Merge pull request #873 from sha2fiddy/feature/455/kg-sanitize-punctuation

fix: use permissive validator for KG entity values
This commit is contained in:
Igor Lins e Silva
2026-04-14 14:15:33 -03:00
committed by GitHub
3 changed files with 83 additions and 8 deletions
+24
View File
@@ -47,6 +47,30 @@ def sanitize_name(value: str, field_name: str = "name") -> str:
return value return value
def sanitize_kg_value(value: str, field_name: str = "value") -> str:
"""Validate a knowledge-graph entity name (subject or object).
More permissive than sanitize_name — allows punctuation like commas,
colons, and parentheses that are common in natural-language KG values.
Only blocks null bytes and over-length strings.
Not used for wing/room names (which have filesystem constraints) or
predicates (which should be simple relationship identifiers).
"""
if not isinstance(value, str) or not value.strip():
raise ValueError(f"{field_name} must be a non-empty string")
value = value.strip()
if len(value) > MAX_NAME_LENGTH:
raise ValueError(f"{field_name} exceeds maximum length of {MAX_NAME_LENGTH} characters")
if "\x00" in value:
raise ValueError(f"{field_name} contains null bytes")
return value
def sanitize_content(value: str, max_length: int = 100_000) -> str: def sanitize_content(value: str, max_length: int = 100_000) -> str:
"""Validate drawer/diary content length.""" """Validate drawer/diary content length."""
if not isinstance(value, str) or not value.strip(): if not isinstance(value, str) or not value.strip():
+7 -7
View File
@@ -30,7 +30,7 @@ import time
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from .config import MempalaceConfig, sanitize_name, sanitize_content from .config import MempalaceConfig, sanitize_kg_value, sanitize_name, sanitize_content
from .version import __version__ from .version import __version__
from .backends.chroma import ChromaBackend, ChromaCollection from .backends.chroma import ChromaBackend, ChromaCollection
from .query_sanitizer import sanitize_query from .query_sanitizer import sanitize_query
@@ -810,7 +810,7 @@ def tool_update_drawer(drawer_id: str, content: str = None, wing: str = None, ro
def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"): def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
"""Query the knowledge graph for an entity's relationships.""" """Query the knowledge graph for an entity's relationships."""
try: try:
entity = sanitize_name(entity, "entity") entity = sanitize_kg_value(entity, "entity")
except ValueError as e: except ValueError as e:
return {"error": str(e)} return {"error": str(e)}
if direction not in ("outgoing", "incoming", "both"): if direction not in ("outgoing", "incoming", "both"):
@@ -824,9 +824,9 @@ def tool_kg_add(
): ):
"""Add a relationship to the knowledge graph.""" """Add a relationship to the knowledge graph."""
try: try:
subject = sanitize_name(subject, "subject") subject = sanitize_kg_value(subject, "subject")
predicate = sanitize_name(predicate, "predicate") predicate = sanitize_name(predicate, "predicate")
object = sanitize_name(object, "object") object = sanitize_kg_value(object, "object")
except ValueError as e: except ValueError as e:
return {"success": False, "error": str(e)} return {"success": False, "error": str(e)}
@@ -849,9 +849,9 @@ def tool_kg_add(
def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = None): def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = None):
"""Mark a fact as no longer true (set end date).""" """Mark a fact as no longer true (set end date)."""
try: try:
subject = sanitize_name(subject, "subject") subject = sanitize_kg_value(subject, "subject")
predicate = sanitize_name(predicate, "predicate") predicate = sanitize_name(predicate, "predicate")
object = sanitize_name(object, "object") object = sanitize_kg_value(object, "object")
except ValueError as e: except ValueError as e:
return {"success": False, "error": str(e)} return {"success": False, "error": str(e)}
_wal_log( _wal_log(
@@ -870,7 +870,7 @@ def tool_kg_timeline(entity: str = None):
"""Get chronological timeline of facts, optionally for one entity.""" """Get chronological timeline of facts, optionally for one entity."""
if entity is not None: if entity is not None:
try: try:
entity = sanitize_name(entity, "entity") entity = sanitize_kg_value(entity, "entity")
except ValueError as e: except ValueError as e:
return {"error": str(e)} return {"error": str(e)}
results = _kg.timeline(entity) results = _kg.timeline(entity)
+52 -1
View File
@@ -3,7 +3,7 @@ import json
import tempfile import tempfile
import pytest import pytest
from mempalace.config import MempalaceConfig, sanitize_name from mempalace.config import MempalaceConfig, sanitize_kg_value, sanitize_name
def test_default_config(): def test_default_config():
@@ -66,3 +66,54 @@ def test_sanitize_name_rejects_path_traversal():
def test_sanitize_name_rejects_empty(): def test_sanitize_name_rejects_empty():
with pytest.raises(ValueError): with pytest.raises(ValueError):
sanitize_name("") sanitize_name("")
# --- sanitize_kg_value ---
def test_kg_value_accepts_commas():
assert sanitize_kg_value("Alice, Bob, and Carol") == "Alice, Bob, and Carol"
def test_kg_value_accepts_colons():
assert sanitize_kg_value("role: engineer") == "role: engineer"
def test_kg_value_accepts_parentheses():
assert sanitize_kg_value("Python (programming)") == "Python (programming)"
def test_kg_value_accepts_slashes():
assert sanitize_kg_value("owner/repo") == "owner/repo"
def test_kg_value_accepts_hash():
assert sanitize_kg_value("issue #123") == "issue #123"
def test_kg_value_accepts_unicode():
assert sanitize_kg_value("Jānis Bērziņš") == "Jānis Bērziņš"
def test_kg_value_strips_whitespace():
assert sanitize_kg_value(" hello ") == "hello"
def test_kg_value_rejects_empty():
with pytest.raises(ValueError):
sanitize_kg_value("")
def test_kg_value_rejects_whitespace_only():
with pytest.raises(ValueError):
sanitize_kg_value(" ")
def test_kg_value_rejects_null_bytes():
with pytest.raises(ValueError):
sanitize_kg_value("hello\x00world")
def test_kg_value_rejects_over_length():
with pytest.raises(ValueError):
sanitize_kg_value("a" * 129)