Merge pull request #873 from sha2fiddy/feature/455/kg-sanitize-punctuation
fix: use permissive validator for KG entity values
This commit is contained in:
@@ -47,6 +47,30 @@ def sanitize_name(value: str, field_name: str = "name") -> str:
|
||||
return value
|
||||
|
||||
|
||||
def sanitize_kg_value(value: str, field_name: str = "value") -> str:
|
||||
"""Validate a knowledge-graph entity name (subject or object).
|
||||
|
||||
More permissive than sanitize_name — allows punctuation like commas,
|
||||
colons, and parentheses that are common in natural-language KG values.
|
||||
Only blocks null bytes and over-length strings.
|
||||
|
||||
Not used for wing/room names (which have filesystem constraints) or
|
||||
predicates (which should be simple relationship identifiers).
|
||||
"""
|
||||
if not isinstance(value, str) or not value.strip():
|
||||
raise ValueError(f"{field_name} must be a non-empty string")
|
||||
|
||||
value = value.strip()
|
||||
|
||||
if len(value) > MAX_NAME_LENGTH:
|
||||
raise ValueError(f"{field_name} exceeds maximum length of {MAX_NAME_LENGTH} characters")
|
||||
|
||||
if "\x00" in value:
|
||||
raise ValueError(f"{field_name} contains null bytes")
|
||||
|
||||
return value
|
||||
|
||||
|
||||
def sanitize_content(value: str, max_length: int = 100_000) -> str:
|
||||
"""Validate drawer/diary content length."""
|
||||
if not isinstance(value, str) or not value.strip():
|
||||
|
||||
@@ -30,7 +30,7 @@ import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from .config import MempalaceConfig, sanitize_name, sanitize_content
|
||||
from .config import MempalaceConfig, sanitize_kg_value, sanitize_name, sanitize_content
|
||||
from .version import __version__
|
||||
from .backends.chroma import ChromaBackend, ChromaCollection
|
||||
from .query_sanitizer import sanitize_query
|
||||
@@ -810,7 +810,7 @@ def tool_update_drawer(drawer_id: str, content: str = None, wing: str = None, ro
|
||||
def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
|
||||
"""Query the knowledge graph for an entity's relationships."""
|
||||
try:
|
||||
entity = sanitize_name(entity, "entity")
|
||||
entity = sanitize_kg_value(entity, "entity")
|
||||
except ValueError as e:
|
||||
return {"error": str(e)}
|
||||
if direction not in ("outgoing", "incoming", "both"):
|
||||
@@ -824,9 +824,9 @@ def tool_kg_add(
|
||||
):
|
||||
"""Add a relationship to the knowledge graph."""
|
||||
try:
|
||||
subject = sanitize_name(subject, "subject")
|
||||
subject = sanitize_kg_value(subject, "subject")
|
||||
predicate = sanitize_name(predicate, "predicate")
|
||||
object = sanitize_name(object, "object")
|
||||
object = sanitize_kg_value(object, "object")
|
||||
except ValueError as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
@@ -849,9 +849,9 @@ def tool_kg_add(
|
||||
def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = None):
|
||||
"""Mark a fact as no longer true (set end date)."""
|
||||
try:
|
||||
subject = sanitize_name(subject, "subject")
|
||||
subject = sanitize_kg_value(subject, "subject")
|
||||
predicate = sanitize_name(predicate, "predicate")
|
||||
object = sanitize_name(object, "object")
|
||||
object = sanitize_kg_value(object, "object")
|
||||
except ValueError as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
_wal_log(
|
||||
@@ -870,7 +870,7 @@ def tool_kg_timeline(entity: str = None):
|
||||
"""Get chronological timeline of facts, optionally for one entity."""
|
||||
if entity is not None:
|
||||
try:
|
||||
entity = sanitize_name(entity, "entity")
|
||||
entity = sanitize_kg_value(entity, "entity")
|
||||
except ValueError as e:
|
||||
return {"error": str(e)}
|
||||
results = _kg.timeline(entity)
|
||||
|
||||
+52
-1
@@ -3,7 +3,7 @@ import json
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
from mempalace.config import MempalaceConfig, sanitize_name
|
||||
from mempalace.config import MempalaceConfig, sanitize_kg_value, sanitize_name
|
||||
|
||||
|
||||
def test_default_config():
|
||||
@@ -66,3 +66,54 @@ def test_sanitize_name_rejects_path_traversal():
|
||||
def test_sanitize_name_rejects_empty():
|
||||
with pytest.raises(ValueError):
|
||||
sanitize_name("")
|
||||
|
||||
|
||||
# --- sanitize_kg_value ---
|
||||
|
||||
|
||||
def test_kg_value_accepts_commas():
|
||||
assert sanitize_kg_value("Alice, Bob, and Carol") == "Alice, Bob, and Carol"
|
||||
|
||||
|
||||
def test_kg_value_accepts_colons():
|
||||
assert sanitize_kg_value("role: engineer") == "role: engineer"
|
||||
|
||||
|
||||
def test_kg_value_accepts_parentheses():
|
||||
assert sanitize_kg_value("Python (programming)") == "Python (programming)"
|
||||
|
||||
|
||||
def test_kg_value_accepts_slashes():
|
||||
assert sanitize_kg_value("owner/repo") == "owner/repo"
|
||||
|
||||
|
||||
def test_kg_value_accepts_hash():
|
||||
assert sanitize_kg_value("issue #123") == "issue #123"
|
||||
|
||||
|
||||
def test_kg_value_accepts_unicode():
|
||||
assert sanitize_kg_value("Jānis Bērziņš") == "Jānis Bērziņš"
|
||||
|
||||
|
||||
def test_kg_value_strips_whitespace():
|
||||
assert sanitize_kg_value(" hello ") == "hello"
|
||||
|
||||
|
||||
def test_kg_value_rejects_empty():
|
||||
with pytest.raises(ValueError):
|
||||
sanitize_kg_value("")
|
||||
|
||||
|
||||
def test_kg_value_rejects_whitespace_only():
|
||||
with pytest.raises(ValueError):
|
||||
sanitize_kg_value(" ")
|
||||
|
||||
|
||||
def test_kg_value_rejects_null_bytes():
|
||||
with pytest.raises(ValueError):
|
||||
sanitize_kg_value("hello\x00world")
|
||||
|
||||
|
||||
def test_kg_value_rejects_over_length():
|
||||
with pytest.raises(ValueError):
|
||||
sanitize_kg_value("a" * 129)
|
||||
|
||||
Reference in New Issue
Block a user