Merge pull request #873 from sha2fiddy/feature/455/kg-sanitize-punctuation
fix: use permissive validator for KG entity values
This commit is contained in:
@@ -47,6 +47,30 @@ def sanitize_name(value: str, field_name: str = "name") -> str:
|
|||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_kg_value(value: str, field_name: str = "value") -> str:
|
||||||
|
"""Validate a knowledge-graph entity name (subject or object).
|
||||||
|
|
||||||
|
More permissive than sanitize_name — allows punctuation like commas,
|
||||||
|
colons, and parentheses that are common in natural-language KG values.
|
||||||
|
Only blocks null bytes and over-length strings.
|
||||||
|
|
||||||
|
Not used for wing/room names (which have filesystem constraints) or
|
||||||
|
predicates (which should be simple relationship identifiers).
|
||||||
|
"""
|
||||||
|
if not isinstance(value, str) or not value.strip():
|
||||||
|
raise ValueError(f"{field_name} must be a non-empty string")
|
||||||
|
|
||||||
|
value = value.strip()
|
||||||
|
|
||||||
|
if len(value) > MAX_NAME_LENGTH:
|
||||||
|
raise ValueError(f"{field_name} exceeds maximum length of {MAX_NAME_LENGTH} characters")
|
||||||
|
|
||||||
|
if "\x00" in value:
|
||||||
|
raise ValueError(f"{field_name} contains null bytes")
|
||||||
|
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
def sanitize_content(value: str, max_length: int = 100_000) -> str:
|
def sanitize_content(value: str, max_length: int = 100_000) -> str:
|
||||||
"""Validate drawer/diary content length."""
|
"""Validate drawer/diary content length."""
|
||||||
if not isinstance(value, str) or not value.strip():
|
if not isinstance(value, str) or not value.strip():
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ import time
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from .config import MempalaceConfig, sanitize_name, sanitize_content
|
from .config import MempalaceConfig, sanitize_kg_value, sanitize_name, sanitize_content
|
||||||
from .version import __version__
|
from .version import __version__
|
||||||
from .backends.chroma import ChromaBackend, ChromaCollection
|
from .backends.chroma import ChromaBackend, ChromaCollection
|
||||||
from .query_sanitizer import sanitize_query
|
from .query_sanitizer import sanitize_query
|
||||||
@@ -810,7 +810,7 @@ def tool_update_drawer(drawer_id: str, content: str = None, wing: str = None, ro
|
|||||||
def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
|
def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
|
||||||
"""Query the knowledge graph for an entity's relationships."""
|
"""Query the knowledge graph for an entity's relationships."""
|
||||||
try:
|
try:
|
||||||
entity = sanitize_name(entity, "entity")
|
entity = sanitize_kg_value(entity, "entity")
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return {"error": str(e)}
|
return {"error": str(e)}
|
||||||
if direction not in ("outgoing", "incoming", "both"):
|
if direction not in ("outgoing", "incoming", "both"):
|
||||||
@@ -824,9 +824,9 @@ def tool_kg_add(
|
|||||||
):
|
):
|
||||||
"""Add a relationship to the knowledge graph."""
|
"""Add a relationship to the knowledge graph."""
|
||||||
try:
|
try:
|
||||||
subject = sanitize_name(subject, "subject")
|
subject = sanitize_kg_value(subject, "subject")
|
||||||
predicate = sanitize_name(predicate, "predicate")
|
predicate = sanitize_name(predicate, "predicate")
|
||||||
object = sanitize_name(object, "object")
|
object = sanitize_kg_value(object, "object")
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return {"success": False, "error": str(e)}
|
return {"success": False, "error": str(e)}
|
||||||
|
|
||||||
@@ -849,9 +849,9 @@ def tool_kg_add(
|
|||||||
def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = None):
|
def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = None):
|
||||||
"""Mark a fact as no longer true (set end date)."""
|
"""Mark a fact as no longer true (set end date)."""
|
||||||
try:
|
try:
|
||||||
subject = sanitize_name(subject, "subject")
|
subject = sanitize_kg_value(subject, "subject")
|
||||||
predicate = sanitize_name(predicate, "predicate")
|
predicate = sanitize_name(predicate, "predicate")
|
||||||
object = sanitize_name(object, "object")
|
object = sanitize_kg_value(object, "object")
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return {"success": False, "error": str(e)}
|
return {"success": False, "error": str(e)}
|
||||||
_wal_log(
|
_wal_log(
|
||||||
@@ -870,7 +870,7 @@ def tool_kg_timeline(entity: str = None):
|
|||||||
"""Get chronological timeline of facts, optionally for one entity."""
|
"""Get chronological timeline of facts, optionally for one entity."""
|
||||||
if entity is not None:
|
if entity is not None:
|
||||||
try:
|
try:
|
||||||
entity = sanitize_name(entity, "entity")
|
entity = sanitize_kg_value(entity, "entity")
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return {"error": str(e)}
|
return {"error": str(e)}
|
||||||
results = _kg.timeline(entity)
|
results = _kg.timeline(entity)
|
||||||
|
|||||||
+52
-1
@@ -3,7 +3,7 @@ import json
|
|||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from mempalace.config import MempalaceConfig, sanitize_name
|
from mempalace.config import MempalaceConfig, sanitize_kg_value, sanitize_name
|
||||||
|
|
||||||
|
|
||||||
def test_default_config():
|
def test_default_config():
|
||||||
@@ -66,3 +66,54 @@ def test_sanitize_name_rejects_path_traversal():
|
|||||||
def test_sanitize_name_rejects_empty():
|
def test_sanitize_name_rejects_empty():
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
sanitize_name("")
|
sanitize_name("")
|
||||||
|
|
||||||
|
|
||||||
|
# --- sanitize_kg_value ---
|
||||||
|
|
||||||
|
|
||||||
|
def test_kg_value_accepts_commas():
|
||||||
|
assert sanitize_kg_value("Alice, Bob, and Carol") == "Alice, Bob, and Carol"
|
||||||
|
|
||||||
|
|
||||||
|
def test_kg_value_accepts_colons():
|
||||||
|
assert sanitize_kg_value("role: engineer") == "role: engineer"
|
||||||
|
|
||||||
|
|
||||||
|
def test_kg_value_accepts_parentheses():
|
||||||
|
assert sanitize_kg_value("Python (programming)") == "Python (programming)"
|
||||||
|
|
||||||
|
|
||||||
|
def test_kg_value_accepts_slashes():
|
||||||
|
assert sanitize_kg_value("owner/repo") == "owner/repo"
|
||||||
|
|
||||||
|
|
||||||
|
def test_kg_value_accepts_hash():
|
||||||
|
assert sanitize_kg_value("issue #123") == "issue #123"
|
||||||
|
|
||||||
|
|
||||||
|
def test_kg_value_accepts_unicode():
|
||||||
|
assert sanitize_kg_value("Jānis Bērziņš") == "Jānis Bērziņš"
|
||||||
|
|
||||||
|
|
||||||
|
def test_kg_value_strips_whitespace():
|
||||||
|
assert sanitize_kg_value(" hello ") == "hello"
|
||||||
|
|
||||||
|
|
||||||
|
def test_kg_value_rejects_empty():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
sanitize_kg_value("")
|
||||||
|
|
||||||
|
|
||||||
|
def test_kg_value_rejects_whitespace_only():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
sanitize_kg_value(" ")
|
||||||
|
|
||||||
|
|
||||||
|
def test_kg_value_rejects_null_bytes():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
sanitize_kg_value("hello\x00world")
|
||||||
|
|
||||||
|
|
||||||
|
def test_kg_value_rejects_over_length():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
sanitize_kg_value("a" * 129)
|
||||||
|
|||||||
Reference in New Issue
Block a user