fix: use upsert and deterministic IDs to prevent data stagnation
MCP tool_add_drawer: - Make drawer_id content-based: hash full content instead of content[:100] + timestamp. Same content → same ID, eliminating TOCTOU race conditions - Switch from col.add() to col.upsert() so re-filing with updated content updates the existing drawer miner.add_drawer: - Switch from collection.add() to collection.upsert() so re-mining a modified file updates instead of silently failing - Remove the try/except catching 'already exists' — upsert handles this naturally Findings: #11 (HIGH — add ignores updates), #6 (MEDIUM — TOCTOU), #13 (MEDIUM — non-deterministic IDs) Includes test infrastructure from PR #131. 92 tests pass.
This commit is contained in:
@@ -292,10 +292,10 @@ def tool_add_drawer(
|
|||||||
"matches": dup["matches"],
|
"matches": dup["matches"],
|
||||||
}
|
}
|
||||||
|
|
||||||
drawer_id = f"drawer_{wing}_{room}_{hashlib.md5((content[:100] + datetime.now().isoformat()).encode()).hexdigest()[:16]}"
|
drawer_id = f"drawer_{wing}_{room}_{hashlib.md5(content.encode()).hexdigest()[:16]}"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
col.add(
|
col.upsert(
|
||||||
ids=[drawer_id],
|
ids=[drawer_id],
|
||||||
documents=[content],
|
documents=[content],
|
||||||
metadatas=[
|
metadatas=[
|
||||||
|
|||||||
+2
-4
@@ -417,7 +417,7 @@ def add_drawer(
|
|||||||
"""Add one drawer to the palace."""
|
"""Add one drawer to the palace."""
|
||||||
drawer_id = f"drawer_{wing}_{room}_{hashlib.md5((source_file + str(chunk_index)).encode(), usedforsecurity=False).hexdigest()[:16]}"
|
drawer_id = f"drawer_{wing}_{room}_{hashlib.md5((source_file + str(chunk_index)).encode(), usedforsecurity=False).hexdigest()[:16]}"
|
||||||
try:
|
try:
|
||||||
collection.add(
|
collection.upsert(
|
||||||
documents=[content],
|
documents=[content],
|
||||||
ids=[drawer_id],
|
ids=[drawer_id],
|
||||||
metadatas=[
|
metadatas=[
|
||||||
@@ -432,9 +432,7 @@ def add_drawer(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception:
|
||||||
if "already exists" in str(e).lower() or "duplicate" in str(e).lower():
|
|
||||||
return False
|
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ timeline, stats, and edge cases (duplicate triples, ID collisions).
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TestEntityOperations:
|
class TestEntityOperations:
|
||||||
def test_add_entity(self, kg):
|
def test_add_entity(self, kg):
|
||||||
eid = kg.add_entity("Alice", entity_type="person")
|
eid = kg.add_entity("Alice", entity_type="person")
|
||||||
@@ -124,7 +125,6 @@ class TestWALMode:
|
|||||||
conn.close()
|
conn.close()
|
||||||
assert mode == "wal"
|
assert mode == "wal"
|
||||||
|
|
||||||
|
|
||||||
class TestStats:
|
class TestStats:
|
||||||
def test_stats_empty(self, kg):
|
def test_stats_empty(self, kg):
|
||||||
stats = kg.stats()
|
stats = kg.stats()
|
||||||
|
|||||||
Reference in New Issue
Block a user