From 89904ed03f1270b0cfa7b8bb32d1aa93d9bbb894 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Sat, 18 Apr 2026 17:17:50 -0300 Subject: [PATCH] fix(sources): address Copilot review on #1014 Five findings from the automated review, fixed with targeted tests where behavior changed: 1. Transformation Protocol (transforms.py). The registry mixed a bytes-to-str transform (utf8_replace_invalid) with str-to-str transforms under a single Callable[..., str] type, misleading static type checkers and adapter authors. Introduced a Transformation Protocol with __call__(data: bytes|str) -> str and retyped the registry + get_transformation return. 2. Drawer-id collision risk (context.py). Switched _build_drawer_id from sha1[:16]=64 bits to sha256[:24]=96 bits. 64 bits sits uncomfortably close to the birthday bound for palace-sized corpora; 96 bits keeps the collision probability negligible while preserving the existing _ layout adapters rely on. 3. Fresh-schema KG columns (knowledge_graph.py). source_drawer_id and adapter_name now live in the canonical CREATE TABLE so new palaces don't take an ALTER round-trip on first open. _migrate_schema stays for legacy palaces (SQLite has no ADD COLUMN IF NOT EXISTS, so PRAGMA introspection is still needed there). 4. Identity-shim comment (transforms.py). Comment said the adapter-specific transforms "raise if invoked without adapter context" but they return the input unchanged. Updated the comment to match the actual identity- shim behavior Copilot suggested. 5. Test docstring (test_sources.py). Comment mentioned default_factory=list but SourceRef.options uses default_factory=dict. Corrected. Tests: 1020 passed (up from 1018), +2 new tests for the sha256 id shape and the fresh-schema column presence on new palaces. --- mempalace/knowledge_graph.py | 13 ++++++---- mempalace/sources/context.py | 12 ++++++---- mempalace/sources/transforms.py | 31 ++++++++++++++++++------ tests/test_sources.py | 42 ++++++++++++++++++++++++++++++++- 4 files changed, 80 insertions(+), 18 deletions(-) diff --git a/mempalace/knowledge_graph.py b/mempalace/knowledge_graph.py index 7dde5e5..9096ab2 100644 --- a/mempalace/knowledge_graph.py +++ b/mempalace/knowledge_graph.py @@ -83,6 +83,8 @@ class KnowledgeGraph: confidence REAL DEFAULT 1.0, source_closet TEXT, source_file TEXT, + source_drawer_id TEXT, + adapter_name TEXT, extracted_at TEXT DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (subject) REFERENCES entities(id), FOREIGN KEY (object) REFERENCES entities(id) @@ -99,11 +101,12 @@ class KnowledgeGraph: def _migrate_schema(self, conn): """Backwards-compatible schema migration for older triples tables. - RFC 002 §5.5 adds two optional provenance columns so adapter-written - triples can be traced back to (a) the specific drawer that produced - them and (b) the adapter that authored them. Older palaces predate - these columns; SQLite has no ``ADD COLUMN IF NOT EXISTS``, so we - introspect the schema first and only issue the ALTER if needed. + Fresh palaces get ``source_drawer_id`` / ``adapter_name`` (RFC 002 §5.5) + directly from the canonical ``CREATE TABLE`` above, so this path is a + no-op on new installs. It exists for palaces that were created before + those columns were added: SQLite has no ``ADD COLUMN IF NOT EXISTS``, + so we introspect the schema and only issue the ALTER when the column + is missing. """ existing = {row["name"] for row in conn.execute("PRAGMA table_info(triples)")} if "source_drawer_id" not in existing: diff --git a/mempalace/sources/context.py b/mempalace/sources/context.py index f9b2a0b..c5b8644 100644 --- a/mempalace/sources/context.py +++ b/mempalace/sources/context.py @@ -126,15 +126,17 @@ class PalaceContext: def _build_drawer_id(record: DrawerRecord) -> str: - """Deterministic drawer id: ``_``. + """Deterministic drawer id: ``_``. Matches the shape existing miners rely on (``source_file`` + chunk index pair) while keeping the id chroma-safe (no separators that collide with - existing metadata values). Adapters that need a different id scheme can - bypass :meth:`PalaceContext.upsert_drawer` and write through - ``drawer_collection.upsert`` directly. + existing metadata values). 96-bit SHA-256 prefix keeps collision risk + negligible across corpora the size of a palace (sha1@64 bits was too + close to the birthday bound for large ingests). Adapters that need a + different id scheme can bypass :meth:`PalaceContext.upsert_drawer` and + write through ``drawer_collection.upsert`` directly. """ import hashlib - digest = hashlib.sha1(record.source_file.encode("utf-8")).hexdigest()[:16] + digest = hashlib.sha256(record.source_file.encode("utf-8")).hexdigest()[:24] return f"{digest}_{record.chunk_index}" diff --git a/mempalace/sources/transforms.py b/mempalace/sources/transforms.py index 2c2ade2..6fb702b 100644 --- a/mempalace/sources/transforms.py +++ b/mempalace/sources/transforms.py @@ -24,7 +24,19 @@ conformance suite can locate and apply them. from __future__ import annotations import re -from typing import Callable +from typing import Protocol, Union + + +class Transformation(Protocol): + """Callable signature every reserved transformation conforms to. + + Accepts the current stage of the pipeline — ``bytes`` on input + (``utf8_replace_invalid``) or ``str`` after decoding — and returns ``str``. + Adapters compose them in declaration order; the first step operates on the + original source bytes, every subsequent step on the prior step's output. + """ + + def __call__(self, data: Union[bytes, str], /) -> str: ... # --------------------------------------------------------------------------- @@ -90,12 +102,14 @@ def blank_line_drop(text: str) -> str: # The following reserved transformations are declared in the spec but are # deeply adapter-specific. Rather than guess a single reference implementation -# now, we provide identity shims that raise if invoked without adapter-supplied -# context. Adapters that declare these MUST either override with a concrete -# implementation or provide a namespaced reference under +# now, we provide identity shims that leave the input unchanged when no +# adapter-specific implementation is available. Adapters that declare these +# MUST either override with a concrete implementation or provide a namespaced +# reference under # ``mempalace.sources.transforms._`` (per the # module docstring). The conformance suite looks up the adapter-specific -# implementation first, falling back to these only when none exists. +# implementation first, falling back to these identity shims only when none +# exists. def strip_tool_chrome(text: str) -> str: @@ -146,7 +160,10 @@ def speaker_role_assignment(text: str) -> str: # Reserved transformation name → reference implementation. # Adapters look up by name to compose a round-trip pipeline during testing. -RESERVED_TRANSFORMATIONS: dict[str, Callable[..., str]] = { +# The value conforms to the :class:`Transformation` protocol above; we type +# it as that Protocol rather than a concrete ``Callable`` so static checkers +# accept both the bytes→str (``utf8_replace_invalid``) and str→str shapes. +RESERVED_TRANSFORMATIONS: dict[str, Transformation] = { "utf8_replace_invalid": utf8_replace_invalid, "newline_normalize": newline_normalize, "whitespace_trim": whitespace_trim, @@ -163,7 +180,7 @@ RESERVED_TRANSFORMATIONS: dict[str, Callable[..., str]] = { } -def get_transformation(name: str) -> Callable[..., str]: +def get_transformation(name: str) -> Transformation: """Resolve a reserved transformation by name. Raises :class:`KeyError` if the name is neither reserved nor registered as diff --git a/tests/test_sources.py b/tests/test_sources.py index c8bd8fe..be24c32 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -103,7 +103,7 @@ def test_source_summary_default_uses_adapter_name(): def test_source_ref_options_default_is_empty_dict(): - # Frozen dataclass must not share default_factory=list instance across instances. + # Frozen dataclass must not share a default_factory=dict instance across instances. a = SourceRef(uri="a") b = SourceRef(uri="b") a.options["touched"] = True @@ -280,6 +280,27 @@ def test_palace_context_upsert_drawer_stamps_adapter_metadata(): assert meta["chunk_index"] == 2 +def test_palace_context_drawer_id_is_sha256_prefix_not_sha1(): + """Guards against the pre-review sha1[:16]=64-bit id scheme. + + 64-bit ids sit close to the birthday bound for palace-sized corpora. + The helper uses sha256[:24]=96 bits so collision risk stays negligible. + """ + import hashlib + + from mempalace.sources.context import _build_drawer_id + + src = "/an/absolute/path/to/a/file.txt" + record = DrawerRecord(content="x", source_file=src, chunk_index=3) + drawer_id = _build_drawer_id(record) + + expected_prefix = hashlib.sha256(src.encode("utf-8")).hexdigest()[:24] + assert drawer_id == f"{expected_prefix}_3" + # Negative: the old sha1 scheme MUST NOT produce the same id. + sha1_prefix = hashlib.sha1(src.encode("utf-8")).hexdigest()[:16] + assert drawer_id != f"{sha1_prefix}_3" + + def test_palace_context_skip_current_item_sets_flag(): ctx = PalaceContext( drawer_collection=_FakeCollection(), @@ -355,6 +376,25 @@ def test_knowledge_graph_add_triple_accepts_source_drawer_id_and_adapter_name(tm kg.close() +def test_knowledge_graph_fresh_schema_includes_new_columns(tmp_path): + """Brand-new palaces should get source_drawer_id / adapter_name directly + from CREATE TABLE, not via a post-hoc ALTER. _migrate_schema exists only + for legacy palaces.""" + import sqlite3 + + from mempalace.knowledge_graph import KnowledgeGraph + + kg = KnowledgeGraph(db_path=str(tmp_path / "fresh.sqlite3")) + try: + conn = sqlite3.connect(str(tmp_path / "fresh.sqlite3")) + cols = {row[1] for row in conn.execute("PRAGMA table_info(triples)")} + conn.close() + assert "source_drawer_id" in cols + assert "adapter_name" in cols + finally: + kg.close() + + def test_knowledge_graph_migration_adds_missing_columns_to_old_schema(tmp_path): """An old-schema triples table (pre-RFC 002) should auto-migrate on open.""" import sqlite3