fix(sources): address Copilot review on #1014

Five findings from the automated review, fixed with targeted tests where
behavior changed:

1. Transformation Protocol (transforms.py). The registry mixed a bytes-to-str
   transform (utf8_replace_invalid) with str-to-str transforms under a single
   Callable[..., str] type, misleading static type checkers and adapter
   authors. Introduced a Transformation Protocol with __call__(data: bytes|str)
   -> str and retyped the registry + get_transformation return.

2. Drawer-id collision risk (context.py). Switched _build_drawer_id from
   sha1[:16]=64 bits to sha256[:24]=96 bits. 64 bits sits uncomfortably
   close to the birthday bound for palace-sized corpora; 96 bits keeps the
   collision probability negligible while preserving the existing
   <prefix>_<chunk> layout adapters rely on.

3. Fresh-schema KG columns (knowledge_graph.py). source_drawer_id and
   adapter_name now live in the canonical CREATE TABLE so new palaces don't
   take an ALTER round-trip on first open. _migrate_schema stays for legacy
   palaces (SQLite has no ADD COLUMN IF NOT EXISTS, so PRAGMA introspection
   is still needed there).

4. Identity-shim comment (transforms.py). Comment said the adapter-specific
   transforms "raise if invoked without adapter context" but they return
   the input unchanged. Updated the comment to match the actual identity-
   shim behavior Copilot suggested.

5. Test docstring (test_sources.py). Comment mentioned default_factory=list
   but SourceRef.options uses default_factory=dict. Corrected.

Tests: 1020 passed (up from 1018), +2 new tests for the sha256 id shape
and the fresh-schema column presence on new palaces.
This commit is contained in:
Igor Lins e Silva
2026-04-18 17:17:50 -03:00
parent 552e9927b7
commit 89904ed03f
4 changed files with 80 additions and 18 deletions
+8 -5
View File
@@ -83,6 +83,8 @@ class KnowledgeGraph:
confidence REAL DEFAULT 1.0, confidence REAL DEFAULT 1.0,
source_closet TEXT, source_closet TEXT,
source_file TEXT, source_file TEXT,
source_drawer_id TEXT,
adapter_name TEXT,
extracted_at TEXT DEFAULT CURRENT_TIMESTAMP, extracted_at TEXT DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (subject) REFERENCES entities(id), FOREIGN KEY (subject) REFERENCES entities(id),
FOREIGN KEY (object) REFERENCES entities(id) FOREIGN KEY (object) REFERENCES entities(id)
@@ -99,11 +101,12 @@ class KnowledgeGraph:
def _migrate_schema(self, conn): def _migrate_schema(self, conn):
"""Backwards-compatible schema migration for older triples tables. """Backwards-compatible schema migration for older triples tables.
RFC 002 §5.5 adds two optional provenance columns so adapter-written Fresh palaces get ``source_drawer_id`` / ``adapter_name`` (RFC 002 §5.5)
triples can be traced back to (a) the specific drawer that produced directly from the canonical ``CREATE TABLE`` above, so this path is a
them and (b) the adapter that authored them. Older palaces predate no-op on new installs. It exists for palaces that were created before
these columns; SQLite has no ``ADD COLUMN IF NOT EXISTS``, so we those columns were added: SQLite has no ``ADD COLUMN IF NOT EXISTS``,
introspect the schema first and only issue the ALTER if needed. so we introspect the schema and only issue the ALTER when the column
is missing.
""" """
existing = {row["name"] for row in conn.execute("PRAGMA table_info(triples)")} existing = {row["name"] for row in conn.execute("PRAGMA table_info(triples)")}
if "source_drawer_id" not in existing: if "source_drawer_id" not in existing:
+7 -5
View File
@@ -126,15 +126,17 @@ class PalaceContext:
def _build_drawer_id(record: DrawerRecord) -> str: def _build_drawer_id(record: DrawerRecord) -> str:
"""Deterministic drawer id: ``<sha1(source_file)>_<chunk_index>``. """Deterministic drawer id: ``<sha256(source_file)[:24]>_<chunk_index>``.
Matches the shape existing miners rely on (``source_file`` + chunk index Matches the shape existing miners rely on (``source_file`` + chunk index
pair) while keeping the id chroma-safe (no separators that collide with pair) while keeping the id chroma-safe (no separators that collide with
existing metadata values). Adapters that need a different id scheme can existing metadata values). 96-bit SHA-256 prefix keeps collision risk
bypass :meth:`PalaceContext.upsert_drawer` and write through negligible across corpora the size of a palace (sha1@64 bits was too
``drawer_collection.upsert`` directly. close to the birthday bound for large ingests). Adapters that need a
different id scheme can bypass :meth:`PalaceContext.upsert_drawer` and
write through ``drawer_collection.upsert`` directly.
""" """
import hashlib import hashlib
digest = hashlib.sha1(record.source_file.encode("utf-8")).hexdigest()[:16] digest = hashlib.sha256(record.source_file.encode("utf-8")).hexdigest()[:24]
return f"{digest}_{record.chunk_index}" return f"{digest}_{record.chunk_index}"
+24 -7
View File
@@ -24,7 +24,19 @@ conformance suite can locate and apply them.
from __future__ import annotations from __future__ import annotations
import re import re
from typing import Callable from typing import Protocol, Union
class Transformation(Protocol):
"""Callable signature every reserved transformation conforms to.
Accepts the current stage of the pipeline — ``bytes`` on input
(``utf8_replace_invalid``) or ``str`` after decoding — and returns ``str``.
Adapters compose them in declaration order; the first step operates on the
original source bytes, every subsequent step on the prior step's output.
"""
def __call__(self, data: Union[bytes, str], /) -> str: ...
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -90,12 +102,14 @@ def blank_line_drop(text: str) -> str:
# The following reserved transformations are declared in the spec but are # The following reserved transformations are declared in the spec but are
# deeply adapter-specific. Rather than guess a single reference implementation # deeply adapter-specific. Rather than guess a single reference implementation
# now, we provide identity shims that raise if invoked without adapter-supplied # now, we provide identity shims that leave the input unchanged when no
# context. Adapters that declare these MUST either override with a concrete # adapter-specific implementation is available. Adapters that declare these
# implementation or provide a namespaced reference under # MUST either override with a concrete implementation or provide a namespaced
# reference under
# ``mempalace.sources.transforms.<adapter_name>_<transform_name>`` (per the # ``mempalace.sources.transforms.<adapter_name>_<transform_name>`` (per the
# module docstring). The conformance suite looks up the adapter-specific # module docstring). The conformance suite looks up the adapter-specific
# implementation first, falling back to these only when none exists. # implementation first, falling back to these identity shims only when none
# exists.
def strip_tool_chrome(text: str) -> str: def strip_tool_chrome(text: str) -> str:
@@ -146,7 +160,10 @@ def speaker_role_assignment(text: str) -> str:
# Reserved transformation name → reference implementation. # Reserved transformation name → reference implementation.
# Adapters look up by name to compose a round-trip pipeline during testing. # Adapters look up by name to compose a round-trip pipeline during testing.
RESERVED_TRANSFORMATIONS: dict[str, Callable[..., str]] = { # The value conforms to the :class:`Transformation` protocol above; we type
# it as that Protocol rather than a concrete ``Callable`` so static checkers
# accept both the bytes→str (``utf8_replace_invalid``) and str→str shapes.
RESERVED_TRANSFORMATIONS: dict[str, Transformation] = {
"utf8_replace_invalid": utf8_replace_invalid, "utf8_replace_invalid": utf8_replace_invalid,
"newline_normalize": newline_normalize, "newline_normalize": newline_normalize,
"whitespace_trim": whitespace_trim, "whitespace_trim": whitespace_trim,
@@ -163,7 +180,7 @@ RESERVED_TRANSFORMATIONS: dict[str, Callable[..., str]] = {
} }
def get_transformation(name: str) -> Callable[..., str]: def get_transformation(name: str) -> Transformation:
"""Resolve a reserved transformation by name. """Resolve a reserved transformation by name.
Raises :class:`KeyError` if the name is neither reserved nor registered as Raises :class:`KeyError` if the name is neither reserved nor registered as
+41 -1
View File
@@ -103,7 +103,7 @@ def test_source_summary_default_uses_adapter_name():
def test_source_ref_options_default_is_empty_dict(): def test_source_ref_options_default_is_empty_dict():
# Frozen dataclass must not share default_factory=list instance across instances. # Frozen dataclass must not share a default_factory=dict instance across instances.
a = SourceRef(uri="a") a = SourceRef(uri="a")
b = SourceRef(uri="b") b = SourceRef(uri="b")
a.options["touched"] = True a.options["touched"] = True
@@ -280,6 +280,27 @@ def test_palace_context_upsert_drawer_stamps_adapter_metadata():
assert meta["chunk_index"] == 2 assert meta["chunk_index"] == 2
def test_palace_context_drawer_id_is_sha256_prefix_not_sha1():
"""Guards against the pre-review sha1[:16]=64-bit id scheme.
64-bit ids sit close to the birthday bound for palace-sized corpora.
The helper uses sha256[:24]=96 bits so collision risk stays negligible.
"""
import hashlib
from mempalace.sources.context import _build_drawer_id
src = "/an/absolute/path/to/a/file.txt"
record = DrawerRecord(content="x", source_file=src, chunk_index=3)
drawer_id = _build_drawer_id(record)
expected_prefix = hashlib.sha256(src.encode("utf-8")).hexdigest()[:24]
assert drawer_id == f"{expected_prefix}_3"
# Negative: the old sha1 scheme MUST NOT produce the same id.
sha1_prefix = hashlib.sha1(src.encode("utf-8")).hexdigest()[:16]
assert drawer_id != f"{sha1_prefix}_3"
def test_palace_context_skip_current_item_sets_flag(): def test_palace_context_skip_current_item_sets_flag():
ctx = PalaceContext( ctx = PalaceContext(
drawer_collection=_FakeCollection(), drawer_collection=_FakeCollection(),
@@ -355,6 +376,25 @@ def test_knowledge_graph_add_triple_accepts_source_drawer_id_and_adapter_name(tm
kg.close() kg.close()
def test_knowledge_graph_fresh_schema_includes_new_columns(tmp_path):
"""Brand-new palaces should get source_drawer_id / adapter_name directly
from CREATE TABLE, not via a post-hoc ALTER. _migrate_schema exists only
for legacy palaces."""
import sqlite3
from mempalace.knowledge_graph import KnowledgeGraph
kg = KnowledgeGraph(db_path=str(tmp_path / "fresh.sqlite3"))
try:
conn = sqlite3.connect(str(tmp_path / "fresh.sqlite3"))
cols = {row[1] for row in conn.execute("PRAGMA table_info(triples)")}
conn.close()
assert "source_drawer_id" in cols
assert "adapter_name" in cols
finally:
kg.close()
def test_knowledge_graph_migration_adds_missing_columns_to_old_schema(tmp_path): def test_knowledge_graph_migration_adds_missing_columns_to_old_schema(tmp_path):
"""An old-schema triples table (pre-RFC 002) should auto-migrate on open.""" """An old-schema triples table (pre-RFC 002) should auto-migrate on open."""
import sqlite3 import sqlite3