fix(sources): address Copilot review on #1014
Five findings from the automated review, fixed with targeted tests where behavior changed: 1. Transformation Protocol (transforms.py). The registry mixed a bytes-to-str transform (utf8_replace_invalid) with str-to-str transforms under a single Callable[..., str] type, misleading static type checkers and adapter authors. Introduced a Transformation Protocol with __call__(data: bytes|str) -> str and retyped the registry + get_transformation return. 2. Drawer-id collision risk (context.py). Switched _build_drawer_id from sha1[:16]=64 bits to sha256[:24]=96 bits. 64 bits sits uncomfortably close to the birthday bound for palace-sized corpora; 96 bits keeps the collision probability negligible while preserving the existing <prefix>_<chunk> layout adapters rely on. 3. Fresh-schema KG columns (knowledge_graph.py). source_drawer_id and adapter_name now live in the canonical CREATE TABLE so new palaces don't take an ALTER round-trip on first open. _migrate_schema stays for legacy palaces (SQLite has no ADD COLUMN IF NOT EXISTS, so PRAGMA introspection is still needed there). 4. Identity-shim comment (transforms.py). Comment said the adapter-specific transforms "raise if invoked without adapter context" but they return the input unchanged. Updated the comment to match the actual identity- shim behavior Copilot suggested. 5. Test docstring (test_sources.py). Comment mentioned default_factory=list but SourceRef.options uses default_factory=dict. Corrected. Tests: 1020 passed (up from 1018), +2 new tests for the sha256 id shape and the fresh-schema column presence on new palaces.
This commit is contained in:
@@ -83,6 +83,8 @@ class KnowledgeGraph:
|
||||
confidence REAL DEFAULT 1.0,
|
||||
source_closet TEXT,
|
||||
source_file TEXT,
|
||||
source_drawer_id TEXT,
|
||||
adapter_name TEXT,
|
||||
extracted_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (subject) REFERENCES entities(id),
|
||||
FOREIGN KEY (object) REFERENCES entities(id)
|
||||
@@ -99,11 +101,12 @@ class KnowledgeGraph:
|
||||
def _migrate_schema(self, conn):
|
||||
"""Backwards-compatible schema migration for older triples tables.
|
||||
|
||||
RFC 002 §5.5 adds two optional provenance columns so adapter-written
|
||||
triples can be traced back to (a) the specific drawer that produced
|
||||
them and (b) the adapter that authored them. Older palaces predate
|
||||
these columns; SQLite has no ``ADD COLUMN IF NOT EXISTS``, so we
|
||||
introspect the schema first and only issue the ALTER if needed.
|
||||
Fresh palaces get ``source_drawer_id`` / ``adapter_name`` (RFC 002 §5.5)
|
||||
directly from the canonical ``CREATE TABLE`` above, so this path is a
|
||||
no-op on new installs. It exists for palaces that were created before
|
||||
those columns were added: SQLite has no ``ADD COLUMN IF NOT EXISTS``,
|
||||
so we introspect the schema and only issue the ALTER when the column
|
||||
is missing.
|
||||
"""
|
||||
existing = {row["name"] for row in conn.execute("PRAGMA table_info(triples)")}
|
||||
if "source_drawer_id" not in existing:
|
||||
|
||||
@@ -126,15 +126,17 @@ class PalaceContext:
|
||||
|
||||
|
||||
def _build_drawer_id(record: DrawerRecord) -> str:
|
||||
"""Deterministic drawer id: ``<sha1(source_file)>_<chunk_index>``.
|
||||
"""Deterministic drawer id: ``<sha256(source_file)[:24]>_<chunk_index>``.
|
||||
|
||||
Matches the shape existing miners rely on (``source_file`` + chunk index
|
||||
pair) while keeping the id chroma-safe (no separators that collide with
|
||||
existing metadata values). Adapters that need a different id scheme can
|
||||
bypass :meth:`PalaceContext.upsert_drawer` and write through
|
||||
``drawer_collection.upsert`` directly.
|
||||
existing metadata values). 96-bit SHA-256 prefix keeps collision risk
|
||||
negligible across corpora the size of a palace (sha1@64 bits was too
|
||||
close to the birthday bound for large ingests). Adapters that need a
|
||||
different id scheme can bypass :meth:`PalaceContext.upsert_drawer` and
|
||||
write through ``drawer_collection.upsert`` directly.
|
||||
"""
|
||||
import hashlib
|
||||
|
||||
digest = hashlib.sha1(record.source_file.encode("utf-8")).hexdigest()[:16]
|
||||
digest = hashlib.sha256(record.source_file.encode("utf-8")).hexdigest()[:24]
|
||||
return f"{digest}_{record.chunk_index}"
|
||||
|
||||
@@ -24,7 +24,19 @@ conformance suite can locate and apply them.
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Callable
|
||||
from typing import Protocol, Union
|
||||
|
||||
|
||||
class Transformation(Protocol):
|
||||
"""Callable signature every reserved transformation conforms to.
|
||||
|
||||
Accepts the current stage of the pipeline — ``bytes`` on input
|
||||
(``utf8_replace_invalid``) or ``str`` after decoding — and returns ``str``.
|
||||
Adapters compose them in declaration order; the first step operates on the
|
||||
original source bytes, every subsequent step on the prior step's output.
|
||||
"""
|
||||
|
||||
def __call__(self, data: Union[bytes, str], /) -> str: ...
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -90,12 +102,14 @@ def blank_line_drop(text: str) -> str:
|
||||
|
||||
# The following reserved transformations are declared in the spec but are
|
||||
# deeply adapter-specific. Rather than guess a single reference implementation
|
||||
# now, we provide identity shims that raise if invoked without adapter-supplied
|
||||
# context. Adapters that declare these MUST either override with a concrete
|
||||
# implementation or provide a namespaced reference under
|
||||
# now, we provide identity shims that leave the input unchanged when no
|
||||
# adapter-specific implementation is available. Adapters that declare these
|
||||
# MUST either override with a concrete implementation or provide a namespaced
|
||||
# reference under
|
||||
# ``mempalace.sources.transforms.<adapter_name>_<transform_name>`` (per the
|
||||
# module docstring). The conformance suite looks up the adapter-specific
|
||||
# implementation first, falling back to these only when none exists.
|
||||
# implementation first, falling back to these identity shims only when none
|
||||
# exists.
|
||||
|
||||
|
||||
def strip_tool_chrome(text: str) -> str:
|
||||
@@ -146,7 +160,10 @@ def speaker_role_assignment(text: str) -> str:
|
||||
|
||||
# Reserved transformation name → reference implementation.
|
||||
# Adapters look up by name to compose a round-trip pipeline during testing.
|
||||
RESERVED_TRANSFORMATIONS: dict[str, Callable[..., str]] = {
|
||||
# The value conforms to the :class:`Transformation` protocol above; we type
|
||||
# it as that Protocol rather than a concrete ``Callable`` so static checkers
|
||||
# accept both the bytes→str (``utf8_replace_invalid``) and str→str shapes.
|
||||
RESERVED_TRANSFORMATIONS: dict[str, Transformation] = {
|
||||
"utf8_replace_invalid": utf8_replace_invalid,
|
||||
"newline_normalize": newline_normalize,
|
||||
"whitespace_trim": whitespace_trim,
|
||||
@@ -163,7 +180,7 @@ RESERVED_TRANSFORMATIONS: dict[str, Callable[..., str]] = {
|
||||
}
|
||||
|
||||
|
||||
def get_transformation(name: str) -> Callable[..., str]:
|
||||
def get_transformation(name: str) -> Transformation:
|
||||
"""Resolve a reserved transformation by name.
|
||||
|
||||
Raises :class:`KeyError` if the name is neither reserved nor registered as
|
||||
|
||||
+41
-1
@@ -103,7 +103,7 @@ def test_source_summary_default_uses_adapter_name():
|
||||
|
||||
|
||||
def test_source_ref_options_default_is_empty_dict():
|
||||
# Frozen dataclass must not share default_factory=list instance across instances.
|
||||
# Frozen dataclass must not share a default_factory=dict instance across instances.
|
||||
a = SourceRef(uri="a")
|
||||
b = SourceRef(uri="b")
|
||||
a.options["touched"] = True
|
||||
@@ -280,6 +280,27 @@ def test_palace_context_upsert_drawer_stamps_adapter_metadata():
|
||||
assert meta["chunk_index"] == 2
|
||||
|
||||
|
||||
def test_palace_context_drawer_id_is_sha256_prefix_not_sha1():
|
||||
"""Guards against the pre-review sha1[:16]=64-bit id scheme.
|
||||
|
||||
64-bit ids sit close to the birthday bound for palace-sized corpora.
|
||||
The helper uses sha256[:24]=96 bits so collision risk stays negligible.
|
||||
"""
|
||||
import hashlib
|
||||
|
||||
from mempalace.sources.context import _build_drawer_id
|
||||
|
||||
src = "/an/absolute/path/to/a/file.txt"
|
||||
record = DrawerRecord(content="x", source_file=src, chunk_index=3)
|
||||
drawer_id = _build_drawer_id(record)
|
||||
|
||||
expected_prefix = hashlib.sha256(src.encode("utf-8")).hexdigest()[:24]
|
||||
assert drawer_id == f"{expected_prefix}_3"
|
||||
# Negative: the old sha1 scheme MUST NOT produce the same id.
|
||||
sha1_prefix = hashlib.sha1(src.encode("utf-8")).hexdigest()[:16]
|
||||
assert drawer_id != f"{sha1_prefix}_3"
|
||||
|
||||
|
||||
def test_palace_context_skip_current_item_sets_flag():
|
||||
ctx = PalaceContext(
|
||||
drawer_collection=_FakeCollection(),
|
||||
@@ -355,6 +376,25 @@ def test_knowledge_graph_add_triple_accepts_source_drawer_id_and_adapter_name(tm
|
||||
kg.close()
|
||||
|
||||
|
||||
def test_knowledge_graph_fresh_schema_includes_new_columns(tmp_path):
|
||||
"""Brand-new palaces should get source_drawer_id / adapter_name directly
|
||||
from CREATE TABLE, not via a post-hoc ALTER. _migrate_schema exists only
|
||||
for legacy palaces."""
|
||||
import sqlite3
|
||||
|
||||
from mempalace.knowledge_graph import KnowledgeGraph
|
||||
|
||||
kg = KnowledgeGraph(db_path=str(tmp_path / "fresh.sqlite3"))
|
||||
try:
|
||||
conn = sqlite3.connect(str(tmp_path / "fresh.sqlite3"))
|
||||
cols = {row[1] for row in conn.execute("PRAGMA table_info(triples)")}
|
||||
conn.close()
|
||||
assert "source_drawer_id" in cols
|
||||
assert "adapter_name" in cols
|
||||
finally:
|
||||
kg.close()
|
||||
|
||||
|
||||
def test_knowledge_graph_migration_adds_missing_columns_to_old_schema(tmp_path):
|
||||
"""An old-schema triples table (pre-RFC 002) should auto-migrate on open."""
|
||||
import sqlite3
|
||||
|
||||
Reference in New Issue
Block a user