fix(sources): address Copilot review on #1014
Five findings from the automated review, fixed with targeted tests where behavior changed: 1. Transformation Protocol (transforms.py). The registry mixed a bytes-to-str transform (utf8_replace_invalid) with str-to-str transforms under a single Callable[..., str] type, misleading static type checkers and adapter authors. Introduced a Transformation Protocol with __call__(data: bytes|str) -> str and retyped the registry + get_transformation return. 2. Drawer-id collision risk (context.py). Switched _build_drawer_id from sha1[:16]=64 bits to sha256[:24]=96 bits. 64 bits sits uncomfortably close to the birthday bound for palace-sized corpora; 96 bits keeps the collision probability negligible while preserving the existing <prefix>_<chunk> layout adapters rely on. 3. Fresh-schema KG columns (knowledge_graph.py). source_drawer_id and adapter_name now live in the canonical CREATE TABLE so new palaces don't take an ALTER round-trip on first open. _migrate_schema stays for legacy palaces (SQLite has no ADD COLUMN IF NOT EXISTS, so PRAGMA introspection is still needed there). 4. Identity-shim comment (transforms.py). Comment said the adapter-specific transforms "raise if invoked without adapter context" but they return the input unchanged. Updated the comment to match the actual identity- shim behavior Copilot suggested. 5. Test docstring (test_sources.py). Comment mentioned default_factory=list but SourceRef.options uses default_factory=dict. Corrected. Tests: 1020 passed (up from 1018), +2 new tests for the sha256 id shape and the fresh-schema column presence on new palaces.
This commit is contained in:
@@ -83,6 +83,8 @@ class KnowledgeGraph:
|
|||||||
confidence REAL DEFAULT 1.0,
|
confidence REAL DEFAULT 1.0,
|
||||||
source_closet TEXT,
|
source_closet TEXT,
|
||||||
source_file TEXT,
|
source_file TEXT,
|
||||||
|
source_drawer_id TEXT,
|
||||||
|
adapter_name TEXT,
|
||||||
extracted_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
extracted_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||||
FOREIGN KEY (subject) REFERENCES entities(id),
|
FOREIGN KEY (subject) REFERENCES entities(id),
|
||||||
FOREIGN KEY (object) REFERENCES entities(id)
|
FOREIGN KEY (object) REFERENCES entities(id)
|
||||||
@@ -99,11 +101,12 @@ class KnowledgeGraph:
|
|||||||
def _migrate_schema(self, conn):
|
def _migrate_schema(self, conn):
|
||||||
"""Backwards-compatible schema migration for older triples tables.
|
"""Backwards-compatible schema migration for older triples tables.
|
||||||
|
|
||||||
RFC 002 §5.5 adds two optional provenance columns so adapter-written
|
Fresh palaces get ``source_drawer_id`` / ``adapter_name`` (RFC 002 §5.5)
|
||||||
triples can be traced back to (a) the specific drawer that produced
|
directly from the canonical ``CREATE TABLE`` above, so this path is a
|
||||||
them and (b) the adapter that authored them. Older palaces predate
|
no-op on new installs. It exists for palaces that were created before
|
||||||
these columns; SQLite has no ``ADD COLUMN IF NOT EXISTS``, so we
|
those columns were added: SQLite has no ``ADD COLUMN IF NOT EXISTS``,
|
||||||
introspect the schema first and only issue the ALTER if needed.
|
so we introspect the schema and only issue the ALTER when the column
|
||||||
|
is missing.
|
||||||
"""
|
"""
|
||||||
existing = {row["name"] for row in conn.execute("PRAGMA table_info(triples)")}
|
existing = {row["name"] for row in conn.execute("PRAGMA table_info(triples)")}
|
||||||
if "source_drawer_id" not in existing:
|
if "source_drawer_id" not in existing:
|
||||||
|
|||||||
@@ -126,15 +126,17 @@ class PalaceContext:
|
|||||||
|
|
||||||
|
|
||||||
def _build_drawer_id(record: DrawerRecord) -> str:
|
def _build_drawer_id(record: DrawerRecord) -> str:
|
||||||
"""Deterministic drawer id: ``<sha1(source_file)>_<chunk_index>``.
|
"""Deterministic drawer id: ``<sha256(source_file)[:24]>_<chunk_index>``.
|
||||||
|
|
||||||
Matches the shape existing miners rely on (``source_file`` + chunk index
|
Matches the shape existing miners rely on (``source_file`` + chunk index
|
||||||
pair) while keeping the id chroma-safe (no separators that collide with
|
pair) while keeping the id chroma-safe (no separators that collide with
|
||||||
existing metadata values). Adapters that need a different id scheme can
|
existing metadata values). 96-bit SHA-256 prefix keeps collision risk
|
||||||
bypass :meth:`PalaceContext.upsert_drawer` and write through
|
negligible across corpora the size of a palace (sha1@64 bits was too
|
||||||
``drawer_collection.upsert`` directly.
|
close to the birthday bound for large ingests). Adapters that need a
|
||||||
|
different id scheme can bypass :meth:`PalaceContext.upsert_drawer` and
|
||||||
|
write through ``drawer_collection.upsert`` directly.
|
||||||
"""
|
"""
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
digest = hashlib.sha1(record.source_file.encode("utf-8")).hexdigest()[:16]
|
digest = hashlib.sha256(record.source_file.encode("utf-8")).hexdigest()[:24]
|
||||||
return f"{digest}_{record.chunk_index}"
|
return f"{digest}_{record.chunk_index}"
|
||||||
|
|||||||
@@ -24,7 +24,19 @@ conformance suite can locate and apply them.
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from typing import Callable
|
from typing import Protocol, Union
|
||||||
|
|
||||||
|
|
||||||
|
class Transformation(Protocol):
|
||||||
|
"""Callable signature every reserved transformation conforms to.
|
||||||
|
|
||||||
|
Accepts the current stage of the pipeline — ``bytes`` on input
|
||||||
|
(``utf8_replace_invalid``) or ``str`` after decoding — and returns ``str``.
|
||||||
|
Adapters compose them in declaration order; the first step operates on the
|
||||||
|
original source bytes, every subsequent step on the prior step's output.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __call__(self, data: Union[bytes, str], /) -> str: ...
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -90,12 +102,14 @@ def blank_line_drop(text: str) -> str:
|
|||||||
|
|
||||||
# The following reserved transformations are declared in the spec but are
|
# The following reserved transformations are declared in the spec but are
|
||||||
# deeply adapter-specific. Rather than guess a single reference implementation
|
# deeply adapter-specific. Rather than guess a single reference implementation
|
||||||
# now, we provide identity shims that raise if invoked without adapter-supplied
|
# now, we provide identity shims that leave the input unchanged when no
|
||||||
# context. Adapters that declare these MUST either override with a concrete
|
# adapter-specific implementation is available. Adapters that declare these
|
||||||
# implementation or provide a namespaced reference under
|
# MUST either override with a concrete implementation or provide a namespaced
|
||||||
|
# reference under
|
||||||
# ``mempalace.sources.transforms.<adapter_name>_<transform_name>`` (per the
|
# ``mempalace.sources.transforms.<adapter_name>_<transform_name>`` (per the
|
||||||
# module docstring). The conformance suite looks up the adapter-specific
|
# module docstring). The conformance suite looks up the adapter-specific
|
||||||
# implementation first, falling back to these only when none exists.
|
# implementation first, falling back to these identity shims only when none
|
||||||
|
# exists.
|
||||||
|
|
||||||
|
|
||||||
def strip_tool_chrome(text: str) -> str:
|
def strip_tool_chrome(text: str) -> str:
|
||||||
@@ -146,7 +160,10 @@ def speaker_role_assignment(text: str) -> str:
|
|||||||
|
|
||||||
# Reserved transformation name → reference implementation.
|
# Reserved transformation name → reference implementation.
|
||||||
# Adapters look up by name to compose a round-trip pipeline during testing.
|
# Adapters look up by name to compose a round-trip pipeline during testing.
|
||||||
RESERVED_TRANSFORMATIONS: dict[str, Callable[..., str]] = {
|
# The value conforms to the :class:`Transformation` protocol above; we type
|
||||||
|
# it as that Protocol rather than a concrete ``Callable`` so static checkers
|
||||||
|
# accept both the bytes→str (``utf8_replace_invalid``) and str→str shapes.
|
||||||
|
RESERVED_TRANSFORMATIONS: dict[str, Transformation] = {
|
||||||
"utf8_replace_invalid": utf8_replace_invalid,
|
"utf8_replace_invalid": utf8_replace_invalid,
|
||||||
"newline_normalize": newline_normalize,
|
"newline_normalize": newline_normalize,
|
||||||
"whitespace_trim": whitespace_trim,
|
"whitespace_trim": whitespace_trim,
|
||||||
@@ -163,7 +180,7 @@ RESERVED_TRANSFORMATIONS: dict[str, Callable[..., str]] = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_transformation(name: str) -> Callable[..., str]:
|
def get_transformation(name: str) -> Transformation:
|
||||||
"""Resolve a reserved transformation by name.
|
"""Resolve a reserved transformation by name.
|
||||||
|
|
||||||
Raises :class:`KeyError` if the name is neither reserved nor registered as
|
Raises :class:`KeyError` if the name is neither reserved nor registered as
|
||||||
|
|||||||
+41
-1
@@ -103,7 +103,7 @@ def test_source_summary_default_uses_adapter_name():
|
|||||||
|
|
||||||
|
|
||||||
def test_source_ref_options_default_is_empty_dict():
|
def test_source_ref_options_default_is_empty_dict():
|
||||||
# Frozen dataclass must not share default_factory=list instance across instances.
|
# Frozen dataclass must not share a default_factory=dict instance across instances.
|
||||||
a = SourceRef(uri="a")
|
a = SourceRef(uri="a")
|
||||||
b = SourceRef(uri="b")
|
b = SourceRef(uri="b")
|
||||||
a.options["touched"] = True
|
a.options["touched"] = True
|
||||||
@@ -280,6 +280,27 @@ def test_palace_context_upsert_drawer_stamps_adapter_metadata():
|
|||||||
assert meta["chunk_index"] == 2
|
assert meta["chunk_index"] == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_palace_context_drawer_id_is_sha256_prefix_not_sha1():
|
||||||
|
"""Guards against the pre-review sha1[:16]=64-bit id scheme.
|
||||||
|
|
||||||
|
64-bit ids sit close to the birthday bound for palace-sized corpora.
|
||||||
|
The helper uses sha256[:24]=96 bits so collision risk stays negligible.
|
||||||
|
"""
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
from mempalace.sources.context import _build_drawer_id
|
||||||
|
|
||||||
|
src = "/an/absolute/path/to/a/file.txt"
|
||||||
|
record = DrawerRecord(content="x", source_file=src, chunk_index=3)
|
||||||
|
drawer_id = _build_drawer_id(record)
|
||||||
|
|
||||||
|
expected_prefix = hashlib.sha256(src.encode("utf-8")).hexdigest()[:24]
|
||||||
|
assert drawer_id == f"{expected_prefix}_3"
|
||||||
|
# Negative: the old sha1 scheme MUST NOT produce the same id.
|
||||||
|
sha1_prefix = hashlib.sha1(src.encode("utf-8")).hexdigest()[:16]
|
||||||
|
assert drawer_id != f"{sha1_prefix}_3"
|
||||||
|
|
||||||
|
|
||||||
def test_palace_context_skip_current_item_sets_flag():
|
def test_palace_context_skip_current_item_sets_flag():
|
||||||
ctx = PalaceContext(
|
ctx = PalaceContext(
|
||||||
drawer_collection=_FakeCollection(),
|
drawer_collection=_FakeCollection(),
|
||||||
@@ -355,6 +376,25 @@ def test_knowledge_graph_add_triple_accepts_source_drawer_id_and_adapter_name(tm
|
|||||||
kg.close()
|
kg.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_knowledge_graph_fresh_schema_includes_new_columns(tmp_path):
|
||||||
|
"""Brand-new palaces should get source_drawer_id / adapter_name directly
|
||||||
|
from CREATE TABLE, not via a post-hoc ALTER. _migrate_schema exists only
|
||||||
|
for legacy palaces."""
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
from mempalace.knowledge_graph import KnowledgeGraph
|
||||||
|
|
||||||
|
kg = KnowledgeGraph(db_path=str(tmp_path / "fresh.sqlite3"))
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(str(tmp_path / "fresh.sqlite3"))
|
||||||
|
cols = {row[1] for row in conn.execute("PRAGMA table_info(triples)")}
|
||||||
|
conn.close()
|
||||||
|
assert "source_drawer_id" in cols
|
||||||
|
assert "adapter_name" in cols
|
||||||
|
finally:
|
||||||
|
kg.close()
|
||||||
|
|
||||||
|
|
||||||
def test_knowledge_graph_migration_adds_missing_columns_to_old_schema(tmp_path):
|
def test_knowledge_graph_migration_adds_missing_columns_to_old_schema(tmp_path):
|
||||||
"""An old-schema triples table (pre-RFC 002) should auto-migrate on open."""
|
"""An old-schema triples table (pre-RFC 002) should auto-migrate on open."""
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
|||||||
Reference in New Issue
Block a user