89904ed03f
Five findings from the automated review, fixed with targeted tests where behavior changed: 1. Transformation Protocol (transforms.py). The registry mixed a bytes-to-str transform (utf8_replace_invalid) with str-to-str transforms under a single Callable[..., str] type, misleading static type checkers and adapter authors. Introduced a Transformation Protocol with __call__(data: bytes|str) -> str and retyped the registry + get_transformation return. 2. Drawer-id collision risk (context.py). Switched _build_drawer_id from sha1[:16]=64 bits to sha256[:24]=96 bits. 64 bits sits uncomfortably close to the birthday bound for palace-sized corpora; 96 bits keeps the collision probability negligible while preserving the existing <prefix>_<chunk> layout adapters rely on. 3. Fresh-schema KG columns (knowledge_graph.py). source_drawer_id and adapter_name now live in the canonical CREATE TABLE so new palaces don't take an ALTER round-trip on first open. _migrate_schema stays for legacy palaces (SQLite has no ADD COLUMN IF NOT EXISTS, so PRAGMA introspection is still needed there). 4. Identity-shim comment (transforms.py). Comment said the adapter-specific transforms "raise if invoked without adapter context" but they return the input unchanged. Updated the comment to match the actual identity- shim behavior Copilot suggested. 5. Test docstring (test_sources.py). Comment mentioned default_factory=list but SourceRef.options uses default_factory=dict. Corrected. Tests: 1020 passed (up from 1018), +2 new tests for the sha256 id shape and the fresh-schema column presence on new palaces.
197 lines
7.7 KiB
Python
197 lines
7.7 KiB
Python
"""Reference implementations of the reserved content transformations (RFC 002 §1.4).
|
|
|
|
Every source adapter declares the set of transformations it applies to source
|
|
bytes via ``declared_transformations``. The conformance suite then verifies
|
|
that the adapter's output can be reproduced from the source bytes by applying
|
|
*only* the declared transformations in declaration order, using these
|
|
reference implementations.
|
|
|
|
Each transformation is a pure function on strings (text content after UTF-8
|
|
decoding). ``utf8_replace_invalid`` is the one that operates on bytes.
|
|
|
|
The invariant the spec enforces: **no transformation is applied that is not
|
|
declared in the adapter's set**. Adapters with an empty set are byte-preserving
|
|
end-to-end (modulo the initial UTF-8 decode itself, which is captured by
|
|
``utf8_replace_invalid`` when applicable).
|
|
|
|
Adapters MAY add custom transformations beyond the reserved set; third-party
|
|
names SHOULD be prefixed with the adapter name (``cursor.composer_ordering``).
|
|
Custom transformations MUST expose a reference implementation under
|
|
``mempalace.sources.transforms.<adapter_name>_<transform_name>`` so the
|
|
conformance suite can locate and apply them.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import Protocol, Union
|
|
|
|
|
|
class Transformation(Protocol):
|
|
"""Callable signature every reserved transformation conforms to.
|
|
|
|
Accepts the current stage of the pipeline — ``bytes`` on input
|
|
(``utf8_replace_invalid``) or ``str`` after decoding — and returns ``str``.
|
|
Adapters compose them in declaration order; the first step operates on the
|
|
original source bytes, every subsequent step on the prior step's output.
|
|
"""
|
|
|
|
def __call__(self, data: Union[bytes, str], /) -> str: ...
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Reserved transformations
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def utf8_replace_invalid(raw: bytes) -> str:
|
|
"""Decode bytes as UTF-8; replace invalid sequences with U+FFFD.
|
|
|
|
Equivalent to ``raw.decode("utf-8", errors="replace")``. This is the one
|
|
reserved transformation that operates on bytes rather than decoded text.
|
|
"""
|
|
return raw.decode("utf-8", errors="replace")
|
|
|
|
|
|
def newline_normalize(text: str) -> str:
|
|
"""Convert CRLF and bare-CR line endings to LF."""
|
|
return text.replace("\r\n", "\n").replace("\r", "\n")
|
|
|
|
|
|
def whitespace_trim(text: str) -> str:
|
|
"""Strip leading and trailing whitespace at the record boundary only."""
|
|
return text.strip()
|
|
|
|
|
|
_RUN_OF_THREE_OR_MORE_BLANK = re.compile(r"(?:\n[ \t]*){3,}\n")
|
|
|
|
|
|
def whitespace_collapse_internal(text: str) -> str:
|
|
"""Collapse runs of three or more blank lines to exactly two blank lines.
|
|
|
|
A "blank line" here is a line containing only spaces or tabs. Single and
|
|
double blank-line runs are preserved.
|
|
"""
|
|
# Normalise inputs before collapsing: turn internal blank lines with
|
|
# whitespace content into pure \n so the regex matches consistently.
|
|
lines = text.split("\n")
|
|
normalised = "\n".join(line if line.strip() else "" for line in lines)
|
|
return _RUN_OF_THREE_OR_MORE_BLANK.sub("\n\n\n", normalised)
|
|
|
|
|
|
def line_trim(text: str) -> str:
|
|
"""Strip leading and trailing whitespace from each individual line."""
|
|
return "\n".join(line.strip() for line in text.split("\n"))
|
|
|
|
|
|
def line_join_spaces(text: str) -> str:
|
|
"""Join adjacent non-blank lines with a single space, preserving paragraph breaks.
|
|
|
|
Two lines separated by at least one blank line remain on separate lines;
|
|
runs of non-blank lines collapse into a single space-separated line.
|
|
"""
|
|
paragraphs = re.split(r"\n[ \t]*\n", text)
|
|
joined = [" ".join(line.strip() for line in p.split("\n") if line.strip()) for p in paragraphs]
|
|
return "\n\n".join(joined)
|
|
|
|
|
|
def blank_line_drop(text: str) -> str:
|
|
"""Drop blank lines between non-blank lines, keeping non-blank lines only."""
|
|
return "\n".join(line for line in text.split("\n") if line.strip())
|
|
|
|
|
|
# The following reserved transformations are declared in the spec but are
|
|
# deeply adapter-specific. Rather than guess a single reference implementation
|
|
# now, we provide identity shims that leave the input unchanged when no
|
|
# adapter-specific implementation is available. Adapters that declare these
|
|
# MUST either override with a concrete implementation or provide a namespaced
|
|
# reference under
|
|
# ``mempalace.sources.transforms.<adapter_name>_<transform_name>`` (per the
|
|
# module docstring). The conformance suite looks up the adapter-specific
|
|
# implementation first, falling back to these identity shims only when none
|
|
# exists.
|
|
|
|
|
|
def strip_tool_chrome(text: str) -> str:
|
|
"""Adapter-supplied: remove system tags, hook output, tool UI chrome.
|
|
|
|
The reference implementation here is intentionally an identity function
|
|
because the noise patterns differ per transcript format (Claude Code,
|
|
Codex, ChatGPT, Slack). The conversations adapter, when migrated, will
|
|
register a concrete reference implementation under
|
|
``mempalace.sources.transforms.conversations_strip_tool_chrome``.
|
|
"""
|
|
return text
|
|
|
|
|
|
def tool_result_truncate(text: str) -> str:
|
|
"""Adapter-supplied: head/tail window on tool output with a middle marker."""
|
|
return text
|
|
|
|
|
|
def tool_result_omitted(text: str) -> str:
|
|
"""Adapter-supplied: fully omit some tool outputs (e.g., Read/Edit/Write)."""
|
|
return text
|
|
|
|
|
|
def spellcheck_user(text: str) -> str:
|
|
"""Adapter-supplied: rewrite user turns via autocorrect.
|
|
|
|
Requires the optional ``spellcheck`` extra and a tokenizer; the spec does
|
|
not mandate a specific language model, so the reference is adapter-owned.
|
|
"""
|
|
return text
|
|
|
|
|
|
def synthesized_marker(text: str) -> str:
|
|
"""Adapter-supplied: adapter inserts its own strings (e.g., '[N lines omitted]')."""
|
|
return text
|
|
|
|
|
|
def speaker_role_assignment(text: str) -> str:
|
|
"""Adapter-supplied: multi-party speakers alternately assigned user/assistant."""
|
|
return text
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Registry
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Reserved transformation name → reference implementation.
|
|
# Adapters look up by name to compose a round-trip pipeline during testing.
|
|
# The value conforms to the :class:`Transformation` protocol above; we type
|
|
# it as that Protocol rather than a concrete ``Callable`` so static checkers
|
|
# accept both the bytes→str (``utf8_replace_invalid``) and str→str shapes.
|
|
RESERVED_TRANSFORMATIONS: dict[str, Transformation] = {
|
|
"utf8_replace_invalid": utf8_replace_invalid,
|
|
"newline_normalize": newline_normalize,
|
|
"whitespace_trim": whitespace_trim,
|
|
"whitespace_collapse_internal": whitespace_collapse_internal,
|
|
"line_trim": line_trim,
|
|
"line_join_spaces": line_join_spaces,
|
|
"blank_line_drop": blank_line_drop,
|
|
"strip_tool_chrome": strip_tool_chrome,
|
|
"tool_result_truncate": tool_result_truncate,
|
|
"tool_result_omitted": tool_result_omitted,
|
|
"spellcheck_user": spellcheck_user,
|
|
"synthesized_marker": synthesized_marker,
|
|
"speaker_role_assignment": speaker_role_assignment,
|
|
}
|
|
|
|
|
|
def get_transformation(name: str) -> Transformation:
|
|
"""Resolve a reserved transformation by name.
|
|
|
|
Raises :class:`KeyError` if the name is neither reserved nor registered as
|
|
an adapter-namespaced reference (``<adapter>_<transform>``). Callers
|
|
looking for adapter-specific references SHOULD ``getattr`` on this module
|
|
first; this helper only covers the reserved names.
|
|
"""
|
|
try:
|
|
return RESERVED_TRANSFORMATIONS[name]
|
|
except KeyError as e:
|
|
raise KeyError(
|
|
f"unknown transformation {name!r}; reserved names: {sorted(RESERVED_TRANSFORMATIONS)}"
|
|
) from e
|