180 lines
6.9 KiB
Python
180 lines
6.9 KiB
Python
|
|
"""Reference implementations of the reserved content transformations (RFC 002 §1.4).
|
||
|
|
|
||
|
|
Every source adapter declares the set of transformations it applies to source
|
||
|
|
bytes via ``declared_transformations``. The conformance suite then verifies
|
||
|
|
that the adapter's output can be reproduced from the source bytes by applying
|
||
|
|
*only* the declared transformations in declaration order, using these
|
||
|
|
reference implementations.
|
||
|
|
|
||
|
|
Each transformation is a pure function on strings (text content after UTF-8
|
||
|
|
decoding). ``utf8_replace_invalid`` is the one that operates on bytes.
|
||
|
|
|
||
|
|
The invariant the spec enforces: **no transformation is applied that is not
|
||
|
|
declared in the adapter's set**. Adapters with an empty set are byte-preserving
|
||
|
|
end-to-end (modulo the initial UTF-8 decode itself, which is captured by
|
||
|
|
``utf8_replace_invalid`` when applicable).
|
||
|
|
|
||
|
|
Adapters MAY add custom transformations beyond the reserved set; third-party
|
||
|
|
names SHOULD be prefixed with the adapter name (``cursor.composer_ordering``).
|
||
|
|
Custom transformations MUST expose a reference implementation under
|
||
|
|
``mempalace.sources.transforms.<adapter_name>_<transform_name>`` so the
|
||
|
|
conformance suite can locate and apply them.
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import re
|
||
|
|
from typing import Callable
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Reserved transformations
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
|
||
|
|
def utf8_replace_invalid(raw: bytes) -> str:
|
||
|
|
"""Decode bytes as UTF-8; replace invalid sequences with U+FFFD.
|
||
|
|
|
||
|
|
Equivalent to ``raw.decode("utf-8", errors="replace")``. This is the one
|
||
|
|
reserved transformation that operates on bytes rather than decoded text.
|
||
|
|
"""
|
||
|
|
return raw.decode("utf-8", errors="replace")
|
||
|
|
|
||
|
|
|
||
|
|
def newline_normalize(text: str) -> str:
|
||
|
|
"""Convert CRLF and bare-CR line endings to LF."""
|
||
|
|
return text.replace("\r\n", "\n").replace("\r", "\n")
|
||
|
|
|
||
|
|
|
||
|
|
def whitespace_trim(text: str) -> str:
|
||
|
|
"""Strip leading and trailing whitespace at the record boundary only."""
|
||
|
|
return text.strip()
|
||
|
|
|
||
|
|
|
||
|
|
_RUN_OF_THREE_OR_MORE_BLANK = re.compile(r"(?:\n[ \t]*){3,}\n")
|
||
|
|
|
||
|
|
|
||
|
|
def whitespace_collapse_internal(text: str) -> str:
|
||
|
|
"""Collapse runs of three or more blank lines to exactly two blank lines.
|
||
|
|
|
||
|
|
A "blank line" here is a line containing only spaces or tabs. Single and
|
||
|
|
double blank-line runs are preserved.
|
||
|
|
"""
|
||
|
|
# Normalise inputs before collapsing: turn internal blank lines with
|
||
|
|
# whitespace content into pure \n so the regex matches consistently.
|
||
|
|
lines = text.split("\n")
|
||
|
|
normalised = "\n".join(line if line.strip() else "" for line in lines)
|
||
|
|
return _RUN_OF_THREE_OR_MORE_BLANK.sub("\n\n\n", normalised)
|
||
|
|
|
||
|
|
|
||
|
|
def line_trim(text: str) -> str:
|
||
|
|
"""Strip leading and trailing whitespace from each individual line."""
|
||
|
|
return "\n".join(line.strip() for line in text.split("\n"))
|
||
|
|
|
||
|
|
|
||
|
|
def line_join_spaces(text: str) -> str:
|
||
|
|
"""Join adjacent non-blank lines with a single space, preserving paragraph breaks.
|
||
|
|
|
||
|
|
Two lines separated by at least one blank line remain on separate lines;
|
||
|
|
runs of non-blank lines collapse into a single space-separated line.
|
||
|
|
"""
|
||
|
|
paragraphs = re.split(r"\n[ \t]*\n", text)
|
||
|
|
joined = [" ".join(line.strip() for line in p.split("\n") if line.strip()) for p in paragraphs]
|
||
|
|
return "\n\n".join(joined)
|
||
|
|
|
||
|
|
|
||
|
|
def blank_line_drop(text: str) -> str:
|
||
|
|
"""Drop blank lines between non-blank lines, keeping non-blank lines only."""
|
||
|
|
return "\n".join(line for line in text.split("\n") if line.strip())
|
||
|
|
|
||
|
|
|
||
|
|
# The following reserved transformations are declared in the spec but are
|
||
|
|
# deeply adapter-specific. Rather than guess a single reference implementation
|
||
|
|
# now, we provide identity shims that raise if invoked without adapter-supplied
|
||
|
|
# context. Adapters that declare these MUST either override with a concrete
|
||
|
|
# implementation or provide a namespaced reference under
|
||
|
|
# ``mempalace.sources.transforms.<adapter_name>_<transform_name>`` (per the
|
||
|
|
# module docstring). The conformance suite looks up the adapter-specific
|
||
|
|
# implementation first, falling back to these only when none exists.
|
||
|
|
|
||
|
|
|
||
|
|
def strip_tool_chrome(text: str) -> str:
|
||
|
|
"""Adapter-supplied: remove system tags, hook output, tool UI chrome.
|
||
|
|
|
||
|
|
The reference implementation here is intentionally an identity function
|
||
|
|
because the noise patterns differ per transcript format (Claude Code,
|
||
|
|
Codex, ChatGPT, Slack). The conversations adapter, when migrated, will
|
||
|
|
register a concrete reference implementation under
|
||
|
|
``mempalace.sources.transforms.conversations_strip_tool_chrome``.
|
||
|
|
"""
|
||
|
|
return text
|
||
|
|
|
||
|
|
|
||
|
|
def tool_result_truncate(text: str) -> str:
|
||
|
|
"""Adapter-supplied: head/tail window on tool output with a middle marker."""
|
||
|
|
return text
|
||
|
|
|
||
|
|
|
||
|
|
def tool_result_omitted(text: str) -> str:
|
||
|
|
"""Adapter-supplied: fully omit some tool outputs (e.g., Read/Edit/Write)."""
|
||
|
|
return text
|
||
|
|
|
||
|
|
|
||
|
|
def spellcheck_user(text: str) -> str:
|
||
|
|
"""Adapter-supplied: rewrite user turns via autocorrect.
|
||
|
|
|
||
|
|
Requires the optional ``spellcheck`` extra and a tokenizer; the spec does
|
||
|
|
not mandate a specific language model, so the reference is adapter-owned.
|
||
|
|
"""
|
||
|
|
return text
|
||
|
|
|
||
|
|
|
||
|
|
def synthesized_marker(text: str) -> str:
|
||
|
|
"""Adapter-supplied: adapter inserts its own strings (e.g., '[N lines omitted]')."""
|
||
|
|
return text
|
||
|
|
|
||
|
|
|
||
|
|
def speaker_role_assignment(text: str) -> str:
|
||
|
|
"""Adapter-supplied: multi-party speakers alternately assigned user/assistant."""
|
||
|
|
return text
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Registry
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
|
||
|
|
# Reserved transformation name → reference implementation.
|
||
|
|
# Adapters look up by name to compose a round-trip pipeline during testing.
|
||
|
|
RESERVED_TRANSFORMATIONS: dict[str, Callable[..., str]] = {
|
||
|
|
"utf8_replace_invalid": utf8_replace_invalid,
|
||
|
|
"newline_normalize": newline_normalize,
|
||
|
|
"whitespace_trim": whitespace_trim,
|
||
|
|
"whitespace_collapse_internal": whitespace_collapse_internal,
|
||
|
|
"line_trim": line_trim,
|
||
|
|
"line_join_spaces": line_join_spaces,
|
||
|
|
"blank_line_drop": blank_line_drop,
|
||
|
|
"strip_tool_chrome": strip_tool_chrome,
|
||
|
|
"tool_result_truncate": tool_result_truncate,
|
||
|
|
"tool_result_omitted": tool_result_omitted,
|
||
|
|
"spellcheck_user": spellcheck_user,
|
||
|
|
"synthesized_marker": synthesized_marker,
|
||
|
|
"speaker_role_assignment": speaker_role_assignment,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def get_transformation(name: str) -> Callable[..., str]:
|
||
|
|
"""Resolve a reserved transformation by name.
|
||
|
|
|
||
|
|
Raises :class:`KeyError` if the name is neither reserved nor registered as
|
||
|
|
an adapter-namespaced reference (``<adapter>_<transform>``). Callers
|
||
|
|
looking for adapter-specific references SHOULD ``getattr`` on this module
|
||
|
|
first; this helper only covers the reserved names.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
return RESERVED_TRANSFORMATIONS[name]
|
||
|
|
except KeyError as e:
|
||
|
|
raise KeyError(
|
||
|
|
f"unknown transformation {name!r}; reserved names: {sorted(RESERVED_TRANSFORMATIONS)}"
|
||
|
|
) from e
|