refactor(stdio): extract Windows UTF-8 reconfigure into shared helper

Both cli.py and fact_checker.py carried identical 28-line Windows stdio
reconfigure helpers; pull the loop into mempalace/_stdio.py so the same
machine drives the CLI, the fact_checker --stdin entry point, and the
MCP server. The thin per-call-site wrappers stay so existing tests keep
importing _reconfigure_stdio_utf8_on_windows from the same module they
always have.

CLI / fact_checker policy unchanged: stdin=surrogateescape (don't crash
on a malformed redirected file), stdout/stderr=replace (don't crash
mid-print on a surrogate half round-tripped from a filename).
This commit is contained in:
mvalentsev
2026-05-03 22:25:31 +05:00
parent 03643eb507
commit 285b3b4f2e
3 changed files with 88 additions and 67 deletions
+71
View File
@@ -0,0 +1,71 @@
"""Stdio UTF-8 reconfiguration helper for Windows entry points.
Python on Windows defaults stdio to the system ANSI codepage
(cp1252/cp1251/cp950 depending on locale), which mojibakes UTF-8 input
or output the moment a non-Latin character shows up. Every console
entry point that touches stdio needs to fix this on Windows -- the MCP
server, the CLI, the fact_checker `--stdin` mode -- so the
reconfigure code lives here in one place to keep the per-stream
errors policies aligned across them.
Per-stream errors policy is caller-chosen:
* MCP server uses ``strict`` on stdout/stderr because everything written
there is server-controlled JSON-RPC; any encode failure is a real bug
the operator wants loud.
* CLI / fact_checker use ``replace`` on stdout/stderr because they print
verbatim drawer text that may contain surrogate halves round-tripped
from filenames -- ``strict`` would crash mid-print.
* All callers use ``surrogateescape`` on stdin so a malformed byte from
a redirected file or a misbehaving client survives as a lone surrogate
the consumer's parser surfaces, instead of ``UnicodeDecodeError``
killing the read loop on the first bad byte.
"""
from __future__ import annotations
import sys
from typing import Callable, Optional
def reconfigure_stdio_utf8_on_windows(
*,
stdin_errors: str = "surrogateescape",
stdout_errors: str = "strict",
stderr_errors: str = "strict",
on_failure: Optional[Callable[[str, BaseException], None]] = None,
) -> None:
"""Reconfigure stdio to UTF-8 on Windows. No-op elsewhere.
Args:
stdin_errors: errors= policy for stdin.reconfigure().
stdout_errors: errors= policy for stdout.reconfigure().
stderr_errors: errors= policy for stderr.reconfigure().
on_failure: optional ``(stream_name, exc) -> None`` callback for
streams whose ``reconfigure`` raises (e.g. Jupyter-replaced
streams that lack the method-shape we expect). Defaults to a
``WARNING:`` line on the original sys.stderr.
"""
if sys.platform != "win32":
return
policies = (
("stdin", stdin_errors),
("stdout", stdout_errors),
("stderr", stderr_errors),
)
for name, errors in policies:
stream = getattr(sys, name, None)
reconfigure = getattr(stream, "reconfigure", None)
if reconfigure is None:
continue
try:
reconfigure(encoding="utf-8", errors=errors)
except Exception as exc: # noqa: BLE001 -- last-resort guard
if on_failure is not None:
on_failure(name, exc)
else:
print(
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
file=sys.stderr,
)
+10 -35
View File
@@ -938,42 +938,17 @@ def cmd_compress(args):
def _reconfigure_stdio_utf8_on_windows(): def _reconfigure_stdio_utf8_on_windows():
"""Decode stdio as UTF-8 on Windows for the primary `mempalace` CLI. """Decode stdio as UTF-8 on Windows for the primary `mempalace` CLI.
Without this, Python defaults stdio to the system ANSI codepage Thin wrapper around the shared helper in ``mempalace._stdio``. The CLI
(cp1252/cp1251/cp950 depending on locale). That mojibakes non-ASCII overrides stdout/stderr to ``replace`` because ``mempalace search``
content piped in (`mempalace search ... < query.txt`) or piped out prints verbatim drawer text that may carry surrogate halves
(`mempalace search "..." > out.txt`) when verbatim drawer text or round-tripped from filenames -- ``strict`` would crash mid-print and
wing/room names contain non-Latin characters. lose the rest of the search result block. stdin keeps the default
``surrogateescape`` so a redirected non-UTF-8 file does not kill the
Per-stream errors policy: read on the first bad byte.
stdin -- surrogateescape: malformed bytes from a redirected file
survive as lone surrogates instead of crashing the read.
stdout -- replace: ``mempalace search`` prints verbatim drawer
text. A drawer that round-tripped a filename through
surrogateescape can hold a lone surrogate, which would
otherwise raise ``UnicodeEncodeError`` mid-print and
lose the rest of the search result block.
stderr -- replace: same hazard for logger output that quotes
user-supplied path or content.
""" """
if sys.platform != "win32": from ._stdio import reconfigure_stdio_utf8_on_windows
return
policies = ( reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
("stdin", "surrogateescape"),
("stdout", "replace"),
("stderr", "replace"),
)
for name, errors in policies:
stream = getattr(sys, name, None)
reconfigure = getattr(stream, "reconfigure", None)
if reconfigure is None:
continue
try:
reconfigure(encoding="utf-8", errors=errors)
except Exception as exc:
print(
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
file=sys.stderr,
)
def main(): def main():
+7 -32
View File
@@ -306,40 +306,15 @@ def _edit_distance(s1: str, s2: str) -> int:
def _reconfigure_stdio_utf8_on_windows(): def _reconfigure_stdio_utf8_on_windows():
"""Decode --stdin payload as UTF-8 on Windows. """Decode --stdin payload as UTF-8 on Windows.
Without this, Python defaults stdio to the system ANSI codepage Thin wrapper around the shared helper in ``mempalace._stdio``. Mirrors
(cp1252/cp1251/cp950 depending on locale), which mojibakes the primary CLI policy: stdout/stderr use ``replace`` because
non-ASCII fact text before pattern parsing sees it. extracted fact text can include surrogate halves round-tripped from
filenames -- ``strict`` would raise UnicodeEncodeError mid-print.
Per-stream errors policy mirrors the primary CLI helper in stdin keeps the default ``surrogateescape``.
``mempalace/cli.py``:
stdin -- surrogateescape: malformed input bytes survive as lone
surrogates instead of crashing the read.
stdout -- replace: extracted fact text can include surrogate
halves round-tripped from filenames; replace prevents
a UnicodeEncodeError mid-print.
stderr -- replace: same protection for warning lines.
""" """
import sys from ._stdio import reconfigure_stdio_utf8_on_windows
if sys.platform != "win32": reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
return
policies = (
("stdin", "surrogateescape"),
("stdout", "replace"),
("stderr", "replace"),
)
for name, errors in policies:
stream = getattr(sys, name, None)
reconfigure = getattr(stream, "reconfigure", None)
if reconfigure is None:
continue
try:
reconfigure(encoding="utf-8", errors=errors)
except Exception as exc:
print(
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
file=sys.stderr,
)
if __name__ == "__main__": if __name__ == "__main__":