refactor(stdio): extract Windows UTF-8 reconfigure into shared helper
Both cli.py and fact_checker.py carried identical 28-line Windows stdio reconfigure helpers; pull the loop into mempalace/_stdio.py so the same machine drives the CLI, the fact_checker --stdin entry point, and the MCP server. The thin per-call-site wrappers stay so existing tests keep importing _reconfigure_stdio_utf8_on_windows from the same module they always have. CLI / fact_checker policy unchanged: stdin=surrogateescape (don't crash on a malformed redirected file), stdout/stderr=replace (don't crash mid-print on a surrogate half round-tripped from a filename).
This commit is contained in:
@@ -0,0 +1,71 @@
|
||||
"""Stdio UTF-8 reconfiguration helper for Windows entry points.
|
||||
|
||||
Python on Windows defaults stdio to the system ANSI codepage
|
||||
(cp1252/cp1251/cp950 depending on locale), which mojibakes UTF-8 input
|
||||
or output the moment a non-Latin character shows up. Every console
|
||||
entry point that touches stdio needs to fix this on Windows -- the MCP
|
||||
server, the CLI, the fact_checker `--stdin` mode -- so the
|
||||
reconfigure code lives here in one place to keep the per-stream
|
||||
errors policies aligned across them.
|
||||
|
||||
Per-stream errors policy is caller-chosen:
|
||||
|
||||
* MCP server uses ``strict`` on stdout/stderr because everything written
|
||||
there is server-controlled JSON-RPC; any encode failure is a real bug
|
||||
the operator wants loud.
|
||||
* CLI / fact_checker use ``replace`` on stdout/stderr because they print
|
||||
verbatim drawer text that may contain surrogate halves round-tripped
|
||||
from filenames -- ``strict`` would crash mid-print.
|
||||
* All callers use ``surrogateescape`` on stdin so a malformed byte from
|
||||
a redirected file or a misbehaving client survives as a lone surrogate
|
||||
the consumer's parser surfaces, instead of ``UnicodeDecodeError``
|
||||
killing the read loop on the first bad byte.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from typing import Callable, Optional
|
||||
|
||||
|
||||
def reconfigure_stdio_utf8_on_windows(
|
||||
*,
|
||||
stdin_errors: str = "surrogateescape",
|
||||
stdout_errors: str = "strict",
|
||||
stderr_errors: str = "strict",
|
||||
on_failure: Optional[Callable[[str, BaseException], None]] = None,
|
||||
) -> None:
|
||||
"""Reconfigure stdio to UTF-8 on Windows. No-op elsewhere.
|
||||
|
||||
Args:
|
||||
stdin_errors: errors= policy for stdin.reconfigure().
|
||||
stdout_errors: errors= policy for stdout.reconfigure().
|
||||
stderr_errors: errors= policy for stderr.reconfigure().
|
||||
on_failure: optional ``(stream_name, exc) -> None`` callback for
|
||||
streams whose ``reconfigure`` raises (e.g. Jupyter-replaced
|
||||
streams that lack the method-shape we expect). Defaults to a
|
||||
``WARNING:`` line on the original sys.stderr.
|
||||
"""
|
||||
if sys.platform != "win32":
|
||||
return
|
||||
|
||||
policies = (
|
||||
("stdin", stdin_errors),
|
||||
("stdout", stdout_errors),
|
||||
("stderr", stderr_errors),
|
||||
)
|
||||
for name, errors in policies:
|
||||
stream = getattr(sys, name, None)
|
||||
reconfigure = getattr(stream, "reconfigure", None)
|
||||
if reconfigure is None:
|
||||
continue
|
||||
try:
|
||||
reconfigure(encoding="utf-8", errors=errors)
|
||||
except Exception as exc: # noqa: BLE001 -- last-resort guard
|
||||
if on_failure is not None:
|
||||
on_failure(name, exc)
|
||||
else:
|
||||
print(
|
||||
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
+10
-35
@@ -938,42 +938,17 @@ def cmd_compress(args):
|
||||
def _reconfigure_stdio_utf8_on_windows():
|
||||
"""Decode stdio as UTF-8 on Windows for the primary `mempalace` CLI.
|
||||
|
||||
Without this, Python defaults stdio to the system ANSI codepage
|
||||
(cp1252/cp1251/cp950 depending on locale). That mojibakes non-ASCII
|
||||
content piped in (`mempalace search ... < query.txt`) or piped out
|
||||
(`mempalace search "..." > out.txt`) when verbatim drawer text or
|
||||
wing/room names contain non-Latin characters.
|
||||
|
||||
Per-stream errors policy:
|
||||
stdin -- surrogateescape: malformed bytes from a redirected file
|
||||
survive as lone surrogates instead of crashing the read.
|
||||
stdout -- replace: ``mempalace search`` prints verbatim drawer
|
||||
text. A drawer that round-tripped a filename through
|
||||
surrogateescape can hold a lone surrogate, which would
|
||||
otherwise raise ``UnicodeEncodeError`` mid-print and
|
||||
lose the rest of the search result block.
|
||||
stderr -- replace: same hazard for logger output that quotes
|
||||
user-supplied path or content.
|
||||
Thin wrapper around the shared helper in ``mempalace._stdio``. The CLI
|
||||
overrides stdout/stderr to ``replace`` because ``mempalace search``
|
||||
prints verbatim drawer text that may carry surrogate halves
|
||||
round-tripped from filenames -- ``strict`` would crash mid-print and
|
||||
lose the rest of the search result block. stdin keeps the default
|
||||
``surrogateescape`` so a redirected non-UTF-8 file does not kill the
|
||||
read on the first bad byte.
|
||||
"""
|
||||
if sys.platform != "win32":
|
||||
return
|
||||
policies = (
|
||||
("stdin", "surrogateescape"),
|
||||
("stdout", "replace"),
|
||||
("stderr", "replace"),
|
||||
)
|
||||
for name, errors in policies:
|
||||
stream = getattr(sys, name, None)
|
||||
reconfigure = getattr(stream, "reconfigure", None)
|
||||
if reconfigure is None:
|
||||
continue
|
||||
try:
|
||||
reconfigure(encoding="utf-8", errors=errors)
|
||||
except Exception as exc:
|
||||
print(
|
||||
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
from ._stdio import reconfigure_stdio_utf8_on_windows
|
||||
|
||||
reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@@ -306,40 +306,15 @@ def _edit_distance(s1: str, s2: str) -> int:
|
||||
def _reconfigure_stdio_utf8_on_windows():
|
||||
"""Decode --stdin payload as UTF-8 on Windows.
|
||||
|
||||
Without this, Python defaults stdio to the system ANSI codepage
|
||||
(cp1252/cp1251/cp950 depending on locale), which mojibakes
|
||||
non-ASCII fact text before pattern parsing sees it.
|
||||
|
||||
Per-stream errors policy mirrors the primary CLI helper in
|
||||
``mempalace/cli.py``:
|
||||
stdin -- surrogateescape: malformed input bytes survive as lone
|
||||
surrogates instead of crashing the read.
|
||||
stdout -- replace: extracted fact text can include surrogate
|
||||
halves round-tripped from filenames; replace prevents
|
||||
a UnicodeEncodeError mid-print.
|
||||
stderr -- replace: same protection for warning lines.
|
||||
Thin wrapper around the shared helper in ``mempalace._stdio``. Mirrors
|
||||
the primary CLI policy: stdout/stderr use ``replace`` because
|
||||
extracted fact text can include surrogate halves round-tripped from
|
||||
filenames -- ``strict`` would raise UnicodeEncodeError mid-print.
|
||||
stdin keeps the default ``surrogateescape``.
|
||||
"""
|
||||
import sys
|
||||
from ._stdio import reconfigure_stdio_utf8_on_windows
|
||||
|
||||
if sys.platform != "win32":
|
||||
return
|
||||
policies = (
|
||||
("stdin", "surrogateescape"),
|
||||
("stdout", "replace"),
|
||||
("stderr", "replace"),
|
||||
)
|
||||
for name, errors in policies:
|
||||
stream = getattr(sys, name, None)
|
||||
reconfigure = getattr(stream, "reconfigure", None)
|
||||
if reconfigure is None:
|
||||
continue
|
||||
try:
|
||||
reconfigure(encoding="utf-8", errors=errors)
|
||||
except Exception as exc:
|
||||
print(
|
||||
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user