Merge pull request #1282 from mvalentsev/fix/fact-checker-stdio-utf8
fix(cli, fact-checker): reconfigure stdio to UTF-8 on Windows
This commit is contained in:
@@ -0,0 +1,71 @@
|
|||||||
|
"""Stdio UTF-8 reconfiguration helper for Windows entry points.
|
||||||
|
|
||||||
|
Python on Windows defaults stdio to the system ANSI codepage
|
||||||
|
(cp1252/cp1251/cp950 depending on locale), which mojibakes UTF-8 input
|
||||||
|
or output the moment a non-Latin character shows up. Every console
|
||||||
|
entry point that touches stdio needs to fix this on Windows -- the MCP
|
||||||
|
server, the CLI, the fact_checker `--stdin` mode -- so the
|
||||||
|
reconfigure code lives here in one place to keep the per-stream
|
||||||
|
errors policies aligned across them.
|
||||||
|
|
||||||
|
Per-stream errors policy is caller-chosen:
|
||||||
|
|
||||||
|
* MCP server uses ``strict`` on stdout/stderr because everything written
|
||||||
|
there is server-controlled JSON-RPC; any encode failure is a real bug
|
||||||
|
the operator wants loud.
|
||||||
|
* CLI / fact_checker use ``replace`` on stdout/stderr because they print
|
||||||
|
verbatim drawer text that may contain surrogate halves round-tripped
|
||||||
|
from filenames -- ``strict`` would crash mid-print.
|
||||||
|
* All callers use ``surrogateescape`` on stdin so a malformed byte from
|
||||||
|
a redirected file or a misbehaving client survives as a lone surrogate
|
||||||
|
the consumer's parser surfaces, instead of ``UnicodeDecodeError``
|
||||||
|
killing the read loop on the first bad byte.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from typing import Callable, Optional
|
||||||
|
|
||||||
|
|
||||||
|
def reconfigure_stdio_utf8_on_windows(
|
||||||
|
*,
|
||||||
|
stdin_errors: str = "surrogateescape",
|
||||||
|
stdout_errors: str = "strict",
|
||||||
|
stderr_errors: str = "strict",
|
||||||
|
on_failure: Optional[Callable[[str, BaseException], None]] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Reconfigure stdio to UTF-8 on Windows. No-op elsewhere.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
stdin_errors: errors= policy for stdin.reconfigure().
|
||||||
|
stdout_errors: errors= policy for stdout.reconfigure().
|
||||||
|
stderr_errors: errors= policy for stderr.reconfigure().
|
||||||
|
on_failure: optional ``(stream_name, exc) -> None`` callback for
|
||||||
|
streams whose ``reconfigure`` raises (e.g. Jupyter-replaced
|
||||||
|
streams that lack the method-shape we expect). Defaults to a
|
||||||
|
``WARNING:`` line on the original sys.stderr.
|
||||||
|
"""
|
||||||
|
if sys.platform != "win32":
|
||||||
|
return
|
||||||
|
|
||||||
|
policies = (
|
||||||
|
("stdin", stdin_errors),
|
||||||
|
("stdout", stdout_errors),
|
||||||
|
("stderr", stderr_errors),
|
||||||
|
)
|
||||||
|
for name, errors in policies:
|
||||||
|
stream = getattr(sys, name, None)
|
||||||
|
reconfigure = getattr(stream, "reconfigure", None)
|
||||||
|
if reconfigure is None:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
reconfigure(encoding="utf-8", errors=errors)
|
||||||
|
except Exception as exc: # noqa: BLE001 -- last-resort guard
|
||||||
|
if on_failure is not None:
|
||||||
|
on_failure(name, exc)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
@@ -935,7 +935,25 @@ def cmd_compress(args):
|
|||||||
print(" (dry run -- nothing stored)")
|
print(" (dry run -- nothing stored)")
|
||||||
|
|
||||||
|
|
||||||
|
def _reconfigure_stdio_utf8_on_windows():
|
||||||
|
"""Decode stdio as UTF-8 on Windows for the primary `mempalace` CLI.
|
||||||
|
|
||||||
|
Thin wrapper around the shared helper in ``mempalace._stdio``. The CLI
|
||||||
|
overrides stdout/stderr to ``replace`` because ``mempalace search``
|
||||||
|
prints verbatim drawer text that may carry surrogate halves
|
||||||
|
round-tripped from filenames -- ``strict`` would crash mid-print and
|
||||||
|
lose the rest of the search result block. stdin keeps the default
|
||||||
|
``surrogateescape`` so a redirected non-UTF-8 file does not kill the
|
||||||
|
read on the first bad byte.
|
||||||
|
"""
|
||||||
|
from ._stdio import reconfigure_stdio_utf8_on_windows
|
||||||
|
|
||||||
|
reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
_reconfigure_stdio_utf8_on_windows()
|
||||||
|
|
||||||
version_label = f"MemPalace {__version__}"
|
version_label = f"MemPalace {__version__}"
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="MemPalace — Give your AI a memory. No API key required.",
|
description="MemPalace — Give your AI a memory. No API key required.",
|
||||||
|
|||||||
@@ -303,11 +303,27 @@ def _edit_distance(s1: str, s2: str) -> int:
|
|||||||
return prev[-1]
|
return prev[-1]
|
||||||
|
|
||||||
|
|
||||||
|
def _reconfigure_stdio_utf8_on_windows():
|
||||||
|
"""Decode --stdin payload as UTF-8 on Windows.
|
||||||
|
|
||||||
|
Thin wrapper around the shared helper in ``mempalace._stdio``. Mirrors
|
||||||
|
the primary CLI policy: stdout/stderr use ``replace`` because
|
||||||
|
extracted fact text can include surrogate halves round-tripped from
|
||||||
|
filenames -- ``strict`` would raise UnicodeEncodeError mid-print.
|
||||||
|
stdin keeps the default ``surrogateescape``.
|
||||||
|
"""
|
||||||
|
from ._stdio import reconfigure_stdio_utf8_on_windows
|
||||||
|
|
||||||
|
reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
_reconfigure_stdio_utf8_on_windows()
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Check text against known facts in the MemPalace palace.",
|
description="Check text against known facts in the MemPalace palace.",
|
||||||
epilog="Exits 0 when no issues found, 1 when one or more issues detected.",
|
epilog="Exits 0 when no issues found, 1 when one or more issues detected.",
|
||||||
|
|||||||
@@ -1042,3 +1042,58 @@ def test_cmd_repair_trailing_slash_does_not_recurse():
|
|||||||
palace_path = os.path.expanduser(args.palace).rstrip(os.sep)
|
palace_path = os.path.expanduser(args.palace).rstrip(os.sep)
|
||||||
backup_path = palace_path + ".backup"
|
backup_path = palace_path + ".backup"
|
||||||
assert not backup_path.startswith(palace_path + os.sep)
|
assert not backup_path.startswith(palace_path + os.sep)
|
||||||
|
|
||||||
|
|
||||||
|
# ── stdio reconfigure on Windows ─────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class _ReconfigurableStringIO:
|
||||||
|
def __init__(self):
|
||||||
|
self.reconfigure_calls = []
|
||||||
|
|
||||||
|
def reconfigure(self, **kwargs):
|
||||||
|
self.reconfigure_calls.append(kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_reconfigures_stdio_to_utf8_on_windows():
|
||||||
|
"""Windows `mempalace` CLI must decode/encode stdio as UTF-8.
|
||||||
|
|
||||||
|
Without this, piped non-ASCII input (`mempalace search ... < q.txt`)
|
||||||
|
or piped non-ASCII output (`mempalace search "..." > out.txt`) is
|
||||||
|
mojibaked through the system ANSI codepage on non-Latin Windows
|
||||||
|
locales (cp1252/cp1251/cp950).
|
||||||
|
"""
|
||||||
|
from mempalace.cli import _reconfigure_stdio_utf8_on_windows
|
||||||
|
|
||||||
|
stdin = _ReconfigurableStringIO()
|
||||||
|
stdout = _ReconfigurableStringIO()
|
||||||
|
stderr = _ReconfigurableStringIO()
|
||||||
|
with (
|
||||||
|
patch.object(sys, "platform", "win32"),
|
||||||
|
patch.object(sys, "stdin", stdin),
|
||||||
|
patch.object(sys, "stdout", stdout),
|
||||||
|
patch.object(sys, "stderr", stderr),
|
||||||
|
):
|
||||||
|
_reconfigure_stdio_utf8_on_windows()
|
||||||
|
|
||||||
|
# Per-stream errors policy: stdin survives bad bytes via
|
||||||
|
# surrogateescape so a redirected non-UTF-8 file does not crash
|
||||||
|
# the read; stdout/stderr use replace so a drawer carrying a
|
||||||
|
# round-tripped surrogate half does not crash mid-print.
|
||||||
|
assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
|
||||||
|
assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
|
||||||
|
assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
|
||||||
|
|
||||||
|
|
||||||
|
def test_reconfigure_stdio_is_noop_off_windows():
|
||||||
|
"""Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
|
||||||
|
from mempalace.cli import _reconfigure_stdio_utf8_on_windows
|
||||||
|
|
||||||
|
stdin = _ReconfigurableStringIO()
|
||||||
|
with (
|
||||||
|
patch.object(sys, "platform", "linux"),
|
||||||
|
patch.object(sys, "stdin", stdin),
|
||||||
|
):
|
||||||
|
_reconfigure_stdio_utf8_on_windows()
|
||||||
|
|
||||||
|
assert stdin.reconfigure_calls == []
|
||||||
|
|||||||
@@ -286,3 +286,66 @@ class TestCLI:
|
|||||||
assert "similar_name" in out
|
assert "similar_name" in out
|
||||||
# Silence unused import warning.
|
# Silence unused import warning.
|
||||||
_ = (MagicMock, patch, fact_checker)
|
_ = (MagicMock, patch, fact_checker)
|
||||||
|
|
||||||
|
def test_reconfigures_stdio_to_utf8_on_windows(self):
|
||||||
|
"""Windows fact_checker --stdin must decode payload as UTF-8.
|
||||||
|
|
||||||
|
Without this, Python defaults stdio to the system ANSI codepage
|
||||||
|
(cp1252/cp1251/cp950), which mojibakes non-ASCII text before
|
||||||
|
pattern parsing sees it.
|
||||||
|
"""
|
||||||
|
import io
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
|
||||||
|
|
||||||
|
class _ReconfigurableStringIO(io.StringIO):
|
||||||
|
def __init__(self, initial_value=""):
|
||||||
|
super().__init__(initial_value)
|
||||||
|
self.reconfigure_calls = []
|
||||||
|
|
||||||
|
def reconfigure(self, **kwargs):
|
||||||
|
self.reconfigure_calls.append(kwargs)
|
||||||
|
|
||||||
|
stdin = _ReconfigurableStringIO()
|
||||||
|
stdout = _ReconfigurableStringIO()
|
||||||
|
stderr = _ReconfigurableStringIO()
|
||||||
|
with (
|
||||||
|
patch.object(sys, "platform", "win32"),
|
||||||
|
patch.object(sys, "stdin", stdin),
|
||||||
|
patch.object(sys, "stdout", stdout),
|
||||||
|
patch.object(sys, "stderr", stderr),
|
||||||
|
):
|
||||||
|
_reconfigure_stdio_utf8_on_windows()
|
||||||
|
|
||||||
|
# Per-stream errors policy: stdin uses surrogateescape so a stray
|
||||||
|
# malformed byte from a redirected file does not crash the read,
|
||||||
|
# stdout/stderr use replace so an extracted fact carrying a
|
||||||
|
# surrogate half does not crash mid-print.
|
||||||
|
assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
|
||||||
|
assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
|
||||||
|
assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
|
||||||
|
|
||||||
|
def test_reconfigure_stdio_is_noop_off_windows(self):
|
||||||
|
"""Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
|
||||||
|
import io
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
|
||||||
|
|
||||||
|
class _ReconfigurableStringIO(io.StringIO):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.reconfigure_calls = []
|
||||||
|
|
||||||
|
def reconfigure(self, **kwargs):
|
||||||
|
self.reconfigure_calls.append(kwargs)
|
||||||
|
|
||||||
|
stdin = _ReconfigurableStringIO()
|
||||||
|
with (
|
||||||
|
patch.object(sys, "platform", "linux"),
|
||||||
|
patch.object(sys, "stdin", stdin),
|
||||||
|
):
|
||||||
|
_reconfigure_stdio_utf8_on_windows()
|
||||||
|
|
||||||
|
assert stdin.reconfigure_calls == []
|
||||||
|
|||||||
Reference in New Issue
Block a user