fix(fact-checker): reconfigure stdio to UTF-8 on Windows
The `python -m mempalace.fact_checker --stdin` entry point reads non-ASCII text through the system ANSI codepage (cp1252/cp1251/cp950) on Windows, which mojibakes characters before claim-extraction sees them. Reconfigure stdin/stdout/stderr to UTF-8 with `errors="strict"`, wrapped in try/except so a replaced stream (Jupyter, test harness) logs a warning rather than crashing the CLI. Mirrors the same fix shipped for `mcp_server.py:main()` (#400) and `hooks_cli.py:run_hook()` (#1280) -- this is the third and last stdin-reading entry point in the package.
This commit is contained in:
@@ -303,11 +303,38 @@ def _edit_distance(s1: str, s2: str) -> int:
|
|||||||
return prev[-1]
|
return prev[-1]
|
||||||
|
|
||||||
|
|
||||||
|
def _reconfigure_stdio_utf8_on_windows():
|
||||||
|
"""Decode --stdin payload as UTF-8 on Windows.
|
||||||
|
|
||||||
|
Without this, Python defaults stdio to the system ANSI codepage
|
||||||
|
(cp1252/cp1251/cp950 depending on locale), which mojibakes
|
||||||
|
non-ASCII fact text before pattern parsing sees it.
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if sys.platform != "win32":
|
||||||
|
return
|
||||||
|
for name in ("stdin", "stdout", "stderr"):
|
||||||
|
stream = getattr(sys, name, None)
|
||||||
|
reconfigure = getattr(stream, "reconfigure", None)
|
||||||
|
if reconfigure is None:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
reconfigure(encoding="utf-8", errors="strict")
|
||||||
|
except Exception as exc:
|
||||||
|
print(
|
||||||
|
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
_reconfigure_stdio_utf8_on_windows()
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Check text against known facts in the MemPalace palace.",
|
description="Check text against known facts in the MemPalace palace.",
|
||||||
epilog="Exits 0 when no issues found, 1 when one or more issues detected.",
|
epilog="Exits 0 when no issues found, 1 when one or more issues detected.",
|
||||||
|
|||||||
@@ -286,3 +286,63 @@ class TestCLI:
|
|||||||
assert "similar_name" in out
|
assert "similar_name" in out
|
||||||
# Silence unused import warning.
|
# Silence unused import warning.
|
||||||
_ = (MagicMock, patch, fact_checker)
|
_ = (MagicMock, patch, fact_checker)
|
||||||
|
|
||||||
|
def test_reconfigures_stdio_to_utf8_on_windows(self):
|
||||||
|
"""Windows fact_checker --stdin must decode payload as UTF-8.
|
||||||
|
|
||||||
|
Without this, Python defaults stdio to the system ANSI codepage
|
||||||
|
(cp1252/cp1251/cp950), which mojibakes non-ASCII text before
|
||||||
|
pattern parsing sees it.
|
||||||
|
"""
|
||||||
|
import io
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
|
||||||
|
|
||||||
|
class _ReconfigurableStringIO(io.StringIO):
|
||||||
|
def __init__(self, initial_value=""):
|
||||||
|
super().__init__(initial_value)
|
||||||
|
self.reconfigure_calls = []
|
||||||
|
|
||||||
|
def reconfigure(self, **kwargs):
|
||||||
|
self.reconfigure_calls.append(kwargs)
|
||||||
|
|
||||||
|
stdin = _ReconfigurableStringIO()
|
||||||
|
stdout = _ReconfigurableStringIO()
|
||||||
|
stderr = _ReconfigurableStringIO()
|
||||||
|
with (
|
||||||
|
patch.object(sys, "platform", "win32"),
|
||||||
|
patch.object(sys, "stdin", stdin),
|
||||||
|
patch.object(sys, "stdout", stdout),
|
||||||
|
patch.object(sys, "stderr", stderr),
|
||||||
|
):
|
||||||
|
_reconfigure_stdio_utf8_on_windows()
|
||||||
|
|
||||||
|
expected = {"encoding": "utf-8", "errors": "strict"}
|
||||||
|
assert stdin.reconfigure_calls == [expected]
|
||||||
|
assert stdout.reconfigure_calls == [expected]
|
||||||
|
assert stderr.reconfigure_calls == [expected]
|
||||||
|
|
||||||
|
def test_reconfigure_stdio_is_noop_off_windows(self):
|
||||||
|
"""Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
|
||||||
|
import io
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
|
||||||
|
|
||||||
|
class _ReconfigurableStringIO(io.StringIO):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.reconfigure_calls = []
|
||||||
|
|
||||||
|
def reconfigure(self, **kwargs):
|
||||||
|
self.reconfigure_calls.append(kwargs)
|
||||||
|
|
||||||
|
stdin = _ReconfigurableStringIO()
|
||||||
|
with (
|
||||||
|
patch.object(sys, "platform", "linux"),
|
||||||
|
patch.object(sys, "stdin", stdin),
|
||||||
|
):
|
||||||
|
_reconfigure_stdio_utf8_on_windows()
|
||||||
|
|
||||||
|
assert stdin.reconfigure_calls == []
|
||||||
|
|||||||
Reference in New Issue
Block a user