diff --git a/mempalace/fact_checker.py b/mempalace/fact_checker.py index 50e8842..c894859 100644 --- a/mempalace/fact_checker.py +++ b/mempalace/fact_checker.py @@ -303,11 +303,38 @@ def _edit_distance(s1: str, s2: str) -> int: return prev[-1] +def _reconfigure_stdio_utf8_on_windows(): + """Decode --stdin payload as UTF-8 on Windows. + + Without this, Python defaults stdio to the system ANSI codepage + (cp1252/cp1251/cp950 depending on locale), which mojibakes + non-ASCII fact text before pattern parsing sees it. + """ + import sys + + if sys.platform != "win32": + return + for name in ("stdin", "stdout", "stderr"): + stream = getattr(sys, name, None) + reconfigure = getattr(stream, "reconfigure", None) + if reconfigure is None: + continue + try: + reconfigure(encoding="utf-8", errors="strict") + except Exception as exc: + print( + f"WARNING: Could not reconfigure {name} to UTF-8: {exc}", + file=sys.stderr, + ) + + if __name__ == "__main__": import argparse import json import sys + _reconfigure_stdio_utf8_on_windows() + parser = argparse.ArgumentParser( description="Check text against known facts in the MemPalace palace.", epilog="Exits 0 when no issues found, 1 when one or more issues detected.", diff --git a/tests/test_fact_checker.py b/tests/test_fact_checker.py index 5b34a40..9db370e 100644 --- a/tests/test_fact_checker.py +++ b/tests/test_fact_checker.py @@ -286,3 +286,63 @@ class TestCLI: assert "similar_name" in out # Silence unused import warning. _ = (MagicMock, patch, fact_checker) + + def test_reconfigures_stdio_to_utf8_on_windows(self): + """Windows fact_checker --stdin must decode payload as UTF-8. + + Without this, Python defaults stdio to the system ANSI codepage + (cp1252/cp1251/cp950), which mojibakes non-ASCII text before + pattern parsing sees it. + """ + import io + import sys + + from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows + + class _ReconfigurableStringIO(io.StringIO): + def __init__(self, initial_value=""): + super().__init__(initial_value) + self.reconfigure_calls = [] + + def reconfigure(self, **kwargs): + self.reconfigure_calls.append(kwargs) + + stdin = _ReconfigurableStringIO() + stdout = _ReconfigurableStringIO() + stderr = _ReconfigurableStringIO() + with ( + patch.object(sys, "platform", "win32"), + patch.object(sys, "stdin", stdin), + patch.object(sys, "stdout", stdout), + patch.object(sys, "stderr", stderr), + ): + _reconfigure_stdio_utf8_on_windows() + + expected = {"encoding": "utf-8", "errors": "strict"} + assert stdin.reconfigure_calls == [expected] + assert stdout.reconfigure_calls == [expected] + assert stderr.reconfigure_calls == [expected] + + def test_reconfigure_stdio_is_noop_off_windows(self): + """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams.""" + import io + import sys + + from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows + + class _ReconfigurableStringIO(io.StringIO): + def __init__(self): + super().__init__() + self.reconfigure_calls = [] + + def reconfigure(self, **kwargs): + self.reconfigure_calls.append(kwargs) + + stdin = _ReconfigurableStringIO() + with ( + patch.object(sys, "platform", "linux"), + patch.object(sys, "stdin", stdin), + ): + _reconfigure_stdio_utf8_on_windows() + + assert stdin.reconfigure_calls == []