From 03643eb507e4ba81c65d50b519fcfb4dfb3c769f Mon Sep 17 00:00:00 2001 From: mvalentsev Date: Sun, 3 May 2026 21:37:12 +0500 Subject: [PATCH] fix(cli, fact-checker): per-stream stdio errors policy on Windows Previously all three streams reconfigured to UTF-8 with errors='strict'. That kills 'mempalace search' the moment a drawer carrying a surrogate half (round-tripped from a filename via surrogateescape) hits print(), losing the rest of the result block. Same hazard for warning lines on stderr. Split the policy: stdin -> surrogateescape (malformed bytes from a redirected file survive as lone surrogates instead of crashing the read) stdout -> replace (drawer text with a stray surrogate becomes U+FFFD instead of UnicodeEncodeError mid-print) stderr -> replace (same protection for logger / warning paths) Applied identically in the cli.py and fact_checker.py helpers; the DRY extraction into a shared module is a separate cleanup ask, kept out of this fix to keep the diff narrow. Tests updated for the new per-stream assertion. --- mempalace/cli.py | 20 ++++++++++++++++++-- mempalace/fact_checker.py | 18 ++++++++++++++++-- tests/test_cli.py | 11 +++++++---- tests/test_fact_checker.py | 11 +++++++---- 4 files changed, 48 insertions(+), 12 deletions(-) diff --git a/mempalace/cli.py b/mempalace/cli.py index 7372cd7..7052e1f 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -943,16 +943,32 @@ def _reconfigure_stdio_utf8_on_windows(): content piped in (`mempalace search ... < query.txt`) or piped out (`mempalace search "..." > out.txt`) when verbatim drawer text or wing/room names contain non-Latin characters. + + Per-stream errors policy: + stdin -- surrogateescape: malformed bytes from a redirected file + survive as lone surrogates instead of crashing the read. + stdout -- replace: ``mempalace search`` prints verbatim drawer + text. A drawer that round-tripped a filename through + surrogateescape can hold a lone surrogate, which would + otherwise raise ``UnicodeEncodeError`` mid-print and + lose the rest of the search result block. + stderr -- replace: same hazard for logger output that quotes + user-supplied path or content. """ if sys.platform != "win32": return - for name in ("stdin", "stdout", "stderr"): + policies = ( + ("stdin", "surrogateescape"), + ("stdout", "replace"), + ("stderr", "replace"), + ) + for name, errors in policies: stream = getattr(sys, name, None) reconfigure = getattr(stream, "reconfigure", None) if reconfigure is None: continue try: - reconfigure(encoding="utf-8", errors="strict") + reconfigure(encoding="utf-8", errors=errors) except Exception as exc: print( f"WARNING: Could not reconfigure {name} to UTF-8: {exc}", diff --git a/mempalace/fact_checker.py b/mempalace/fact_checker.py index c894859..1844c45 100644 --- a/mempalace/fact_checker.py +++ b/mempalace/fact_checker.py @@ -309,18 +309,32 @@ def _reconfigure_stdio_utf8_on_windows(): Without this, Python defaults stdio to the system ANSI codepage (cp1252/cp1251/cp950 depending on locale), which mojibakes non-ASCII fact text before pattern parsing sees it. + + Per-stream errors policy mirrors the primary CLI helper in + ``mempalace/cli.py``: + stdin -- surrogateescape: malformed input bytes survive as lone + surrogates instead of crashing the read. + stdout -- replace: extracted fact text can include surrogate + halves round-tripped from filenames; replace prevents + a UnicodeEncodeError mid-print. + stderr -- replace: same protection for warning lines. """ import sys if sys.platform != "win32": return - for name in ("stdin", "stdout", "stderr"): + policies = ( + ("stdin", "surrogateescape"), + ("stdout", "replace"), + ("stderr", "replace"), + ) + for name, errors in policies: stream = getattr(sys, name, None) reconfigure = getattr(stream, "reconfigure", None) if reconfigure is None: continue try: - reconfigure(encoding="utf-8", errors="strict") + reconfigure(encoding="utf-8", errors=errors) except Exception as exc: print( f"WARNING: Could not reconfigure {name} to UTF-8: {exc}", diff --git a/tests/test_cli.py b/tests/test_cli.py index 4836d69..6b4b7b3 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1076,10 +1076,13 @@ def test_reconfigures_stdio_to_utf8_on_windows(): ): _reconfigure_stdio_utf8_on_windows() - expected = {"encoding": "utf-8", "errors": "strict"} - assert stdin.reconfigure_calls == [expected] - assert stdout.reconfigure_calls == [expected] - assert stderr.reconfigure_calls == [expected] + # Per-stream errors policy: stdin survives bad bytes via + # surrogateescape so a redirected non-UTF-8 file does not crash + # the read; stdout/stderr use replace so a drawer carrying a + # round-tripped surrogate half does not crash mid-print. + assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}] + assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}] + assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}] def test_reconfigure_stdio_is_noop_off_windows(): diff --git a/tests/test_fact_checker.py b/tests/test_fact_checker.py index 9db370e..89d8366 100644 --- a/tests/test_fact_checker.py +++ b/tests/test_fact_checker.py @@ -318,10 +318,13 @@ class TestCLI: ): _reconfigure_stdio_utf8_on_windows() - expected = {"encoding": "utf-8", "errors": "strict"} - assert stdin.reconfigure_calls == [expected] - assert stdout.reconfigure_calls == [expected] - assert stderr.reconfigure_calls == [expected] + # Per-stream errors policy: stdin uses surrogateescape so a stray + # malformed byte from a redirected file does not crash the read, + # stdout/stderr use replace so an extracted fact carrying a + # surrogate half does not crash mid-print. + assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}] + assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}] + assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}] def test_reconfigure_stdio_is_noop_off_windows(self): """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""