fix(cli, fact-checker): per-stream stdio errors policy on Windows

Previously all three streams reconfigured to UTF-8 with errors='strict'.
That kills 'mempalace search' the moment a drawer carrying a surrogate
half (round-tripped from a filename via surrogateescape) hits print(),
losing the rest of the result block. Same hazard for warning lines on
stderr.

Split the policy:
  stdin  -> surrogateescape (malformed bytes from a redirected file
            survive as lone surrogates instead of crashing the read)
  stdout -> replace (drawer text with a stray surrogate becomes U+FFFD
            instead of UnicodeEncodeError mid-print)
  stderr -> replace (same protection for logger / warning paths)

Applied identically in the cli.py and fact_checker.py helpers; the DRY
extraction into a shared module is a separate cleanup ask, kept out of
this fix to keep the diff narrow.

Tests updated for the new per-stream assertion.
This commit is contained in:
mvalentsev
2026-05-03 21:37:12 +05:00
parent 32f4dfa26d
commit 03643eb507
4 changed files with 48 additions and 12 deletions
+18 -2
View File
@@ -943,16 +943,32 @@ def _reconfigure_stdio_utf8_on_windows():
content piped in (`mempalace search ... < query.txt`) or piped out content piped in (`mempalace search ... < query.txt`) or piped out
(`mempalace search "..." > out.txt`) when verbatim drawer text or (`mempalace search "..." > out.txt`) when verbatim drawer text or
wing/room names contain non-Latin characters. wing/room names contain non-Latin characters.
Per-stream errors policy:
stdin -- surrogateescape: malformed bytes from a redirected file
survive as lone surrogates instead of crashing the read.
stdout -- replace: ``mempalace search`` prints verbatim drawer
text. A drawer that round-tripped a filename through
surrogateescape can hold a lone surrogate, which would
otherwise raise ``UnicodeEncodeError`` mid-print and
lose the rest of the search result block.
stderr -- replace: same hazard for logger output that quotes
user-supplied path or content.
""" """
if sys.platform != "win32": if sys.platform != "win32":
return return
for name in ("stdin", "stdout", "stderr"): policies = (
("stdin", "surrogateescape"),
("stdout", "replace"),
("stderr", "replace"),
)
for name, errors in policies:
stream = getattr(sys, name, None) stream = getattr(sys, name, None)
reconfigure = getattr(stream, "reconfigure", None) reconfigure = getattr(stream, "reconfigure", None)
if reconfigure is None: if reconfigure is None:
continue continue
try: try:
reconfigure(encoding="utf-8", errors="strict") reconfigure(encoding="utf-8", errors=errors)
except Exception as exc: except Exception as exc:
print( print(
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}", f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
+16 -2
View File
@@ -309,18 +309,32 @@ def _reconfigure_stdio_utf8_on_windows():
Without this, Python defaults stdio to the system ANSI codepage Without this, Python defaults stdio to the system ANSI codepage
(cp1252/cp1251/cp950 depending on locale), which mojibakes (cp1252/cp1251/cp950 depending on locale), which mojibakes
non-ASCII fact text before pattern parsing sees it. non-ASCII fact text before pattern parsing sees it.
Per-stream errors policy mirrors the primary CLI helper in
``mempalace/cli.py``:
stdin -- surrogateescape: malformed input bytes survive as lone
surrogates instead of crashing the read.
stdout -- replace: extracted fact text can include surrogate
halves round-tripped from filenames; replace prevents
a UnicodeEncodeError mid-print.
stderr -- replace: same protection for warning lines.
""" """
import sys import sys
if sys.platform != "win32": if sys.platform != "win32":
return return
for name in ("stdin", "stdout", "stderr"): policies = (
("stdin", "surrogateescape"),
("stdout", "replace"),
("stderr", "replace"),
)
for name, errors in policies:
stream = getattr(sys, name, None) stream = getattr(sys, name, None)
reconfigure = getattr(stream, "reconfigure", None) reconfigure = getattr(stream, "reconfigure", None)
if reconfigure is None: if reconfigure is None:
continue continue
try: try:
reconfigure(encoding="utf-8", errors="strict") reconfigure(encoding="utf-8", errors=errors)
except Exception as exc: except Exception as exc:
print( print(
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}", f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
+7 -4
View File
@@ -1076,10 +1076,13 @@ def test_reconfigures_stdio_to_utf8_on_windows():
): ):
_reconfigure_stdio_utf8_on_windows() _reconfigure_stdio_utf8_on_windows()
expected = {"encoding": "utf-8", "errors": "strict"} # Per-stream errors policy: stdin survives bad bytes via
assert stdin.reconfigure_calls == [expected] # surrogateescape so a redirected non-UTF-8 file does not crash
assert stdout.reconfigure_calls == [expected] # the read; stdout/stderr use replace so a drawer carrying a
assert stderr.reconfigure_calls == [expected] # round-tripped surrogate half does not crash mid-print.
assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
def test_reconfigure_stdio_is_noop_off_windows(): def test_reconfigure_stdio_is_noop_off_windows():
+7 -4
View File
@@ -318,10 +318,13 @@ class TestCLI:
): ):
_reconfigure_stdio_utf8_on_windows() _reconfigure_stdio_utf8_on_windows()
expected = {"encoding": "utf-8", "errors": "strict"} # Per-stream errors policy: stdin uses surrogateescape so a stray
assert stdin.reconfigure_calls == [expected] # malformed byte from a redirected file does not crash the read,
assert stdout.reconfigure_calls == [expected] # stdout/stderr use replace so an extracted fact carrying a
assert stderr.reconfigure_calls == [expected] # surrogate half does not crash mid-print.
assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
def test_reconfigure_stdio_is_noop_off_windows(self): def test_reconfigure_stdio_is_noop_off_windows(self):
"""Linux/macOS already default to UTF-8 stdio -- helper must not touch streams.""" """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""