fix(cli, fact-checker): per-stream stdio errors policy on Windows
Previously all three streams reconfigured to UTF-8 with errors='strict'.
That kills 'mempalace search' the moment a drawer carrying a surrogate
half (round-tripped from a filename via surrogateescape) hits print(),
losing the rest of the result block. Same hazard for warning lines on
stderr.
Split the policy:
stdin -> surrogateescape (malformed bytes from a redirected file
survive as lone surrogates instead of crashing the read)
stdout -> replace (drawer text with a stray surrogate becomes U+FFFD
instead of UnicodeEncodeError mid-print)
stderr -> replace (same protection for logger / warning paths)
Applied identically in the cli.py and fact_checker.py helpers; the DRY
extraction into a shared module is a separate cleanup ask, kept out of
this fix to keep the diff narrow.
Tests updated for the new per-stream assertion.
This commit is contained in:
+18
-2
@@ -943,16 +943,32 @@ def _reconfigure_stdio_utf8_on_windows():
|
||||
content piped in (`mempalace search ... < query.txt`) or piped out
|
||||
(`mempalace search "..." > out.txt`) when verbatim drawer text or
|
||||
wing/room names contain non-Latin characters.
|
||||
|
||||
Per-stream errors policy:
|
||||
stdin -- surrogateescape: malformed bytes from a redirected file
|
||||
survive as lone surrogates instead of crashing the read.
|
||||
stdout -- replace: ``mempalace search`` prints verbatim drawer
|
||||
text. A drawer that round-tripped a filename through
|
||||
surrogateescape can hold a lone surrogate, which would
|
||||
otherwise raise ``UnicodeEncodeError`` mid-print and
|
||||
lose the rest of the search result block.
|
||||
stderr -- replace: same hazard for logger output that quotes
|
||||
user-supplied path or content.
|
||||
"""
|
||||
if sys.platform != "win32":
|
||||
return
|
||||
for name in ("stdin", "stdout", "stderr"):
|
||||
policies = (
|
||||
("stdin", "surrogateescape"),
|
||||
("stdout", "replace"),
|
||||
("stderr", "replace"),
|
||||
)
|
||||
for name, errors in policies:
|
||||
stream = getattr(sys, name, None)
|
||||
reconfigure = getattr(stream, "reconfigure", None)
|
||||
if reconfigure is None:
|
||||
continue
|
||||
try:
|
||||
reconfigure(encoding="utf-8", errors="strict")
|
||||
reconfigure(encoding="utf-8", errors=errors)
|
||||
except Exception as exc:
|
||||
print(
|
||||
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
|
||||
|
||||
@@ -309,18 +309,32 @@ def _reconfigure_stdio_utf8_on_windows():
|
||||
Without this, Python defaults stdio to the system ANSI codepage
|
||||
(cp1252/cp1251/cp950 depending on locale), which mojibakes
|
||||
non-ASCII fact text before pattern parsing sees it.
|
||||
|
||||
Per-stream errors policy mirrors the primary CLI helper in
|
||||
``mempalace/cli.py``:
|
||||
stdin -- surrogateescape: malformed input bytes survive as lone
|
||||
surrogates instead of crashing the read.
|
||||
stdout -- replace: extracted fact text can include surrogate
|
||||
halves round-tripped from filenames; replace prevents
|
||||
a UnicodeEncodeError mid-print.
|
||||
stderr -- replace: same protection for warning lines.
|
||||
"""
|
||||
import sys
|
||||
|
||||
if sys.platform != "win32":
|
||||
return
|
||||
for name in ("stdin", "stdout", "stderr"):
|
||||
policies = (
|
||||
("stdin", "surrogateescape"),
|
||||
("stdout", "replace"),
|
||||
("stderr", "replace"),
|
||||
)
|
||||
for name, errors in policies:
|
||||
stream = getattr(sys, name, None)
|
||||
reconfigure = getattr(stream, "reconfigure", None)
|
||||
if reconfigure is None:
|
||||
continue
|
||||
try:
|
||||
reconfigure(encoding="utf-8", errors="strict")
|
||||
reconfigure(encoding="utf-8", errors=errors)
|
||||
except Exception as exc:
|
||||
print(
|
||||
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
|
||||
|
||||
+7
-4
@@ -1076,10 +1076,13 @@ def test_reconfigures_stdio_to_utf8_on_windows():
|
||||
):
|
||||
_reconfigure_stdio_utf8_on_windows()
|
||||
|
||||
expected = {"encoding": "utf-8", "errors": "strict"}
|
||||
assert stdin.reconfigure_calls == [expected]
|
||||
assert stdout.reconfigure_calls == [expected]
|
||||
assert stderr.reconfigure_calls == [expected]
|
||||
# Per-stream errors policy: stdin survives bad bytes via
|
||||
# surrogateescape so a redirected non-UTF-8 file does not crash
|
||||
# the read; stdout/stderr use replace so a drawer carrying a
|
||||
# round-tripped surrogate half does not crash mid-print.
|
||||
assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
|
||||
assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
|
||||
assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
|
||||
|
||||
|
||||
def test_reconfigure_stdio_is_noop_off_windows():
|
||||
|
||||
@@ -318,10 +318,13 @@ class TestCLI:
|
||||
):
|
||||
_reconfigure_stdio_utf8_on_windows()
|
||||
|
||||
expected = {"encoding": "utf-8", "errors": "strict"}
|
||||
assert stdin.reconfigure_calls == [expected]
|
||||
assert stdout.reconfigure_calls == [expected]
|
||||
assert stderr.reconfigure_calls == [expected]
|
||||
# Per-stream errors policy: stdin uses surrogateescape so a stray
|
||||
# malformed byte from a redirected file does not crash the read,
|
||||
# stdout/stderr use replace so an extracted fact carrying a
|
||||
# surrogate half does not crash mid-print.
|
||||
assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
|
||||
assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
|
||||
assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
|
||||
|
||||
def test_reconfigure_stdio_is_noop_off_windows(self):
|
||||
"""Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
|
||||
|
||||
Reference in New Issue
Block a user