fix(cli, fact-checker): per-stream stdio errors policy on Windows
Previously all three streams reconfigured to UTF-8 with errors='strict'.
That kills 'mempalace search' the moment a drawer carrying a surrogate
half (round-tripped from a filename via surrogateescape) hits print(),
losing the rest of the result block. Same hazard for warning lines on
stderr.
Split the policy:
stdin -> surrogateescape (malformed bytes from a redirected file
survive as lone surrogates instead of crashing the read)
stdout -> replace (drawer text with a stray surrogate becomes U+FFFD
instead of UnicodeEncodeError mid-print)
stderr -> replace (same protection for logger / warning paths)
Applied identically in the cli.py and fact_checker.py helpers; the DRY
extraction into a shared module is a separate cleanup ask, kept out of
this fix to keep the diff narrow.
Tests updated for the new per-stream assertion.
This commit is contained in:
+18
-2
@@ -943,16 +943,32 @@ def _reconfigure_stdio_utf8_on_windows():
|
|||||||
content piped in (`mempalace search ... < query.txt`) or piped out
|
content piped in (`mempalace search ... < query.txt`) or piped out
|
||||||
(`mempalace search "..." > out.txt`) when verbatim drawer text or
|
(`mempalace search "..." > out.txt`) when verbatim drawer text or
|
||||||
wing/room names contain non-Latin characters.
|
wing/room names contain non-Latin characters.
|
||||||
|
|
||||||
|
Per-stream errors policy:
|
||||||
|
stdin -- surrogateescape: malformed bytes from a redirected file
|
||||||
|
survive as lone surrogates instead of crashing the read.
|
||||||
|
stdout -- replace: ``mempalace search`` prints verbatim drawer
|
||||||
|
text. A drawer that round-tripped a filename through
|
||||||
|
surrogateescape can hold a lone surrogate, which would
|
||||||
|
otherwise raise ``UnicodeEncodeError`` mid-print and
|
||||||
|
lose the rest of the search result block.
|
||||||
|
stderr -- replace: same hazard for logger output that quotes
|
||||||
|
user-supplied path or content.
|
||||||
"""
|
"""
|
||||||
if sys.platform != "win32":
|
if sys.platform != "win32":
|
||||||
return
|
return
|
||||||
for name in ("stdin", "stdout", "stderr"):
|
policies = (
|
||||||
|
("stdin", "surrogateescape"),
|
||||||
|
("stdout", "replace"),
|
||||||
|
("stderr", "replace"),
|
||||||
|
)
|
||||||
|
for name, errors in policies:
|
||||||
stream = getattr(sys, name, None)
|
stream = getattr(sys, name, None)
|
||||||
reconfigure = getattr(stream, "reconfigure", None)
|
reconfigure = getattr(stream, "reconfigure", None)
|
||||||
if reconfigure is None:
|
if reconfigure is None:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
reconfigure(encoding="utf-8", errors="strict")
|
reconfigure(encoding="utf-8", errors=errors)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(
|
print(
|
||||||
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
|
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
|
||||||
|
|||||||
@@ -309,18 +309,32 @@ def _reconfigure_stdio_utf8_on_windows():
|
|||||||
Without this, Python defaults stdio to the system ANSI codepage
|
Without this, Python defaults stdio to the system ANSI codepage
|
||||||
(cp1252/cp1251/cp950 depending on locale), which mojibakes
|
(cp1252/cp1251/cp950 depending on locale), which mojibakes
|
||||||
non-ASCII fact text before pattern parsing sees it.
|
non-ASCII fact text before pattern parsing sees it.
|
||||||
|
|
||||||
|
Per-stream errors policy mirrors the primary CLI helper in
|
||||||
|
``mempalace/cli.py``:
|
||||||
|
stdin -- surrogateescape: malformed input bytes survive as lone
|
||||||
|
surrogates instead of crashing the read.
|
||||||
|
stdout -- replace: extracted fact text can include surrogate
|
||||||
|
halves round-tripped from filenames; replace prevents
|
||||||
|
a UnicodeEncodeError mid-print.
|
||||||
|
stderr -- replace: same protection for warning lines.
|
||||||
"""
|
"""
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
if sys.platform != "win32":
|
if sys.platform != "win32":
|
||||||
return
|
return
|
||||||
for name in ("stdin", "stdout", "stderr"):
|
policies = (
|
||||||
|
("stdin", "surrogateescape"),
|
||||||
|
("stdout", "replace"),
|
||||||
|
("stderr", "replace"),
|
||||||
|
)
|
||||||
|
for name, errors in policies:
|
||||||
stream = getattr(sys, name, None)
|
stream = getattr(sys, name, None)
|
||||||
reconfigure = getattr(stream, "reconfigure", None)
|
reconfigure = getattr(stream, "reconfigure", None)
|
||||||
if reconfigure is None:
|
if reconfigure is None:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
reconfigure(encoding="utf-8", errors="strict")
|
reconfigure(encoding="utf-8", errors=errors)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(
|
print(
|
||||||
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
|
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
|
||||||
|
|||||||
+7
-4
@@ -1076,10 +1076,13 @@ def test_reconfigures_stdio_to_utf8_on_windows():
|
|||||||
):
|
):
|
||||||
_reconfigure_stdio_utf8_on_windows()
|
_reconfigure_stdio_utf8_on_windows()
|
||||||
|
|
||||||
expected = {"encoding": "utf-8", "errors": "strict"}
|
# Per-stream errors policy: stdin survives bad bytes via
|
||||||
assert stdin.reconfigure_calls == [expected]
|
# surrogateescape so a redirected non-UTF-8 file does not crash
|
||||||
assert stdout.reconfigure_calls == [expected]
|
# the read; stdout/stderr use replace so a drawer carrying a
|
||||||
assert stderr.reconfigure_calls == [expected]
|
# round-tripped surrogate half does not crash mid-print.
|
||||||
|
assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
|
||||||
|
assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
|
||||||
|
assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
|
||||||
|
|
||||||
|
|
||||||
def test_reconfigure_stdio_is_noop_off_windows():
|
def test_reconfigure_stdio_is_noop_off_windows():
|
||||||
|
|||||||
@@ -318,10 +318,13 @@ class TestCLI:
|
|||||||
):
|
):
|
||||||
_reconfigure_stdio_utf8_on_windows()
|
_reconfigure_stdio_utf8_on_windows()
|
||||||
|
|
||||||
expected = {"encoding": "utf-8", "errors": "strict"}
|
# Per-stream errors policy: stdin uses surrogateescape so a stray
|
||||||
assert stdin.reconfigure_calls == [expected]
|
# malformed byte from a redirected file does not crash the read,
|
||||||
assert stdout.reconfigure_calls == [expected]
|
# stdout/stderr use replace so an extracted fact carrying a
|
||||||
assert stderr.reconfigure_calls == [expected]
|
# surrogate half does not crash mid-print.
|
||||||
|
assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
|
||||||
|
assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
|
||||||
|
assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
|
||||||
|
|
||||||
def test_reconfigure_stdio_is_noop_off_windows(self):
|
def test_reconfigure_stdio_is_noop_off_windows(self):
|
||||||
"""Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
|
"""Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
|
||||||
|
|||||||
Reference in New Issue
Block a user