6f33d52681
Mirrors the miner.py fix in this same branch. convo_miner.py had the exact same 10 MB cap at line 58 that silently dropped long transcripts via continue. Long Claude Code sessions, multi-year ChatGPT exports, and lifetime Slack dumps all exceed 10 MB. Same silent-drop pattern, different file. Raised to 500 MB to match miner.py for consistency; downstream chunking means source file size does not affect storage or embedding cost. Tests: tests/test_convo_miner_size_cap.py (1 test) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
32 lines
1.3 KiB
Python
32 lines
1.3 KiB
Python
"""TDD: convo_miner.py must not silently drop transcripts larger than 10 MB.
|
|
|
|
Mirrors the miner.py fix shipped in the same PR family (see
|
|
test_miner_jsonl_visibility.py). Long Claude Code sessions, ChatGPT
|
|
exports, and multi-year Slack dumps routinely exceed 10 MB. The cap
|
|
silently `continue`s past them at convo_miner.py:~289, same silent-drop
|
|
pattern as the project miner's.
|
|
|
|
Written BEFORE the fix.
|
|
"""
|
|
|
|
from mempalace.convo_miner import MAX_FILE_SIZE
|
|
|
|
|
|
class TestConvoMinerSizeCap:
|
|
def test_max_file_size_accommodates_long_transcripts(self):
|
|
"""The cap must be well above any realistic transcript.
|
|
|
|
Long sessions and lifetime exports exceed 10 MB. The cap exists
|
|
as a sanity rail against pathological binaries, not as a limit
|
|
on legitimate text — downstream chunking means source size does
|
|
not matter for storage or embedding cost.
|
|
"""
|
|
assert MAX_FILE_SIZE >= 100 * 1024 * 1024, (
|
|
f"convo_miner.MAX_FILE_SIZE is {MAX_FILE_SIZE} bytes "
|
|
f"({MAX_FILE_SIZE / 1024 / 1024:.0f} MB). Same silent-drop "
|
|
"bug as miner.py's old 10 MB cap — long transcripts get "
|
|
"filtered out at convo_miner.py:~289 with `continue`. "
|
|
"Raise to at least 100 MB (match miner.py at 500 MB for "
|
|
"consistency across both miners)."
|
|
)
|