Files
mempalace/tests/test_miner_jsonl_visibility.py
Igor Lins e Silva 29ce7c7135 Harden sweeper for production: verbatim tool blocks, full session_id, logged failures
Four changes on top of the proposal's initial sweeper draft, driven by
the CLAUDE.md design principles:

1. Drop the 500-char truncation on tool_use / tool_result content in
   _flatten_content. The "verbatim always" principle forbids lossy
   compression of user-adjacent data; a long code-edit diff handed to
   the assistant must round-trip intact. Unknown block types now also
   serialize their full payload instead of just a type marker. New test
   test_parse_preserves_tool_blocks_verbatim covers a 5000-char input.

2. Use the full session_id in drawer IDs (not session_id[:12]). Rules
   out cross-session collisions if a transcript source ever uses
   non-UUID session identifiers or shared prefixes.

3. Replace silent `except Exception: return None` in get_palace_cursor
   with a logger.warning — the exact anti-pattern this PR otherwise
   criticizes in miner.py. The fallback behavior is still safe
   (deterministic IDs make a missed cursor recover on the next run),
   but the failure is now discoverable.

4. sweep_directory now collects per-file failures into the result dict
   and the CLI exits non-zero when any file failed, so a partial-sweep
   outcome is visible rather than swallowed.

Co-Authored-By: MSL <232237854+milla-jovovich@users.noreply.github.com>
2026-04-18 13:14:32 -03:00

127 lines
5.5 KiB
Python

"""TDD: miner.py must not silently drop .jsonl files.
The project miner (mempalace.miner.scan_project) walks a directory and
keeps only files whose suffix is in READABLE_EXTENSIONS. The whitelist
contains `.json` but NOT `.jsonl`. Every ChatGPT export, Claude Code
transcript, or any other jsonl transcript dumped into a project
directory is silently dropped with no user-visible output.
Two paths to fix this, both tested here:
1. READABLE_EXTENSIONS must include `.jsonl` so the file is at least
readable as text (jsonl is line-delimited JSON — each line is
already valid text for embedding).
2. OR scan_project must surface skipped .jsonl files to the user so
they know to use `--mode convos`.
We test (1) — include .jsonl in READABLE_EXTENSIONS. This matches how
`.json` is already handled: the miner doesn't care what the structure
is, it chunks the text.
Written BEFORE the fix.
"""
import tempfile
from pathlib import Path
from unittest.mock import patch
from mempalace.miner import MAX_FILE_SIZE, READABLE_EXTENSIONS, scan_project
class TestJsonlNotSilentlySkipped:
def test_jsonl_in_readable_extensions(self):
"""`.jsonl` must be in the readable-extensions whitelist.
`.json` is already there (see mempalace/miner.py:30). `.jsonl`
is conceptually the same thing — line-delimited JSON — and all
of Claude Code's transcripts, ChatGPT exports, and similar
tooling writes `.jsonl`. Excluding it silently drops user data.
"""
assert ".jsonl" in READABLE_EXTENSIONS, (
"mempalace/miner.py:READABLE_EXTENSIONS contains `.json` "
"but NOT `.jsonl`. Every jsonl file in a mined project is "
"silently skipped at miner.py:722 "
"(`if filepath.suffix.lower() not in READABLE_EXTENSIONS: "
"continue`). This causes the 'convos not being saved' bug "
"reported by users — the hook fires `mempalace mine`, the "
"miner walks the directory, skips every .jsonl file, exits "
"cleanly. No warning, no log line, user sees nothing wrong. "
"Add `.jsonl` to READABLE_EXTENSIONS."
)
def test_scan_project_picks_up_jsonl_file(self):
"""scan_project should find .jsonl files in the target dir."""
with tempfile.TemporaryDirectory() as tmp:
tmpdir = Path(tmp)
jsonl_path = tmpdir / "transcript.jsonl"
jsonl_path.write_text(
'{"role": "user", "content": "hello"}\n'
'{"role": "assistant", "content": "hi there"}\n'
'{"role": "user", "content": "how do I install this"}\n'
'{"role": "assistant", "content": "pip install mempalace"}\n'
)
found = scan_project(str(tmpdir))
found_names = [p.name for p in found]
assert "transcript.jsonl" in found_names, (
"scan_project silently dropped transcript.jsonl. "
f"Returned: {found_names}. Users placing transcript "
"exports in a project directory expect them to be mined."
)
def test_large_jsonl_not_silently_dropped_by_size_cap(self):
"""Long sessions produce >10 MB transcripts. They must still mine.
The legacy cap was 10 MB, which is smaller than a long Claude Code
session's transcript. Users hitting the cap lost their entire
conversation to a silent `if size > MAX: continue` at miner.py:732.
Raise the cap well above any realistic transcript size.
"""
# 10 MB cap was silent failure — real Claude Code long sessions
# exceed this. The cap must accommodate them.
assert MAX_FILE_SIZE >= 100 * 1024 * 1024, (
f"MAX_FILE_SIZE is {MAX_FILE_SIZE} bytes "
f"({MAX_FILE_SIZE / 1024 / 1024:.0f} MB). Long Claude Code "
"sessions produce transcripts larger than 10 MB and get "
"silently dropped. Raise to at least 100 MB — chunking "
"at 800 chars per drawer means source file size doesn't "
"matter for downstream storage."
)
def test_scan_project_picks_up_50mb_jsonl(self):
"""A 50 MB .jsonl must not be filtered out by the size cap.
We don't actually write 50 MB (slow test). Instead, we mock
stat().st_size to report a 50 MB file and confirm scan_project
still includes it.
"""
with tempfile.TemporaryDirectory() as tmp:
tmpdir = Path(tmp)
big_jsonl = tmpdir / "big_transcript.jsonl"
# Write a small real file so the existence / extension / text
# checks pass; then mock its reported size.
big_jsonl.write_text('{"role": "user", "content": "hi"}\n')
fake_size = 50 * 1024 * 1024 # 50 MB
real_stat = Path.stat
def fake_stat(self, *args, **kwargs):
result = real_stat(self, *args, **kwargs)
if self.name == "big_transcript.jsonl":
class _FakeStat:
st_size = fake_size
st_mode = result.st_mode
return _FakeStat()
return result
with patch.object(Path, "stat", fake_stat):
found = scan_project(str(tmpdir))
found_names = [p.name for p in found]
assert "big_transcript.jsonl" in found_names, (
f"50 MB .jsonl was dropped by size cap (MAX_FILE_SIZE="
f"{MAX_FILE_SIZE}). Returned: {found_names}."
)