fix: prevent convo_miner from re-processing 0-chunk files on every run (#654) (#732)

* fix: register 0-chunk files to prevent re-processing on every mine (#654)

mine_convos() has three early-exit paths (OSError, content too short,
zero chunks) that skip writing anything to ChromaDB. Since
file_already_mined() checks for the presence of a document with a
matching source_file, these files are re-read and re-processed on
every subsequent run.

Add _register_file() that upserts a lightweight sentinel document
(room="_registry", ingest_mode="registry") so file_already_mined()
returns True on future runs.

Note: Bug 2 from the issue (drawers_added counter always 0) was
already resolved upstream via the switch from collection.add() to
collection.upsert().

* fix: resolve macOS path symlink in test + remove unused variable
This commit is contained in:
Mikhail Valentsev
2026-04-13 02:25:34 +05:00
committed by GitHub
parent 9b60c6edd7
commit 87e8bafad8
2 changed files with 81 additions and 0 deletions
+51
View File
@@ -1,8 +1,12 @@
import os
import tempfile
import shutil
from pathlib import Path
import chromadb
from mempalace.convo_miner import mine_convos
from mempalace.palace import file_already_mined
def test_convo_mining():
@@ -24,3 +28,50 @@ def test_convo_mining():
assert len(results["documents"][0]) > 0
shutil.rmtree(tmpdir, ignore_errors=True)
def test_mine_convos_does_not_reprocess_short_files(capsys):
"""Files below MIN_CHUNK_SIZE get a sentinel so they are skipped on re-run."""
tmpdir = tempfile.mkdtemp()
try:
# A file too short to produce any chunks
with open(os.path.join(tmpdir, "tiny.txt"), "w") as f:
f.write("hi")
palace_path = os.path.join(tmpdir, "palace")
# First run -- file is processed (sentinel written)
mine_convos(tmpdir, palace_path, wing="test")
capsys.readouterr() # drain output
# Verify sentinel was written (resolve path -- macOS /var -> /private/var)
resolved_file = str(Path(tmpdir).resolve() / "tiny.txt")
client = chromadb.PersistentClient(path=palace_path)
col = client.get_collection("mempalace_drawers")
assert file_already_mined(col, resolved_file)
# Second run -- file should be skipped
mine_convos(tmpdir, palace_path, wing="test")
out2 = capsys.readouterr().out
assert "Files skipped (already filed): 1" in out2
finally:
shutil.rmtree(tmpdir, ignore_errors=True)
def test_mine_convos_does_not_reprocess_empty_chunk_files(capsys):
"""Files that normalize but produce 0 exchange chunks get a sentinel."""
tmpdir = tempfile.mkdtemp()
try:
# Content long enough to pass MIN_CHUNK_SIZE but with no exchange markers
# (no "> " lines), so chunk_exchanges returns []
with open(os.path.join(tmpdir, "no_exchanges.txt"), "w") as f:
f.write("This is a plain paragraph without any exchange markers. " * 5)
palace_path = os.path.join(tmpdir, "palace")
mine_convos(tmpdir, palace_path, wing="test")
mine_convos(tmpdir, palace_path, wing="test")
out2 = capsys.readouterr().out
assert "Files skipped (already filed): 1" in out2
finally:
shutil.rmtree(tmpdir, ignore_errors=True)