fix(diary): rebuild closets on hash change + backfill legacy state
Address Copilot review on #925: - Full closet rebuild whenever the content hash differs from prior state, not only on entry-count growth. Without this, an in-place edit (same entry count, different body) updated the drawer but left the closet/search index stale — defeats the verbatim guarantee at the search layer even if the drawer is correct. - Legacy size-only skip path now records the computed content_hash back into state so subsequent runs use the strict hash check instead of remaining on the size-only path indefinitely. - Test updates: typo direction in the regression test now matches the comment (typo "Teh" → fix "The"), assertion now also checks the closet collection reflects the edit, and a new test exercises the legacy-state backfill path.
This commit is contained in:
@@ -133,10 +133,16 @@ def ingest_diaries(
|
|||||||
if curr_hash == prev_hash:
|
if curr_hash == prev_hash:
|
||||||
continue
|
continue
|
||||||
elif curr_size == prev_size and prev_size > 0:
|
elif curr_size == prev_size and prev_size > 0:
|
||||||
# Legacy state without content_hash: keep size-based skip so a
|
# Legacy state without content_hash: keep size-based skip but
|
||||||
# post-upgrade run doesn't re-ingest every untouched diary.
|
# backfill the hash so future runs use the strict check.
|
||||||
|
state[state_key] = {**prev_entry, "content_hash": curr_hash}
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# An in-place edit (same entry count, different content) means existing
|
||||||
|
# closets are stale. Force a full rebuild whenever the hash changes,
|
||||||
|
# not only on entry-count growth.
|
||||||
|
content_changed = prev_hash is not None and curr_hash != prev_hash
|
||||||
|
|
||||||
now_iso = datetime.now(timezone.utc).isoformat()
|
now_iso = datetime.now(timezone.utc).isoformat()
|
||||||
drawer_id = _diary_drawer_id(wing, date_str)
|
drawer_id = _diary_drawer_id(wing, date_str)
|
||||||
entities = _extract_entities_for_metadata(text)
|
entities = _extract_entities_for_metadata(text)
|
||||||
@@ -163,7 +169,8 @@ def ingest_diaries(
|
|||||||
|
|
||||||
entries = _split_entries(text)
|
entries = _split_entries(text)
|
||||||
prev_entry_count = state.get(state_key, {}).get("entry_count", 0)
|
prev_entry_count = state.get(state_key, {}).get("entry_count", 0)
|
||||||
new_entries = entries if force else entries[prev_entry_count:]
|
full_rebuild = force or content_changed
|
||||||
|
new_entries = entries if full_rebuild else entries[prev_entry_count:]
|
||||||
|
|
||||||
if new_entries:
|
if new_entries:
|
||||||
all_lines = []
|
all_lines = []
|
||||||
@@ -185,9 +192,9 @@ def ingest_diaries(
|
|||||||
}
|
}
|
||||||
if entities:
|
if entities:
|
||||||
closet_meta["entities"] = entities
|
closet_meta["entities"] = entities
|
||||||
# On a force rebuild, wipe any leftover numbered closets
|
# On any full rebuild (force or detected content edit),
|
||||||
# from a longer prior run before re-writing.
|
# wipe leftover closets from a prior run before re-writing.
|
||||||
if force:
|
if full_rebuild:
|
||||||
purge_file_closets(closets_col, source_file)
|
purge_file_closets(closets_col, source_file)
|
||||||
n = upsert_closet_lines(closets_col, closet_id_base, all_lines, closet_meta)
|
n = upsert_closet_lines(closets_col, closet_id_base, all_lines, closet_meta)
|
||||||
closets_created += n
|
closets_created += n
|
||||||
|
|||||||
+60
-4
@@ -23,6 +23,7 @@ Coverage map:
|
|||||||
cross-diary collisions, force=True purges leftover closets.
|
cross-diary collisions, force=True purges leftover closets.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
@@ -607,13 +608,16 @@ class TestDiaryIngest:
|
|||||||
|
|
||||||
def test_ingest_detects_same_size_content_edit(self, tmp_path):
|
def test_ingest_detects_same_size_content_edit(self, tmp_path):
|
||||||
# Regression #925: the prior skip-check compared byte length only, so
|
# Regression #925: the prior skip-check compared byte length only, so
|
||||||
# any in-place edit preserving total length (e.g. typo fix "teh"→"the")
|
# any in-place edit preserving total length (typo fix "teh"→"the",
|
||||||
# was silently dropped. Content-hash check must catch it.
|
# word swap, character reorder) was silently dropped. Content-hash
|
||||||
|
# check must catch the change AND rebuild the searchable closet so
|
||||||
|
# the index does not stay stale while the drawer updates.
|
||||||
diary_dir = tmp_path / "diaries"
|
diary_dir = tmp_path / "diaries"
|
||||||
diary_dir.mkdir()
|
diary_dir.mkdir()
|
||||||
diary_file = diary_dir / "2026-04-13.md"
|
diary_file = diary_dir / "2026-04-13.md"
|
||||||
original = "# 2026-04-13\n\n## 10:00 — Test\n\nThe quick brown fox jumps over.\n"
|
# Original has the typo "Teh"; the edit fixes it to "The" — same length.
|
||||||
edited = "# 2026-04-13\n\n## 10:00 — Test\n\nTeh quick brown fox jumps over.\n"
|
original = "# 2026-04-13\n\n## 10:00 — Test\n\nTeh elaborate jakarta postgres bug.\n"
|
||||||
|
edited = "# 2026-04-13\n\n## 10:00 — Test\n\nThe elaborate jakarta postgres bug.\n"
|
||||||
assert len(original) == len(edited), "test setup: edited content must be same length"
|
assert len(original) == len(edited), "test setup: edited content must be same length"
|
||||||
diary_file.write_text(original)
|
diary_file.write_text(original)
|
||||||
palace_dir = tmp_path / "palace"
|
palace_dir = tmp_path / "palace"
|
||||||
@@ -625,6 +629,58 @@ class TestDiaryIngest:
|
|||||||
result = ingest_diaries(str(diary_dir), str(palace_dir))
|
result = ingest_diaries(str(diary_dir), str(palace_dir))
|
||||||
assert result["days_updated"] == 1, "same-size content edit must trigger re-ingest"
|
assert result["days_updated"] == 1, "same-size content edit must trigger re-ingest"
|
||||||
|
|
||||||
|
# Drawer must hold the corrected text.
|
||||||
|
drawers = get_collection(str(palace_dir)).get(where={"source_file": str(diary_file)})
|
||||||
|
joined_drawers = "\n".join(drawers["documents"])
|
||||||
|
assert "The elaborate" in joined_drawers
|
||||||
|
assert "Teh elaborate" not in joined_drawers, "drawer still holds pre-edit content"
|
||||||
|
|
||||||
|
# And the closet (search index) must reflect the edit too — not just the
|
||||||
|
# drawer. Otherwise searches would surface stale text.
|
||||||
|
closets = get_closets_collection(str(palace_dir)).get(
|
||||||
|
where={"source_file": str(diary_file)}
|
||||||
|
)
|
||||||
|
joined_closets = "\n".join(closets["documents"])
|
||||||
|
assert "Teh elaborate" not in joined_closets, "closet index still holds stale content"
|
||||||
|
|
||||||
|
def test_legacy_state_backfills_content_hash(self, tmp_path):
|
||||||
|
# Upgraded users can carry legacy state entries without ``content_hash``.
|
||||||
|
# Same-size skip is preserved for that one run, but the hash must be
|
||||||
|
# recorded so the strict check engages on subsequent runs.
|
||||||
|
diary_dir = tmp_path / "diaries"
|
||||||
|
diary_dir.mkdir()
|
||||||
|
diary_file = diary_dir / "2026-04-13.md"
|
||||||
|
text = "# 2026-04-13\n\n## 10:00 — Test\n\nUnchanged body content here.\n"
|
||||||
|
diary_file.write_text(text)
|
||||||
|
palace_dir = tmp_path / "palace"
|
||||||
|
|
||||||
|
from mempalace.diary_ingest import _state_file_for, ingest_diaries
|
||||||
|
|
||||||
|
# Simulate a legacy state file: only size + entry_count, no content_hash.
|
||||||
|
state_file = _state_file_for(str(palace_dir), diary_dir.resolve())
|
||||||
|
state_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
state_file.write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
f"diary|{diary_file.name}": {
|
||||||
|
"size": len(text),
|
||||||
|
"entry_count": 1,
|
||||||
|
"ingested_at": "2026-04-12T00:00:00+00:00",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Run with no force — size matches, so this should skip ingest.
|
||||||
|
result = ingest_diaries(str(diary_dir), str(palace_dir))
|
||||||
|
assert result["days_updated"] == 0
|
||||||
|
|
||||||
|
# Hash must have been backfilled into state for the next run's strict check.
|
||||||
|
persisted = json.loads(state_file.read_text())
|
||||||
|
entry = persisted[f"diary|{diary_file.name}"]
|
||||||
|
assert "content_hash" in entry, "legacy skip path must record the hash"
|
||||||
|
assert entry["content_hash"] == hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
def test_state_file_lives_outside_diary_dir(self, tmp_path):
|
def test_state_file_lives_outside_diary_dir(self, tmp_path):
|
||||||
# Regression: the original implementation wrote
|
# Regression: the original implementation wrote
|
||||||
# ``.diary_ingest_state.json`` *inside* the user's diary directory,
|
# ``.diary_ingest_state.json`` *inside* the user's diary directory,
|
||||||
|
|||||||
Reference in New Issue
Block a user