From 0d1c1fbcaab751a1e7f8d992debaf0c5976904de Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Thu, 7 May 2026 12:42:02 -0300 Subject: [PATCH 1/3] fix(diary): detect same-size edits via content hash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The skip-if-unchanged check compared byte length only, so any in-place edit preserving total length (typo fix "teh"→"the", word swap) was silently dropped — a verbatim-storage violation: the user's actual words never reached the palace. Switch the gate to sha256(text). State entries gain a "content_hash" field; the legacy size-only path is preserved when prev_hash is missing so a post-upgrade run does not re-ingest every untouched diary. Closes #925 --- mempalace/diary_ingest.py | 19 +++++++++++++++---- tests/test_closets.py | 20 ++++++++++++++++++++ 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/mempalace/diary_ingest.py b/mempalace/diary_ingest.py index 503f0c0..7939dda 100644 --- a/mempalace/diary_ingest.py +++ b/mempalace/diary_ingest.py @@ -120,12 +120,22 @@ def ingest_diaries( continue date_str = date_match.group(1) - # Skip if content hasn't changed + # Skip if content hasn't changed. Hash-based — size alone false-negatives + # on same-length edits (e.g. "teh" → "the"), silently dropping real edits. state_key = f"{wing}|{diary_path.name}" - prev_size = state.get(state_key, {}).get("size", 0) + prev_entry = state.get(state_key, {}) + prev_hash = prev_entry.get("content_hash") + prev_size = prev_entry.get("size", 0) curr_size = len(text) - if curr_size == prev_size and not force: - continue + curr_hash = hashlib.sha256(text.encode("utf-8")).hexdigest() + if not force: + if prev_hash is not None: + if curr_hash == prev_hash: + continue + elif curr_size == prev_size and prev_size > 0: + # Legacy state without content_hash: keep size-based skip so a + # post-upgrade run doesn't re-ingest every untouched diary. + continue now_iso = datetime.now(timezone.utc).isoformat() drawer_id = _diary_drawer_id(wing, date_str) @@ -184,6 +194,7 @@ def ingest_diaries( state[state_key] = { "size": curr_size, + "content_hash": curr_hash, "entry_count": len(entries), "ingested_at": now_iso, } diff --git a/tests/test_closets.py b/tests/test_closets.py index 976086d..bd996e1 100644 --- a/tests/test_closets.py +++ b/tests/test_closets.py @@ -605,6 +605,26 @@ class TestDiaryIngest: result = ingest_diaries(str(diary_dir), str(palace_dir)) assert result["days_updated"] == 0 + def test_ingest_detects_same_size_content_edit(self, tmp_path): + # Regression #925: the prior skip-check compared byte length only, so + # any in-place edit preserving total length (e.g. typo fix "teh"→"the") + # was silently dropped. Content-hash check must catch it. + diary_dir = tmp_path / "diaries" + diary_dir.mkdir() + diary_file = diary_dir / "2026-04-13.md" + original = "# 2026-04-13\n\n## 10:00 — Test\n\nThe quick brown fox jumps over.\n" + edited = "# 2026-04-13\n\n## 10:00 — Test\n\nTeh quick brown fox jumps over.\n" + assert len(original) == len(edited), "test setup: edited content must be same length" + diary_file.write_text(original) + palace_dir = tmp_path / "palace" + + from mempalace.diary_ingest import ingest_diaries + + ingest_diaries(str(diary_dir), str(palace_dir), force=True) + diary_file.write_text(edited) + result = ingest_diaries(str(diary_dir), str(palace_dir)) + assert result["days_updated"] == 1, "same-size content edit must trigger re-ingest" + def test_state_file_lives_outside_diary_dir(self, tmp_path): # Regression: the original implementation wrote # ``.diary_ingest_state.json`` *inside* the user's diary directory, From 2ff6283b32f2ed5825122a1e3284cee2fee8b85c Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Thu, 7 May 2026 12:54:09 -0300 Subject: [PATCH 2/3] fix(diary): rebuild closets on hash change + backfill legacy state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address Copilot review on #925: - Full closet rebuild whenever the content hash differs from prior state, not only on entry-count growth. Without this, an in-place edit (same entry count, different body) updated the drawer but left the closet/search index stale — defeats the verbatim guarantee at the search layer even if the drawer is correct. - Legacy size-only skip path now records the computed content_hash back into state so subsequent runs use the strict hash check instead of remaining on the size-only path indefinitely. - Test updates: typo direction in the regression test now matches the comment (typo "Teh" → fix "The"), assertion now also checks the closet collection reflects the edit, and a new test exercises the legacy-state backfill path. --- mempalace/diary_ingest.py | 19 ++++++++---- tests/test_closets.py | 64 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 73 insertions(+), 10 deletions(-) diff --git a/mempalace/diary_ingest.py b/mempalace/diary_ingest.py index 7939dda..e6ffe42 100644 --- a/mempalace/diary_ingest.py +++ b/mempalace/diary_ingest.py @@ -133,10 +133,16 @@ def ingest_diaries( if curr_hash == prev_hash: continue elif curr_size == prev_size and prev_size > 0: - # Legacy state without content_hash: keep size-based skip so a - # post-upgrade run doesn't re-ingest every untouched diary. + # Legacy state without content_hash: keep size-based skip but + # backfill the hash so future runs use the strict check. + state[state_key] = {**prev_entry, "content_hash": curr_hash} continue + # An in-place edit (same entry count, different content) means existing + # closets are stale. Force a full rebuild whenever the hash changes, + # not only on entry-count growth. + content_changed = prev_hash is not None and curr_hash != prev_hash + now_iso = datetime.now(timezone.utc).isoformat() drawer_id = _diary_drawer_id(wing, date_str) entities = _extract_entities_for_metadata(text) @@ -163,7 +169,8 @@ def ingest_diaries( entries = _split_entries(text) prev_entry_count = state.get(state_key, {}).get("entry_count", 0) - new_entries = entries if force else entries[prev_entry_count:] + full_rebuild = force or content_changed + new_entries = entries if full_rebuild else entries[prev_entry_count:] if new_entries: all_lines = [] @@ -185,9 +192,9 @@ def ingest_diaries( } if entities: closet_meta["entities"] = entities - # On a force rebuild, wipe any leftover numbered closets - # from a longer prior run before re-writing. - if force: + # On any full rebuild (force or detected content edit), + # wipe leftover closets from a prior run before re-writing. + if full_rebuild: purge_file_closets(closets_col, source_file) n = upsert_closet_lines(closets_col, closet_id_base, all_lines, closet_meta) closets_created += n diff --git a/tests/test_closets.py b/tests/test_closets.py index bd996e1..37a78f4 100644 --- a/tests/test_closets.py +++ b/tests/test_closets.py @@ -23,6 +23,7 @@ Coverage map: cross-diary collisions, force=True purges leftover closets. """ +import hashlib import json import multiprocessing import os @@ -607,13 +608,16 @@ class TestDiaryIngest: def test_ingest_detects_same_size_content_edit(self, tmp_path): # Regression #925: the prior skip-check compared byte length only, so - # any in-place edit preserving total length (e.g. typo fix "teh"→"the") - # was silently dropped. Content-hash check must catch it. + # any in-place edit preserving total length (typo fix "teh"→"the", + # word swap, character reorder) was silently dropped. Content-hash + # check must catch the change AND rebuild the searchable closet so + # the index does not stay stale while the drawer updates. diary_dir = tmp_path / "diaries" diary_dir.mkdir() diary_file = diary_dir / "2026-04-13.md" - original = "# 2026-04-13\n\n## 10:00 — Test\n\nThe quick brown fox jumps over.\n" - edited = "# 2026-04-13\n\n## 10:00 — Test\n\nTeh quick brown fox jumps over.\n" + # Original has the typo "Teh"; the edit fixes it to "The" — same length. + original = "# 2026-04-13\n\n## 10:00 — Test\n\nTeh elaborate jakarta postgres bug.\n" + edited = "# 2026-04-13\n\n## 10:00 — Test\n\nThe elaborate jakarta postgres bug.\n" assert len(original) == len(edited), "test setup: edited content must be same length" diary_file.write_text(original) palace_dir = tmp_path / "palace" @@ -625,6 +629,58 @@ class TestDiaryIngest: result = ingest_diaries(str(diary_dir), str(palace_dir)) assert result["days_updated"] == 1, "same-size content edit must trigger re-ingest" + # Drawer must hold the corrected text. + drawers = get_collection(str(palace_dir)).get(where={"source_file": str(diary_file)}) + joined_drawers = "\n".join(drawers["documents"]) + assert "The elaborate" in joined_drawers + assert "Teh elaborate" not in joined_drawers, "drawer still holds pre-edit content" + + # And the closet (search index) must reflect the edit too — not just the + # drawer. Otherwise searches would surface stale text. + closets = get_closets_collection(str(palace_dir)).get( + where={"source_file": str(diary_file)} + ) + joined_closets = "\n".join(closets["documents"]) + assert "Teh elaborate" not in joined_closets, "closet index still holds stale content" + + def test_legacy_state_backfills_content_hash(self, tmp_path): + # Upgraded users can carry legacy state entries without ``content_hash``. + # Same-size skip is preserved for that one run, but the hash must be + # recorded so the strict check engages on subsequent runs. + diary_dir = tmp_path / "diaries" + diary_dir.mkdir() + diary_file = diary_dir / "2026-04-13.md" + text = "# 2026-04-13\n\n## 10:00 — Test\n\nUnchanged body content here.\n" + diary_file.write_text(text) + palace_dir = tmp_path / "palace" + + from mempalace.diary_ingest import _state_file_for, ingest_diaries + + # Simulate a legacy state file: only size + entry_count, no content_hash. + state_file = _state_file_for(str(palace_dir), diary_dir.resolve()) + state_file.parent.mkdir(parents=True, exist_ok=True) + state_file.write_text( + json.dumps( + { + f"diary|{diary_file.name}": { + "size": len(text), + "entry_count": 1, + "ingested_at": "2026-04-12T00:00:00+00:00", + } + } + ) + ) + + # Run with no force — size matches, so this should skip ingest. + result = ingest_diaries(str(diary_dir), str(palace_dir)) + assert result["days_updated"] == 0 + + # Hash must have been backfilled into state for the next run's strict check. + persisted = json.loads(state_file.read_text()) + entry = persisted[f"diary|{diary_file.name}"] + assert "content_hash" in entry, "legacy skip path must record the hash" + assert entry["content_hash"] == hashlib.sha256(text.encode("utf-8")).hexdigest() + def test_state_file_lives_outside_diary_dir(self, tmp_path): # Regression: the original implementation wrote # ``.diary_ingest_state.json`` *inside* the user's diary directory, From 26bc3d4f912fd27ede8a4618e8e8b8a54438a953 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Thu, 7 May 2026 17:41:19 -0300 Subject: [PATCH 3/3] test(diary): write fixture with explicit utf-8 to fix Windows hash assert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_legacy_state_backfills_content_hash failed on test-windows because Path.write_text without an encoding uses the system locale (cp1252 on Windows). The em dash was written as 0x97, then read back by diary_ingest as UTF-8 with errors=replace — round-trip produced different bytes than the in-Python literal, so the assertion comparing the persisted hash to sha256(text.encode(utf-8)) diverged. Pin the write side to encoding=utf-8 so the on-disk bytes match what diary_ingest decodes. No production change. --- tests/test_closets.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_closets.py b/tests/test_closets.py index 37a78f4..e016d83 100644 --- a/tests/test_closets.py +++ b/tests/test_closets.py @@ -650,8 +650,11 @@ class TestDiaryIngest: diary_dir = tmp_path / "diaries" diary_dir.mkdir() diary_file = diary_dir / "2026-04-13.md" + # Write explicit UTF-8 so the round-trip matches how diary_ingest reads. + # Windows' default text-mode encoding is cp1252; without this the em + # dash would round-trip lossy and the hash assertion below would fail. text = "# 2026-04-13\n\n## 10:00 — Test\n\nUnchanged body content here.\n" - diary_file.write_text(text) + diary_file.write_text(text, encoding="utf-8") palace_dir = tmp_path / "palace" from mempalace.diary_ingest import _state_file_for, ingest_diaries