Merge pull request #1406 from MemPalace/fix/925-diary-content-hash

fix(diary): detect same-size edits via content hash (#925)
2026-05-07 17:53:24 -03:00
parent ba30ab6951 26bc3d4f91
commit ea36a00f5f
2 changed files with 105 additions and 8 deletions
@@ -120,12 +120,28 @@ def ingest_diaries(
            continue
        date_str = date_match.group(1)

-        # Skip if content hasn't changed
+        # Skip if content hasn't changed. Hash-based — size alone false-negatives
+        # on same-length edits (e.g. "teh" → "the"), silently dropping real edits.
        state_key = f"{wing}|{diary_path.name}"
-        prev_size = state.get(state_key, {}).get("size", 0)
+        prev_entry = state.get(state_key, {})
+        prev_hash = prev_entry.get("content_hash")
+        prev_size = prev_entry.get("size", 0)
        curr_size = len(text)
-        if curr_size == prev_size and not force:
-            continue
+        curr_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
+        if not force:
+            if prev_hash is not None:
+                if curr_hash == prev_hash:
+                    continue
+            elif curr_size == prev_size and prev_size > 0:
+                # Legacy state without content_hash: keep size-based skip but
+                # backfill the hash so future runs use the strict check.
+                state[state_key] = {**prev_entry, "content_hash": curr_hash}
+                continue
+
+        # An in-place edit (same entry count, different content) means existing
+        # closets are stale. Force a full rebuild whenever the hash changes,
+        # not only on entry-count growth.
+        content_changed = prev_hash is not None and curr_hash != prev_hash

        now_iso = datetime.now(timezone.utc).isoformat()
        drawer_id = _diary_drawer_id(wing, date_str)
@@ -153,7 +169,8 @@ def ingest_diaries(

            entries = _split_entries(text)
            prev_entry_count = state.get(state_key, {}).get("entry_count", 0)
-            new_entries = entries if force else entries[prev_entry_count:]
+            full_rebuild = force or content_changed
+            new_entries = entries if full_rebuild else entries[prev_entry_count:]

            if new_entries:
                all_lines = []
@@ -175,15 +192,16 @@ def ingest_diaries(
                    }
                    if entities:
                        closet_meta["entities"] = entities
-                    # On a force rebuild, wipe any leftover numbered closets
-                    # from a longer prior run before re-writing.
-                    if force:
+                    # On any full rebuild (force or detected content edit),
+                    # wipe leftover closets from a prior run before re-writing.
+                    if full_rebuild:
                        purge_file_closets(closets_col, source_file)
                    n = upsert_closet_lines(closets_col, closet_id_base, all_lines, closet_meta)
                    closets_created += n

            state[state_key] = {
                "size": curr_size,
+                "content_hash": curr_hash,
                "entry_count": len(entries),
                "ingested_at": now_iso,
            }
@@ -23,6 +23,7 @@ Coverage map:
    cross-diary collisions, force=True purges leftover closets.
 """

+import hashlib
 import json
 import multiprocessing
 import os
@@ -605,6 +606,84 @@ class TestDiaryIngest:
        result = ingest_diaries(str(diary_dir), str(palace_dir))
        assert result["days_updated"] == 0

+    def test_ingest_detects_same_size_content_edit(self, tmp_path):
+        # Regression #925: the prior skip-check compared byte length only, so
+        # any in-place edit preserving total length (typo fix "teh"→"the",
+        # word swap, character reorder) was silently dropped. Content-hash
+        # check must catch the change AND rebuild the searchable closet so
+        # the index does not stay stale while the drawer updates.
+        diary_dir = tmp_path / "diaries"
+        diary_dir.mkdir()
+        diary_file = diary_dir / "2026-04-13.md"
+        # Original has the typo "Teh"; the edit fixes it to "The" — same length.
+        original = "# 2026-04-13\n\n## 10:00 — Test\n\nTeh elaborate jakarta postgres bug.\n"
+        edited = "# 2026-04-13\n\n## 10:00 — Test\n\nThe elaborate jakarta postgres bug.\n"
+        assert len(original) == len(edited), "test setup: edited content must be same length"
+        diary_file.write_text(original)
+        palace_dir = tmp_path / "palace"
+
+        from mempalace.diary_ingest import ingest_diaries
+
+        ingest_diaries(str(diary_dir), str(palace_dir), force=True)
+        diary_file.write_text(edited)
+        result = ingest_diaries(str(diary_dir), str(palace_dir))
+        assert result["days_updated"] == 1, "same-size content edit must trigger re-ingest"
+
+        # Drawer must hold the corrected text.
+        drawers = get_collection(str(palace_dir)).get(where={"source_file": str(diary_file)})
+        joined_drawers = "\n".join(drawers["documents"])
+        assert "The elaborate" in joined_drawers
+        assert "Teh elaborate" not in joined_drawers, "drawer still holds pre-edit content"
+
+        # And the closet (search index) must reflect the edit too — not just the
+        # drawer. Otherwise searches would surface stale text.
+        closets = get_closets_collection(str(palace_dir)).get(
+            where={"source_file": str(diary_file)}
+        )
+        joined_closets = "\n".join(closets["documents"])
+        assert "Teh elaborate" not in joined_closets, "closet index still holds stale content"
+
+    def test_legacy_state_backfills_content_hash(self, tmp_path):
+        # Upgraded users can carry legacy state entries without ``content_hash``.
+        # Same-size skip is preserved for that one run, but the hash must be
+        # recorded so the strict check engages on subsequent runs.
+        diary_dir = tmp_path / "diaries"
+        diary_dir.mkdir()
+        diary_file = diary_dir / "2026-04-13.md"
+        # Write explicit UTF-8 so the round-trip matches how diary_ingest reads.
+        # Windows' default text-mode encoding is cp1252; without this the em
+        # dash would round-trip lossy and the hash assertion below would fail.
+        text = "# 2026-04-13\n\n## 10:00 — Test\n\nUnchanged body content here.\n"
+        diary_file.write_text(text, encoding="utf-8")
+        palace_dir = tmp_path / "palace"
+
+        from mempalace.diary_ingest import _state_file_for, ingest_diaries
+
+        # Simulate a legacy state file: only size + entry_count, no content_hash.
+        state_file = _state_file_for(str(palace_dir), diary_dir.resolve())
+        state_file.parent.mkdir(parents=True, exist_ok=True)
+        state_file.write_text(
+            json.dumps(
+                {
+                    f"diary|{diary_file.name}": {
+                        "size": len(text),
+                        "entry_count": 1,
+                        "ingested_at": "2026-04-12T00:00:00+00:00",
+                    }
+                }
+            )
+        )
+
+        # Run with no force — size matches, so this should skip ingest.
+        result = ingest_diaries(str(diary_dir), str(palace_dir))
+        assert result["days_updated"] == 0
+
+        # Hash must have been backfilled into state for the next run's strict check.
+        persisted = json.loads(state_file.read_text())
+        entry = persisted[f"diary|{diary_file.name}"]
+        assert "content_hash" in entry, "legacy skip path must record the hash"
+        assert entry["content_hash"] == hashlib.sha256(text.encode("utf-8")).hexdigest()
+
    def test_state_file_lives_outside_diary_dir(self, tmp_path):
        # Regression: the original implementation wrote
        # ``.diary_ingest_state.json`` *inside* the user's diary directory,