From 0d1c1fbcaab751a1e7f8d992debaf0c5976904de Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Thu, 7 May 2026 12:42:02 -0300 Subject: [PATCH] fix(diary): detect same-size edits via content hash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The skip-if-unchanged check compared byte length only, so any in-place edit preserving total length (typo fix "teh"→"the", word swap) was silently dropped — a verbatim-storage violation: the user's actual words never reached the palace. Switch the gate to sha256(text). State entries gain a "content_hash" field; the legacy size-only path is preserved when prev_hash is missing so a post-upgrade run does not re-ingest every untouched diary. Closes #925 --- mempalace/diary_ingest.py | 19 +++++++++++++++---- tests/test_closets.py | 20 ++++++++++++++++++++ 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/mempalace/diary_ingest.py b/mempalace/diary_ingest.py index 503f0c0..7939dda 100644 --- a/mempalace/diary_ingest.py +++ b/mempalace/diary_ingest.py @@ -120,12 +120,22 @@ def ingest_diaries( continue date_str = date_match.group(1) - # Skip if content hasn't changed + # Skip if content hasn't changed. Hash-based — size alone false-negatives + # on same-length edits (e.g. "teh" → "the"), silently dropping real edits. state_key = f"{wing}|{diary_path.name}" - prev_size = state.get(state_key, {}).get("size", 0) + prev_entry = state.get(state_key, {}) + prev_hash = prev_entry.get("content_hash") + prev_size = prev_entry.get("size", 0) curr_size = len(text) - if curr_size == prev_size and not force: - continue + curr_hash = hashlib.sha256(text.encode("utf-8")).hexdigest() + if not force: + if prev_hash is not None: + if curr_hash == prev_hash: + continue + elif curr_size == prev_size and prev_size > 0: + # Legacy state without content_hash: keep size-based skip so a + # post-upgrade run doesn't re-ingest every untouched diary. + continue now_iso = datetime.now(timezone.utc).isoformat() drawer_id = _diary_drawer_id(wing, date_str) @@ -184,6 +194,7 @@ def ingest_diaries( state[state_key] = { "size": curr_size, + "content_hash": curr_hash, "entry_count": len(entries), "ingested_at": now_iso, } diff --git a/tests/test_closets.py b/tests/test_closets.py index 976086d..bd996e1 100644 --- a/tests/test_closets.py +++ b/tests/test_closets.py @@ -605,6 +605,26 @@ class TestDiaryIngest: result = ingest_diaries(str(diary_dir), str(palace_dir)) assert result["days_updated"] == 0 + def test_ingest_detects_same_size_content_edit(self, tmp_path): + # Regression #925: the prior skip-check compared byte length only, so + # any in-place edit preserving total length (e.g. typo fix "teh"→"the") + # was silently dropped. Content-hash check must catch it. + diary_dir = tmp_path / "diaries" + diary_dir.mkdir() + diary_file = diary_dir / "2026-04-13.md" + original = "# 2026-04-13\n\n## 10:00 — Test\n\nThe quick brown fox jumps over.\n" + edited = "# 2026-04-13\n\n## 10:00 — Test\n\nTeh quick brown fox jumps over.\n" + assert len(original) == len(edited), "test setup: edited content must be same length" + diary_file.write_text(original) + palace_dir = tmp_path / "palace" + + from mempalace.diary_ingest import ingest_diaries + + ingest_diaries(str(diary_dir), str(palace_dir), force=True) + diary_file.write_text(edited) + result = ingest_diaries(str(diary_dir), str(palace_dir)) + assert result["days_updated"] == 1, "same-size content edit must trigger re-ingest" + def test_state_file_lives_outside_diary_dir(self, tmp_path): # Regression: the original implementation wrote # ``.diary_ingest_state.json`` *inside* the user's diary directory,