fix(diary): detect same-size edits via content hash
The skip-if-unchanged check compared byte length only, so any in-place edit preserving total length (typo fix "teh"→"the", word swap) was silently dropped — a verbatim-storage violation: the user's actual words never reached the palace. Switch the gate to sha256(text). State entries gain a "content_hash" field; the legacy size-only path is preserved when prev_hash is missing so a post-upgrade run does not re-ingest every untouched diary. Closes #925
This commit is contained in:
@@ -120,11 +120,21 @@ def ingest_diaries(
|
|||||||
continue
|
continue
|
||||||
date_str = date_match.group(1)
|
date_str = date_match.group(1)
|
||||||
|
|
||||||
# Skip if content hasn't changed
|
# Skip if content hasn't changed. Hash-based — size alone false-negatives
|
||||||
|
# on same-length edits (e.g. "teh" → "the"), silently dropping real edits.
|
||||||
state_key = f"{wing}|{diary_path.name}"
|
state_key = f"{wing}|{diary_path.name}"
|
||||||
prev_size = state.get(state_key, {}).get("size", 0)
|
prev_entry = state.get(state_key, {})
|
||||||
|
prev_hash = prev_entry.get("content_hash")
|
||||||
|
prev_size = prev_entry.get("size", 0)
|
||||||
curr_size = len(text)
|
curr_size = len(text)
|
||||||
if curr_size == prev_size and not force:
|
curr_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||||
|
if not force:
|
||||||
|
if prev_hash is not None:
|
||||||
|
if curr_hash == prev_hash:
|
||||||
|
continue
|
||||||
|
elif curr_size == prev_size and prev_size > 0:
|
||||||
|
# Legacy state without content_hash: keep size-based skip so a
|
||||||
|
# post-upgrade run doesn't re-ingest every untouched diary.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
now_iso = datetime.now(timezone.utc).isoformat()
|
now_iso = datetime.now(timezone.utc).isoformat()
|
||||||
@@ -184,6 +194,7 @@ def ingest_diaries(
|
|||||||
|
|
||||||
state[state_key] = {
|
state[state_key] = {
|
||||||
"size": curr_size,
|
"size": curr_size,
|
||||||
|
"content_hash": curr_hash,
|
||||||
"entry_count": len(entries),
|
"entry_count": len(entries),
|
||||||
"ingested_at": now_iso,
|
"ingested_at": now_iso,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -605,6 +605,26 @@ class TestDiaryIngest:
|
|||||||
result = ingest_diaries(str(diary_dir), str(palace_dir))
|
result = ingest_diaries(str(diary_dir), str(palace_dir))
|
||||||
assert result["days_updated"] == 0
|
assert result["days_updated"] == 0
|
||||||
|
|
||||||
|
def test_ingest_detects_same_size_content_edit(self, tmp_path):
|
||||||
|
# Regression #925: the prior skip-check compared byte length only, so
|
||||||
|
# any in-place edit preserving total length (e.g. typo fix "teh"→"the")
|
||||||
|
# was silently dropped. Content-hash check must catch it.
|
||||||
|
diary_dir = tmp_path / "diaries"
|
||||||
|
diary_dir.mkdir()
|
||||||
|
diary_file = diary_dir / "2026-04-13.md"
|
||||||
|
original = "# 2026-04-13\n\n## 10:00 — Test\n\nThe quick brown fox jumps over.\n"
|
||||||
|
edited = "# 2026-04-13\n\n## 10:00 — Test\n\nTeh quick brown fox jumps over.\n"
|
||||||
|
assert len(original) == len(edited), "test setup: edited content must be same length"
|
||||||
|
diary_file.write_text(original)
|
||||||
|
palace_dir = tmp_path / "palace"
|
||||||
|
|
||||||
|
from mempalace.diary_ingest import ingest_diaries
|
||||||
|
|
||||||
|
ingest_diaries(str(diary_dir), str(palace_dir), force=True)
|
||||||
|
diary_file.write_text(edited)
|
||||||
|
result = ingest_diaries(str(diary_dir), str(palace_dir))
|
||||||
|
assert result["days_updated"] == 1, "same-size content edit must trigger re-ingest"
|
||||||
|
|
||||||
def test_state_file_lives_outside_diary_dir(self, tmp_path):
|
def test_state_file_lives_outside_diary_dir(self, tmp_path):
|
||||||
# Regression: the original implementation wrote
|
# Regression: the original implementation wrote
|
||||||
# ``.diary_ingest_state.json`` *inside* the user's diary directory,
|
# ``.diary_ingest_state.json`` *inside* the user's diary directory,
|
||||||
|
|||||||
Reference in New Issue
Block a user