fix(diary): detect same-size edits via content hash

The skip-if-unchanged check compared byte length only, so any in-place
edit preserving total length (typo fix "teh"→"the", word swap) was
silently dropped — a verbatim-storage violation: the user's actual
words never reached the palace.

Switch the gate to sha256(text). State entries gain a "content_hash"
field; the legacy size-only path is preserved when prev_hash is missing
so a post-upgrade run does not re-ingest every untouched diary.

Closes #925
This commit is contained in:
Igor Lins e Silva
2026-05-07 12:42:02 -03:00
parent 03ed4c45cf
commit 0d1c1fbcaa
2 changed files with 35 additions and 4 deletions
+15 -4
View File
@@ -120,12 +120,22 @@ def ingest_diaries(
continue
date_str = date_match.group(1)
# Skip if content hasn't changed
# Skip if content hasn't changed. Hash-based — size alone false-negatives
# on same-length edits (e.g. "teh" → "the"), silently dropping real edits.
state_key = f"{wing}|{diary_path.name}"
prev_size = state.get(state_key, {}).get("size", 0)
prev_entry = state.get(state_key, {})
prev_hash = prev_entry.get("content_hash")
prev_size = prev_entry.get("size", 0)
curr_size = len(text)
if curr_size == prev_size and not force:
continue
curr_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
if not force:
if prev_hash is not None:
if curr_hash == prev_hash:
continue
elif curr_size == prev_size and prev_size > 0:
# Legacy state without content_hash: keep size-based skip so a
# post-upgrade run doesn't re-ingest every untouched diary.
continue
now_iso = datetime.now(timezone.utc).isoformat()
drawer_id = _diary_drawer_id(wing, date_str)
@@ -184,6 +194,7 @@ def ingest_diaries(
state[state_key] = {
"size": curr_size,
"content_hash": curr_hash,
"entry_count": len(entries),
"ingested_at": now_iso,
}