merge: develop (#784 file-locking, #820 version sync)

Non-trivial merge in convo_miner.py: this branch's _file_convo_chunks
(purge stale + upsert with normalize_version) and develop's
_file_chunks_locked (mine_lock + double-checked file_already_mined)
both touched the same critical section. Combined into a single
_file_chunks_locked helper that does lock → double-check → purge →
upsert, preserving both the multi-agent safety guarantee from #784
and the schema-rebuild contract from this PR.

Also folds develop's mine_lock import into both miner.py and
convo_miner.py alongside NORMALIZE_VERSION.

707/707 tests pass, ruff + format clean under CI-pinned 0.4.x.
This commit is contained in:
Igor Lins e Silva
2026-04-13 16:29:50 -03:00
4 changed files with 141 additions and 66 deletions
+62 -42
View File
@@ -16,7 +16,13 @@ from datetime import datetime
from collections import defaultdict from collections import defaultdict
from .normalize import normalize from .normalize import normalize
from .palace import NORMALIZE_VERSION, SKIP_DIRS, file_already_mined, get_collection from .palace import (
NORMALIZE_VERSION,
SKIP_DIRS,
file_already_mined,
get_collection,
mine_lock,
)
# File types that might contain conversations # File types that might contain conversations
@@ -273,50 +279,60 @@ def scan_convos(convo_dir: str) -> list:
# ============================================================================= # =============================================================================
def _file_convo_chunks(collection, source_file, chunks, wing, room, agent, extract_mode): def _file_chunks_locked(collection, source_file, chunks, wing, room, agent, extract_mode):
"""Purge stale drawers for ``source_file`` then upsert fresh chunks. """Lock the source file, purge stale drawers, and upsert fresh chunks.
Returns (drawers_added, room_counts_delta). Combines the per-file serialization that prevents concurrent agents from
duplicating work (via mine_lock) with the normalize-version rebuild
contract (purge-before-insert so pre-v2 drawers don't survive).
Returns (drawers_added, room_counts_delta, skipped).
""" """
# Purge stale drawers first. When the normalize schema bumps,
# file_already_mined() returns False for pre-v2 drawers and we land
# here — clean them out so the source doesn't end up with a mix of
# old-noise and new-clean drawers.
try:
collection.delete(where={"source_file": source_file})
except Exception:
pass
room_counts_delta: dict = defaultdict(int) room_counts_delta: dict = defaultdict(int)
drawers_added = 0 drawers_added = 0
for chunk in chunks: with mine_lock(source_file):
chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room # Re-check after lock — another agent may have just finished this file
if extract_mode == "general": # at the current schema. A stale-version hit here returns False, so we
room_counts_delta[chunk_room] += 1 # still fall through to the purge+rebuild path below.
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}" if file_already_mined(collection, source_file):
return 0, room_counts_delta, True
# Purge stale drawers first. When the normalize schema bumps,
# file_already_mined() returned False for pre-v2 drawers — clean
# them out so the source doesn't end up with mixed old/new drawers.
try: try:
collection.upsert( collection.delete(where={"source_file": source_file})
documents=[chunk["content"]], except Exception:
ids=[drawer_id], pass
metadatas=[
{ for chunk in chunks:
"wing": wing, chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
"room": chunk_room, if extract_mode == "general":
"source_file": source_file, room_counts_delta[chunk_room] += 1
"chunk_index": chunk["chunk_index"], drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
"added_by": agent, try:
"filed_at": datetime.now().isoformat(), collection.upsert(
"ingest_mode": "convos", documents=[chunk["content"]],
"extract_mode": extract_mode, ids=[drawer_id],
"normalize_version": NORMALIZE_VERSION, metadatas=[
} {
], "wing": wing,
) "room": chunk_room,
drawers_added += 1 "source_file": source_file,
except Exception as e: "chunk_index": chunk["chunk_index"],
if "already exists" not in str(e).lower(): "added_by": agent,
raise "filed_at": datetime.now().isoformat(),
return drawers_added, room_counts_delta "ingest_mode": "convos",
"extract_mode": extract_mode,
"normalize_version": NORMALIZE_VERSION,
}
],
)
drawers_added += 1
except Exception as e:
if "already exists" not in str(e).lower():
raise
return drawers_added, room_counts_delta, False
def mine_convos( def mine_convos(
@@ -422,10 +438,14 @@ def mine_convos(
if extract_mode != "general": if extract_mode != "general":
room_counts[room] += 1 room_counts[room] += 1
# Purge stale drawers + file fresh chunks. # Lock + purge stale + file fresh chunks. Lock serializes concurrent
drawers_added, room_delta = _file_convo_chunks( # agents; purge removes pre-v2 drawers so the schema bump applies.
drawers_added, room_delta, skipped = _file_chunks_locked(
collection, source_file, chunks, wing, room, agent, extract_mode collection, source_file, chunks, wing, room, agent, extract_mode
) )
if skipped:
files_skipped += 1
continue
for r, n in room_delta.items(): for r, n in room_delta.items():
room_counts[r] += n room_counts[r] += n
+37 -23
View File
@@ -15,7 +15,13 @@ from pathlib import Path
from datetime import datetime from datetime import datetime
from collections import defaultdict from collections import defaultdict
from .palace import NORMALIZE_VERSION, SKIP_DIRS, file_already_mined, get_collection from .palace import (
NORMALIZE_VERSION,
SKIP_DIRS,
file_already_mined,
get_collection,
mine_lock,
)
READABLE_EXTENSIONS = { READABLE_EXTENSIONS = {
".txt", ".txt",
@@ -435,29 +441,37 @@ def process_file(
print(f" [DRY RUN] {filepath.name} → room:{room} ({len(chunks)} drawers)") print(f" [DRY RUN] {filepath.name} → room:{room} ({len(chunks)} drawers)")
return len(chunks), room return len(chunks), room
# Purge stale drawers for this file before re-inserting the fresh chunks. # Lock this file so concurrent agents don't interleave delete+insert.
# Converts modified-file re-mines from upsert-over-existing-IDs (which hits # Without the lock, two agents can both pass file_already_mined(),
# hnswlib's thread-unsafe updatePoint path and can segfault on macOS ARM # both delete, and both insert — creating duplicates or losing data.
# with chromadb 0.6.3) into a clean delete+insert, bypassing the update with mine_lock(source_file):
# path entirely. # Re-check after acquiring lock — another agent may have just finished
try: if file_already_mined(collection, source_file, check_mtime=True):
collection.delete(where={"source_file": source_file}) return 0, room
except Exception:
pass
drawers_added = 0 # Purge stale drawers for this file before re-inserting the fresh chunks.
for chunk in chunks: # Converts modified-file re-mines from upsert-over-existing-IDs (which hits
added = add_drawer( # hnswlib's thread-unsafe updatePoint path and can segfault on macOS ARM
collection=collection, # with chromadb 0.6.3) into a clean delete+insert, bypassing the update
wing=wing, # path entirely.
room=room, try:
content=chunk["content"], collection.delete(where={"source_file": source_file})
source_file=source_file, except Exception:
chunk_index=chunk["chunk_index"], pass
agent=agent,
) drawers_added = 0
if added: for chunk in chunks:
drawers_added += 1 added = add_drawer(
collection=collection,
wing=wing,
room=room,
content=chunk["content"],
source_file=source_file,
chunk_index=chunk["chunk_index"],
agent=agent,
)
if added:
drawers_added += 1
return drawers_added, room return drawers_added, room
+41
View File
@@ -4,6 +4,8 @@ palace.py — Shared palace operations.
Consolidates collection access patterns used by both miners and the MCP server. Consolidates collection access patterns used by both miners and the MCP server.
""" """
import contextlib
import hashlib
import os import os
from .backends.chroma import ChromaBackend from .backends.chroma import ChromaBackend
@@ -60,6 +62,45 @@ def get_collection(
) )
@contextlib.contextmanager
def mine_lock(source_file: str):
"""Cross-platform file lock for mine operations.
Prevents multiple agents from mining the same file simultaneously,
which causes duplicate drawers when the delete+insert cycle interleaves.
"""
lock_dir = os.path.join(os.path.expanduser("~"), ".mempalace", "locks")
os.makedirs(lock_dir, exist_ok=True)
lock_path = os.path.join(
lock_dir, hashlib.sha256(source_file.encode()).hexdigest()[:16] + ".lock"
)
lf = open(lock_path, "w")
try:
if os.name == "nt":
import msvcrt
msvcrt.locking(lf.fileno(), msvcrt.LK_LOCK, 1)
else:
import fcntl
fcntl.flock(lf, fcntl.LOCK_EX)
yield
finally:
try:
if os.name == "nt":
import msvcrt
msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1)
else:
import fcntl
fcntl.flock(lf, fcntl.LOCK_UN)
except Exception:
pass
lf.close()
def file_already_mined(collection, source_file: str, check_mtime: bool = False) -> bool: def file_already_mined(collection, source_file: str, check_mtime: bool = False) -> bool:
"""Check if a file has already been filed in the palace. """Check if a file has already been filed in the palace.
+1 -1
View File
@@ -1,3 +1,3 @@
"""Single source of truth for the MemPalace package version.""" """Single source of truth for the MemPalace package version."""
__version__ = "3.1.0" __version__ = "3.2.0"