merge: develop (#784 file-locking, #820 version sync)

Non-trivial merge in convo_miner.py: this branch's _file_convo_chunks
(purge stale + upsert with normalize_version) and develop's
_file_chunks_locked (mine_lock + double-checked file_already_mined)
both touched the same critical section. Combined into a single
_file_chunks_locked helper that does lock → double-check → purge →
upsert, preserving both the multi-agent safety guarantee from #784
and the schema-rebuild contract from this PR.

Also folds develop's mine_lock import into both miner.py and
convo_miner.py alongside NORMALIZE_VERSION.

707/707 tests pass, ruff + format clean under CI-pinned 0.4.x.
This commit is contained in:
Igor Lins e Silva
2026-04-13 16:29:50 -03:00
4 changed files with 141 additions and 66 deletions
+32 -12
View File
@@ -16,7 +16,13 @@ from datetime import datetime
from collections import defaultdict from collections import defaultdict
from .normalize import normalize from .normalize import normalize
from .palace import NORMALIZE_VERSION, SKIP_DIRS, file_already_mined, get_collection from .palace import (
NORMALIZE_VERSION,
SKIP_DIRS,
file_already_mined,
get_collection,
mine_lock,
)
# File types that might contain conversations # File types that might contain conversations
@@ -273,22 +279,32 @@ def scan_convos(convo_dir: str) -> list:
# ============================================================================= # =============================================================================
def _file_convo_chunks(collection, source_file, chunks, wing, room, agent, extract_mode): def _file_chunks_locked(collection, source_file, chunks, wing, room, agent, extract_mode):
"""Purge stale drawers for ``source_file`` then upsert fresh chunks. """Lock the source file, purge stale drawers, and upsert fresh chunks.
Returns (drawers_added, room_counts_delta). Combines the per-file serialization that prevents concurrent agents from
duplicating work (via mine_lock) with the normalize-version rebuild
contract (purge-before-insert so pre-v2 drawers don't survive).
Returns (drawers_added, room_counts_delta, skipped).
""" """
room_counts_delta: dict = defaultdict(int)
drawers_added = 0
with mine_lock(source_file):
# Re-check after lock — another agent may have just finished this file
# at the current schema. A stale-version hit here returns False, so we
# still fall through to the purge+rebuild path below.
if file_already_mined(collection, source_file):
return 0, room_counts_delta, True
# Purge stale drawers first. When the normalize schema bumps, # Purge stale drawers first. When the normalize schema bumps,
# file_already_mined() returns False for pre-v2 drawers and we land # file_already_mined() returned False for pre-v2 drawers — clean
# here — clean them out so the source doesn't end up with a mix of # them out so the source doesn't end up with mixed old/new drawers.
# old-noise and new-clean drawers.
try: try:
collection.delete(where={"source_file": source_file}) collection.delete(where={"source_file": source_file})
except Exception: except Exception:
pass pass
room_counts_delta: dict = defaultdict(int)
drawers_added = 0
for chunk in chunks: for chunk in chunks:
chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
if extract_mode == "general": if extract_mode == "general":
@@ -316,7 +332,7 @@ def _file_convo_chunks(collection, source_file, chunks, wing, room, agent, extra
except Exception as e: except Exception as e:
if "already exists" not in str(e).lower(): if "already exists" not in str(e).lower():
raise raise
return drawers_added, room_counts_delta return drawers_added, room_counts_delta, False
def mine_convos( def mine_convos(
@@ -422,10 +438,14 @@ def mine_convos(
if extract_mode != "general": if extract_mode != "general":
room_counts[room] += 1 room_counts[room] += 1
# Purge stale drawers + file fresh chunks. # Lock + purge stale + file fresh chunks. Lock serializes concurrent
drawers_added, room_delta = _file_convo_chunks( # agents; purge removes pre-v2 drawers so the schema bump applies.
drawers_added, room_delta, skipped = _file_chunks_locked(
collection, source_file, chunks, wing, room, agent, extract_mode collection, source_file, chunks, wing, room, agent, extract_mode
) )
if skipped:
files_skipped += 1
continue
for r, n in room_delta.items(): for r, n in room_delta.items():
room_counts[r] += n room_counts[r] += n
+15 -1
View File
@@ -15,7 +15,13 @@ from pathlib import Path
from datetime import datetime from datetime import datetime
from collections import defaultdict from collections import defaultdict
from .palace import NORMALIZE_VERSION, SKIP_DIRS, file_already_mined, get_collection from .palace import (
NORMALIZE_VERSION,
SKIP_DIRS,
file_already_mined,
get_collection,
mine_lock,
)
READABLE_EXTENSIONS = { READABLE_EXTENSIONS = {
".txt", ".txt",
@@ -435,6 +441,14 @@ def process_file(
print(f" [DRY RUN] {filepath.name} → room:{room} ({len(chunks)} drawers)") print(f" [DRY RUN] {filepath.name} → room:{room} ({len(chunks)} drawers)")
return len(chunks), room return len(chunks), room
# Lock this file so concurrent agents don't interleave delete+insert.
# Without the lock, two agents can both pass file_already_mined(),
# both delete, and both insert — creating duplicates or losing data.
with mine_lock(source_file):
# Re-check after acquiring lock — another agent may have just finished
if file_already_mined(collection, source_file, check_mtime=True):
return 0, room
# Purge stale drawers for this file before re-inserting the fresh chunks. # Purge stale drawers for this file before re-inserting the fresh chunks.
# Converts modified-file re-mines from upsert-over-existing-IDs (which hits # Converts modified-file re-mines from upsert-over-existing-IDs (which hits
# hnswlib's thread-unsafe updatePoint path and can segfault on macOS ARM # hnswlib's thread-unsafe updatePoint path and can segfault on macOS ARM
+41
View File
@@ -4,6 +4,8 @@ palace.py — Shared palace operations.
Consolidates collection access patterns used by both miners and the MCP server. Consolidates collection access patterns used by both miners and the MCP server.
""" """
import contextlib
import hashlib
import os import os
from .backends.chroma import ChromaBackend from .backends.chroma import ChromaBackend
@@ -60,6 +62,45 @@ def get_collection(
) )
@contextlib.contextmanager
def mine_lock(source_file: str):
"""Cross-platform file lock for mine operations.
Prevents multiple agents from mining the same file simultaneously,
which causes duplicate drawers when the delete+insert cycle interleaves.
"""
lock_dir = os.path.join(os.path.expanduser("~"), ".mempalace", "locks")
os.makedirs(lock_dir, exist_ok=True)
lock_path = os.path.join(
lock_dir, hashlib.sha256(source_file.encode()).hexdigest()[:16] + ".lock"
)
lf = open(lock_path, "w")
try:
if os.name == "nt":
import msvcrt
msvcrt.locking(lf.fileno(), msvcrt.LK_LOCK, 1)
else:
import fcntl
fcntl.flock(lf, fcntl.LOCK_EX)
yield
finally:
try:
if os.name == "nt":
import msvcrt
msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1)
else:
import fcntl
fcntl.flock(lf, fcntl.LOCK_UN)
except Exception:
pass
lf.close()
def file_already_mined(collection, source_file: str, check_mtime: bool = False) -> bool: def file_already_mined(collection, source_file: str, check_mtime: bool = False) -> bool:
"""Check if a file has already been filed in the palace. """Check if a file has already been filed in the palace.
+1 -1
View File
@@ -1,3 +1,3 @@
"""Single source of truth for the MemPalace package version.""" """Single source of truth for the MemPalace package version."""
__version__ = "3.1.0" __version__ = "3.2.0"