Non-trivial merge in convo_miner.py: this branch's _file_convo_chunks (purge stale + upsert with normalize_version) and develop's _file_chunks_locked (mine_lock + double-checked file_already_mined) both touched the same critical section. Combined into a single _file_chunks_locked helper that does lock → double-check → purge → upsert, preserving both the multi-agent safety guarantee from #784 and the schema-rebuild contract from this PR. Also folds develop's mine_lock import into both miner.py and convo_miner.py alongside NORMALIZE_VERSION. 707/707 tests pass, ruff + format clean under CI-pinned 0.4.x.
This commit is contained in:
+62
-42
@@ -16,7 +16,13 @@ from datetime import datetime
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
from .normalize import normalize
|
from .normalize import normalize
|
||||||
from .palace import NORMALIZE_VERSION, SKIP_DIRS, file_already_mined, get_collection
|
from .palace import (
|
||||||
|
NORMALIZE_VERSION,
|
||||||
|
SKIP_DIRS,
|
||||||
|
file_already_mined,
|
||||||
|
get_collection,
|
||||||
|
mine_lock,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# File types that might contain conversations
|
# File types that might contain conversations
|
||||||
@@ -273,50 +279,60 @@ def scan_convos(convo_dir: str) -> list:
|
|||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
def _file_convo_chunks(collection, source_file, chunks, wing, room, agent, extract_mode):
|
def _file_chunks_locked(collection, source_file, chunks, wing, room, agent, extract_mode):
|
||||||
"""Purge stale drawers for ``source_file`` then upsert fresh chunks.
|
"""Lock the source file, purge stale drawers, and upsert fresh chunks.
|
||||||
|
|
||||||
Returns (drawers_added, room_counts_delta).
|
Combines the per-file serialization that prevents concurrent agents from
|
||||||
|
duplicating work (via mine_lock) with the normalize-version rebuild
|
||||||
|
contract (purge-before-insert so pre-v2 drawers don't survive).
|
||||||
|
|
||||||
|
Returns (drawers_added, room_counts_delta, skipped).
|
||||||
"""
|
"""
|
||||||
# Purge stale drawers first. When the normalize schema bumps,
|
|
||||||
# file_already_mined() returns False for pre-v2 drawers and we land
|
|
||||||
# here — clean them out so the source doesn't end up with a mix of
|
|
||||||
# old-noise and new-clean drawers.
|
|
||||||
try:
|
|
||||||
collection.delete(where={"source_file": source_file})
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
room_counts_delta: dict = defaultdict(int)
|
room_counts_delta: dict = defaultdict(int)
|
||||||
drawers_added = 0
|
drawers_added = 0
|
||||||
for chunk in chunks:
|
with mine_lock(source_file):
|
||||||
chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
|
# Re-check after lock — another agent may have just finished this file
|
||||||
if extract_mode == "general":
|
# at the current schema. A stale-version hit here returns False, so we
|
||||||
room_counts_delta[chunk_room] += 1
|
# still fall through to the purge+rebuild path below.
|
||||||
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
|
if file_already_mined(collection, source_file):
|
||||||
|
return 0, room_counts_delta, True
|
||||||
|
|
||||||
|
# Purge stale drawers first. When the normalize schema bumps,
|
||||||
|
# file_already_mined() returned False for pre-v2 drawers — clean
|
||||||
|
# them out so the source doesn't end up with mixed old/new drawers.
|
||||||
try:
|
try:
|
||||||
collection.upsert(
|
collection.delete(where={"source_file": source_file})
|
||||||
documents=[chunk["content"]],
|
except Exception:
|
||||||
ids=[drawer_id],
|
pass
|
||||||
metadatas=[
|
|
||||||
{
|
for chunk in chunks:
|
||||||
"wing": wing,
|
chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
|
||||||
"room": chunk_room,
|
if extract_mode == "general":
|
||||||
"source_file": source_file,
|
room_counts_delta[chunk_room] += 1
|
||||||
"chunk_index": chunk["chunk_index"],
|
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
|
||||||
"added_by": agent,
|
try:
|
||||||
"filed_at": datetime.now().isoformat(),
|
collection.upsert(
|
||||||
"ingest_mode": "convos",
|
documents=[chunk["content"]],
|
||||||
"extract_mode": extract_mode,
|
ids=[drawer_id],
|
||||||
"normalize_version": NORMALIZE_VERSION,
|
metadatas=[
|
||||||
}
|
{
|
||||||
],
|
"wing": wing,
|
||||||
)
|
"room": chunk_room,
|
||||||
drawers_added += 1
|
"source_file": source_file,
|
||||||
except Exception as e:
|
"chunk_index": chunk["chunk_index"],
|
||||||
if "already exists" not in str(e).lower():
|
"added_by": agent,
|
||||||
raise
|
"filed_at": datetime.now().isoformat(),
|
||||||
return drawers_added, room_counts_delta
|
"ingest_mode": "convos",
|
||||||
|
"extract_mode": extract_mode,
|
||||||
|
"normalize_version": NORMALIZE_VERSION,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
drawers_added += 1
|
||||||
|
except Exception as e:
|
||||||
|
if "already exists" not in str(e).lower():
|
||||||
|
raise
|
||||||
|
return drawers_added, room_counts_delta, False
|
||||||
|
|
||||||
|
|
||||||
def mine_convos(
|
def mine_convos(
|
||||||
@@ -422,10 +438,14 @@ def mine_convos(
|
|||||||
if extract_mode != "general":
|
if extract_mode != "general":
|
||||||
room_counts[room] += 1
|
room_counts[room] += 1
|
||||||
|
|
||||||
# Purge stale drawers + file fresh chunks.
|
# Lock + purge stale + file fresh chunks. Lock serializes concurrent
|
||||||
drawers_added, room_delta = _file_convo_chunks(
|
# agents; purge removes pre-v2 drawers so the schema bump applies.
|
||||||
|
drawers_added, room_delta, skipped = _file_chunks_locked(
|
||||||
collection, source_file, chunks, wing, room, agent, extract_mode
|
collection, source_file, chunks, wing, room, agent, extract_mode
|
||||||
)
|
)
|
||||||
|
if skipped:
|
||||||
|
files_skipped += 1
|
||||||
|
continue
|
||||||
for r, n in room_delta.items():
|
for r, n in room_delta.items():
|
||||||
room_counts[r] += n
|
room_counts[r] += n
|
||||||
|
|
||||||
|
|||||||
+37
-23
@@ -15,7 +15,13 @@ from pathlib import Path
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
from .palace import NORMALIZE_VERSION, SKIP_DIRS, file_already_mined, get_collection
|
from .palace import (
|
||||||
|
NORMALIZE_VERSION,
|
||||||
|
SKIP_DIRS,
|
||||||
|
file_already_mined,
|
||||||
|
get_collection,
|
||||||
|
mine_lock,
|
||||||
|
)
|
||||||
|
|
||||||
READABLE_EXTENSIONS = {
|
READABLE_EXTENSIONS = {
|
||||||
".txt",
|
".txt",
|
||||||
@@ -435,29 +441,37 @@ def process_file(
|
|||||||
print(f" [DRY RUN] {filepath.name} → room:{room} ({len(chunks)} drawers)")
|
print(f" [DRY RUN] {filepath.name} → room:{room} ({len(chunks)} drawers)")
|
||||||
return len(chunks), room
|
return len(chunks), room
|
||||||
|
|
||||||
# Purge stale drawers for this file before re-inserting the fresh chunks.
|
# Lock this file so concurrent agents don't interleave delete+insert.
|
||||||
# Converts modified-file re-mines from upsert-over-existing-IDs (which hits
|
# Without the lock, two agents can both pass file_already_mined(),
|
||||||
# hnswlib's thread-unsafe updatePoint path and can segfault on macOS ARM
|
# both delete, and both insert — creating duplicates or losing data.
|
||||||
# with chromadb 0.6.3) into a clean delete+insert, bypassing the update
|
with mine_lock(source_file):
|
||||||
# path entirely.
|
# Re-check after acquiring lock — another agent may have just finished
|
||||||
try:
|
if file_already_mined(collection, source_file, check_mtime=True):
|
||||||
collection.delete(where={"source_file": source_file})
|
return 0, room
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
drawers_added = 0
|
# Purge stale drawers for this file before re-inserting the fresh chunks.
|
||||||
for chunk in chunks:
|
# Converts modified-file re-mines from upsert-over-existing-IDs (which hits
|
||||||
added = add_drawer(
|
# hnswlib's thread-unsafe updatePoint path and can segfault on macOS ARM
|
||||||
collection=collection,
|
# with chromadb 0.6.3) into a clean delete+insert, bypassing the update
|
||||||
wing=wing,
|
# path entirely.
|
||||||
room=room,
|
try:
|
||||||
content=chunk["content"],
|
collection.delete(where={"source_file": source_file})
|
||||||
source_file=source_file,
|
except Exception:
|
||||||
chunk_index=chunk["chunk_index"],
|
pass
|
||||||
agent=agent,
|
|
||||||
)
|
drawers_added = 0
|
||||||
if added:
|
for chunk in chunks:
|
||||||
drawers_added += 1
|
added = add_drawer(
|
||||||
|
collection=collection,
|
||||||
|
wing=wing,
|
||||||
|
room=room,
|
||||||
|
content=chunk["content"],
|
||||||
|
source_file=source_file,
|
||||||
|
chunk_index=chunk["chunk_index"],
|
||||||
|
agent=agent,
|
||||||
|
)
|
||||||
|
if added:
|
||||||
|
drawers_added += 1
|
||||||
|
|
||||||
return drawers_added, room
|
return drawers_added, room
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ palace.py — Shared palace operations.
|
|||||||
Consolidates collection access patterns used by both miners and the MCP server.
|
Consolidates collection access patterns used by both miners and the MCP server.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import contextlib
|
||||||
|
import hashlib
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from .backends.chroma import ChromaBackend
|
from .backends.chroma import ChromaBackend
|
||||||
@@ -60,6 +62,45 @@ def get_collection(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def mine_lock(source_file: str):
|
||||||
|
"""Cross-platform file lock for mine operations.
|
||||||
|
|
||||||
|
Prevents multiple agents from mining the same file simultaneously,
|
||||||
|
which causes duplicate drawers when the delete+insert cycle interleaves.
|
||||||
|
"""
|
||||||
|
lock_dir = os.path.join(os.path.expanduser("~"), ".mempalace", "locks")
|
||||||
|
os.makedirs(lock_dir, exist_ok=True)
|
||||||
|
lock_path = os.path.join(
|
||||||
|
lock_dir, hashlib.sha256(source_file.encode()).hexdigest()[:16] + ".lock"
|
||||||
|
)
|
||||||
|
|
||||||
|
lf = open(lock_path, "w")
|
||||||
|
try:
|
||||||
|
if os.name == "nt":
|
||||||
|
import msvcrt
|
||||||
|
|
||||||
|
msvcrt.locking(lf.fileno(), msvcrt.LK_LOCK, 1)
|
||||||
|
else:
|
||||||
|
import fcntl
|
||||||
|
|
||||||
|
fcntl.flock(lf, fcntl.LOCK_EX)
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
if os.name == "nt":
|
||||||
|
import msvcrt
|
||||||
|
|
||||||
|
msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1)
|
||||||
|
else:
|
||||||
|
import fcntl
|
||||||
|
|
||||||
|
fcntl.flock(lf, fcntl.LOCK_UN)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
lf.close()
|
||||||
|
|
||||||
|
|
||||||
def file_already_mined(collection, source_file: str, check_mtime: bool = False) -> bool:
|
def file_already_mined(collection, source_file: str, check_mtime: bool = False) -> bool:
|
||||||
"""Check if a file has already been filed in the palace.
|
"""Check if a file has already been filed in the palace.
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,3 @@
|
|||||||
"""Single source of truth for the MemPalace package version."""
|
"""Single source of truth for the MemPalace package version."""
|
||||||
|
|
||||||
__version__ = "3.1.0"
|
__version__ = "3.2.0"
|
||||||
|
|||||||
Reference in New Issue
Block a user