fix(mine): identify lock holder + exit non-zero on contention

When a `mempalace mine` collided with another writer (live mcp_server,
another mine, anything taking mine_palace_lock), the operator saw a
generic "another `mempalace mine` is already running" message and the
CLI exited 0 — making the contention invisible to nohup or scripts
checking $?. The reporter ran a `nohup mempalace mine ... & disown`
and got a 200-byte log with only the auto-defaults warning, no clue
that an MCP server was holding the store.

palace.py: the lock file now records the holder's PID + first three
argv tokens on acquire. A failed acquire reads the file and surfaces
"palace <path> is held by PID N (mempalace mcp_server); wait for it
to finish or stop the holder before retrying" in the
MineAlreadyRunning message. Open mode changes from "w" to "a+" so the
prior holder's identity survives long enough to be read.

miner.mine() now lets MineAlreadyRunning propagate. cmd_mine catches
it, prints the holder-aware message to stderr, and exits non-zero so
shell wrappers detect the contention.

Note: this is a behavior change for in-process callers that depended
on miner.mine() silently swallowing MineAlreadyRunning. The silent
swallow was the bug.

Closes #1264
This commit is contained in:
Igor Lins e Silva
2026-05-08 01:00:00 -03:00
parent ea36a00f5f
commit ef8d83cc8a
5 changed files with 213 additions and 46 deletions
+10
View File
@@ -500,6 +500,9 @@ def cmd_mine(args):
llm_provider=None,
)
from .palace import MineAlreadyRunning
try:
if args.mode == "convos":
from .convo_miner import mine_convos
@@ -525,6 +528,13 @@ def cmd_mine(args):
respect_gitignore=not args.no_gitignore,
include_ignored=include_ignored,
)
except MineAlreadyRunning as exc:
# A live MCP server or another mine is already writing to this
# palace. Surface the holder identity so the operator knows what
# to wait for (or stop), and exit non-zero so wrappers like
# nohup / scripts can detect the contention.
print(f"mempalace: {exc}", file=sys.stderr)
sys.exit(1)
def cmd_sweep(args):
+3 -9
View File
@@ -21,7 +21,6 @@ from typing import Optional
from .palace import (
NORMALIZE_VERSION,
SKIP_DIRS,
MineAlreadyRunning,
build_closet_lines,
file_already_mined,
get_closets_collection,
@@ -1035,7 +1034,9 @@ def mine(
files=files,
)
try:
# MineAlreadyRunning propagates so the CLI can render a clear holder-aware
# message and exit non-zero. In-process callers (tests, library users) that
# expect to coexist with another writer should handle the exception.
with mine_palace_lock(palace_path):
return _mine_impl(
project_dir,
@@ -1048,13 +1049,6 @@ def mine(
include_ignored=include_ignored,
files=files,
)
except MineAlreadyRunning:
print(
f"mempalace: another `mine` is already running against "
f"{palace_path} — exiting cleanly.",
file=sys.stderr,
)
return
def _mine_impl(
+48 -3
View File
@@ -9,6 +9,7 @@ import hashlib
import logging
import os
import re
import sys
import threading
from typing import Optional
@@ -364,6 +365,41 @@ def _mark_released(lock_key: str) -> None:
_holder_state().discard(lock_key)
def _format_lock_holder(content: str) -> str:
"""Render a lock-file body as 'PID N (cmdline)' for diagnostic messages."""
parts = content.split(maxsplit=1)
if not parts or not parts[0].isdigit():
return "another writer (identity not recorded)"
pid = parts[0]
if len(parts) > 1 and parts[1].strip():
return f"PID {pid} ({parts[1].strip()})"
return f"PID {pid}"
def _read_lock_holder(lock_file) -> str:
"""Read the prior holder's identity from the lock-file body, best-effort."""
try:
lock_file.seek(0)
content = lock_file.read().strip()
except OSError:
return "another writer (identity not recorded)"
if not content:
return "another writer (identity not recorded)"
return _format_lock_holder(content)
def _write_lock_holder(lock_file) -> None:
"""Record this process's identity in the lock-file body. Best-effort."""
try:
ident = f"{os.getpid()} {' '.join(sys.argv[:3])}".strip()
lock_file.seek(0)
lock_file.truncate()
lock_file.write(ident)
lock_file.flush()
except OSError:
pass
@contextlib.contextmanager
def mine_palace_lock(palace_path: str):
"""Per-palace non-blocking lock around the full `mine` pipeline.
@@ -407,7 +443,10 @@ def mine_palace_lock(palace_path: str):
yield
return
lf = open(lock_path, "w")
# "a+" preserves the prior holder's identity recorded inside the file so
# a failed acquire can name who is holding the lock (#1264). "w" mode
# would have truncated the file before we could read it.
lf = open(lock_path, "a+")
acquired = False
try:
if os.name == "nt":
@@ -417,8 +456,10 @@ def mine_palace_lock(palace_path: str):
msvcrt.locking(lf.fileno(), msvcrt.LK_NBLCK, 1)
acquired = True
except OSError as exc:
holder = _read_lock_holder(lf)
raise MineAlreadyRunning(
f"another `mempalace mine` is already running against {resolved}"
f"palace {resolved} is held by {holder}; "
"wait for it to finish or stop the holder before retrying"
) from exc
else:
import fcntl
@@ -427,9 +468,13 @@ def mine_palace_lock(palace_path: str):
fcntl.flock(lf, fcntl.LOCK_EX | fcntl.LOCK_NB)
acquired = True
except BlockingIOError as exc:
holder = _read_lock_holder(lf)
raise MineAlreadyRunning(
f"another `mempalace mine` is already running against {resolved}"
f"palace {resolved} is held by {holder}; "
"wait for it to finish or stop the holder before retrying"
) from exc
# Record our own identity for any later contender's diagnostic message.
_write_lock_holder(lf)
_mark_held(palace_key)
try:
yield
+39
View File
@@ -555,6 +555,45 @@ def test_cmd_mine_include_ignored_comma_split(mock_config_cls):
assert call_kwargs["include_ignored"] == ["a.txt", "b.txt", "c.txt"]
@patch("mempalace.cli.MempalaceConfig")
def test_cmd_mine_exits_nonzero_on_lock_holder(mock_config_cls, capsys):
"""Regression #1264: lock contention must exit non-zero with a clear message.
Before this fix the CLI silently returned 0 when another writer held
the palace lock — operators using nohup/scripts had no way to detect
the contention. The new behavior raises MineAlreadyRunning out of
miner.mine() and cmd_mine catches it, printing the holder identity
to stderr and exiting non-zero.
"""
from mempalace.palace import MineAlreadyRunning
mock_config_cls.return_value.palace_path = "/fake/palace"
args = argparse.Namespace(
dir="/src",
palace=None,
mode="projects",
wing=None,
agent="mempalace",
limit=0,
dry_run=False,
no_gitignore=False,
include_ignored=[],
extract="exchange",
)
with patch(
"mempalace.miner.mine",
side_effect=MineAlreadyRunning(
"palace /fake/palace is held by PID 12345 (mempalace mcp_server); wait for it to finish"
),
):
with pytest.raises(SystemExit) as excinfo:
cmd_mine(args)
assert excinfo.value.code == 1
captured = capsys.readouterr()
assert "PID 12345" in captured.err
assert "mcp_server" in captured.err
# ── cmd_wakeup ─────────────────────────────────────────────────────────
+79
View File
@@ -208,6 +208,85 @@ def _try_acquire_expect_busy(palace_path, result_q):
result_q.put("busy")
def _hold_lock_send_pid(palace_path: str, ready_flag: str, release_flag: str, pid_q) -> None:
"""Acquire the lock, push our PID + cmdline through the queue, then wait."""
import sys as _sys
try:
with mine_palace_lock(palace_path):
pid_q.put((os.getpid(), list(_sys.argv[:3])))
open(ready_flag, "w").close()
for _ in range(500):
if os.path.exists(release_flag):
return
time.sleep(0.01)
except MineAlreadyRunning:
pid_q.put(("error", "raised"))
def test_lock_failure_message_names_holder(tmp_path, monkeypatch):
"""Regression #1264: failed acquire must identify the holder by PID.
Before this fix, a `mempalace mine` colliding with another writer
(mine, MCP server, anything taking mine_palace_lock) saw a generic
"another `mempalace mine` is already running" message and exited
silently. The operator had no signal of which process to wait for
or stop. The new message includes ``PID N`` so the holder can be
identified directly.
"""
monkeypatch.setenv("HOME", str(tmp_path))
palace = str(tmp_path / "palace")
ready = str(tmp_path / "ready")
release = str(tmp_path / "release")
ctx = _get_mp_context()
pid_q = ctx.Queue()
holder = ctx.Process(target=_hold_lock_send_pid, args=(palace, ready, release, pid_q))
holder.start()
try:
for _ in range(500):
if os.path.exists(ready):
break
time.sleep(0.01)
assert os.path.exists(ready), "holder failed to acquire lock in time"
holder_pid, _holder_argv = pid_q.get(timeout=2)
with pytest.raises(MineAlreadyRunning) as excinfo:
with mine_palace_lock(palace):
pytest.fail("second acquire of same palace should have raised")
msg = str(excinfo.value)
assert (
f"PID {holder_pid}" in msg
), f"lock-failure message must name the holder PID; got: {msg!r}"
finally:
open(release, "w").close()
holder.join(timeout=5)
def test_lock_holder_identity_persists_across_release(tmp_path, monkeypatch):
"""The holder line is overwritten by each new acquirer, not appended.
Without explicit truncate the lock file would accumulate lines across
runs and grow without bound. Verify that re-acquire keeps the body
bounded.
"""
monkeypatch.setenv("HOME", str(tmp_path))
palace = str(tmp_path / "palace")
for _ in range(5):
with mine_palace_lock(palace):
pass
# Locate the lock file. The key derivation is internal but we can find
# it by scanning the mempalace locks dir for mine_palace_*.lock entries.
lock_dir = tmp_path / ".mempalace" / "locks"
lock_files = list(lock_dir.glob("mine_palace_*.lock"))
assert lock_files, "expected the palace lock file to exist after acquire/release"
body = lock_files[0].read_text()
# One identity line, no accumulation.
assert body.count("\n") <= 1, f"lock body must not grow across re-acquires; got {body!r}"
def test_mine_global_lock_is_alias_for_back_compat(tmp_path, monkeypatch):
"""Old callers of `mine_global_lock` should still work."""
monkeypatch.setenv("HOME", str(tmp_path))