fix(hooks): consolidate transcript ingest, harden shell parsers (#1231 review)
Address Copilot review on #1231: 1. Stop double-mining the transcript on the Python side. ``_get_mine_targets`` now returns only the ``MEMPAL_DIR`` projects target — the convos target for the transcript dir is dropped because ``_ingest_transcript`` already handles it on every hook fire. The duplicate spawn was using ``sys.executable`` (vs ``_mempalace_python()``) and a different ``--wing``, so each Stop/PreCompact event was writing the same transcript into two wings under asymmetric interpreters and overwriting the single ``_MINE_PID_FILE`` lock. 2. ``_maybe_auto_ingest`` and ``_mine_sync`` now spawn via ``_mempalace_python()`` so the resolved interpreter matches the venv that owns mempalace (matters under GUI-launched harnesses where ``sys.executable`` may resolve to a system Python without chromadb). 3. Replace ``eval $(...)`` in both shell hooks with a ``mapfile``-based reader. Sanitized values are still emitted by the same Python parser, but the shell now does plain variable assignment instead of executing the parser's stdout — smaller blast radius if the sanitizer is ever bypassed. 4. Mirror ``_validate_transcript_path`` in the shell hooks via a ``is_valid_transcript_path`` helper — extension + traversal-segment rejection, parity with the Python validator. The convos mine in each shell hook is now gated on the validator instead of bare ``-f``. 5. Tighten the ``..`` traversal test that previously exercised the suffix gate by mistake (``../../etc/passwd`` lacks ``.json[l]``). Use ``.jsonl`` paths with traversal segments to actually hit the ``..`` rejection branch. 6. README: add a one-liner pointing at ``mempalace sweep`` for users who want per-message recall on top of the file-level chunks the hooks produce. The sweeper was undiscoverable previously. Tests: 1418 passed, 1 skipped (full suite minus benchmarks).
This commit is contained in:
@@ -149,6 +149,10 @@ system prompt:
|
|||||||
Two Claude Code hooks save periodically and before context compression:
|
Two Claude Code hooks save periodically and before context compression:
|
||||||
[mempalaceofficial.com/guide/hooks](https://mempalaceofficial.com/guide/hooks.html).
|
[mempalaceofficial.com/guide/hooks](https://mempalaceofficial.com/guide/hooks.html).
|
||||||
|
|
||||||
|
For per-message recall on top of the file-level chunks the hooks produce,
|
||||||
|
run `mempalace sweep <transcript-dir>` periodically — it stores one
|
||||||
|
verbatim drawer per user/assistant message, idempotent and resume-safe.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|||||||
@@ -65,30 +65,53 @@ fi
|
|||||||
# Read JSON input from stdin
|
# Read JSON input from stdin
|
||||||
INPUT=$(cat)
|
INPUT=$(cat)
|
||||||
|
|
||||||
# Parse session_id and transcript_path in one call. Sanitize both before
|
# Parse session_id and transcript_path in one call. Sanitize both, then
|
||||||
# interpolating into shell — same contract as mempal_save_hook.sh.
|
# read sanitized values from one-per-line stdout into shell variables —
|
||||||
eval $(echo "$INPUT" | "$MEMPAL_PYTHON_BIN" -c "
|
# avoids ``eval`` on generated code (#1231 review). Same contract as
|
||||||
|
# mempal_save_hook.sh.
|
||||||
|
mapfile -t _mempal_parsed < <(echo "$INPUT" | "$MEMPAL_PYTHON_BIN" -c "
|
||||||
import sys, json, re
|
import sys, json, re
|
||||||
data = json.load(sys.stdin)
|
data = json.load(sys.stdin)
|
||||||
sid = data.get('session_id', 'unknown')
|
sid = data.get('session_id', 'unknown')
|
||||||
tp = data.get('transcript_path', '')
|
tp = data.get('transcript_path', '')
|
||||||
safe = lambda s: re.sub(r'[^a-zA-Z0-9_/.\-~]', '', str(s))
|
safe = lambda s: re.sub(r'[^a-zA-Z0-9_/.\-~]', '', str(s))
|
||||||
print(f'SESSION_ID=\"{safe(sid)}\"')
|
print(safe(sid))
|
||||||
print(f'TRANSCRIPT_PATH=\"{safe(tp)}\"')
|
print(safe(tp))
|
||||||
" 2>/dev/null)
|
" 2>/dev/null)
|
||||||
|
SESSION_ID="${_mempal_parsed[0]:-unknown}"
|
||||||
|
TRANSCRIPT_PATH="${_mempal_parsed[1]:-}"
|
||||||
|
|
||||||
# Expand ~ in path
|
# Expand ~ in path
|
||||||
TRANSCRIPT_PATH="${TRANSCRIPT_PATH/#\~/$HOME}"
|
TRANSCRIPT_PATH="${TRANSCRIPT_PATH/#\~/$HOME}"
|
||||||
|
|
||||||
|
# Validate that TRANSCRIPT_PATH looks like a transcript file. Mirrors
|
||||||
|
# mempalace.hooks_cli._validate_transcript_path so the shell hook
|
||||||
|
# rejects the same shapes the Python hook rejects (#1231 review).
|
||||||
|
is_valid_transcript_path() {
|
||||||
|
local path="$1"
|
||||||
|
[ -n "$path" ] || return 1
|
||||||
|
case "$path" in
|
||||||
|
*.json|*.jsonl) ;;
|
||||||
|
*) return 1 ;;
|
||||||
|
esac
|
||||||
|
case "/$path/" in
|
||||||
|
*/../*) return 1 ;;
|
||||||
|
esac
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
echo "[$(date '+%H:%M:%S')] PRE-COMPACT triggered for session $SESSION_ID" >> "$STATE_DIR/hook.log"
|
echo "[$(date '+%H:%M:%S')] PRE-COMPACT triggered for session $SESSION_ID" >> "$STATE_DIR/hook.log"
|
||||||
|
|
||||||
# Run ingest synchronously so memories land before compaction. Two
|
# Run ingest synchronously so memories land before compaction. Two
|
||||||
# independent targets — both run if both are set:
|
# independent targets — both run if both are set:
|
||||||
# 1. TRANSCRIPT_PATH (from Claude Code) → parent dir, --mode convos
|
# 1. TRANSCRIPT_PATH (from Claude Code) → parent dir, --mode convos
|
||||||
# 2. MEMPAL_DIR → --mode projects
|
# 2. MEMPAL_DIR → --mode projects
|
||||||
if [ -n "$TRANSCRIPT_PATH" ] && [ -f "$TRANSCRIPT_PATH" ]; then
|
if is_valid_transcript_path "$TRANSCRIPT_PATH" && [ -f "$TRANSCRIPT_PATH" ]; then
|
||||||
mempalace mine "$(dirname "$TRANSCRIPT_PATH")" --mode convos \
|
mempalace mine "$(dirname "$TRANSCRIPT_PATH")" --mode convos \
|
||||||
>> "$STATE_DIR/hook.log" 2>&1
|
>> "$STATE_DIR/hook.log" 2>&1
|
||||||
|
elif [ -n "$TRANSCRIPT_PATH" ]; then
|
||||||
|
echo "[$(date '+%H:%M:%S')] Skipping invalid transcript path: $TRANSCRIPT_PATH" \
|
||||||
|
>> "$STATE_DIR/hook.log"
|
||||||
fi
|
fi
|
||||||
if [ -n "$MEMPAL_DIR" ] && [ -d "$MEMPAL_DIR" ]; then
|
if [ -n "$MEMPAL_DIR" ] && [ -d "$MEMPAL_DIR" ]; then
|
||||||
mempalace mine "$MEMPAL_DIR" --mode projects \
|
mempalace mine "$MEMPAL_DIR" --mode projects \
|
||||||
|
|||||||
@@ -83,9 +83,11 @@ fi
|
|||||||
INPUT=$(cat)
|
INPUT=$(cat)
|
||||||
|
|
||||||
# Parse all fields in a single Python call (3x faster than separate invocations)
|
# Parse all fields in a single Python call (3x faster than separate invocations)
|
||||||
# SECURITY: All values are sanitized before being interpolated into shell assignments.
|
# without invoking ``eval`` on generated code: Python prints one sanitized
|
||||||
# stop_hook_active is coerced to a strict True/False to prevent command injection via eval.
|
# value per line, the shell reads them via ``mapfile`` and does plain
|
||||||
eval $(echo "$INPUT" | "$MEMPAL_PYTHON_BIN" -c "
|
# variable assignment — same data, smaller blast radius if the sanitizer
|
||||||
|
# is ever bypassed (#1231 review).
|
||||||
|
mapfile -t _mempal_parsed < <(echo "$INPUT" | "$MEMPAL_PYTHON_BIN" -c "
|
||||||
import sys, json, re
|
import sys, json, re
|
||||||
data = json.load(sys.stdin)
|
data = json.load(sys.stdin)
|
||||||
sid = data.get('session_id', 'unknown')
|
sid = data.get('session_id', 'unknown')
|
||||||
@@ -95,14 +97,36 @@ tp = data.get('transcript_path', '')
|
|||||||
safe = lambda s: re.sub(r'[^a-zA-Z0-9_/.\-~]', '', str(s))
|
safe = lambda s: re.sub(r'[^a-zA-Z0-9_/.\-~]', '', str(s))
|
||||||
# Coerce stop_hook_active to strict boolean string
|
# Coerce stop_hook_active to strict boolean string
|
||||||
sha = 'True' if sha_raw is True or str(sha_raw).lower() in ('true', '1', 'yes') else 'False'
|
sha = 'True' if sha_raw is True or str(sha_raw).lower() in ('true', '1', 'yes') else 'False'
|
||||||
print(f'SESSION_ID=\"{safe(sid)}\"')
|
print(safe(sid))
|
||||||
print(f'STOP_HOOK_ACTIVE=\"{sha}\"')
|
print(sha)
|
||||||
print(f'TRANSCRIPT_PATH=\"{safe(tp)}\"')
|
print(safe(tp))
|
||||||
" 2>/dev/null)
|
" 2>/dev/null)
|
||||||
|
SESSION_ID="${_mempal_parsed[0]:-unknown}"
|
||||||
|
STOP_HOOK_ACTIVE="${_mempal_parsed[1]:-False}"
|
||||||
|
TRANSCRIPT_PATH="${_mempal_parsed[2]:-}"
|
||||||
|
|
||||||
# Expand ~ in path
|
# Expand ~ in path
|
||||||
TRANSCRIPT_PATH="${TRANSCRIPT_PATH/#\~/$HOME}"
|
TRANSCRIPT_PATH="${TRANSCRIPT_PATH/#\~/$HOME}"
|
||||||
|
|
||||||
|
# Validate that TRANSCRIPT_PATH looks like a transcript file:
|
||||||
|
# - non-empty
|
||||||
|
# - .jsonl or .json suffix
|
||||||
|
# - no traversal segments (.. components)
|
||||||
|
# Mirrors mempalace.hooks_cli._validate_transcript_path so the shell hook
|
||||||
|
# rejects the same shapes the Python hook rejects (#1231 review).
|
||||||
|
is_valid_transcript_path() {
|
||||||
|
local path="$1"
|
||||||
|
[ -n "$path" ] || return 1
|
||||||
|
case "$path" in
|
||||||
|
*.json|*.jsonl) ;;
|
||||||
|
*) return 1 ;;
|
||||||
|
esac
|
||||||
|
case "/$path/" in
|
||||||
|
*/../*) return 1 ;;
|
||||||
|
esac
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
# If we're already in a save cycle, let the AI stop normally
|
# If we're already in a save cycle, let the AI stop normally
|
||||||
# This is the infinite-loop prevention: block once → AI saves → tries to stop again → we let it through
|
# This is the infinite-loop prevention: block once → AI saves → tries to stop again → we let it through
|
||||||
if [ "$STOP_HOOK_ACTIVE" = "True" ] || [ "$STOP_HOOK_ACTIVE" = "true" ]; then
|
if [ "$STOP_HOOK_ACTIVE" = "True" ] || [ "$STOP_HOOK_ACTIVE" = "true" ]; then
|
||||||
@@ -165,9 +189,12 @@ if [ "$SINCE_LAST" -ge "$SAVE_INTERVAL" ] && [ "$EXCHANGE_COUNT" -gt 0 ]; then
|
|||||||
# (code, notes, docs)
|
# (code, notes, docs)
|
||||||
# MEMPAL_DIR is *additive*, not an override: a user with MEMPAL_DIR
|
# MEMPAL_DIR is *additive*, not an override: a user with MEMPAL_DIR
|
||||||
# pointed at their project still gets the active conversation mined.
|
# pointed at their project still gets the active conversation mined.
|
||||||
if [ -n "$TRANSCRIPT_PATH" ] && [ -f "$TRANSCRIPT_PATH" ]; then
|
if is_valid_transcript_path "$TRANSCRIPT_PATH" && [ -f "$TRANSCRIPT_PATH" ]; then
|
||||||
mempalace mine "$(dirname "$TRANSCRIPT_PATH")" --mode convos \
|
mempalace mine "$(dirname "$TRANSCRIPT_PATH")" --mode convos \
|
||||||
>> "$STATE_DIR/hook.log" 2>&1 &
|
>> "$STATE_DIR/hook.log" 2>&1 &
|
||||||
|
elif [ -n "$TRANSCRIPT_PATH" ]; then
|
||||||
|
echo "[$(date '+%H:%M:%S')] Skipping invalid transcript path: $TRANSCRIPT_PATH" \
|
||||||
|
>> "$STATE_DIR/hook.log"
|
||||||
fi
|
fi
|
||||||
if [ -n "$MEMPAL_DIR" ] && [ -d "$MEMPAL_DIR" ]; then
|
if [ -n "$MEMPAL_DIR" ] && [ -d "$MEMPAL_DIR" ]; then
|
||||||
mempalace mine "$MEMPAL_DIR" --mode projects \
|
mempalace mine "$MEMPAL_DIR" --mode projects \
|
||||||
|
|||||||
+30
-21
@@ -197,16 +197,15 @@ def _output(data: dict):
|
|||||||
sys.stdout.buffer.flush()
|
sys.stdout.buffer.flush()
|
||||||
|
|
||||||
|
|
||||||
def _get_mine_targets(transcript_path: str = "") -> list[tuple[str, str]]:
|
def _get_mine_targets() -> list[tuple[str, str]]:
|
||||||
"""Return the list of ``(dir, mode)`` targets for auto-ingest.
|
"""Return the list of ``(dir, mode)`` targets for auto-ingest.
|
||||||
|
|
||||||
MEMPAL_DIR (when set and resolvable) contributes a ``"projects"``
|
MEMPAL_DIR (when set and resolvable) contributes a ``"projects"``
|
||||||
target. A valid transcript JSONL contributes a ``"convos"`` target
|
target. Transcript ingestion is handled separately by
|
||||||
on its parent dir. Both may appear together — the hook runs one
|
``_ingest_transcript`` — emitting it here too would double-mine the
|
||||||
ingest per target, so a user with MEMPAL_DIR pointed at their
|
same JSONL into a different wing on every hook fire (#1231 review).
|
||||||
project dir still gets the active conversation mined verbatim.
|
|
||||||
|
|
||||||
An empty list means no ingest should run.
|
An empty list means no MEMPAL_DIR ingest should run.
|
||||||
"""
|
"""
|
||||||
targets: list[tuple[str, str]] = []
|
targets: list[tuple[str, str]] = []
|
||||||
mempal_dir = os.environ.get("MEMPAL_DIR", "")
|
mempal_dir = os.environ.get("MEMPAL_DIR", "")
|
||||||
@@ -214,9 +213,6 @@ def _get_mine_targets(transcript_path: str = "") -> list[tuple[str, str]]:
|
|||||||
resolved = Path(mempal_dir).expanduser().resolve()
|
resolved = Path(mempal_dir).expanduser().resolve()
|
||||||
if resolved.is_dir():
|
if resolved.is_dir():
|
||||||
targets.append((str(resolved), "projects"))
|
targets.append((str(resolved), "projects"))
|
||||||
path = _validate_transcript_path(transcript_path)
|
|
||||||
if path is not None and path.is_file():
|
|
||||||
targets.append((str(path.parent), "convos"))
|
|
||||||
return targets
|
return targets
|
||||||
|
|
||||||
|
|
||||||
@@ -274,9 +270,15 @@ def _spawn_mine(cmd: list) -> None:
|
|||||||
_MINE_PID_FILE.write_text(str(proc.pid))
|
_MINE_PID_FILE.write_text(str(proc.pid))
|
||||||
|
|
||||||
|
|
||||||
def _maybe_auto_ingest(transcript_path: str = ""):
|
def _maybe_auto_ingest():
|
||||||
"""Run mempalace mine in background for every available target."""
|
"""Background-mine MEMPAL_DIR (project files) if set.
|
||||||
targets = _get_mine_targets(transcript_path)
|
|
||||||
|
Transcript convos are ingested separately via ``_ingest_transcript``
|
||||||
|
in the hook handlers — this function does not handle them, to avoid
|
||||||
|
asymmetric interpreter handling and PID-file overwrite when both
|
||||||
|
targets fire from a single hook call (#1231 review).
|
||||||
|
"""
|
||||||
|
targets = _get_mine_targets()
|
||||||
if not targets:
|
if not targets:
|
||||||
return
|
return
|
||||||
if _mine_already_running():
|
if _mine_already_running():
|
||||||
@@ -284,14 +286,19 @@ def _maybe_auto_ingest(transcript_path: str = ""):
|
|||||||
return
|
return
|
||||||
for mine_dir, mode in targets:
|
for mine_dir, mode in targets:
|
||||||
try:
|
try:
|
||||||
_spawn_mine([sys.executable, "-m", "mempalace", "mine", mine_dir, "--mode", mode])
|
_spawn_mine([_mempalace_python(), "-m", "mempalace", "mine", mine_dir, "--mode", mode])
|
||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _mine_sync(transcript_path: str = ""):
|
def _mine_sync():
|
||||||
"""Run mempalace mine synchronously for every target (precompact)."""
|
"""Synchronously mine MEMPAL_DIR (precompact path).
|
||||||
targets = _get_mine_targets(transcript_path)
|
|
||||||
|
Transcript convos are ingested separately via ``_ingest_transcript``
|
||||||
|
in ``hook_precompact`` — keeping them out of this function avoids
|
||||||
|
timeout stacking against the harness 30s ceiling (#1231 review).
|
||||||
|
"""
|
||||||
|
targets = _get_mine_targets()
|
||||||
if not targets:
|
if not targets:
|
||||||
return
|
return
|
||||||
STATE_DIR.mkdir(parents=True, exist_ok=True)
|
STATE_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
@@ -301,7 +308,7 @@ def _mine_sync(transcript_path: str = ""):
|
|||||||
with open(log_path, "a") as log_f:
|
with open(log_path, "a") as log_f:
|
||||||
subprocess.run(
|
subprocess.run(
|
||||||
[
|
[
|
||||||
sys.executable,
|
_mempalace_python(),
|
||||||
"-m",
|
"-m",
|
||||||
"mempalace",
|
"mempalace",
|
||||||
"mine",
|
"mine",
|
||||||
@@ -613,7 +620,7 @@ def hook_stop(data: dict, harness: str):
|
|||||||
transcript_path, session_id, wing=project_wing, toast=toast
|
transcript_path, session_id, wing=project_wing, toast=toast
|
||||||
)
|
)
|
||||||
_ingest_transcript(transcript_path)
|
_ingest_transcript(transcript_path)
|
||||||
_maybe_auto_ingest(transcript_path)
|
_maybe_auto_ingest()
|
||||||
# Only advance save marker after successful save
|
# Only advance save marker after successful save
|
||||||
count = result.get("count", 0)
|
count = result.get("count", 0)
|
||||||
if count > 0:
|
if count > 0:
|
||||||
@@ -643,7 +650,7 @@ def hook_stop(data: dict, harness: str):
|
|||||||
pass
|
pass
|
||||||
if transcript_path:
|
if transcript_path:
|
||||||
_ingest_transcript(transcript_path)
|
_ingest_transcript(transcript_path)
|
||||||
_maybe_auto_ingest(transcript_path)
|
_maybe_auto_ingest()
|
||||||
reason = STOP_BLOCK_REASON + f" Write diary entry to wing={project_wing}."
|
reason = STOP_BLOCK_REASON + f" Write diary entry to wing={project_wing}."
|
||||||
_output({"decision": "block", "reason": reason})
|
_output({"decision": "block", "reason": reason})
|
||||||
else:
|
else:
|
||||||
@@ -676,8 +683,10 @@ def hook_precompact(data: dict, harness: str):
|
|||||||
if transcript_path:
|
if transcript_path:
|
||||||
_ingest_transcript(transcript_path)
|
_ingest_transcript(transcript_path)
|
||||||
|
|
||||||
# Mine synchronously so data lands before compaction proceeds
|
# Mine MEMPAL_DIR synchronously so project data lands before
|
||||||
_mine_sync(transcript_path)
|
# compaction proceeds. Transcript convos were already kicked off
|
||||||
|
# above via _ingest_transcript.
|
||||||
|
_mine_sync()
|
||||||
|
|
||||||
_output({})
|
_output({})
|
||||||
|
|
||||||
|
|||||||
+94
-90
@@ -450,35 +450,26 @@ def test_maybe_auto_ingest_with_env(tmp_path):
|
|||||||
assert cmd[cmd.index("--mode") + 1] == "projects"
|
assert cmd[cmd.index("--mode") + 1] == "projects"
|
||||||
|
|
||||||
|
|
||||||
def test_maybe_auto_ingest_with_transcript(tmp_path):
|
def test_maybe_auto_ingest_uses_mempalace_python(tmp_path):
|
||||||
"""Transcript fallback spawns mine in convos mode against the JSONL parent."""
|
"""Spawned mine command uses _mempalace_python(), not bare sys.executable.
|
||||||
transcript = tmp_path / "t.jsonl"
|
|
||||||
transcript.write_text("")
|
Hook subprocesses inherit the harness PATH which on GUI-launched
|
||||||
with patch.dict("os.environ", {}, clear=True):
|
Claude Code may resolve to a system Python without chromadb. The
|
||||||
|
interpreter used here must be the same one the hook itself runs
|
||||||
|
under (typically the venv that owns mempalace).
|
||||||
|
"""
|
||||||
|
mempal_dir = tmp_path / "project"
|
||||||
|
mempal_dir.mkdir()
|
||||||
|
with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}):
|
||||||
with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
|
with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
|
||||||
with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"):
|
with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"):
|
||||||
|
with patch(
|
||||||
|
"mempalace.hooks_cli._mempalace_python", return_value="/fake/venv/python"
|
||||||
|
):
|
||||||
with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
|
with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
|
||||||
_maybe_auto_ingest(str(transcript))
|
_maybe_auto_ingest()
|
||||||
mock_popen.assert_called_once()
|
|
||||||
cmd = mock_popen.call_args[0][0]
|
cmd = mock_popen.call_args[0][0]
|
||||||
assert "mine" in cmd
|
assert cmd[0] == "/fake/venv/python"
|
||||||
assert str(tmp_path.resolve()) in cmd
|
|
||||||
assert cmd[cmd.index("--mode") + 1] == "convos"
|
|
||||||
|
|
||||||
|
|
||||||
def test_mine_sync_with_transcript_uses_convos_mode(tmp_path):
|
|
||||||
"""Precompact sync path also picks convos mode for JSONL transcripts."""
|
|
||||||
transcript = tmp_path / "t.jsonl"
|
|
||||||
transcript.write_text("")
|
|
||||||
with patch.dict("os.environ", {}, clear=True):
|
|
||||||
with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
|
|
||||||
with patch("mempalace.hooks_cli.subprocess.run") as mock_run:
|
|
||||||
_mine_sync(str(transcript))
|
|
||||||
mock_run.assert_called_once()
|
|
||||||
cmd = mock_run.call_args[0][0]
|
|
||||||
assert "mine" in cmd
|
|
||||||
assert str(tmp_path.resolve()) in cmd
|
|
||||||
assert cmd[cmd.index("--mode") + 1] == "convos"
|
|
||||||
|
|
||||||
|
|
||||||
def test_mine_sync_with_env_uses_projects_mode(tmp_path):
|
def test_mine_sync_with_env_uses_projects_mode(tmp_path):
|
||||||
@@ -494,41 +485,55 @@ def test_mine_sync_with_env_uses_projects_mode(tmp_path):
|
|||||||
assert cmd[cmd.index("--mode") + 1] == "projects"
|
assert cmd[cmd.index("--mode") + 1] == "projects"
|
||||||
|
|
||||||
|
|
||||||
def test_maybe_auto_ingest_with_both_set(tmp_path):
|
def test_mine_sync_uses_mempalace_python(tmp_path):
|
||||||
"""When MEMPAL_DIR and a transcript are both set, BOTH spawns happen."""
|
"""Sync mine command uses _mempalace_python(), not bare sys.executable."""
|
||||||
mempal_dir = tmp_path / "project"
|
mempal_dir = tmp_path / "project"
|
||||||
mempal_dir.mkdir()
|
mempal_dir.mkdir()
|
||||||
|
with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}):
|
||||||
|
with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
|
||||||
|
with patch("mempalace.hooks_cli._mempalace_python", return_value="/fake/venv/python"):
|
||||||
|
with patch("mempalace.hooks_cli.subprocess.run") as mock_run:
|
||||||
|
_mine_sync()
|
||||||
|
cmd = mock_run.call_args[0][0]
|
||||||
|
assert cmd[0] == "/fake/venv/python"
|
||||||
|
|
||||||
|
|
||||||
|
def test_maybe_auto_ingest_ignores_transcript_arg_path(tmp_path):
|
||||||
|
"""_maybe_auto_ingest does NOT mine the transcript directory.
|
||||||
|
|
||||||
|
Transcript convos are handled by _ingest_transcript (called separately
|
||||||
|
in hook handlers). _maybe_auto_ingest only handles MEMPAL_DIR — even
|
||||||
|
when invoked in a context where a transcript is also being processed,
|
||||||
|
no second spawn for the transcript dir should appear here.
|
||||||
|
"""
|
||||||
convo_dir = tmp_path / "convos"
|
convo_dir = tmp_path / "convos"
|
||||||
convo_dir.mkdir()
|
convo_dir.mkdir()
|
||||||
transcript = convo_dir / "session.jsonl"
|
transcript = convo_dir / "session.jsonl"
|
||||||
transcript.write_text("")
|
transcript.write_text("")
|
||||||
with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}):
|
with patch.dict("os.environ", {}, clear=True):
|
||||||
with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
|
with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
|
||||||
with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"):
|
with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"):
|
||||||
with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
|
with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
|
||||||
_maybe_auto_ingest(str(transcript))
|
_maybe_auto_ingest()
|
||||||
assert mock_popen.call_count == 2
|
mock_popen.assert_not_called()
|
||||||
cmds = [call.args[0] for call in mock_popen.call_args_list]
|
|
||||||
modes = {cmd[cmd.index("--mode") + 1] for cmd in cmds}
|
|
||||||
assert modes == {"projects", "convos"}
|
|
||||||
|
|
||||||
|
|
||||||
def test_mine_sync_with_both_set(tmp_path):
|
def test_mine_sync_ignores_transcript(tmp_path):
|
||||||
"""Precompact sync runs BOTH mines when MEMPAL_DIR + transcript are set."""
|
"""_mine_sync does not run a convos mine for the transcript dir.
|
||||||
mempal_dir = tmp_path / "project"
|
|
||||||
mempal_dir.mkdir()
|
The precompact transcript ingest is the responsibility of
|
||||||
|
_ingest_transcript; routing it through _mine_sync would stack a
|
||||||
|
second 60s timeout against the harness 30s ceiling.
|
||||||
|
"""
|
||||||
convo_dir = tmp_path / "convos"
|
convo_dir = tmp_path / "convos"
|
||||||
convo_dir.mkdir()
|
convo_dir.mkdir()
|
||||||
transcript = convo_dir / "session.jsonl"
|
transcript = convo_dir / "session.jsonl"
|
||||||
transcript.write_text("")
|
transcript.write_text("")
|
||||||
with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}):
|
with patch.dict("os.environ", {}, clear=True):
|
||||||
with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
|
with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
|
||||||
with patch("mempalace.hooks_cli.subprocess.run") as mock_run:
|
with patch("mempalace.hooks_cli.subprocess.run") as mock_run:
|
||||||
_mine_sync(str(transcript))
|
_mine_sync()
|
||||||
assert mock_run.call_count == 2
|
mock_run.assert_not_called()
|
||||||
cmds = [call.args[0] for call in mock_run.call_args_list]
|
|
||||||
modes = {cmd[cmd.index("--mode") + 1] for cmd in cmds}
|
|
||||||
assert modes == {"projects", "convos"}
|
|
||||||
|
|
||||||
|
|
||||||
def test_maybe_auto_ingest_oserror(tmp_path):
|
def test_maybe_auto_ingest_oserror(tmp_path):
|
||||||
@@ -595,7 +600,7 @@ def test_get_mine_targets_mempal_dir_only(tmp_path):
|
|||||||
mempal_dir = tmp_path / "project"
|
mempal_dir = tmp_path / "project"
|
||||||
mempal_dir.mkdir()
|
mempal_dir.mkdir()
|
||||||
with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}):
|
with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}):
|
||||||
targets = _get_mine_targets("")
|
targets = _get_mine_targets()
|
||||||
assert len(targets) == 1
|
assert len(targets) == 1
|
||||||
assert Path(targets[0][0]).resolve() == mempal_dir.resolve()
|
assert Path(targets[0][0]).resolve() == mempal_dir.resolve()
|
||||||
assert targets[0][1] == "projects"
|
assert targets[0][1] == "projects"
|
||||||
@@ -612,65 +617,53 @@ def test_get_mine_targets_mempal_dir_tilde(tmp_path):
|
|||||||
pytest.skip("tmp_path is not under home, cannot build ~-relative path")
|
pytest.skip("tmp_path is not under home, cannot build ~-relative path")
|
||||||
tilde_path = "~/" + str(rel)
|
tilde_path = "~/" + str(rel)
|
||||||
with patch.dict("os.environ", {"MEMPAL_DIR": tilde_path}):
|
with patch.dict("os.environ", {"MEMPAL_DIR": tilde_path}):
|
||||||
targets = _get_mine_targets("")
|
targets = _get_mine_targets()
|
||||||
assert len(targets) == 1
|
assert len(targets) == 1
|
||||||
assert Path(targets[0][0]).resolve() == mempal_dir.resolve()
|
assert Path(targets[0][0]).resolve() == mempal_dir.resolve()
|
||||||
assert targets[0][1] == "projects"
|
assert targets[0][1] == "projects"
|
||||||
|
|
||||||
|
|
||||||
def test_get_mine_targets_transcript_only(tmp_path):
|
def test_get_mine_targets_no_transcript_target(tmp_path):
|
||||||
"""A valid transcript JSONL alone yields a single convos target."""
|
"""_get_mine_targets does not emit a convos target for the transcript path.
|
||||||
|
|
||||||
|
Transcript ingestion is owned by _ingest_transcript; emitting it
|
||||||
|
here too would double-mine the same JSONL into a different wing on
|
||||||
|
every hook fire (#1231 review).
|
||||||
|
"""
|
||||||
transcript = tmp_path / "t.jsonl"
|
transcript = tmp_path / "t.jsonl"
|
||||||
transcript.write_text("")
|
transcript.write_text("")
|
||||||
with patch.dict("os.environ", {}, clear=True):
|
with patch.dict("os.environ", {}, clear=True):
|
||||||
targets = _get_mine_targets(str(transcript))
|
targets = _get_mine_targets()
|
||||||
assert len(targets) == 1
|
assert targets == []
|
||||||
assert Path(targets[0][0]).resolve() == tmp_path.resolve()
|
|
||||||
assert targets[0][1] == "convos"
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_mine_targets_both_set(tmp_path):
|
def test_get_mine_targets_only_returns_mempal_dir(tmp_path):
|
||||||
"""When MEMPAL_DIR and a valid transcript are both set, BOTH targets appear.
|
"""When MEMPAL_DIR is set, exactly one projects target — never a convos target."""
|
||||||
|
|
||||||
This is the regression that motivates the additive-targets design:
|
|
||||||
users who set MEMPAL_DIR previously had their conversations silently
|
|
||||||
skipped because MEMPAL_DIR overrode the transcript path.
|
|
||||||
"""
|
|
||||||
mempal_dir = tmp_path / "project"
|
mempal_dir = tmp_path / "project"
|
||||||
mempal_dir.mkdir()
|
mempal_dir.mkdir()
|
||||||
convo_dir = tmp_path / "convos"
|
|
||||||
convo_dir.mkdir()
|
|
||||||
transcript = convo_dir / "session.jsonl"
|
|
||||||
transcript.write_text("")
|
|
||||||
with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}):
|
with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}):
|
||||||
targets = _get_mine_targets(str(transcript))
|
targets = _get_mine_targets()
|
||||||
modes = {mode for _, mode in targets}
|
assert len(targets) == 1
|
||||||
assert modes == {"projects", "convos"}
|
assert targets[0][1] == "projects"
|
||||||
by_mode = {mode: Path(d) for d, mode in targets}
|
|
||||||
assert by_mode["projects"].resolve() == mempal_dir.resolve()
|
|
||||||
assert by_mode["convos"].resolve() == convo_dir.resolve()
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_mine_targets_transcript_path_traversal_rejected(tmp_path):
|
def test_validate_transcript_path_traversal_rejected_jsonl(tmp_path):
|
||||||
"""Transcript paths with '..' components are rejected (no convos target)."""
|
"""Path traversal is rejected even when the path has a .jsonl suffix.
|
||||||
with patch.dict("os.environ", {}, clear=True):
|
|
||||||
targets = _get_mine_targets("../../etc/passwd")
|
|
||||||
assert targets == []
|
|
||||||
|
|
||||||
|
The pre-fix test used "../../etc/passwd" which lacks an extension and
|
||||||
def test_get_mine_targets_transcript_non_jsonl_rejected(tmp_path):
|
so was rejected by the suffix gate before the traversal check ever
|
||||||
"""Transcript paths without .jsonl/.json extension are rejected."""
|
fired (Copilot review on #1231). Use a .jsonl path with `..`
|
||||||
bad = tmp_path / "notes.txt"
|
segments to exercise the traversal guard specifically.
|
||||||
bad.write_text("content")
|
"""
|
||||||
with patch.dict("os.environ", {}, clear=True):
|
assert _validate_transcript_path("../t.jsonl") is None
|
||||||
targets = _get_mine_targets(str(bad))
|
assert _validate_transcript_path("a/../b.jsonl") is None
|
||||||
assert targets == []
|
assert _validate_transcript_path("/tmp/../etc/t.jsonl") is None
|
||||||
|
|
||||||
|
|
||||||
def test_get_mine_targets_empty():
|
def test_get_mine_targets_empty():
|
||||||
"""Returns empty list when nothing is available."""
|
"""Returns empty list when MEMPAL_DIR is unset or invalid."""
|
||||||
with patch.dict("os.environ", {}, clear=True):
|
with patch.dict("os.environ", {}, clear=True):
|
||||||
assert _get_mine_targets("") == []
|
assert _get_mine_targets() == []
|
||||||
|
|
||||||
|
|
||||||
# --- _parse_harness_input ---
|
# --- _parse_harness_input ---
|
||||||
@@ -790,10 +783,19 @@ def test_precompact_with_timeout(tmp_path):
|
|||||||
|
|
||||||
|
|
||||||
def test_precompact_mines_transcript_dir(tmp_path, monkeypatch):
|
def test_precompact_mines_transcript_dir(tmp_path, monkeypatch):
|
||||||
"""Precompact mines transcript directory when no MEMPAL_DIR."""
|
"""Precompact ingests the active transcript via _ingest_transcript.
|
||||||
|
|
||||||
|
With no MEMPAL_DIR, _mine_sync is a no-op; the transcript ingest is
|
||||||
|
the only mining that should fire, and it goes through Popen
|
||||||
|
(background) inside _ingest_transcript. Pre-#1231-review this test
|
||||||
|
asserted against subprocess.run, which corresponded to the
|
||||||
|
duplicate-mine path that has now been removed.
|
||||||
|
"""
|
||||||
transcript = tmp_path / "t.jsonl"
|
transcript = tmp_path / "t.jsonl"
|
||||||
transcript.write_text("")
|
# _ingest_transcript skips files smaller than 100 bytes, so pad it.
|
||||||
|
transcript.write_text("x" * 200)
|
||||||
monkeypatch.delenv("MEMPAL_DIR", raising=False)
|
monkeypatch.delenv("MEMPAL_DIR", raising=False)
|
||||||
|
with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
|
||||||
with patch("mempalace.hooks_cli.subprocess.run") as mock_run:
|
with patch("mempalace.hooks_cli.subprocess.run") as mock_run:
|
||||||
result = _capture_hook_output(
|
result = _capture_hook_output(
|
||||||
hook_precompact,
|
hook_precompact,
|
||||||
@@ -801,11 +803,13 @@ def test_precompact_mines_transcript_dir(tmp_path, monkeypatch):
|
|||||||
state_dir=tmp_path,
|
state_dir=tmp_path,
|
||||||
)
|
)
|
||||||
assert result == {}
|
assert result == {}
|
||||||
mock_run.assert_called_once()
|
mock_run.assert_not_called()
|
||||||
# Mine dir is the transcript's parent (resolved) and mode is convos for JSONL.
|
mock_popen.assert_called_once()
|
||||||
call_args = mock_run.call_args[0][0]
|
cmd = mock_popen.call_args[0][0]
|
||||||
assert str(tmp_path.resolve()) in call_args
|
# Mines the transcript's parent dir as convos, into wing "sessions".
|
||||||
assert call_args[call_args.index("--mode") + 1] == "convos"
|
assert str(tmp_path) in cmd
|
||||||
|
assert cmd[cmd.index("--mode") + 1] == "convos"
|
||||||
|
assert cmd[cmd.index("--wing") + 1] == "sessions"
|
||||||
|
|
||||||
|
|
||||||
# --- run_hook ---
|
# --- run_hook ---
|
||||||
|
|||||||
@@ -61,3 +61,65 @@ class TestSaveHookAutoMines:
|
|||||||
'MEMPAL_DIR defaults to "" which silently disables mining. '
|
'MEMPAL_DIR defaults to "" which silently disables mining. '
|
||||||
"Either set a default path or add transcript-based mining."
|
"Either set a default path or add transcript-based mining."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestShellHookTranscriptValidation:
|
||||||
|
"""Both shell hooks must validate transcript paths before mining them.
|
||||||
|
|
||||||
|
Mirrors mempalace.hooks_cli._validate_transcript_path so unsafe paths
|
||||||
|
(no extension, traversal segments) are rejected at the shell layer
|
||||||
|
too — added in #1231 review (Copilot #7, #8).
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _hook_src(name: str) -> str:
|
||||||
|
path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "hooks", name)
|
||||||
|
return open(path).read()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _strip_comments(src: str) -> str:
|
||||||
|
return "\n".join(line for line in src.splitlines() if not line.lstrip().startswith("#"))
|
||||||
|
|
||||||
|
def test_save_hook_defines_and_uses_validator(self):
|
||||||
|
src = self._strip_comments(self._hook_src("mempal_save_hook.sh"))
|
||||||
|
assert "is_valid_transcript_path() {" in src, "validator function must be defined"
|
||||||
|
assert (
|
||||||
|
'is_valid_transcript_path "$TRANSCRIPT_PATH"' in src
|
||||||
|
), "validator must be invoked against TRANSCRIPT_PATH before mining"
|
||||||
|
|
||||||
|
def test_precompact_hook_defines_and_uses_validator(self):
|
||||||
|
src = self._strip_comments(self._hook_src("mempal_precompact_hook.sh"))
|
||||||
|
assert "is_valid_transcript_path() {" in src, "validator function must be defined"
|
||||||
|
assert (
|
||||||
|
'is_valid_transcript_path "$TRANSCRIPT_PATH"' in src
|
||||||
|
), "validator must be invoked against TRANSCRIPT_PATH before mining"
|
||||||
|
|
||||||
|
def test_validators_run_via_bash(self, tmp_path):
|
||||||
|
"""Source the validator out of each hook and exercise it directly."""
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
for name in ("mempal_save_hook.sh", "mempal_precompact_hook.sh"):
|
||||||
|
src = self._hook_src(name)
|
||||||
|
# Extract just the function definition (first occurrence).
|
||||||
|
start = src.index("is_valid_transcript_path() {")
|
||||||
|
end = src.index("\n}\n", start) + 2
|
||||||
|
func_src = src[start:end]
|
||||||
|
script = tmp_path / "v.sh"
|
||||||
|
script.write_text(
|
||||||
|
f"{func_src}\n" 'is_valid_transcript_path "$1" && echo OK || echo NO\n'
|
||||||
|
)
|
||||||
|
|
||||||
|
def run(arg: str) -> str:
|
||||||
|
return subprocess.run(
|
||||||
|
["bash", str(script), arg],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=False,
|
||||||
|
).stdout.strip()
|
||||||
|
|
||||||
|
assert run("/tmp/sessions/abc.jsonl") == "OK"
|
||||||
|
assert run("/tmp/sessions/abc.json") == "OK"
|
||||||
|
assert run("") == "NO"
|
||||||
|
assert run("/tmp/notes.txt") == "NO"
|
||||||
|
assert run("../etc/passwd.jsonl") == "NO"
|
||||||
|
assert run("/tmp/../etc/t.jsonl") == "NO"
|
||||||
|
|||||||
Reference in New Issue
Block a user