From fe5679776274a23f175f23bd1bdb57ed42094e6d Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Mon, 27 Apr 2026 02:26:53 -0300 Subject: [PATCH] fix(hooks): consolidate transcript ingest, harden shell parsers (#1231 review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address Copilot review on #1231: 1. Stop double-mining the transcript on the Python side. ``_get_mine_targets`` now returns only the ``MEMPAL_DIR`` projects target — the convos target for the transcript dir is dropped because ``_ingest_transcript`` already handles it on every hook fire. The duplicate spawn was using ``sys.executable`` (vs ``_mempalace_python()``) and a different ``--wing``, so each Stop/PreCompact event was writing the same transcript into two wings under asymmetric interpreters and overwriting the single ``_MINE_PID_FILE`` lock. 2. ``_maybe_auto_ingest`` and ``_mine_sync`` now spawn via ``_mempalace_python()`` so the resolved interpreter matches the venv that owns mempalace (matters under GUI-launched harnesses where ``sys.executable`` may resolve to a system Python without chromadb). 3. Replace ``eval $(...)`` in both shell hooks with a ``mapfile``-based reader. Sanitized values are still emitted by the same Python parser, but the shell now does plain variable assignment instead of executing the parser's stdout — smaller blast radius if the sanitizer is ever bypassed. 4. Mirror ``_validate_transcript_path`` in the shell hooks via a ``is_valid_transcript_path`` helper — extension + traversal-segment rejection, parity with the Python validator. The convos mine in each shell hook is now gated on the validator instead of bare ``-f``. 5. Tighten the ``..`` traversal test that previously exercised the suffix gate by mistake (``../../etc/passwd`` lacks ``.json[l]``). Use ``.jsonl`` paths with traversal segments to actually hit the ``..`` rejection branch. 6. README: add a one-liner pointing at ``mempalace sweep`` for users who want per-message recall on top of the file-level chunks the hooks produce. The sweeper was undiscoverable previously. Tests: 1418 passed, 1 skipped (full suite minus benchmarks). --- README.md | 4 + hooks/mempal_precompact_hook.sh | 35 +++++- hooks/mempal_save_hook.sh | 41 +++++-- mempalace/hooks_cli.py | 51 ++++---- tests/test_hooks_cli.py | 200 ++++++++++++++++---------------- tests/test_save_hook_mines.py | 62 ++++++++++ 6 files changed, 261 insertions(+), 132 deletions(-) diff --git a/README.md b/README.md index acbeb14..8157fca 100644 --- a/README.md +++ b/README.md @@ -149,6 +149,10 @@ system prompt: Two Claude Code hooks save periodically and before context compression: [mempalaceofficial.com/guide/hooks](https://mempalaceofficial.com/guide/hooks.html). +For per-message recall on top of the file-level chunks the hooks produce, +run `mempalace sweep ` periodically — it stores one +verbatim drawer per user/assistant message, idempotent and resume-safe. + --- ## Requirements diff --git a/hooks/mempal_precompact_hook.sh b/hooks/mempal_precompact_hook.sh index 70377f6..e811d65 100755 --- a/hooks/mempal_precompact_hook.sh +++ b/hooks/mempal_precompact_hook.sh @@ -65,30 +65,53 @@ fi # Read JSON input from stdin INPUT=$(cat) -# Parse session_id and transcript_path in one call. Sanitize both before -# interpolating into shell — same contract as mempal_save_hook.sh. -eval $(echo "$INPUT" | "$MEMPAL_PYTHON_BIN" -c " +# Parse session_id and transcript_path in one call. Sanitize both, then +# read sanitized values from one-per-line stdout into shell variables — +# avoids ``eval`` on generated code (#1231 review). Same contract as +# mempal_save_hook.sh. +mapfile -t _mempal_parsed < <(echo "$INPUT" | "$MEMPAL_PYTHON_BIN" -c " import sys, json, re data = json.load(sys.stdin) sid = data.get('session_id', 'unknown') tp = data.get('transcript_path', '') safe = lambda s: re.sub(r'[^a-zA-Z0-9_/.\-~]', '', str(s)) -print(f'SESSION_ID=\"{safe(sid)}\"') -print(f'TRANSCRIPT_PATH=\"{safe(tp)}\"') +print(safe(sid)) +print(safe(tp)) " 2>/dev/null) +SESSION_ID="${_mempal_parsed[0]:-unknown}" +TRANSCRIPT_PATH="${_mempal_parsed[1]:-}" # Expand ~ in path TRANSCRIPT_PATH="${TRANSCRIPT_PATH/#\~/$HOME}" +# Validate that TRANSCRIPT_PATH looks like a transcript file. Mirrors +# mempalace.hooks_cli._validate_transcript_path so the shell hook +# rejects the same shapes the Python hook rejects (#1231 review). +is_valid_transcript_path() { + local path="$1" + [ -n "$path" ] || return 1 + case "$path" in + *.json|*.jsonl) ;; + *) return 1 ;; + esac + case "/$path/" in + */../*) return 1 ;; + esac + return 0 +} + echo "[$(date '+%H:%M:%S')] PRE-COMPACT triggered for session $SESSION_ID" >> "$STATE_DIR/hook.log" # Run ingest synchronously so memories land before compaction. Two # independent targets — both run if both are set: # 1. TRANSCRIPT_PATH (from Claude Code) → parent dir, --mode convos # 2. MEMPAL_DIR → --mode projects -if [ -n "$TRANSCRIPT_PATH" ] && [ -f "$TRANSCRIPT_PATH" ]; then +if is_valid_transcript_path "$TRANSCRIPT_PATH" && [ -f "$TRANSCRIPT_PATH" ]; then mempalace mine "$(dirname "$TRANSCRIPT_PATH")" --mode convos \ >> "$STATE_DIR/hook.log" 2>&1 +elif [ -n "$TRANSCRIPT_PATH" ]; then + echo "[$(date '+%H:%M:%S')] Skipping invalid transcript path: $TRANSCRIPT_PATH" \ + >> "$STATE_DIR/hook.log" fi if [ -n "$MEMPAL_DIR" ] && [ -d "$MEMPAL_DIR" ]; then mempalace mine "$MEMPAL_DIR" --mode projects \ diff --git a/hooks/mempal_save_hook.sh b/hooks/mempal_save_hook.sh index d69ae85..5efd157 100755 --- a/hooks/mempal_save_hook.sh +++ b/hooks/mempal_save_hook.sh @@ -83,9 +83,11 @@ fi INPUT=$(cat) # Parse all fields in a single Python call (3x faster than separate invocations) -# SECURITY: All values are sanitized before being interpolated into shell assignments. -# stop_hook_active is coerced to a strict True/False to prevent command injection via eval. -eval $(echo "$INPUT" | "$MEMPAL_PYTHON_BIN" -c " +# without invoking ``eval`` on generated code: Python prints one sanitized +# value per line, the shell reads them via ``mapfile`` and does plain +# variable assignment — same data, smaller blast radius if the sanitizer +# is ever bypassed (#1231 review). +mapfile -t _mempal_parsed < <(echo "$INPUT" | "$MEMPAL_PYTHON_BIN" -c " import sys, json, re data = json.load(sys.stdin) sid = data.get('session_id', 'unknown') @@ -95,14 +97,36 @@ tp = data.get('transcript_path', '') safe = lambda s: re.sub(r'[^a-zA-Z0-9_/.\-~]', '', str(s)) # Coerce stop_hook_active to strict boolean string sha = 'True' if sha_raw is True or str(sha_raw).lower() in ('true', '1', 'yes') else 'False' -print(f'SESSION_ID=\"{safe(sid)}\"') -print(f'STOP_HOOK_ACTIVE=\"{sha}\"') -print(f'TRANSCRIPT_PATH=\"{safe(tp)}\"') +print(safe(sid)) +print(sha) +print(safe(tp)) " 2>/dev/null) +SESSION_ID="${_mempal_parsed[0]:-unknown}" +STOP_HOOK_ACTIVE="${_mempal_parsed[1]:-False}" +TRANSCRIPT_PATH="${_mempal_parsed[2]:-}" # Expand ~ in path TRANSCRIPT_PATH="${TRANSCRIPT_PATH/#\~/$HOME}" +# Validate that TRANSCRIPT_PATH looks like a transcript file: +# - non-empty +# - .jsonl or .json suffix +# - no traversal segments (.. components) +# Mirrors mempalace.hooks_cli._validate_transcript_path so the shell hook +# rejects the same shapes the Python hook rejects (#1231 review). +is_valid_transcript_path() { + local path="$1" + [ -n "$path" ] || return 1 + case "$path" in + *.json|*.jsonl) ;; + *) return 1 ;; + esac + case "/$path/" in + */../*) return 1 ;; + esac + return 0 +} + # If we're already in a save cycle, let the AI stop normally # This is the infinite-loop prevention: block once → AI saves → tries to stop again → we let it through if [ "$STOP_HOOK_ACTIVE" = "True" ] || [ "$STOP_HOOK_ACTIVE" = "true" ]; then @@ -165,9 +189,12 @@ if [ "$SINCE_LAST" -ge "$SAVE_INTERVAL" ] && [ "$EXCHANGE_COUNT" -gt 0 ]; then # (code, notes, docs) # MEMPAL_DIR is *additive*, not an override: a user with MEMPAL_DIR # pointed at their project still gets the active conversation mined. - if [ -n "$TRANSCRIPT_PATH" ] && [ -f "$TRANSCRIPT_PATH" ]; then + if is_valid_transcript_path "$TRANSCRIPT_PATH" && [ -f "$TRANSCRIPT_PATH" ]; then mempalace mine "$(dirname "$TRANSCRIPT_PATH")" --mode convos \ >> "$STATE_DIR/hook.log" 2>&1 & + elif [ -n "$TRANSCRIPT_PATH" ]; then + echo "[$(date '+%H:%M:%S')] Skipping invalid transcript path: $TRANSCRIPT_PATH" \ + >> "$STATE_DIR/hook.log" fi if [ -n "$MEMPAL_DIR" ] && [ -d "$MEMPAL_DIR" ]; then mempalace mine "$MEMPAL_DIR" --mode projects \ diff --git a/mempalace/hooks_cli.py b/mempalace/hooks_cli.py index 0645c41..d4f8317 100644 --- a/mempalace/hooks_cli.py +++ b/mempalace/hooks_cli.py @@ -197,16 +197,15 @@ def _output(data: dict): sys.stdout.buffer.flush() -def _get_mine_targets(transcript_path: str = "") -> list[tuple[str, str]]: +def _get_mine_targets() -> list[tuple[str, str]]: """Return the list of ``(dir, mode)`` targets for auto-ingest. MEMPAL_DIR (when set and resolvable) contributes a ``"projects"`` - target. A valid transcript JSONL contributes a ``"convos"`` target - on its parent dir. Both may appear together — the hook runs one - ingest per target, so a user with MEMPAL_DIR pointed at their - project dir still gets the active conversation mined verbatim. + target. Transcript ingestion is handled separately by + ``_ingest_transcript`` — emitting it here too would double-mine the + same JSONL into a different wing on every hook fire (#1231 review). - An empty list means no ingest should run. + An empty list means no MEMPAL_DIR ingest should run. """ targets: list[tuple[str, str]] = [] mempal_dir = os.environ.get("MEMPAL_DIR", "") @@ -214,9 +213,6 @@ def _get_mine_targets(transcript_path: str = "") -> list[tuple[str, str]]: resolved = Path(mempal_dir).expanduser().resolve() if resolved.is_dir(): targets.append((str(resolved), "projects")) - path = _validate_transcript_path(transcript_path) - if path is not None and path.is_file(): - targets.append((str(path.parent), "convos")) return targets @@ -274,9 +270,15 @@ def _spawn_mine(cmd: list) -> None: _MINE_PID_FILE.write_text(str(proc.pid)) -def _maybe_auto_ingest(transcript_path: str = ""): - """Run mempalace mine in background for every available target.""" - targets = _get_mine_targets(transcript_path) +def _maybe_auto_ingest(): + """Background-mine MEMPAL_DIR (project files) if set. + + Transcript convos are ingested separately via ``_ingest_transcript`` + in the hook handlers — this function does not handle them, to avoid + asymmetric interpreter handling and PID-file overwrite when both + targets fire from a single hook call (#1231 review). + """ + targets = _get_mine_targets() if not targets: return if _mine_already_running(): @@ -284,14 +286,19 @@ def _maybe_auto_ingest(transcript_path: str = ""): return for mine_dir, mode in targets: try: - _spawn_mine([sys.executable, "-m", "mempalace", "mine", mine_dir, "--mode", mode]) + _spawn_mine([_mempalace_python(), "-m", "mempalace", "mine", mine_dir, "--mode", mode]) except OSError: pass -def _mine_sync(transcript_path: str = ""): - """Run mempalace mine synchronously for every target (precompact).""" - targets = _get_mine_targets(transcript_path) +def _mine_sync(): + """Synchronously mine MEMPAL_DIR (precompact path). + + Transcript convos are ingested separately via ``_ingest_transcript`` + in ``hook_precompact`` — keeping them out of this function avoids + timeout stacking against the harness 30s ceiling (#1231 review). + """ + targets = _get_mine_targets() if not targets: return STATE_DIR.mkdir(parents=True, exist_ok=True) @@ -301,7 +308,7 @@ def _mine_sync(transcript_path: str = ""): with open(log_path, "a") as log_f: subprocess.run( [ - sys.executable, + _mempalace_python(), "-m", "mempalace", "mine", @@ -613,7 +620,7 @@ def hook_stop(data: dict, harness: str): transcript_path, session_id, wing=project_wing, toast=toast ) _ingest_transcript(transcript_path) - _maybe_auto_ingest(transcript_path) + _maybe_auto_ingest() # Only advance save marker after successful save count = result.get("count", 0) if count > 0: @@ -643,7 +650,7 @@ def hook_stop(data: dict, harness: str): pass if transcript_path: _ingest_transcript(transcript_path) - _maybe_auto_ingest(transcript_path) + _maybe_auto_ingest() reason = STOP_BLOCK_REASON + f" Write diary entry to wing={project_wing}." _output({"decision": "block", "reason": reason}) else: @@ -676,8 +683,10 @@ def hook_precompact(data: dict, harness: str): if transcript_path: _ingest_transcript(transcript_path) - # Mine synchronously so data lands before compaction proceeds - _mine_sync(transcript_path) + # Mine MEMPAL_DIR synchronously so project data lands before + # compaction proceeds. Transcript convos were already kicked off + # above via _ingest_transcript. + _mine_sync() _output({}) diff --git a/tests/test_hooks_cli.py b/tests/test_hooks_cli.py index c105eae..1ceb530 100644 --- a/tests/test_hooks_cli.py +++ b/tests/test_hooks_cli.py @@ -450,35 +450,26 @@ def test_maybe_auto_ingest_with_env(tmp_path): assert cmd[cmd.index("--mode") + 1] == "projects" -def test_maybe_auto_ingest_with_transcript(tmp_path): - """Transcript fallback spawns mine in convos mode against the JSONL parent.""" - transcript = tmp_path / "t.jsonl" - transcript.write_text("") - with patch.dict("os.environ", {}, clear=True): +def test_maybe_auto_ingest_uses_mempalace_python(tmp_path): + """Spawned mine command uses _mempalace_python(), not bare sys.executable. + + Hook subprocesses inherit the harness PATH which on GUI-launched + Claude Code may resolve to a system Python without chromadb. The + interpreter used here must be the same one the hook itself runs + under (typically the venv that owns mempalace). + """ + mempal_dir = tmp_path / "project" + mempal_dir.mkdir() + with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): with patch("mempalace.hooks_cli.STATE_DIR", tmp_path): with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"): - with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen: - _maybe_auto_ingest(str(transcript)) - mock_popen.assert_called_once() - cmd = mock_popen.call_args[0][0] - assert "mine" in cmd - assert str(tmp_path.resolve()) in cmd - assert cmd[cmd.index("--mode") + 1] == "convos" - - -def test_mine_sync_with_transcript_uses_convos_mode(tmp_path): - """Precompact sync path also picks convos mode for JSONL transcripts.""" - transcript = tmp_path / "t.jsonl" - transcript.write_text("") - with patch.dict("os.environ", {}, clear=True): - with patch("mempalace.hooks_cli.STATE_DIR", tmp_path): - with patch("mempalace.hooks_cli.subprocess.run") as mock_run: - _mine_sync(str(transcript)) - mock_run.assert_called_once() - cmd = mock_run.call_args[0][0] - assert "mine" in cmd - assert str(tmp_path.resolve()) in cmd - assert cmd[cmd.index("--mode") + 1] == "convos" + with patch( + "mempalace.hooks_cli._mempalace_python", return_value="/fake/venv/python" + ): + with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen: + _maybe_auto_ingest() + cmd = mock_popen.call_args[0][0] + assert cmd[0] == "/fake/venv/python" def test_mine_sync_with_env_uses_projects_mode(tmp_path): @@ -494,41 +485,55 @@ def test_mine_sync_with_env_uses_projects_mode(tmp_path): assert cmd[cmd.index("--mode") + 1] == "projects" -def test_maybe_auto_ingest_with_both_set(tmp_path): - """When MEMPAL_DIR and a transcript are both set, BOTH spawns happen.""" +def test_mine_sync_uses_mempalace_python(tmp_path): + """Sync mine command uses _mempalace_python(), not bare sys.executable.""" mempal_dir = tmp_path / "project" mempal_dir.mkdir() + with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): + with patch("mempalace.hooks_cli.STATE_DIR", tmp_path): + with patch("mempalace.hooks_cli._mempalace_python", return_value="/fake/venv/python"): + with patch("mempalace.hooks_cli.subprocess.run") as mock_run: + _mine_sync() + cmd = mock_run.call_args[0][0] + assert cmd[0] == "/fake/venv/python" + + +def test_maybe_auto_ingest_ignores_transcript_arg_path(tmp_path): + """_maybe_auto_ingest does NOT mine the transcript directory. + + Transcript convos are handled by _ingest_transcript (called separately + in hook handlers). _maybe_auto_ingest only handles MEMPAL_DIR — even + when invoked in a context where a transcript is also being processed, + no second spawn for the transcript dir should appear here. + """ convo_dir = tmp_path / "convos" convo_dir.mkdir() transcript = convo_dir / "session.jsonl" transcript.write_text("") - with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): + with patch.dict("os.environ", {}, clear=True): with patch("mempalace.hooks_cli.STATE_DIR", tmp_path): with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"): with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen: - _maybe_auto_ingest(str(transcript)) - assert mock_popen.call_count == 2 - cmds = [call.args[0] for call in mock_popen.call_args_list] - modes = {cmd[cmd.index("--mode") + 1] for cmd in cmds} - assert modes == {"projects", "convos"} + _maybe_auto_ingest() + mock_popen.assert_not_called() -def test_mine_sync_with_both_set(tmp_path): - """Precompact sync runs BOTH mines when MEMPAL_DIR + transcript are set.""" - mempal_dir = tmp_path / "project" - mempal_dir.mkdir() +def test_mine_sync_ignores_transcript(tmp_path): + """_mine_sync does not run a convos mine for the transcript dir. + + The precompact transcript ingest is the responsibility of + _ingest_transcript; routing it through _mine_sync would stack a + second 60s timeout against the harness 30s ceiling. + """ convo_dir = tmp_path / "convos" convo_dir.mkdir() transcript = convo_dir / "session.jsonl" transcript.write_text("") - with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): + with patch.dict("os.environ", {}, clear=True): with patch("mempalace.hooks_cli.STATE_DIR", tmp_path): with patch("mempalace.hooks_cli.subprocess.run") as mock_run: - _mine_sync(str(transcript)) - assert mock_run.call_count == 2 - cmds = [call.args[0] for call in mock_run.call_args_list] - modes = {cmd[cmd.index("--mode") + 1] for cmd in cmds} - assert modes == {"projects", "convos"} + _mine_sync() + mock_run.assert_not_called() def test_maybe_auto_ingest_oserror(tmp_path): @@ -595,7 +600,7 @@ def test_get_mine_targets_mempal_dir_only(tmp_path): mempal_dir = tmp_path / "project" mempal_dir.mkdir() with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): - targets = _get_mine_targets("") + targets = _get_mine_targets() assert len(targets) == 1 assert Path(targets[0][0]).resolve() == mempal_dir.resolve() assert targets[0][1] == "projects" @@ -612,65 +617,53 @@ def test_get_mine_targets_mempal_dir_tilde(tmp_path): pytest.skip("tmp_path is not under home, cannot build ~-relative path") tilde_path = "~/" + str(rel) with patch.dict("os.environ", {"MEMPAL_DIR": tilde_path}): - targets = _get_mine_targets("") + targets = _get_mine_targets() assert len(targets) == 1 assert Path(targets[0][0]).resolve() == mempal_dir.resolve() assert targets[0][1] == "projects" -def test_get_mine_targets_transcript_only(tmp_path): - """A valid transcript JSONL alone yields a single convos target.""" +def test_get_mine_targets_no_transcript_target(tmp_path): + """_get_mine_targets does not emit a convos target for the transcript path. + + Transcript ingestion is owned by _ingest_transcript; emitting it + here too would double-mine the same JSONL into a different wing on + every hook fire (#1231 review). + """ transcript = tmp_path / "t.jsonl" transcript.write_text("") with patch.dict("os.environ", {}, clear=True): - targets = _get_mine_targets(str(transcript)) - assert len(targets) == 1 - assert Path(targets[0][0]).resolve() == tmp_path.resolve() - assert targets[0][1] == "convos" + targets = _get_mine_targets() + assert targets == [] -def test_get_mine_targets_both_set(tmp_path): - """When MEMPAL_DIR and a valid transcript are both set, BOTH targets appear. - - This is the regression that motivates the additive-targets design: - users who set MEMPAL_DIR previously had their conversations silently - skipped because MEMPAL_DIR overrode the transcript path. - """ +def test_get_mine_targets_only_returns_mempal_dir(tmp_path): + """When MEMPAL_DIR is set, exactly one projects target — never a convos target.""" mempal_dir = tmp_path / "project" mempal_dir.mkdir() - convo_dir = tmp_path / "convos" - convo_dir.mkdir() - transcript = convo_dir / "session.jsonl" - transcript.write_text("") with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): - targets = _get_mine_targets(str(transcript)) - modes = {mode for _, mode in targets} - assert modes == {"projects", "convos"} - by_mode = {mode: Path(d) for d, mode in targets} - assert by_mode["projects"].resolve() == mempal_dir.resolve() - assert by_mode["convos"].resolve() == convo_dir.resolve() + targets = _get_mine_targets() + assert len(targets) == 1 + assert targets[0][1] == "projects" -def test_get_mine_targets_transcript_path_traversal_rejected(tmp_path): - """Transcript paths with '..' components are rejected (no convos target).""" - with patch.dict("os.environ", {}, clear=True): - targets = _get_mine_targets("../../etc/passwd") - assert targets == [] +def test_validate_transcript_path_traversal_rejected_jsonl(tmp_path): + """Path traversal is rejected even when the path has a .jsonl suffix. - -def test_get_mine_targets_transcript_non_jsonl_rejected(tmp_path): - """Transcript paths without .jsonl/.json extension are rejected.""" - bad = tmp_path / "notes.txt" - bad.write_text("content") - with patch.dict("os.environ", {}, clear=True): - targets = _get_mine_targets(str(bad)) - assert targets == [] + The pre-fix test used "../../etc/passwd" which lacks an extension and + so was rejected by the suffix gate before the traversal check ever + fired (Copilot review on #1231). Use a .jsonl path with `..` + segments to exercise the traversal guard specifically. + """ + assert _validate_transcript_path("../t.jsonl") is None + assert _validate_transcript_path("a/../b.jsonl") is None + assert _validate_transcript_path("/tmp/../etc/t.jsonl") is None def test_get_mine_targets_empty(): - """Returns empty list when nothing is available.""" + """Returns empty list when MEMPAL_DIR is unset or invalid.""" with patch.dict("os.environ", {}, clear=True): - assert _get_mine_targets("") == [] + assert _get_mine_targets() == [] # --- _parse_harness_input --- @@ -790,22 +783,33 @@ def test_precompact_with_timeout(tmp_path): def test_precompact_mines_transcript_dir(tmp_path, monkeypatch): - """Precompact mines transcript directory when no MEMPAL_DIR.""" + """Precompact ingests the active transcript via _ingest_transcript. + + With no MEMPAL_DIR, _mine_sync is a no-op; the transcript ingest is + the only mining that should fire, and it goes through Popen + (background) inside _ingest_transcript. Pre-#1231-review this test + asserted against subprocess.run, which corresponded to the + duplicate-mine path that has now been removed. + """ transcript = tmp_path / "t.jsonl" - transcript.write_text("") + # _ingest_transcript skips files smaller than 100 bytes, so pad it. + transcript.write_text("x" * 200) monkeypatch.delenv("MEMPAL_DIR", raising=False) - with patch("mempalace.hooks_cli.subprocess.run") as mock_run: - result = _capture_hook_output( - hook_precompact, - {"session_id": "test", "transcript_path": str(transcript)}, - state_dir=tmp_path, - ) + with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen: + with patch("mempalace.hooks_cli.subprocess.run") as mock_run: + result = _capture_hook_output( + hook_precompact, + {"session_id": "test", "transcript_path": str(transcript)}, + state_dir=tmp_path, + ) assert result == {} - mock_run.assert_called_once() - # Mine dir is the transcript's parent (resolved) and mode is convos for JSONL. - call_args = mock_run.call_args[0][0] - assert str(tmp_path.resolve()) in call_args - assert call_args[call_args.index("--mode") + 1] == "convos" + mock_run.assert_not_called() + mock_popen.assert_called_once() + cmd = mock_popen.call_args[0][0] + # Mines the transcript's parent dir as convos, into wing "sessions". + assert str(tmp_path) in cmd + assert cmd[cmd.index("--mode") + 1] == "convos" + assert cmd[cmd.index("--wing") + 1] == "sessions" # --- run_hook --- diff --git a/tests/test_save_hook_mines.py b/tests/test_save_hook_mines.py index 93a51f2..d9f40e8 100644 --- a/tests/test_save_hook_mines.py +++ b/tests/test_save_hook_mines.py @@ -61,3 +61,65 @@ class TestSaveHookAutoMines: 'MEMPAL_DIR defaults to "" which silently disables mining. ' "Either set a default path or add transcript-based mining." ) + + +class TestShellHookTranscriptValidation: + """Both shell hooks must validate transcript paths before mining them. + + Mirrors mempalace.hooks_cli._validate_transcript_path so unsafe paths + (no extension, traversal segments) are rejected at the shell layer + too — added in #1231 review (Copilot #7, #8). + """ + + @staticmethod + def _hook_src(name: str) -> str: + path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "hooks", name) + return open(path).read() + + @staticmethod + def _strip_comments(src: str) -> str: + return "\n".join(line for line in src.splitlines() if not line.lstrip().startswith("#")) + + def test_save_hook_defines_and_uses_validator(self): + src = self._strip_comments(self._hook_src("mempal_save_hook.sh")) + assert "is_valid_transcript_path() {" in src, "validator function must be defined" + assert ( + 'is_valid_transcript_path "$TRANSCRIPT_PATH"' in src + ), "validator must be invoked against TRANSCRIPT_PATH before mining" + + def test_precompact_hook_defines_and_uses_validator(self): + src = self._strip_comments(self._hook_src("mempal_precompact_hook.sh")) + assert "is_valid_transcript_path() {" in src, "validator function must be defined" + assert ( + 'is_valid_transcript_path "$TRANSCRIPT_PATH"' in src + ), "validator must be invoked against TRANSCRIPT_PATH before mining" + + def test_validators_run_via_bash(self, tmp_path): + """Source the validator out of each hook and exercise it directly.""" + import subprocess + + for name in ("mempal_save_hook.sh", "mempal_precompact_hook.sh"): + src = self._hook_src(name) + # Extract just the function definition (first occurrence). + start = src.index("is_valid_transcript_path() {") + end = src.index("\n}\n", start) + 2 + func_src = src[start:end] + script = tmp_path / "v.sh" + script.write_text( + f"{func_src}\n" 'is_valid_transcript_path "$1" && echo OK || echo NO\n' + ) + + def run(arg: str) -> str: + return subprocess.run( + ["bash", str(script), arg], + capture_output=True, + text=True, + check=False, + ).stdout.strip() + + assert run("/tmp/sessions/abc.jsonl") == "OK" + assert run("/tmp/sessions/abc.json") == "OK" + assert run("") == "NO" + assert run("/tmp/notes.txt") == "NO" + assert run("../etc/passwd.jsonl") == "NO" + assert run("/tmp/../etc/t.jsonl") == "NO"