From eb4de0433962df4c42ac081dc22611b005096b66 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Mon, 27 Apr 2026 00:32:35 -0300 Subject: [PATCH 1/3] fix(hooks): always mine the active transcript as convos, additive to MEMPAL_DIR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #1230 fixed --mode convos for the case where MEMPAL_DIR was unset, but left two configurations broken: - MEMPAL_DIR set to a project dir: convos never mined (MEMPAL_DIR overrode the transcript path); only project files were ingested. - MEMPAL_DIR set to a conversations dir per the old hooks/README: the projects miner ran on JSONL — same wrong-miner behaviour. The shell hooks (mempal_save_hook.sh, mempal_precompact_hook.sh) had the same MEMPAL_DIR-overrides-transcript bug AND were missing --mode on every spawned `mempalace mine` call. Make the auto-ingest *additive*. _get_mine_dir → _get_mine_targets, returning a list of (dir, mode) pairs: - MEMPAL_DIR (when valid) contributes (dir, "projects") - A valid transcript JSONL contributes (parent, "convos") - Both can appear together; the hook spawns one ingest per target Same change applied to the shell save and precompact hooks. Precompact also gained transcript_path parsing so it can run the convos mine synchronously before context is compressed. hooks/README.md updated to describe MEMPAL_DIR as a project-files target, never a convos target. --- hooks/README.md | 2 +- hooks/mempal_precompact_hook.sh | 44 ++++++++---- hooks/mempal_save_hook.sh | 36 +++++----- mempalace/hooks_cli.py | 78 ++++++++++++--------- tests/test_hooks_cli.py | 119 +++++++++++++++++++++++--------- tests/test_save_hook_mines.py | 31 ++++----- 6 files changed, 197 insertions(+), 113 deletions(-) diff --git a/hooks/README.md b/hooks/README.md index 7794527..469d231 100644 --- a/hooks/README.md +++ b/hooks/README.md @@ -67,7 +67,7 @@ Edit `mempal_save_hook.sh` to change: - **`SAVE_INTERVAL=15`** — How many human messages between saves. Lower = more frequent saves, higher = less interruption. - **`STATE_DIR`** — Where hook state is stored (defaults to `~/.mempalace/hook_state/`) -- **`MEMPAL_DIR`** — Optional. Set to a conversations directory to auto-run `mempalace mine ` on each save trigger. Leave blank (default) to let the AI handle saving via the block reason message. +- **`MEMPAL_DIR`** — Optional **project directory** (code, notes, docs) to also mine on each save trigger, with `--mode projects`. The hook ALWAYS mines the active conversation transcript automatically with `--mode convos` — `MEMPAL_DIR` is purely additive, never an override. Leave blank if you don't want to ingest project files. - **`MEMPALACE_PYTHON`** — Optional env var. Python interpreter with mempalace + chromadb installed. Auto-detects: `MEMPALACE_PYTHON` env var → repo `venv/bin/python3` → system `python3`. Set this if your venv is in a non-standard location. ### mempalace CLI diff --git a/hooks/mempal_precompact_hook.sh b/hooks/mempal_precompact_hook.sh index a14a0d0..70377f6 100755 --- a/hooks/mempal_precompact_hook.sh +++ b/hooks/mempal_precompact_hook.sh @@ -41,17 +41,18 @@ # to save everything. After the AI saves, compaction proceeds normally. # # === MEMPALACE CLI === -# This repo uses: mempalace mine -# or: mempalace mine --mode convos -# Set MEMPAL_DIR below if you want the hook to auto-ingest before compaction. -# Leave blank to rely on the AI's own save instructions. +# The hook ALWAYS mines the active conversation transcript synchronously +# before compaction (via `mempalace mine --mode convos`). +# MEMPAL_DIR is an *additional*, optional target for project files — it +# does not replace the conversation mine. STATE_DIR="$HOME/.mempalace/hook_state" mkdir -p "$STATE_DIR" -# Optional: set to the directory you want auto-ingested before compaction. -# Example: MEMPAL_DIR="$HOME/conversations" -# Leave empty to skip auto-ingest (AI handles saving via the block reason). +# Optional: project directory (code / notes / docs) to also mine before +# compaction. Mined with `--mode projects`. The conversation transcript +# is always mined regardless — this is purely additive. +# Example: MEMPAL_DIR="$HOME/projects/my_app" MEMPAL_DIR="" # Resolve the Python interpreter. Same contract as mempal_save_hook.sh: @@ -64,15 +65,34 @@ fi # Read JSON input from stdin INPUT=$(cat) -SESSION_ID=$(echo "$INPUT" | "$MEMPAL_PYTHON_BIN" -c "import sys,json; print(json.load(sys.stdin).get('session_id','unknown'))" 2>/dev/null) +# Parse session_id and transcript_path in one call. Sanitize both before +# interpolating into shell — same contract as mempal_save_hook.sh. +eval $(echo "$INPUT" | "$MEMPAL_PYTHON_BIN" -c " +import sys, json, re +data = json.load(sys.stdin) +sid = data.get('session_id', 'unknown') +tp = data.get('transcript_path', '') +safe = lambda s: re.sub(r'[^a-zA-Z0-9_/.\-~]', '', str(s)) +print(f'SESSION_ID=\"{safe(sid)}\"') +print(f'TRANSCRIPT_PATH=\"{safe(tp)}\"') +" 2>/dev/null) + +# Expand ~ in path +TRANSCRIPT_PATH="${TRANSCRIPT_PATH/#\~/$HOME}" echo "[$(date '+%H:%M:%S')] PRE-COMPACT triggered for session $SESSION_ID" >> "$STATE_DIR/hook.log" -# Optional: run mempalace ingest synchronously so memories land before compaction +# Run ingest synchronously so memories land before compaction. Two +# independent targets — both run if both are set: +# 1. TRANSCRIPT_PATH (from Claude Code) → parent dir, --mode convos +# 2. MEMPAL_DIR → --mode projects +if [ -n "$TRANSCRIPT_PATH" ] && [ -f "$TRANSCRIPT_PATH" ]; then + mempalace mine "$(dirname "$TRANSCRIPT_PATH")" --mode convos \ + >> "$STATE_DIR/hook.log" 2>&1 +fi if [ -n "$MEMPAL_DIR" ] && [ -d "$MEMPAL_DIR" ]; then - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - REPO_DIR="$(dirname "$SCRIPT_DIR")" - mempalace mine "$MEMPAL_DIR" >> "$STATE_DIR/hook.log" 2>&1 + mempalace mine "$MEMPAL_DIR" --mode projects \ + >> "$STATE_DIR/hook.log" 2>&1 fi # Silent: return empty JSON to not block. "decision": "allow" is invalid — diff --git a/hooks/mempal_save_hook.sh b/hooks/mempal_save_hook.sh index 228b41b..d69ae85 100755 --- a/hooks/mempal_save_hook.sh +++ b/hooks/mempal_save_hook.sh @@ -45,10 +45,10 @@ # stop_hook_active=true so we let it through. No infinite loop. # # === MEMPALACE CLI === -# This repo uses: mempalace mine -# or: mempalace mine --mode convos -# Set MEMPAL_DIR below if you want the hook to auto-ingest after blocking. -# Leave blank to rely on the AI's own save instructions. +# The hook ALWAYS mines the active conversation transcript automatically +# (via `mempalace mine --mode convos`). MEMPAL_DIR is an +# *additional*, optional target for project files — it does not replace +# the conversation mine. # # === CONFIGURATION === @@ -56,9 +56,10 @@ SAVE_INTERVAL=15 # Save every N human messages (adjust to taste) STATE_DIR="$HOME/.mempalace/hook_state" mkdir -p "$STATE_DIR" -# Optional: set to the directory you want auto-ingested on each save trigger. -# Example: MEMPAL_DIR="$HOME/conversations" -# Leave empty to skip auto-ingest (AI handles saving via the block reason). +# Optional: project directory (code / notes / docs) to also mine each +# save trigger. Mined with `--mode projects`. The conversation transcript +# is always mined regardless — this is purely additive. +# Example: MEMPAL_DIR="$HOME/projects/my_app" MEMPAL_DIR="" # Resolve the Python interpreter the hook should use. @@ -157,19 +158,20 @@ if [ "$SINCE_LAST" -ge "$SAVE_INTERVAL" ] && [ "$EXCHANGE_COUNT" -gt 0 ]; then echo "[$(date '+%H:%M:%S')] TRIGGERING SAVE at exchange $EXCHANGE_COUNT" >> "$STATE_DIR/hook.log" - # Auto-mine the transcript. Two paths: - # 1. TRANSCRIPT_PATH (from Claude Code) — mine the directory it lives in - # 2. MEMPAL_DIR (user-configured) — mine that directory - # At least one should work. If neither is set, nothing mines. - MINE_DIR="" + # Auto-mine. Two independent targets — both run if both are set: + # 1. TRANSCRIPT_PATH (from Claude Code) → parent dir, --mode convos + # (Claude Code session JSONL — must use the convo miner) + # 2. MEMPAL_DIR (user-configured project) → --mode projects + # (code, notes, docs) + # MEMPAL_DIR is *additive*, not an override: a user with MEMPAL_DIR + # pointed at their project still gets the active conversation mined. if [ -n "$TRANSCRIPT_PATH" ] && [ -f "$TRANSCRIPT_PATH" ]; then - MINE_DIR="$(dirname "$TRANSCRIPT_PATH")" + mempalace mine "$(dirname "$TRANSCRIPT_PATH")" --mode convos \ + >> "$STATE_DIR/hook.log" 2>&1 & fi if [ -n "$MEMPAL_DIR" ] && [ -d "$MEMPAL_DIR" ]; then - MINE_DIR="$MEMPAL_DIR" - fi - if [ -n "$MINE_DIR" ]; then - mempalace mine "$MINE_DIR" >> "$STATE_DIR/hook.log" 2>&1 & + mempalace mine "$MEMPAL_DIR" --mode projects \ + >> "$STATE_DIR/hook.log" 2>&1 & fi # MEMPAL_VERBOSE toggle: diff --git a/mempalace/hooks_cli.py b/mempalace/hooks_cli.py index 49b77e2..0645c41 100644 --- a/mempalace/hooks_cli.py +++ b/mempalace/hooks_cli.py @@ -197,27 +197,27 @@ def _output(data: dict): sys.stdout.buffer.flush() -def _get_mine_dir(transcript_path: str = "") -> tuple[str, str]: - """Determine directory to mine and the miner mode to use. +def _get_mine_targets(transcript_path: str = "") -> list[tuple[str, str]]: + """Return the list of ``(dir, mode)`` targets for auto-ingest. - Returns ``(dir, mode)`` where ``mode`` is ``"projects"`` or ``"convos"``. - Empty ``dir`` means no ingest should run. + MEMPAL_DIR (when set and resolvable) contributes a ``"projects"`` + target. A valid transcript JSONL contributes a ``"convos"`` target + on its parent dir. Both may appear together — the hook runs one + ingest per target, so a user with MEMPAL_DIR pointed at their + project dir still gets the active conversation mined verbatim. - MEMPAL_DIR is treated as a project directory ("projects" mode). The - transcript-path fallback resolves to the parent of a Claude Code - session JSONL, which must be mined with the conversation miner — - running the projects miner there ingests JSONL as if it were source - code. + An empty list means no ingest should run. """ + targets: list[tuple[str, str]] = [] mempal_dir = os.environ.get("MEMPAL_DIR", "") if mempal_dir: resolved = Path(mempal_dir).expanduser().resolve() if resolved.is_dir(): - return str(resolved), "projects" + targets.append((str(resolved), "projects")) path = _validate_transcript_path(transcript_path) if path is not None and path.is_file(): - return str(path.parent), "convos" - return "", "projects" + targets.append((str(path.parent), "convos")) + return targets _MINE_PID_FILE = STATE_DIR / "mine.pid" @@ -275,36 +275,46 @@ def _spawn_mine(cmd: list) -> None: def _maybe_auto_ingest(transcript_path: str = ""): - """Run mempalace mine in background if a mine directory is available.""" - mine_dir, mode = _get_mine_dir(transcript_path) - if not mine_dir: + """Run mempalace mine in background for every available target.""" + targets = _get_mine_targets(transcript_path) + if not targets: return if _mine_already_running(): _log("Skipping auto-ingest: mine already running") return - try: - _spawn_mine([sys.executable, "-m", "mempalace", "mine", mine_dir, "--mode", mode]) - except OSError: - pass + for mine_dir, mode in targets: + try: + _spawn_mine([sys.executable, "-m", "mempalace", "mine", mine_dir, "--mode", mode]) + except OSError: + pass def _mine_sync(transcript_path: str = ""): - """Run mempalace mine synchronously (for precompact -- data must land first).""" - mine_dir, mode = _get_mine_dir(transcript_path) - if not mine_dir: + """Run mempalace mine synchronously for every target (precompact).""" + targets = _get_mine_targets(transcript_path) + if not targets: return - try: - STATE_DIR.mkdir(parents=True, exist_ok=True) - log_path = STATE_DIR / "hook.log" - with open(log_path, "a") as log_f: - subprocess.run( - [sys.executable, "-m", "mempalace", "mine", mine_dir, "--mode", mode], - stdout=log_f, - stderr=log_f, - timeout=60, - ) - except (OSError, subprocess.TimeoutExpired): - pass + STATE_DIR.mkdir(parents=True, exist_ok=True) + log_path = STATE_DIR / "hook.log" + for mine_dir, mode in targets: + try: + with open(log_path, "a") as log_f: + subprocess.run( + [ + sys.executable, + "-m", + "mempalace", + "mine", + mine_dir, + "--mode", + mode, + ], + stdout=log_f, + stderr=log_f, + timeout=60, + ) + except (OSError, subprocess.TimeoutExpired): + pass def _desktop_toast(body: str, title: str = "MemPalace"): diff --git a/tests/test_hooks_cli.py b/tests/test_hooks_cli.py index 6763439..c105eae 100644 --- a/tests/test_hooks_cli.py +++ b/tests/test_hooks_cli.py @@ -12,7 +12,7 @@ from mempalace.hooks_cli import ( SAVE_INTERVAL, _count_human_messages, _extract_recent_messages, - _get_mine_dir, + _get_mine_targets, _log, _maybe_auto_ingest, _mempalace_python, @@ -494,6 +494,43 @@ def test_mine_sync_with_env_uses_projects_mode(tmp_path): assert cmd[cmd.index("--mode") + 1] == "projects" +def test_maybe_auto_ingest_with_both_set(tmp_path): + """When MEMPAL_DIR and a transcript are both set, BOTH spawns happen.""" + mempal_dir = tmp_path / "project" + mempal_dir.mkdir() + convo_dir = tmp_path / "convos" + convo_dir.mkdir() + transcript = convo_dir / "session.jsonl" + transcript.write_text("") + with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): + with patch("mempalace.hooks_cli.STATE_DIR", tmp_path): + with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"): + with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen: + _maybe_auto_ingest(str(transcript)) + assert mock_popen.call_count == 2 + cmds = [call.args[0] for call in mock_popen.call_args_list] + modes = {cmd[cmd.index("--mode") + 1] for cmd in cmds} + assert modes == {"projects", "convos"} + + +def test_mine_sync_with_both_set(tmp_path): + """Precompact sync runs BOTH mines when MEMPAL_DIR + transcript are set.""" + mempal_dir = tmp_path / "project" + mempal_dir.mkdir() + convo_dir = tmp_path / "convos" + convo_dir.mkdir() + transcript = convo_dir / "session.jsonl" + transcript.write_text("") + with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): + with patch("mempalace.hooks_cli.STATE_DIR", tmp_path): + with patch("mempalace.hooks_cli.subprocess.run") as mock_run: + _mine_sync(str(transcript)) + assert mock_run.call_count == 2 + cmds = [call.args[0] for call in mock_run.call_args_list] + modes = {cmd[cmd.index("--mode") + 1] for cmd in cmds} + assert modes == {"projects", "convos"} + + def test_maybe_auto_ingest_oserror(tmp_path): """OSError during subprocess spawn is silenced.""" mempal_dir = tmp_path / "project" @@ -550,70 +587,90 @@ def test_mine_already_running_corrupt_file(tmp_path): assert _mine_already_running() is False -# --- _get_mine_dir --- +# --- _get_mine_targets --- -def test_get_mine_dir_mempal_dir(tmp_path): - """MEMPAL_DIR takes priority, is expanded/resolved, and is treated as projects mode.""" +def test_get_mine_targets_mempal_dir_only(tmp_path): + """MEMPAL_DIR alone yields a single projects target, expanded/resolved.""" mempal_dir = tmp_path / "project" mempal_dir.mkdir() - transcript = tmp_path / "t.jsonl" - transcript.write_text("") with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): - result_dir, result_mode = _get_mine_dir(str(transcript)) - assert Path(result_dir).resolve() == mempal_dir.resolve() - assert result_mode == "projects" + targets = _get_mine_targets("") + assert len(targets) == 1 + assert Path(targets[0][0]).resolve() == mempal_dir.resolve() + assert targets[0][1] == "projects" -def test_get_mine_dir_mempal_dir_tilde(tmp_path): +def test_get_mine_targets_mempal_dir_tilde(tmp_path): """MEMPAL_DIR with a tilde prefix is expanded correctly.""" mempal_dir = tmp_path / "project" mempal_dir.mkdir() home = Path.home() - # Build a ~-relative path only if tmp_path is inside home try: rel = mempal_dir.relative_to(home) except ValueError: pytest.skip("tmp_path is not under home, cannot build ~-relative path") tilde_path = "~/" + str(rel) with patch.dict("os.environ", {"MEMPAL_DIR": tilde_path}): - result_dir, result_mode = _get_mine_dir("") - assert Path(result_dir).resolve() == mempal_dir.resolve() - assert result_mode == "projects" + targets = _get_mine_targets("") + assert len(targets) == 1 + assert Path(targets[0][0]).resolve() == mempal_dir.resolve() + assert targets[0][1] == "projects" -def test_get_mine_dir_transcript_fallback(tmp_path): - """Transcript fallback resolves to its parent dir in convos mode.""" +def test_get_mine_targets_transcript_only(tmp_path): + """A valid transcript JSONL alone yields a single convos target.""" transcript = tmp_path / "t.jsonl" transcript.write_text("") with patch.dict("os.environ", {}, clear=True): - result_dir, result_mode = _get_mine_dir(str(transcript)) - assert Path(result_dir).resolve() == tmp_path.resolve() - assert result_mode == "convos" + targets = _get_mine_targets(str(transcript)) + assert len(targets) == 1 + assert Path(targets[0][0]).resolve() == tmp_path.resolve() + assert targets[0][1] == "convos" -def test_get_mine_dir_transcript_path_traversal_rejected(tmp_path): - """Transcript paths with '..' components are rejected and return no dir.""" +def test_get_mine_targets_both_set(tmp_path): + """When MEMPAL_DIR and a valid transcript are both set, BOTH targets appear. + + This is the regression that motivates the additive-targets design: + users who set MEMPAL_DIR previously had their conversations silently + skipped because MEMPAL_DIR overrode the transcript path. + """ + mempal_dir = tmp_path / "project" + mempal_dir.mkdir() + convo_dir = tmp_path / "convos" + convo_dir.mkdir() + transcript = convo_dir / "session.jsonl" + transcript.write_text("") + with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): + targets = _get_mine_targets(str(transcript)) + modes = {mode for _, mode in targets} + assert modes == {"projects", "convos"} + by_mode = {mode: Path(d) for d, mode in targets} + assert by_mode["projects"].resolve() == mempal_dir.resolve() + assert by_mode["convos"].resolve() == convo_dir.resolve() + + +def test_get_mine_targets_transcript_path_traversal_rejected(tmp_path): + """Transcript paths with '..' components are rejected (no convos target).""" with patch.dict("os.environ", {}, clear=True): - result_dir, result_mode = _get_mine_dir("../../etc/passwd") - assert result_dir == "" - assert result_mode == "projects" + targets = _get_mine_targets("../../etc/passwd") + assert targets == [] -def test_get_mine_dir_transcript_non_jsonl_rejected(tmp_path): +def test_get_mine_targets_transcript_non_jsonl_rejected(tmp_path): """Transcript paths without .jsonl/.json extension are rejected.""" bad = tmp_path / "notes.txt" bad.write_text("content") with patch.dict("os.environ", {}, clear=True): - result_dir, result_mode = _get_mine_dir(str(bad)) - assert result_dir == "" - assert result_mode == "projects" + targets = _get_mine_targets(str(bad)) + assert targets == [] -def test_get_mine_dir_empty(): - """Returns empty dir when nothing is available.""" +def test_get_mine_targets_empty(): + """Returns empty list when nothing is available.""" with patch.dict("os.environ", {}, clear=True): - assert _get_mine_dir("") == ("", "projects") + assert _get_mine_targets("") == [] # --- _parse_harness_input --- diff --git a/tests/test_save_hook_mines.py b/tests/test_save_hook_mines.py index a702a42..93a51f2 100644 --- a/tests/test_save_hook_mines.py +++ b/tests/test_save_hook_mines.py @@ -16,7 +16,8 @@ class TestSaveHookAutoMines: def test_hook_mines_transcript_path(self): """The hook receives TRANSCRIPT_PATH from Claude Code. - It should use that to mine the conversation, not depend on MEMPAL_DIR.""" + It should use that to mine the conversation as --mode convos, + independently of MEMPAL_DIR (which is for project files only).""" hook_path = os.path.join( os.path.dirname(os.path.dirname(__file__)), "hooks", @@ -24,23 +25,17 @@ class TestSaveHookAutoMines: ) src = open(hook_path).read() - # The hook ALREADY receives TRANSCRIPT_PATH in the JSON input. - # It should use this to mine the current session's transcript - # regardless of whether MEMPAL_DIR is set. - # The hook must have a path that uses TRANSCRIPT_PATH to determine - # what to mine, separate from the MEMPAL_DIR path. - uses_transcript = "TRANSCRIPT_PATH" in src - has_mine = "mempalace mine" in src - # TRANSCRIPT_PATH must appear in the mining logic, not just the parse block - transcript_drives_mine = "MINE_DIR" in src and "dirname" in src and "TRANSCRIPT_PATH" in src - - assert uses_transcript and has_mine and transcript_drives_mine, ( - "Save hook only mines when MEMPAL_DIR is set (defaults to empty). " - "The hook receives TRANSCRIPT_PATH from Claude Code — it should " - "mine that file automatically so conversations are saved without " - "the user setting an env var. Currently the hook says 'saved in " - "background' but nothing actually saves." - ) + # The hook must drive the conversation mine off TRANSCRIPT_PATH, + # using `dirname` to derive the parent dir, and tagging it with + # `--mode convos` so the convo miner runs (not the projects miner). + assert "TRANSCRIPT_PATH" in src, "hook must read transcript_path" + assert "mempalace mine" in src, "hook must invoke `mempalace mine`" + assert ( + 'dirname "$TRANSCRIPT_PATH"' in src + ), "hook must mine the transcript's parent directory" + assert ( + "--mode convos" in src + ), "transcript mine must use --mode convos, not the projects miner" def test_mempal_dir_default_not_empty(self): """If MEMPAL_DIR is still used, it should have a sensible default, From fe5679776274a23f175f23bd1bdb57ed42094e6d Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Mon, 27 Apr 2026 02:26:53 -0300 Subject: [PATCH 2/3] fix(hooks): consolidate transcript ingest, harden shell parsers (#1231 review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address Copilot review on #1231: 1. Stop double-mining the transcript on the Python side. ``_get_mine_targets`` now returns only the ``MEMPAL_DIR`` projects target — the convos target for the transcript dir is dropped because ``_ingest_transcript`` already handles it on every hook fire. The duplicate spawn was using ``sys.executable`` (vs ``_mempalace_python()``) and a different ``--wing``, so each Stop/PreCompact event was writing the same transcript into two wings under asymmetric interpreters and overwriting the single ``_MINE_PID_FILE`` lock. 2. ``_maybe_auto_ingest`` and ``_mine_sync`` now spawn via ``_mempalace_python()`` so the resolved interpreter matches the venv that owns mempalace (matters under GUI-launched harnesses where ``sys.executable`` may resolve to a system Python without chromadb). 3. Replace ``eval $(...)`` in both shell hooks with a ``mapfile``-based reader. Sanitized values are still emitted by the same Python parser, but the shell now does plain variable assignment instead of executing the parser's stdout — smaller blast radius if the sanitizer is ever bypassed. 4. Mirror ``_validate_transcript_path`` in the shell hooks via a ``is_valid_transcript_path`` helper — extension + traversal-segment rejection, parity with the Python validator. The convos mine in each shell hook is now gated on the validator instead of bare ``-f``. 5. Tighten the ``..`` traversal test that previously exercised the suffix gate by mistake (``../../etc/passwd`` lacks ``.json[l]``). Use ``.jsonl`` paths with traversal segments to actually hit the ``..`` rejection branch. 6. README: add a one-liner pointing at ``mempalace sweep`` for users who want per-message recall on top of the file-level chunks the hooks produce. The sweeper was undiscoverable previously. Tests: 1418 passed, 1 skipped (full suite minus benchmarks). --- README.md | 4 + hooks/mempal_precompact_hook.sh | 35 +++++- hooks/mempal_save_hook.sh | 41 +++++-- mempalace/hooks_cli.py | 51 ++++---- tests/test_hooks_cli.py | 200 ++++++++++++++++---------------- tests/test_save_hook_mines.py | 62 ++++++++++ 6 files changed, 261 insertions(+), 132 deletions(-) diff --git a/README.md b/README.md index acbeb14..8157fca 100644 --- a/README.md +++ b/README.md @@ -149,6 +149,10 @@ system prompt: Two Claude Code hooks save periodically and before context compression: [mempalaceofficial.com/guide/hooks](https://mempalaceofficial.com/guide/hooks.html). +For per-message recall on top of the file-level chunks the hooks produce, +run `mempalace sweep ` periodically — it stores one +verbatim drawer per user/assistant message, idempotent and resume-safe. + --- ## Requirements diff --git a/hooks/mempal_precompact_hook.sh b/hooks/mempal_precompact_hook.sh index 70377f6..e811d65 100755 --- a/hooks/mempal_precompact_hook.sh +++ b/hooks/mempal_precompact_hook.sh @@ -65,30 +65,53 @@ fi # Read JSON input from stdin INPUT=$(cat) -# Parse session_id and transcript_path in one call. Sanitize both before -# interpolating into shell — same contract as mempal_save_hook.sh. -eval $(echo "$INPUT" | "$MEMPAL_PYTHON_BIN" -c " +# Parse session_id and transcript_path in one call. Sanitize both, then +# read sanitized values from one-per-line stdout into shell variables — +# avoids ``eval`` on generated code (#1231 review). Same contract as +# mempal_save_hook.sh. +mapfile -t _mempal_parsed < <(echo "$INPUT" | "$MEMPAL_PYTHON_BIN" -c " import sys, json, re data = json.load(sys.stdin) sid = data.get('session_id', 'unknown') tp = data.get('transcript_path', '') safe = lambda s: re.sub(r'[^a-zA-Z0-9_/.\-~]', '', str(s)) -print(f'SESSION_ID=\"{safe(sid)}\"') -print(f'TRANSCRIPT_PATH=\"{safe(tp)}\"') +print(safe(sid)) +print(safe(tp)) " 2>/dev/null) +SESSION_ID="${_mempal_parsed[0]:-unknown}" +TRANSCRIPT_PATH="${_mempal_parsed[1]:-}" # Expand ~ in path TRANSCRIPT_PATH="${TRANSCRIPT_PATH/#\~/$HOME}" +# Validate that TRANSCRIPT_PATH looks like a transcript file. Mirrors +# mempalace.hooks_cli._validate_transcript_path so the shell hook +# rejects the same shapes the Python hook rejects (#1231 review). +is_valid_transcript_path() { + local path="$1" + [ -n "$path" ] || return 1 + case "$path" in + *.json|*.jsonl) ;; + *) return 1 ;; + esac + case "/$path/" in + */../*) return 1 ;; + esac + return 0 +} + echo "[$(date '+%H:%M:%S')] PRE-COMPACT triggered for session $SESSION_ID" >> "$STATE_DIR/hook.log" # Run ingest synchronously so memories land before compaction. Two # independent targets — both run if both are set: # 1. TRANSCRIPT_PATH (from Claude Code) → parent dir, --mode convos # 2. MEMPAL_DIR → --mode projects -if [ -n "$TRANSCRIPT_PATH" ] && [ -f "$TRANSCRIPT_PATH" ]; then +if is_valid_transcript_path "$TRANSCRIPT_PATH" && [ -f "$TRANSCRIPT_PATH" ]; then mempalace mine "$(dirname "$TRANSCRIPT_PATH")" --mode convos \ >> "$STATE_DIR/hook.log" 2>&1 +elif [ -n "$TRANSCRIPT_PATH" ]; then + echo "[$(date '+%H:%M:%S')] Skipping invalid transcript path: $TRANSCRIPT_PATH" \ + >> "$STATE_DIR/hook.log" fi if [ -n "$MEMPAL_DIR" ] && [ -d "$MEMPAL_DIR" ]; then mempalace mine "$MEMPAL_DIR" --mode projects \ diff --git a/hooks/mempal_save_hook.sh b/hooks/mempal_save_hook.sh index d69ae85..5efd157 100755 --- a/hooks/mempal_save_hook.sh +++ b/hooks/mempal_save_hook.sh @@ -83,9 +83,11 @@ fi INPUT=$(cat) # Parse all fields in a single Python call (3x faster than separate invocations) -# SECURITY: All values are sanitized before being interpolated into shell assignments. -# stop_hook_active is coerced to a strict True/False to prevent command injection via eval. -eval $(echo "$INPUT" | "$MEMPAL_PYTHON_BIN" -c " +# without invoking ``eval`` on generated code: Python prints one sanitized +# value per line, the shell reads them via ``mapfile`` and does plain +# variable assignment — same data, smaller blast radius if the sanitizer +# is ever bypassed (#1231 review). +mapfile -t _mempal_parsed < <(echo "$INPUT" | "$MEMPAL_PYTHON_BIN" -c " import sys, json, re data = json.load(sys.stdin) sid = data.get('session_id', 'unknown') @@ -95,14 +97,36 @@ tp = data.get('transcript_path', '') safe = lambda s: re.sub(r'[^a-zA-Z0-9_/.\-~]', '', str(s)) # Coerce stop_hook_active to strict boolean string sha = 'True' if sha_raw is True or str(sha_raw).lower() in ('true', '1', 'yes') else 'False' -print(f'SESSION_ID=\"{safe(sid)}\"') -print(f'STOP_HOOK_ACTIVE=\"{sha}\"') -print(f'TRANSCRIPT_PATH=\"{safe(tp)}\"') +print(safe(sid)) +print(sha) +print(safe(tp)) " 2>/dev/null) +SESSION_ID="${_mempal_parsed[0]:-unknown}" +STOP_HOOK_ACTIVE="${_mempal_parsed[1]:-False}" +TRANSCRIPT_PATH="${_mempal_parsed[2]:-}" # Expand ~ in path TRANSCRIPT_PATH="${TRANSCRIPT_PATH/#\~/$HOME}" +# Validate that TRANSCRIPT_PATH looks like a transcript file: +# - non-empty +# - .jsonl or .json suffix +# - no traversal segments (.. components) +# Mirrors mempalace.hooks_cli._validate_transcript_path so the shell hook +# rejects the same shapes the Python hook rejects (#1231 review). +is_valid_transcript_path() { + local path="$1" + [ -n "$path" ] || return 1 + case "$path" in + *.json|*.jsonl) ;; + *) return 1 ;; + esac + case "/$path/" in + */../*) return 1 ;; + esac + return 0 +} + # If we're already in a save cycle, let the AI stop normally # This is the infinite-loop prevention: block once → AI saves → tries to stop again → we let it through if [ "$STOP_HOOK_ACTIVE" = "True" ] || [ "$STOP_HOOK_ACTIVE" = "true" ]; then @@ -165,9 +189,12 @@ if [ "$SINCE_LAST" -ge "$SAVE_INTERVAL" ] && [ "$EXCHANGE_COUNT" -gt 0 ]; then # (code, notes, docs) # MEMPAL_DIR is *additive*, not an override: a user with MEMPAL_DIR # pointed at their project still gets the active conversation mined. - if [ -n "$TRANSCRIPT_PATH" ] && [ -f "$TRANSCRIPT_PATH" ]; then + if is_valid_transcript_path "$TRANSCRIPT_PATH" && [ -f "$TRANSCRIPT_PATH" ]; then mempalace mine "$(dirname "$TRANSCRIPT_PATH")" --mode convos \ >> "$STATE_DIR/hook.log" 2>&1 & + elif [ -n "$TRANSCRIPT_PATH" ]; then + echo "[$(date '+%H:%M:%S')] Skipping invalid transcript path: $TRANSCRIPT_PATH" \ + >> "$STATE_DIR/hook.log" fi if [ -n "$MEMPAL_DIR" ] && [ -d "$MEMPAL_DIR" ]; then mempalace mine "$MEMPAL_DIR" --mode projects \ diff --git a/mempalace/hooks_cli.py b/mempalace/hooks_cli.py index 0645c41..d4f8317 100644 --- a/mempalace/hooks_cli.py +++ b/mempalace/hooks_cli.py @@ -197,16 +197,15 @@ def _output(data: dict): sys.stdout.buffer.flush() -def _get_mine_targets(transcript_path: str = "") -> list[tuple[str, str]]: +def _get_mine_targets() -> list[tuple[str, str]]: """Return the list of ``(dir, mode)`` targets for auto-ingest. MEMPAL_DIR (when set and resolvable) contributes a ``"projects"`` - target. A valid transcript JSONL contributes a ``"convos"`` target - on its parent dir. Both may appear together — the hook runs one - ingest per target, so a user with MEMPAL_DIR pointed at their - project dir still gets the active conversation mined verbatim. + target. Transcript ingestion is handled separately by + ``_ingest_transcript`` — emitting it here too would double-mine the + same JSONL into a different wing on every hook fire (#1231 review). - An empty list means no ingest should run. + An empty list means no MEMPAL_DIR ingest should run. """ targets: list[tuple[str, str]] = [] mempal_dir = os.environ.get("MEMPAL_DIR", "") @@ -214,9 +213,6 @@ def _get_mine_targets(transcript_path: str = "") -> list[tuple[str, str]]: resolved = Path(mempal_dir).expanduser().resolve() if resolved.is_dir(): targets.append((str(resolved), "projects")) - path = _validate_transcript_path(transcript_path) - if path is not None and path.is_file(): - targets.append((str(path.parent), "convos")) return targets @@ -274,9 +270,15 @@ def _spawn_mine(cmd: list) -> None: _MINE_PID_FILE.write_text(str(proc.pid)) -def _maybe_auto_ingest(transcript_path: str = ""): - """Run mempalace mine in background for every available target.""" - targets = _get_mine_targets(transcript_path) +def _maybe_auto_ingest(): + """Background-mine MEMPAL_DIR (project files) if set. + + Transcript convos are ingested separately via ``_ingest_transcript`` + in the hook handlers — this function does not handle them, to avoid + asymmetric interpreter handling and PID-file overwrite when both + targets fire from a single hook call (#1231 review). + """ + targets = _get_mine_targets() if not targets: return if _mine_already_running(): @@ -284,14 +286,19 @@ def _maybe_auto_ingest(transcript_path: str = ""): return for mine_dir, mode in targets: try: - _spawn_mine([sys.executable, "-m", "mempalace", "mine", mine_dir, "--mode", mode]) + _spawn_mine([_mempalace_python(), "-m", "mempalace", "mine", mine_dir, "--mode", mode]) except OSError: pass -def _mine_sync(transcript_path: str = ""): - """Run mempalace mine synchronously for every target (precompact).""" - targets = _get_mine_targets(transcript_path) +def _mine_sync(): + """Synchronously mine MEMPAL_DIR (precompact path). + + Transcript convos are ingested separately via ``_ingest_transcript`` + in ``hook_precompact`` — keeping them out of this function avoids + timeout stacking against the harness 30s ceiling (#1231 review). + """ + targets = _get_mine_targets() if not targets: return STATE_DIR.mkdir(parents=True, exist_ok=True) @@ -301,7 +308,7 @@ def _mine_sync(transcript_path: str = ""): with open(log_path, "a") as log_f: subprocess.run( [ - sys.executable, + _mempalace_python(), "-m", "mempalace", "mine", @@ -613,7 +620,7 @@ def hook_stop(data: dict, harness: str): transcript_path, session_id, wing=project_wing, toast=toast ) _ingest_transcript(transcript_path) - _maybe_auto_ingest(transcript_path) + _maybe_auto_ingest() # Only advance save marker after successful save count = result.get("count", 0) if count > 0: @@ -643,7 +650,7 @@ def hook_stop(data: dict, harness: str): pass if transcript_path: _ingest_transcript(transcript_path) - _maybe_auto_ingest(transcript_path) + _maybe_auto_ingest() reason = STOP_BLOCK_REASON + f" Write diary entry to wing={project_wing}." _output({"decision": "block", "reason": reason}) else: @@ -676,8 +683,10 @@ def hook_precompact(data: dict, harness: str): if transcript_path: _ingest_transcript(transcript_path) - # Mine synchronously so data lands before compaction proceeds - _mine_sync(transcript_path) + # Mine MEMPAL_DIR synchronously so project data lands before + # compaction proceeds. Transcript convos were already kicked off + # above via _ingest_transcript. + _mine_sync() _output({}) diff --git a/tests/test_hooks_cli.py b/tests/test_hooks_cli.py index c105eae..1ceb530 100644 --- a/tests/test_hooks_cli.py +++ b/tests/test_hooks_cli.py @@ -450,35 +450,26 @@ def test_maybe_auto_ingest_with_env(tmp_path): assert cmd[cmd.index("--mode") + 1] == "projects" -def test_maybe_auto_ingest_with_transcript(tmp_path): - """Transcript fallback spawns mine in convos mode against the JSONL parent.""" - transcript = tmp_path / "t.jsonl" - transcript.write_text("") - with patch.dict("os.environ", {}, clear=True): +def test_maybe_auto_ingest_uses_mempalace_python(tmp_path): + """Spawned mine command uses _mempalace_python(), not bare sys.executable. + + Hook subprocesses inherit the harness PATH which on GUI-launched + Claude Code may resolve to a system Python without chromadb. The + interpreter used here must be the same one the hook itself runs + under (typically the venv that owns mempalace). + """ + mempal_dir = tmp_path / "project" + mempal_dir.mkdir() + with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): with patch("mempalace.hooks_cli.STATE_DIR", tmp_path): with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"): - with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen: - _maybe_auto_ingest(str(transcript)) - mock_popen.assert_called_once() - cmd = mock_popen.call_args[0][0] - assert "mine" in cmd - assert str(tmp_path.resolve()) in cmd - assert cmd[cmd.index("--mode") + 1] == "convos" - - -def test_mine_sync_with_transcript_uses_convos_mode(tmp_path): - """Precompact sync path also picks convos mode for JSONL transcripts.""" - transcript = tmp_path / "t.jsonl" - transcript.write_text("") - with patch.dict("os.environ", {}, clear=True): - with patch("mempalace.hooks_cli.STATE_DIR", tmp_path): - with patch("mempalace.hooks_cli.subprocess.run") as mock_run: - _mine_sync(str(transcript)) - mock_run.assert_called_once() - cmd = mock_run.call_args[0][0] - assert "mine" in cmd - assert str(tmp_path.resolve()) in cmd - assert cmd[cmd.index("--mode") + 1] == "convos" + with patch( + "mempalace.hooks_cli._mempalace_python", return_value="/fake/venv/python" + ): + with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen: + _maybe_auto_ingest() + cmd = mock_popen.call_args[0][0] + assert cmd[0] == "/fake/venv/python" def test_mine_sync_with_env_uses_projects_mode(tmp_path): @@ -494,41 +485,55 @@ def test_mine_sync_with_env_uses_projects_mode(tmp_path): assert cmd[cmd.index("--mode") + 1] == "projects" -def test_maybe_auto_ingest_with_both_set(tmp_path): - """When MEMPAL_DIR and a transcript are both set, BOTH spawns happen.""" +def test_mine_sync_uses_mempalace_python(tmp_path): + """Sync mine command uses _mempalace_python(), not bare sys.executable.""" mempal_dir = tmp_path / "project" mempal_dir.mkdir() + with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): + with patch("mempalace.hooks_cli.STATE_DIR", tmp_path): + with patch("mempalace.hooks_cli._mempalace_python", return_value="/fake/venv/python"): + with patch("mempalace.hooks_cli.subprocess.run") as mock_run: + _mine_sync() + cmd = mock_run.call_args[0][0] + assert cmd[0] == "/fake/venv/python" + + +def test_maybe_auto_ingest_ignores_transcript_arg_path(tmp_path): + """_maybe_auto_ingest does NOT mine the transcript directory. + + Transcript convos are handled by _ingest_transcript (called separately + in hook handlers). _maybe_auto_ingest only handles MEMPAL_DIR — even + when invoked in a context where a transcript is also being processed, + no second spawn for the transcript dir should appear here. + """ convo_dir = tmp_path / "convos" convo_dir.mkdir() transcript = convo_dir / "session.jsonl" transcript.write_text("") - with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): + with patch.dict("os.environ", {}, clear=True): with patch("mempalace.hooks_cli.STATE_DIR", tmp_path): with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"): with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen: - _maybe_auto_ingest(str(transcript)) - assert mock_popen.call_count == 2 - cmds = [call.args[0] for call in mock_popen.call_args_list] - modes = {cmd[cmd.index("--mode") + 1] for cmd in cmds} - assert modes == {"projects", "convos"} + _maybe_auto_ingest() + mock_popen.assert_not_called() -def test_mine_sync_with_both_set(tmp_path): - """Precompact sync runs BOTH mines when MEMPAL_DIR + transcript are set.""" - mempal_dir = tmp_path / "project" - mempal_dir.mkdir() +def test_mine_sync_ignores_transcript(tmp_path): + """_mine_sync does not run a convos mine for the transcript dir. + + The precompact transcript ingest is the responsibility of + _ingest_transcript; routing it through _mine_sync would stack a + second 60s timeout against the harness 30s ceiling. + """ convo_dir = tmp_path / "convos" convo_dir.mkdir() transcript = convo_dir / "session.jsonl" transcript.write_text("") - with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): + with patch.dict("os.environ", {}, clear=True): with patch("mempalace.hooks_cli.STATE_DIR", tmp_path): with patch("mempalace.hooks_cli.subprocess.run") as mock_run: - _mine_sync(str(transcript)) - assert mock_run.call_count == 2 - cmds = [call.args[0] for call in mock_run.call_args_list] - modes = {cmd[cmd.index("--mode") + 1] for cmd in cmds} - assert modes == {"projects", "convos"} + _mine_sync() + mock_run.assert_not_called() def test_maybe_auto_ingest_oserror(tmp_path): @@ -595,7 +600,7 @@ def test_get_mine_targets_mempal_dir_only(tmp_path): mempal_dir = tmp_path / "project" mempal_dir.mkdir() with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): - targets = _get_mine_targets("") + targets = _get_mine_targets() assert len(targets) == 1 assert Path(targets[0][0]).resolve() == mempal_dir.resolve() assert targets[0][1] == "projects" @@ -612,65 +617,53 @@ def test_get_mine_targets_mempal_dir_tilde(tmp_path): pytest.skip("tmp_path is not under home, cannot build ~-relative path") tilde_path = "~/" + str(rel) with patch.dict("os.environ", {"MEMPAL_DIR": tilde_path}): - targets = _get_mine_targets("") + targets = _get_mine_targets() assert len(targets) == 1 assert Path(targets[0][0]).resolve() == mempal_dir.resolve() assert targets[0][1] == "projects" -def test_get_mine_targets_transcript_only(tmp_path): - """A valid transcript JSONL alone yields a single convos target.""" +def test_get_mine_targets_no_transcript_target(tmp_path): + """_get_mine_targets does not emit a convos target for the transcript path. + + Transcript ingestion is owned by _ingest_transcript; emitting it + here too would double-mine the same JSONL into a different wing on + every hook fire (#1231 review). + """ transcript = tmp_path / "t.jsonl" transcript.write_text("") with patch.dict("os.environ", {}, clear=True): - targets = _get_mine_targets(str(transcript)) - assert len(targets) == 1 - assert Path(targets[0][0]).resolve() == tmp_path.resolve() - assert targets[0][1] == "convos" + targets = _get_mine_targets() + assert targets == [] -def test_get_mine_targets_both_set(tmp_path): - """When MEMPAL_DIR and a valid transcript are both set, BOTH targets appear. - - This is the regression that motivates the additive-targets design: - users who set MEMPAL_DIR previously had their conversations silently - skipped because MEMPAL_DIR overrode the transcript path. - """ +def test_get_mine_targets_only_returns_mempal_dir(tmp_path): + """When MEMPAL_DIR is set, exactly one projects target — never a convos target.""" mempal_dir = tmp_path / "project" mempal_dir.mkdir() - convo_dir = tmp_path / "convos" - convo_dir.mkdir() - transcript = convo_dir / "session.jsonl" - transcript.write_text("") with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}): - targets = _get_mine_targets(str(transcript)) - modes = {mode for _, mode in targets} - assert modes == {"projects", "convos"} - by_mode = {mode: Path(d) for d, mode in targets} - assert by_mode["projects"].resolve() == mempal_dir.resolve() - assert by_mode["convos"].resolve() == convo_dir.resolve() + targets = _get_mine_targets() + assert len(targets) == 1 + assert targets[0][1] == "projects" -def test_get_mine_targets_transcript_path_traversal_rejected(tmp_path): - """Transcript paths with '..' components are rejected (no convos target).""" - with patch.dict("os.environ", {}, clear=True): - targets = _get_mine_targets("../../etc/passwd") - assert targets == [] +def test_validate_transcript_path_traversal_rejected_jsonl(tmp_path): + """Path traversal is rejected even when the path has a .jsonl suffix. - -def test_get_mine_targets_transcript_non_jsonl_rejected(tmp_path): - """Transcript paths without .jsonl/.json extension are rejected.""" - bad = tmp_path / "notes.txt" - bad.write_text("content") - with patch.dict("os.environ", {}, clear=True): - targets = _get_mine_targets(str(bad)) - assert targets == [] + The pre-fix test used "../../etc/passwd" which lacks an extension and + so was rejected by the suffix gate before the traversal check ever + fired (Copilot review on #1231). Use a .jsonl path with `..` + segments to exercise the traversal guard specifically. + """ + assert _validate_transcript_path("../t.jsonl") is None + assert _validate_transcript_path("a/../b.jsonl") is None + assert _validate_transcript_path("/tmp/../etc/t.jsonl") is None def test_get_mine_targets_empty(): - """Returns empty list when nothing is available.""" + """Returns empty list when MEMPAL_DIR is unset or invalid.""" with patch.dict("os.environ", {}, clear=True): - assert _get_mine_targets("") == [] + assert _get_mine_targets() == [] # --- _parse_harness_input --- @@ -790,22 +783,33 @@ def test_precompact_with_timeout(tmp_path): def test_precompact_mines_transcript_dir(tmp_path, monkeypatch): - """Precompact mines transcript directory when no MEMPAL_DIR.""" + """Precompact ingests the active transcript via _ingest_transcript. + + With no MEMPAL_DIR, _mine_sync is a no-op; the transcript ingest is + the only mining that should fire, and it goes through Popen + (background) inside _ingest_transcript. Pre-#1231-review this test + asserted against subprocess.run, which corresponded to the + duplicate-mine path that has now been removed. + """ transcript = tmp_path / "t.jsonl" - transcript.write_text("") + # _ingest_transcript skips files smaller than 100 bytes, so pad it. + transcript.write_text("x" * 200) monkeypatch.delenv("MEMPAL_DIR", raising=False) - with patch("mempalace.hooks_cli.subprocess.run") as mock_run: - result = _capture_hook_output( - hook_precompact, - {"session_id": "test", "transcript_path": str(transcript)}, - state_dir=tmp_path, - ) + with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen: + with patch("mempalace.hooks_cli.subprocess.run") as mock_run: + result = _capture_hook_output( + hook_precompact, + {"session_id": "test", "transcript_path": str(transcript)}, + state_dir=tmp_path, + ) assert result == {} - mock_run.assert_called_once() - # Mine dir is the transcript's parent (resolved) and mode is convos for JSONL. - call_args = mock_run.call_args[0][0] - assert str(tmp_path.resolve()) in call_args - assert call_args[call_args.index("--mode") + 1] == "convos" + mock_run.assert_not_called() + mock_popen.assert_called_once() + cmd = mock_popen.call_args[0][0] + # Mines the transcript's parent dir as convos, into wing "sessions". + assert str(tmp_path) in cmd + assert cmd[cmd.index("--mode") + 1] == "convos" + assert cmd[cmd.index("--wing") + 1] == "sessions" # --- run_hook --- diff --git a/tests/test_save_hook_mines.py b/tests/test_save_hook_mines.py index 93a51f2..d9f40e8 100644 --- a/tests/test_save_hook_mines.py +++ b/tests/test_save_hook_mines.py @@ -61,3 +61,65 @@ class TestSaveHookAutoMines: 'MEMPAL_DIR defaults to "" which silently disables mining. ' "Either set a default path or add transcript-based mining." ) + + +class TestShellHookTranscriptValidation: + """Both shell hooks must validate transcript paths before mining them. + + Mirrors mempalace.hooks_cli._validate_transcript_path so unsafe paths + (no extension, traversal segments) are rejected at the shell layer + too — added in #1231 review (Copilot #7, #8). + """ + + @staticmethod + def _hook_src(name: str) -> str: + path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "hooks", name) + return open(path).read() + + @staticmethod + def _strip_comments(src: str) -> str: + return "\n".join(line for line in src.splitlines() if not line.lstrip().startswith("#")) + + def test_save_hook_defines_and_uses_validator(self): + src = self._strip_comments(self._hook_src("mempal_save_hook.sh")) + assert "is_valid_transcript_path() {" in src, "validator function must be defined" + assert ( + 'is_valid_transcript_path "$TRANSCRIPT_PATH"' in src + ), "validator must be invoked against TRANSCRIPT_PATH before mining" + + def test_precompact_hook_defines_and_uses_validator(self): + src = self._strip_comments(self._hook_src("mempal_precompact_hook.sh")) + assert "is_valid_transcript_path() {" in src, "validator function must be defined" + assert ( + 'is_valid_transcript_path "$TRANSCRIPT_PATH"' in src + ), "validator must be invoked against TRANSCRIPT_PATH before mining" + + def test_validators_run_via_bash(self, tmp_path): + """Source the validator out of each hook and exercise it directly.""" + import subprocess + + for name in ("mempal_save_hook.sh", "mempal_precompact_hook.sh"): + src = self._hook_src(name) + # Extract just the function definition (first occurrence). + start = src.index("is_valid_transcript_path() {") + end = src.index("\n}\n", start) + 2 + func_src = src[start:end] + script = tmp_path / "v.sh" + script.write_text( + f"{func_src}\n" 'is_valid_transcript_path "$1" && echo OK || echo NO\n' + ) + + def run(arg: str) -> str: + return subprocess.run( + ["bash", str(script), arg], + capture_output=True, + text=True, + check=False, + ).stdout.strip() + + assert run("/tmp/sessions/abc.jsonl") == "OK" + assert run("/tmp/sessions/abc.json") == "OK" + assert run("") == "NO" + assert run("/tmp/notes.txt") == "NO" + assert run("../etc/passwd.jsonl") == "NO" + assert run("/tmp/../etc/t.jsonl") == "NO" From 3deebfed19f255348399ac7f985cc8fbe1555d43 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Mon, 27 Apr 2026 02:45:04 -0300 Subject: [PATCH 3/3] test(hooks): skip bash subprocess validator test on Windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `bash` on the Windows CI runner resolves to `wsl.exe` which fails with "Windows Subsystem for Linux has no installed distributions." The shell hooks themselves are POSIX-only — Windows users use the Python entry point — so the bash-subprocess exercise is non-applicable on win32. The static-grep validator tests still run on every platform, so the shell-side validation is still asserted under Windows; only the live bash invocation is skipped. --- tests/test_save_hook_mines.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_save_hook_mines.py b/tests/test_save_hook_mines.py index d9f40e8..e11234a 100644 --- a/tests/test_save_hook_mines.py +++ b/tests/test_save_hook_mines.py @@ -9,6 +9,9 @@ Written BEFORE the fix. """ import os +import sys + +import pytest class TestSaveHookAutoMines: @@ -94,6 +97,10 @@ class TestShellHookTranscriptValidation: 'is_valid_transcript_path "$TRANSCRIPT_PATH"' in src ), "validator must be invoked against TRANSCRIPT_PATH before mining" + @pytest.mark.skipif( + sys.platform == "win32", + reason="shell hooks are POSIX-only; Windows CI bash maps to wsl.exe with no distro", + ) def test_validators_run_via_bash(self, tmp_path): """Source the validator out of each hook and exercise it directly.""" import subprocess