035fe6d658
Addresses issues found while reviewing the initial phase-2 implementation against real data: **Bug: uncertain bucket starved from the LLM.** `discover_entities` was dropping the regex-uncertain bucket whenever real git/manifest signal existed — which is exactly when `--llm` is most useful for cleaning up prose noise. The uncertain candidates never reached the refinement step. Fixed: only drop when `llm_provider is None`. **Context collection: word boundaries, not substring.** `_collect_contexts` used substring matching on lower-cased lines, so the name "Go" matched "good", "going", "forgot". Switched to a `(?<!\w)…(?!\w)` regex so short names only match at token boundaries. **Authoritative-source detection replaces confidence threshold.** Previously the refinement step skipped entries with `confidence >= 0.95` to avoid second-guessing manifest-backed projects. That threshold was fragile — the regex detector produces 0.99 confidence for things like `code file reference (5x)` on framework names (OpenAPI, etc.), so those skipped the LLM despite being regex-only noise. New helpers `_is_authoritative_person` / `_is_authoritative_project` look at the actual signal strings (commits, package.json, etc.) to decide. **Now also refines regex-derived people.** After #1148's high-pronoun-signal fix, the regex detector can promote non-people to the `people` bucket (e.g. a capitalized common noun that happened to appear near pronouns). The LLM now gets a chance to clean those up, while git-authored people are still skipped. **Robust JSON extraction.** Small local models routinely wrap JSON output in prose ("Sure, here's the classification: {…}"). The previous code-fence stripper failed on that. `_extract_json_candidates` now does balanced-bracket extraction with string-aware quote handling, so it recovers JSON from: - raw responses - markdown fenced blocks - JSON embedded inside surrounding text - multiple candidate objects/arrays **Prompt guidance for frameworks vs user projects.** Added an explicit instruction: frameworks, runtimes, APIs, cloud services, and third-party vendors (Angular, OpenAPI, Terraform, Bun, Google, etc.) are TOPIC unless the context clearly says it's the user's own codebase. Directly addresses a false-positive pattern observed during dev runs. **Defensive mtime.** `convo_scanner._safe_mtime` catches OSError during `stat()` — permission changes, filesystem races, broken symlinks — and sorts the affected file to the end of the newest-first order rather than crashing the scan. **Cosmetic:** merged two adjacent f-strings on the same line in `backends/chroma.py` and `llm_client.py` (no behaviour change). 15 new tests cover the OSError fallback, word-boundary matching, JSON extraction variants, authoritative-source helpers, refining high- confidence regex projects, and end-to-end LLM refinement preserving the uncertain bucket.
219 lines
7.7 KiB
Python
219 lines
7.7 KiB
Python
"""Tests for mempalace.convo_scanner."""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
from mempalace.convo_scanner import (
|
|
_decode_slug_fallback,
|
|
_extract_cwd_from_session,
|
|
_resolve_project_name,
|
|
_safe_mtime,
|
|
is_claude_projects_root,
|
|
scan_claude_projects,
|
|
)
|
|
|
|
|
|
# ── is_claude_projects_root ─────────────────────────────────────────────
|
|
|
|
|
|
def test_is_claude_projects_root_true(tmp_path):
|
|
project_dir = tmp_path / "-home-user-dev-foo"
|
|
project_dir.mkdir()
|
|
(project_dir / "abc.jsonl").write_text("{}\n")
|
|
assert is_claude_projects_root(tmp_path)
|
|
|
|
|
|
def test_is_claude_projects_root_false_no_dash_prefix(tmp_path):
|
|
project_dir = tmp_path / "normal-folder"
|
|
project_dir.mkdir()
|
|
(project_dir / "abc.jsonl").write_text("{}\n")
|
|
assert not is_claude_projects_root(tmp_path)
|
|
|
|
|
|
def test_is_claude_projects_root_false_no_jsonl(tmp_path):
|
|
project_dir = tmp_path / "-home-user-foo"
|
|
project_dir.mkdir()
|
|
(project_dir / "other.txt").write_text("hello")
|
|
assert not is_claude_projects_root(tmp_path)
|
|
|
|
|
|
def test_is_claude_projects_root_false_empty(tmp_path):
|
|
assert not is_claude_projects_root(tmp_path)
|
|
|
|
|
|
def test_is_claude_projects_root_false_nonexistent(tmp_path):
|
|
assert not is_claude_projects_root(tmp_path / "does-not-exist")
|
|
|
|
|
|
# ── cwd extraction ──────────────────────────────────────────────────────
|
|
|
|
|
|
def test_extract_cwd_from_session(tmp_path):
|
|
f = tmp_path / "session.jsonl"
|
|
lines = [
|
|
json.dumps({"type": "file-history-snapshot", "messageId": "x"}),
|
|
json.dumps({"type": "user", "cwd": "/home/user/dev/myproj", "content": "hi"}),
|
|
]
|
|
f.write_text("\n".join(lines) + "\n")
|
|
assert _extract_cwd_from_session(f) == "/home/user/dev/myproj"
|
|
|
|
|
|
def test_extract_cwd_from_session_skips_malformed(tmp_path):
|
|
f = tmp_path / "session.jsonl"
|
|
f.write_text(
|
|
"{not valid json\n" + json.dumps({"type": "user", "cwd": "/home/user/dev/good"}) + "\n"
|
|
)
|
|
assert _extract_cwd_from_session(f) == "/home/user/dev/good"
|
|
|
|
|
|
def test_extract_cwd_from_session_none_if_absent(tmp_path):
|
|
f = tmp_path / "session.jsonl"
|
|
f.write_text(json.dumps({"type": "x", "messageId": "y"}) + "\n")
|
|
assert _extract_cwd_from_session(f) is None
|
|
|
|
|
|
def test_extract_cwd_from_session_none_if_file_missing(tmp_path):
|
|
assert _extract_cwd_from_session(tmp_path / "missing.jsonl") is None
|
|
|
|
|
|
# ── slug fallback ───────────────────────────────────────────────────────
|
|
|
|
|
|
def test_decode_slug_fallback_last_segment():
|
|
assert _decode_slug_fallback("-home-user-dev-foo") == "foo"
|
|
|
|
|
|
def test_decode_slug_fallback_double_dash():
|
|
assert _decode_slug_fallback("-home-user--bentokit") == "bentokit"
|
|
|
|
|
|
def test_decode_slug_fallback_empty():
|
|
assert _decode_slug_fallback("") == ""
|
|
|
|
|
|
def test_decode_slug_fallback_only_dashes():
|
|
assert _decode_slug_fallback("---") == "---"
|
|
|
|
|
|
# ── safe metadata helpers ───────────────────────────────────────────────
|
|
|
|
|
|
def test_safe_mtime_returns_zero_on_stat_error(tmp_path, monkeypatch):
|
|
f = tmp_path / "session.jsonl"
|
|
f.write_text("{}\n")
|
|
original_stat = Path.stat
|
|
|
|
def fail_stat(self):
|
|
if self == f:
|
|
raise OSError("permission denied")
|
|
return original_stat(self)
|
|
|
|
monkeypatch.setattr(Path, "stat", fail_stat)
|
|
assert _safe_mtime(f) == 0.0
|
|
|
|
|
|
# ── _resolve_project_name ───────────────────────────────────────────────
|
|
|
|
|
|
def test_resolve_project_name_uses_cwd(tmp_path):
|
|
pdir = tmp_path / "-home-user-dev-coolproj"
|
|
pdir.mkdir()
|
|
session = pdir / "a.jsonl"
|
|
session.write_text(json.dumps({"type": "user", "cwd": "/home/user/dev/cool-proj-real"}) + "\n")
|
|
assert _resolve_project_name(pdir) == "cool-proj-real"
|
|
|
|
|
|
def test_resolve_project_name_falls_back_when_no_cwd(tmp_path):
|
|
pdir = tmp_path / "-home-user-dev-foo"
|
|
pdir.mkdir()
|
|
(pdir / "a.jsonl").write_text(json.dumps({"type": "x"}) + "\n")
|
|
assert _resolve_project_name(pdir) == "foo"
|
|
|
|
|
|
def test_resolve_project_name_prefers_newer_session(tmp_path):
|
|
"""Newest session's cwd wins — covers the case where user renamed the
|
|
project directory between sessions."""
|
|
|
|
pdir = tmp_path / "-home-user-dev-old"
|
|
pdir.mkdir()
|
|
old = pdir / "old.jsonl"
|
|
old.write_text(json.dumps({"type": "user", "cwd": "/home/user/dev/old"}) + "\n")
|
|
# Ensure distinguishable mtimes
|
|
old_mtime = old.stat().st_mtime - 100
|
|
import os
|
|
|
|
os.utime(old, (old_mtime, old_mtime))
|
|
|
|
new = pdir / "new.jsonl"
|
|
new.write_text(json.dumps({"type": "user", "cwd": "/home/user/dev/new-name"}) + "\n")
|
|
assert _resolve_project_name(pdir) == "new-name"
|
|
|
|
|
|
# ── scan_claude_projects ────────────────────────────────────────────────
|
|
|
|
|
|
def test_scan_claude_projects_empty_dir(tmp_path):
|
|
assert scan_claude_projects(tmp_path) == []
|
|
|
|
|
|
def test_scan_claude_projects_not_a_projects_root(tmp_path):
|
|
"""Returns empty list if the dir doesn't look like .claude/projects/."""
|
|
(tmp_path / "some-folder").mkdir()
|
|
(tmp_path / "some-folder" / "readme.md").write_text("hi")
|
|
assert scan_claude_projects(tmp_path) == []
|
|
|
|
|
|
def test_scan_claude_projects_finds_projects(tmp_path):
|
|
p1 = tmp_path / "-home-user-dev-alpha"
|
|
p1.mkdir()
|
|
(p1 / "a.jsonl").write_text(json.dumps({"type": "user", "cwd": "/home/user/dev/alpha"}) + "\n")
|
|
(p1 / "b.jsonl").write_text(json.dumps({"type": "user", "cwd": "/home/user/dev/alpha"}) + "\n")
|
|
|
|
p2 = tmp_path / "-home-user-dev-beta"
|
|
p2.mkdir()
|
|
(p2 / "x.jsonl").write_text(json.dumps({"type": "user", "cwd": "/home/user/dev/beta"}) + "\n")
|
|
|
|
result = scan_claude_projects(tmp_path)
|
|
names = [p.name for p in result]
|
|
assert "alpha" in names
|
|
assert "beta" in names
|
|
# alpha has 2 sessions, beta has 1 — alpha ranks higher
|
|
alpha = next(p for p in result if p.name == "alpha")
|
|
beta = next(p for p in result if p.name == "beta")
|
|
assert alpha.user_commits == 2
|
|
assert beta.user_commits == 1
|
|
|
|
|
|
def test_scan_claude_projects_ignores_dirs_without_jsonl(tmp_path):
|
|
empty_proj = tmp_path / "-home-user-dev-empty"
|
|
empty_proj.mkdir()
|
|
(empty_proj / "notes.md").write_text("hi")
|
|
assert scan_claude_projects(tmp_path) == []
|
|
|
|
|
|
def test_scan_claude_projects_marks_as_mine(tmp_path):
|
|
p = tmp_path / "-home-user-dev-owned"
|
|
p.mkdir()
|
|
(p / "s.jsonl").write_text(json.dumps({"type": "user", "cwd": "/home/user/dev/owned"}) + "\n")
|
|
result = scan_claude_projects(tmp_path)
|
|
assert len(result) == 1
|
|
assert result[0].is_mine is True
|
|
|
|
|
|
def test_scan_claude_projects_dedup_by_name(tmp_path):
|
|
"""Two encoded dirs resolving to the same project name collapse to one."""
|
|
p1 = tmp_path / "-home-user-a-proj"
|
|
p1.mkdir()
|
|
(p1 / "s.jsonl").write_text(json.dumps({"type": "user", "cwd": "/home/user/a/proj"}) + "\n")
|
|
(p1 / "t.jsonl").write_text(json.dumps({"type": "user", "cwd": "/home/user/a/proj"}) + "\n")
|
|
|
|
p2 = tmp_path / "-home-user-b-proj"
|
|
p2.mkdir()
|
|
(p2 / "u.jsonl").write_text(json.dumps({"type": "user", "cwd": "/home/user/b/proj"}) + "\n")
|
|
|
|
result = scan_claude_projects(tmp_path)
|
|
# Both decode to "proj"; only one remains — the one with more sessions wins
|
|
assert len(result) == 1
|
|
assert result[0].name == "proj"
|
|
assert result[0].user_commits == 2
|