diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index 1a171c1..3a0d2c3 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -120,8 +120,7 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> li os.rename(seg_dir, target) moved.append(target) logger.warning( - "Quarantined stale HNSW segment %s " - "(sqlite %.0fs newer than HNSW); renamed to %s", + "Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s", seg_dir, sqlite_mtime - hnsw_mtime, target, diff --git a/mempalace/convo_scanner.py b/mempalace/convo_scanner.py index bb8fbef..b592494 100644 --- a/mempalace/convo_scanner.py +++ b/mempalace/convo_scanner.py @@ -91,6 +91,14 @@ def _decode_slug_fallback(slug: str) -> str: return parts[-1] if parts else slug +def _safe_mtime(path: Path) -> float: + """Return file mtime, defaulting old on permission or filesystem errors.""" + try: + return path.stat().st_mtime + except OSError: + return 0.0 + + def _resolve_project_name(project_dir: Path) -> str: """Read one session's cwd to recover the original project name. @@ -98,7 +106,7 @@ def _resolve_project_name(project_dir: Path) -> str: """ sessions = sorted( (p for p in project_dir.iterdir() if p.is_file() and p.suffix == ".jsonl"), - key=lambda p: p.stat().st_mtime, + key=_safe_mtime, reverse=True, # newest first — most likely to be well-formed ) for session in sessions: diff --git a/mempalace/llm_client.py b/mempalace/llm_client.py index 442cf31..74982ce 100644 --- a/mempalace/llm_client.py +++ b/mempalace/llm_client.py @@ -124,7 +124,7 @@ class OllamaProvider(LLMProvider): if not names & wanted: return ( False, - f"Model '{self.model}' not loaded in Ollama. " f"Run: ollama pull {self.model}", + f"Model '{self.model}' not loaded in Ollama. Run: ollama pull {self.model}", ) return True, "ok" diff --git a/mempalace/llm_refine.py b/mempalace/llm_refine.py index 91a950c..faa737a 100644 --- a/mempalace/llm_refine.py +++ b/mempalace/llm_refine.py @@ -46,6 +46,10 @@ For each candidate, pick exactly ONE label: - COMMON_WORD: an English word, verb, or fragment that isn't a named entity at all (e.g. "Created", "Before", "Never") - AMBIGUOUS: context is insufficient to decide between two of the above +Frameworks, runtimes, APIs, cloud services, vendors, and third-party products +(e.g. Angular, OpenAPI, Terraform, Bun, Google) are TOPIC unless the context +clearly says this is the user's own named codebase, product, or active effort. + Use the provided context lines to disambiguate. A capitalized word that only appears in metadata ("Created: 2026-04-24") is COMMON_WORD. A name that appears with pronouns and dialogue is PERSON. Respond with JSON only. Schema: @@ -58,7 +62,7 @@ One entry per candidate, same order as the input.""" class RefineResult: merged: dict # updated detected dict reclassified: int # entries whose type changed - dropped: int # entries moved out (COMMON_WORD, or AMBIGUOUS sent to uncertain) + dropped: int # entries removed from the merged result (COMMON_WORD only) errors: list[str] # per-batch error messages (transport/parse failures) batches_completed: int batches_total: int @@ -70,14 +74,14 @@ def _collect_contexts( ) -> list[str]: """Return up to `max_lines` distinct lines from the corpus that mention `name`. - Case-insensitive substring match. Lines are truncated to + Case-insensitive token-boundary match. Lines are truncated to CONTEXT_WINDOW_CHARS chars to keep token usage bounded. """ - needle = name.lower() + needle = re.compile(rf"(? list[str]: + """Return plausible JSON payloads extracted from an LLM response.""" + text = text.strip() + if not text: + return [] + + candidates: list[str] = [text] + + for match in re.finditer(r"```(?:json)?\s*([\s\S]*?)\s*```", text, re.IGNORECASE): + candidate = match.group(1).strip() + if candidate and candidate not in candidates: + candidates.append(candidate) + + for start, opener in ((i, ch) for i, ch in enumerate(text) if ch in "{["): + closer = "}" if opener == "{" else "]" + depth = 0 + in_string = False + escaped = False + for i in range(start, len(text)): + ch = text[i] + if in_string: + if escaped: + escaped = False + elif ch == "\\": + escaped = True + elif ch == '"': + in_string = False + continue + + if ch == '"': + in_string = True + elif ch == opener: + depth += 1 + elif ch == closer: + depth -= 1 + if depth == 0: + candidate = text[start : i + 1].strip() + if candidate and candidate not in candidates: + candidates.append(candidate) + break + + return candidates + + def _parse_response(text: str, expected_names: list[str]) -> dict[str, tuple[str, str]]: """Parse the LLM's JSON response into {name: (label, reason)}. Robust to the model occasionally wrapping JSON in text or returning slight schema variations. Falls back to matching by candidate name. """ - # Strip any surrounding fences or prose - text = text.strip() - if text.startswith("```"): - text = re.sub(r"^```(?:json)?\s*", "", text) - text = re.sub(r"\s*```\s*$", "", text) - try: - data = json.loads(text) - except json.JSONDecodeError: + data = None + for candidate in _extract_json_candidates(text): + try: + data = json.loads(candidate) + break + except json.JSONDecodeError: + continue + if data is None: return {} entries = data.get("classifications") if isinstance(data, dict) else data @@ -142,7 +190,9 @@ def _parse_response(text: str, expected_names: list[str]) -> dict[str, tuple[str def _apply_classifications( - detected: dict, decisions: dict[str, tuple[str, str]] + detected: dict, + decisions: dict[str, tuple[str, str]], + allow_project_promotions: bool = True, ) -> tuple[dict, int, int]: """Merge LLM decisions back into the detected dict. @@ -182,6 +232,12 @@ def _apply_classifications( continue target_bucket = label_to_bucket[label] + if ( + label == "PROJECT" + and not allow_project_promotions + and not _is_authoritative_project(entry) + ): + target_bucket = "uncertain" updated = dict(entry) # Append the LLM's reason as a new signal so the user sees why it moved signals = list(updated.get("signals", [])) @@ -201,6 +257,19 @@ def _apply_classifications( return new_detected, reclassified, dropped +def _is_authoritative_person(entry: dict) -> bool: + """Return True for git-author people that should not be second-guessed.""" + signals = " ".join(entry.get("signals", [])).lower() + return "commit" in signals and "repo" in signals + + +def _is_authoritative_project(entry: dict) -> bool: + """Return True for manifest/git-backed projects that are already source-backed.""" + signals = " ".join(entry.get("signals", [])).lower() + manifest_markers = ("package.json", "pyproject.toml", "cargo.toml", "go.mod") + return any(marker in signals for marker in manifest_markers) or "commit" in signals + + def _print_progress(batch_idx: int, total: int, current_name: str) -> None: """Overwrite-line progress indicator.""" width = 40 @@ -217,12 +286,13 @@ def refine_entities( provider: LLMProvider, batch_size: int = BATCH_SIZE, show_progress: bool = True, + allow_project_promotions: bool = True, ) -> RefineResult: """Reclassify detected entities using the LLM provider. - Only candidates in the ``uncertain`` and ``projects`` buckets are sent for - refinement — ``people`` entries from git authorship are already - high-confidence and don't benefit from LLM second-guessing. + Only regex-derived candidates are sent for refinement. Git authors and + manifest/git-backed projects are already source-backed and don't benefit + from LLM second-guessing. Ctrl-C during refinement: cancels the remaining batches, returns a RefineResult with ``cancelled=True`` and whatever was classified before @@ -231,16 +301,20 @@ def refine_entities( Transport or parse failures in individual batches are recorded in ``errors`` and do not abort the run. + + ``allow_project_promotions=False`` keeps LLM-only project guesses in the + uncertain bucket. This is useful when manifest/git signal already supplied + canonical projects and regex/LLM hits are likely tools, vendors, or topics. """ - # Only refine buckets that actually benefit — keep `people` as-is - # (git-authored people are already authoritative). candidates: list[tuple[str, str]] = [] - for bucket in ("projects", "uncertain"): + current_type = {"people": "person", "projects": "project", "uncertain": "uncertain"} + for bucket in ("people", "projects", "uncertain"): for e in detected.get(bucket, []): - # Skip already-high-confidence entries (manifest-backed projects etc.) - if e.get("confidence", 0) >= 0.95 and bucket == "projects": + if bucket == "people" and _is_authoritative_person(e): continue - candidates.append((e["name"], bucket.rstrip("s"))) # "projects" -> "project" + if bucket == "projects" and _is_authoritative_project(e): + continue + candidates.append((e["name"], current_type[bucket])) corpus_lines = corpus_text.splitlines() if corpus_text else [] @@ -300,7 +374,11 @@ def refine_entities( sys.stderr.write("\n") sys.stderr.flush() - merged, reclassified, dropped = _apply_classifications(detected, all_decisions) + merged, reclassified, dropped = _apply_classifications( + detected, + all_decisions, + allow_project_promotions=allow_project_promotions, + ) return RefineResult( merged=merged, diff --git a/mempalace/project_scanner.py b/mempalace/project_scanner.py index b5c408e..5b12d5e 100644 --- a/mempalace/project_scanner.py +++ b/mempalace/project_scanner.py @@ -632,18 +632,28 @@ def discover_entities( else {"people": [], "projects": [], "uncertain": []} ) - # If git/manifests gave us real projects, suppress the regex "uncertain" bucket. - # That bucket is mostly noise (common words, CamelCase tech terms, etc.) and - # adding it to the review flow just makes the user do triage we can skip. + # Without LLM refinement, suppress regex "uncertain" noise when real + # manifest/git signal exists. With LLM refinement enabled, keep those + # candidates so the model can promote real entities or drop common words. has_real_signal = bool(projects) or bool(people) - merged = _merge_detected(real_signal, prose_detected, drop_secondary_uncertain=has_real_signal) + merged = _merge_detected( + real_signal, + prose_detected, + drop_secondary_uncertain=has_real_signal and llm_provider is None, + ) # Optional phase 2: LLM refinement. if llm_provider is not None: from mempalace.llm_refine import collect_corpus_text, refine_entities corpus = collect_corpus_text(str(project_dir)) - result = refine_entities(merged, corpus, llm_provider, show_progress=show_progress) + result = refine_entities( + merged, + corpus, + llm_provider, + show_progress=show_progress, + allow_project_promotions=not has_real_signal, + ) if show_progress: status_bits = [] if result.cancelled: diff --git a/tests/test_convo_scanner.py b/tests/test_convo_scanner.py index 9fcd339..01e980b 100644 --- a/tests/test_convo_scanner.py +++ b/tests/test_convo_scanner.py @@ -1,11 +1,13 @@ """Tests for mempalace.convo_scanner.""" import json +from pathlib import Path from mempalace.convo_scanner import ( _decode_slug_fallback, _extract_cwd_from_session, _resolve_project_name, + _safe_mtime, is_claude_projects_root, scan_claude_projects, ) @@ -93,6 +95,23 @@ def test_decode_slug_fallback_only_dashes(): assert _decode_slug_fallback("---") == "---" +# ── safe metadata helpers ─────────────────────────────────────────────── + + +def test_safe_mtime_returns_zero_on_stat_error(tmp_path, monkeypatch): + f = tmp_path / "session.jsonl" + f.write_text("{}\n") + original_stat = Path.stat + + def fail_stat(self): + if self == f: + raise OSError("permission denied") + return original_stat(self) + + monkeypatch.setattr(Path, "stat", fail_stat) + assert _safe_mtime(f) == 0.0 + + # ── _resolve_project_name ─────────────────────────────────────────────── diff --git a/tests/test_llm_refine.py b/tests/test_llm_refine.py index 329f91a..b3e7d2d 100644 --- a/tests/test_llm_refine.py +++ b/tests/test_llm_refine.py @@ -11,6 +11,9 @@ from mempalace.llm_refine import ( _apply_classifications, _build_user_prompt, _collect_contexts, + _extract_json_candidates, + _is_authoritative_person, + _is_authoritative_project, _parse_response, collect_corpus_text, refine_entities, @@ -62,6 +65,16 @@ def test_collect_contexts_case_insensitive(): assert out == ["lowercase alice mention"] +def test_collect_contexts_uses_token_boundaries(): + lines = [ + "forgot should not match", + "Go is a language.", + "go-v1 shipped.", + ] + out = _collect_contexts(lines, "Go", max_lines=5) + assert out == ["Go is a language.", "go-v1 shipped."] + + def test_collect_contexts_dedupes_identical_lines(): lines = ["Alice", "Alice", "Alice was here"] out = _collect_contexts(lines, "Alice", max_lines=5) @@ -131,6 +144,30 @@ def test_parse_response_strips_code_fences(): assert out["X"][0] == "TOPIC" +def test_parse_response_extracts_json_after_prose(): + text = 'Sure, here is the JSON: {"classifications": [{"name": "X", "label": "TOPIC"}]}' + out = _parse_response(text, ["X"]) + assert out["X"][0] == "TOPIC" + + +def test_parse_response_extracts_fenced_json_after_prose(): + text = 'Sure:\n```json\n{"classifications": [{"name": "X", "label": "PROJECT"}]}\n```' + out = _parse_response(text, ["X"]) + assert out["X"][0] == "PROJECT" + + +def test_extract_json_candidates_handles_embedded_array(): + text = 'prefix [{"name": "Y", "label": "PERSON"}] suffix' + candidates = _extract_json_candidates(text) + assert '[{"name": "Y", "label": "PERSON"}]' in candidates + + +def test_parse_response_ignores_non_json_brackets_before_payload(): + text = 'See [note] first. JSON: {"classifications": [{"name": "X", "label": "TOPIC"}]}' + out = _parse_response(text, ["X"]) + assert out["X"][0] == "TOPIC" + + def test_parse_response_malformed_returns_empty(): out = _parse_response("not json at all", ["X"]) assert out == {} @@ -257,6 +294,67 @@ def test_apply_classifications_topic_goes_to_uncertain(): assert reclass == 1 +def test_apply_classifications_can_block_llm_only_project_promotion(): + detected = { + "people": [], + "projects": [], + "uncertain": [ + { + "name": "Terraform", + "type": "uncertain", + "confidence": 0.4, + "frequency": 5, + "signals": ["regex"], + } + ], + } + decisions = {"Terraform": ("PROJECT", "tool")} + new, reclass, _ = _apply_classifications( + detected, + decisions, + allow_project_promotions=False, + ) + assert new["projects"] == [] + assert new["uncertain"][0]["name"] == "Terraform" + assert new["uncertain"][0]["type"] == "uncertain" + assert reclass == 0 + + +def test_apply_classifications_allows_project_promotion_for_prose_only_mode(): + detected = { + "people": [], + "projects": [], + "uncertain": [ + { + "name": "Project Aurora", + "type": "uncertain", + "confidence": 0.4, + "frequency": 5, + "signals": ["regex"], + } + ], + } + decisions = {"Project Aurora": ("PROJECT", "user effort")} + new, reclass, _ = _apply_classifications(detected, decisions) + assert new["projects"][0]["name"] == "Project Aurora" + assert new["projects"][0]["type"] == "project" + assert reclass == 1 + + +# ── authoritative source filters ──────────────────────────────────────── + + +def test_is_authoritative_person_requires_git_signal(): + assert _is_authoritative_person({"signals": ["5 commits across 2 repos"]}) + assert not _is_authoritative_person({"signals": ["pronoun nearby (5x)"]}) + + +def test_is_authoritative_project_requires_manifest_or_git_signal(): + assert _is_authoritative_project({"signals": ["package.json, 12 of your commits"]}) + assert _is_authoritative_project({"signals": ["57 commits (none by you)"]}) + assert not _is_authoritative_project({"signals": ["code file reference (5x)"]}) + + # ── refine_entities ───────────────────────────────────────────────────── @@ -347,6 +445,93 @@ def test_refine_entities_skips_high_confidence_projects(): assert provider.call_count == 0 +def test_refine_entities_refines_high_confidence_regex_projects(): + """High-confidence regex projects still need LLM review without source signal.""" + detected = { + "people": [], + "projects": [ + { + "name": "OpenAPI", + "type": "project", + "confidence": 0.99, + "frequency": 5, + "signals": ["code file reference (5x)"], + } + ], + "uncertain": [], + } + provider = FakeProvider( + response_text=( + '{"classifications": [{"name": "OpenAPI", "label": "TOPIC", "reason": "technology"}]}' + ) + ) + result = refine_entities(detected, "OpenAPI schemas", provider, show_progress=False) + assert provider.call_count == 1 + assert result.reclassified == 1 + assert result.merged["projects"] == [] + assert result.merged["uncertain"][0]["name"] == "OpenAPI" + + +def test_refine_entities_refines_regex_people_but_skips_git_people(): + detected = { + "people": [ + { + "name": "Igor Lins e Silva", + "type": "person", + "confidence": 0.99, + "frequency": 100, + "signals": ["100 commits across 3 repos"], + }, + { + "name": "Tool", + "type": "person", + "confidence": 0.99, + "frequency": 5, + "signals": ["pronoun nearby (5x)"], + }, + ], + "projects": [], + "uncertain": [], + } + provider = FakeProvider( + response_text='{"classifications": [{"name": "Tool", "label": "COMMON_WORD"}]}' + ) + result = refine_entities(detected, "Tool is a common noun.", provider, show_progress=False) + assert provider.call_count == 1 + names = [e["name"] for e in result.merged["people"]] + assert names == ["Igor Lins e Silva"] + assert result.dropped == 1 + + +def test_refine_entities_can_keep_llm_only_project_in_uncertain(): + detected = { + "people": [], + "projects": [], + "uncertain": [ + { + "name": "Terraform", + "type": "uncertain", + "confidence": 0.4, + "frequency": 9, + "signals": ["regex"], + } + ], + } + provider = FakeProvider( + response_text='{"classifications": [{"name": "Terraform", "label": "PROJECT"}]}' + ) + result = refine_entities( + detected, + "Terraform config", + provider, + show_progress=False, + allow_project_promotions=False, + ) + assert result.merged["projects"] == [] + assert result.merged["uncertain"][0]["name"] == "Terraform" + assert any("LLM: project" in s for s in result.merged["uncertain"][0]["signals"]) + + def test_refine_entities_empty_candidates_returns_noop(): detected = {"people": [], "projects": [], "uncertain": []} provider = FakeProvider() diff --git a/tests/test_project_scanner.py b/tests/test_project_scanner.py index 3499796..d8c680b 100644 --- a/tests/test_project_scanner.py +++ b/tests/test_project_scanner.py @@ -3,6 +3,7 @@ import json import subprocess from pathlib import Path +from types import SimpleNamespace from mempalace.project_scanner import ( PersonInfo, @@ -390,6 +391,49 @@ def test_discover_entities_prefers_real_signal_over_prose(tmp_path): assert "realproj" in proj_names +def test_discover_entities_keeps_uncertain_for_llm_when_real_signal(tmp_path): + """With --llm, regex-uncertain prose candidates should reach refinement.""" + (tmp_path / "package.json").write_text(json.dumps({"name": "realproj"})) + _init_git_repo(tmp_path) + (tmp_path / "doc.md").write_text("Noise appeared. Noise repeated. Noise again.") + + class FakeProvider: + def __init__(self): + self.prompts = [] + + def classify(self, _system, user, json_mode=True): + self.prompts.append(user) + return SimpleNamespace( + text='{"classifications": [{"name": "Noise", "label": "COMMON_WORD"}]}' + ) + + provider = FakeProvider() + d = discover_entities(str(tmp_path), llm_provider=provider, show_progress=False) + + assert len(provider.prompts) == 1 + assert "Noise" in provider.prompts[0] + assert "Noise" not in [e["name"] for cat in d.values() for e in cat] + + +def test_discover_entities_keeps_llm_only_project_uncertain_when_real_signal(tmp_path): + """Repo roots should not auto-promote LLM-only tools/topics into projects.""" + (tmp_path / "package.json").write_text(json.dumps({"name": "realproj"})) + _init_git_repo(tmp_path) + (tmp_path / "doc.md").write_text("Terraform shipped. Terraform changed. Terraform runs.") + + class FakeProvider: + def classify(self, _system, _user, json_mode=True): + return SimpleNamespace( + text='{"classifications": [{"name": "Terraform", "label": "PROJECT"}]}' + ) + + d = discover_entities(str(tmp_path), llm_provider=FakeProvider(), show_progress=False) + + assert "realproj" in [e["name"] for e in d["projects"]] + assert "Terraform" not in [e["name"] for e in d["projects"]] + assert "Terraform" in [e["name"] for e in d["uncertain"]] + + # ── _UnionFind basics ──────────────────────────────────────────────────