fix(llm): tighter refinement — word boundaries, JSON extraction, authoritative sources
Addresses issues found while reviewing the initial phase-2 implementation against real data: **Bug: uncertain bucket starved from the LLM.** `discover_entities` was dropping the regex-uncertain bucket whenever real git/manifest signal existed — which is exactly when `--llm` is most useful for cleaning up prose noise. The uncertain candidates never reached the refinement step. Fixed: only drop when `llm_provider is None`. **Context collection: word boundaries, not substring.** `_collect_contexts` used substring matching on lower-cased lines, so the name "Go" matched "good", "going", "forgot". Switched to a `(?<!\w)…(?!\w)` regex so short names only match at token boundaries. **Authoritative-source detection replaces confidence threshold.** Previously the refinement step skipped entries with `confidence >= 0.95` to avoid second-guessing manifest-backed projects. That threshold was fragile — the regex detector produces 0.99 confidence for things like `code file reference (5x)` on framework names (OpenAPI, etc.), so those skipped the LLM despite being regex-only noise. New helpers `_is_authoritative_person` / `_is_authoritative_project` look at the actual signal strings (commits, package.json, etc.) to decide. **Now also refines regex-derived people.** After #1148's high-pronoun-signal fix, the regex detector can promote non-people to the `people` bucket (e.g. a capitalized common noun that happened to appear near pronouns). The LLM now gets a chance to clean those up, while git-authored people are still skipped. **Robust JSON extraction.** Small local models routinely wrap JSON output in prose ("Sure, here's the classification: {…}"). The previous code-fence stripper failed on that. `_extract_json_candidates` now does balanced-bracket extraction with string-aware quote handling, so it recovers JSON from: - raw responses - markdown fenced blocks - JSON embedded inside surrounding text - multiple candidate objects/arrays **Prompt guidance for frameworks vs user projects.** Added an explicit instruction: frameworks, runtimes, APIs, cloud services, and third-party vendors (Angular, OpenAPI, Terraform, Bun, Google, etc.) are TOPIC unless the context clearly says it's the user's own codebase. Directly addresses a false-positive pattern observed during dev runs. **Defensive mtime.** `convo_scanner._safe_mtime` catches OSError during `stat()` — permission changes, filesystem races, broken symlinks — and sorts the affected file to the end of the newest-first order rather than crashing the scan. **Cosmetic:** merged two adjacent f-strings on the same line in `backends/chroma.py` and `llm_client.py` (no behaviour change). 15 new tests cover the OSError fallback, word-boundary matching, JSON extraction variants, authoritative-source helpers, refining high- confidence regex projects, and end-to-end LLM refinement preserving the uncertain bucket.
This commit is contained in:
@@ -1,11 +1,13 @@
|
||||
"""Tests for mempalace.convo_scanner."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from mempalace.convo_scanner import (
|
||||
_decode_slug_fallback,
|
||||
_extract_cwd_from_session,
|
||||
_resolve_project_name,
|
||||
_safe_mtime,
|
||||
is_claude_projects_root,
|
||||
scan_claude_projects,
|
||||
)
|
||||
@@ -93,6 +95,23 @@ def test_decode_slug_fallback_only_dashes():
|
||||
assert _decode_slug_fallback("---") == "---"
|
||||
|
||||
|
||||
# ── safe metadata helpers ───────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_safe_mtime_returns_zero_on_stat_error(tmp_path, monkeypatch):
|
||||
f = tmp_path / "session.jsonl"
|
||||
f.write_text("{}\n")
|
||||
original_stat = Path.stat
|
||||
|
||||
def fail_stat(self):
|
||||
if self == f:
|
||||
raise OSError("permission denied")
|
||||
return original_stat(self)
|
||||
|
||||
monkeypatch.setattr(Path, "stat", fail_stat)
|
||||
assert _safe_mtime(f) == 0.0
|
||||
|
||||
|
||||
# ── _resolve_project_name ───────────────────────────────────────────────
|
||||
|
||||
|
||||
|
||||
@@ -11,6 +11,9 @@ from mempalace.llm_refine import (
|
||||
_apply_classifications,
|
||||
_build_user_prompt,
|
||||
_collect_contexts,
|
||||
_extract_json_candidates,
|
||||
_is_authoritative_person,
|
||||
_is_authoritative_project,
|
||||
_parse_response,
|
||||
collect_corpus_text,
|
||||
refine_entities,
|
||||
@@ -62,6 +65,16 @@ def test_collect_contexts_case_insensitive():
|
||||
assert out == ["lowercase alice mention"]
|
||||
|
||||
|
||||
def test_collect_contexts_uses_token_boundaries():
|
||||
lines = [
|
||||
"forgot should not match",
|
||||
"Go is a language.",
|
||||
"go-v1 shipped.",
|
||||
]
|
||||
out = _collect_contexts(lines, "Go", max_lines=5)
|
||||
assert out == ["Go is a language.", "go-v1 shipped."]
|
||||
|
||||
|
||||
def test_collect_contexts_dedupes_identical_lines():
|
||||
lines = ["Alice", "Alice", "Alice was here"]
|
||||
out = _collect_contexts(lines, "Alice", max_lines=5)
|
||||
@@ -131,6 +144,30 @@ def test_parse_response_strips_code_fences():
|
||||
assert out["X"][0] == "TOPIC"
|
||||
|
||||
|
||||
def test_parse_response_extracts_json_after_prose():
|
||||
text = 'Sure, here is the JSON: {"classifications": [{"name": "X", "label": "TOPIC"}]}'
|
||||
out = _parse_response(text, ["X"])
|
||||
assert out["X"][0] == "TOPIC"
|
||||
|
||||
|
||||
def test_parse_response_extracts_fenced_json_after_prose():
|
||||
text = 'Sure:\n```json\n{"classifications": [{"name": "X", "label": "PROJECT"}]}\n```'
|
||||
out = _parse_response(text, ["X"])
|
||||
assert out["X"][0] == "PROJECT"
|
||||
|
||||
|
||||
def test_extract_json_candidates_handles_embedded_array():
|
||||
text = 'prefix [{"name": "Y", "label": "PERSON"}] suffix'
|
||||
candidates = _extract_json_candidates(text)
|
||||
assert '[{"name": "Y", "label": "PERSON"}]' in candidates
|
||||
|
||||
|
||||
def test_parse_response_ignores_non_json_brackets_before_payload():
|
||||
text = 'See [note] first. JSON: {"classifications": [{"name": "X", "label": "TOPIC"}]}'
|
||||
out = _parse_response(text, ["X"])
|
||||
assert out["X"][0] == "TOPIC"
|
||||
|
||||
|
||||
def test_parse_response_malformed_returns_empty():
|
||||
out = _parse_response("not json at all", ["X"])
|
||||
assert out == {}
|
||||
@@ -257,6 +294,67 @@ def test_apply_classifications_topic_goes_to_uncertain():
|
||||
assert reclass == 1
|
||||
|
||||
|
||||
def test_apply_classifications_can_block_llm_only_project_promotion():
|
||||
detected = {
|
||||
"people": [],
|
||||
"projects": [],
|
||||
"uncertain": [
|
||||
{
|
||||
"name": "Terraform",
|
||||
"type": "uncertain",
|
||||
"confidence": 0.4,
|
||||
"frequency": 5,
|
||||
"signals": ["regex"],
|
||||
}
|
||||
],
|
||||
}
|
||||
decisions = {"Terraform": ("PROJECT", "tool")}
|
||||
new, reclass, _ = _apply_classifications(
|
||||
detected,
|
||||
decisions,
|
||||
allow_project_promotions=False,
|
||||
)
|
||||
assert new["projects"] == []
|
||||
assert new["uncertain"][0]["name"] == "Terraform"
|
||||
assert new["uncertain"][0]["type"] == "uncertain"
|
||||
assert reclass == 0
|
||||
|
||||
|
||||
def test_apply_classifications_allows_project_promotion_for_prose_only_mode():
|
||||
detected = {
|
||||
"people": [],
|
||||
"projects": [],
|
||||
"uncertain": [
|
||||
{
|
||||
"name": "Project Aurora",
|
||||
"type": "uncertain",
|
||||
"confidence": 0.4,
|
||||
"frequency": 5,
|
||||
"signals": ["regex"],
|
||||
}
|
||||
],
|
||||
}
|
||||
decisions = {"Project Aurora": ("PROJECT", "user effort")}
|
||||
new, reclass, _ = _apply_classifications(detected, decisions)
|
||||
assert new["projects"][0]["name"] == "Project Aurora"
|
||||
assert new["projects"][0]["type"] == "project"
|
||||
assert reclass == 1
|
||||
|
||||
|
||||
# ── authoritative source filters ────────────────────────────────────────
|
||||
|
||||
|
||||
def test_is_authoritative_person_requires_git_signal():
|
||||
assert _is_authoritative_person({"signals": ["5 commits across 2 repos"]})
|
||||
assert not _is_authoritative_person({"signals": ["pronoun nearby (5x)"]})
|
||||
|
||||
|
||||
def test_is_authoritative_project_requires_manifest_or_git_signal():
|
||||
assert _is_authoritative_project({"signals": ["package.json, 12 of your commits"]})
|
||||
assert _is_authoritative_project({"signals": ["57 commits (none by you)"]})
|
||||
assert not _is_authoritative_project({"signals": ["code file reference (5x)"]})
|
||||
|
||||
|
||||
# ── refine_entities ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@@ -347,6 +445,93 @@ def test_refine_entities_skips_high_confidence_projects():
|
||||
assert provider.call_count == 0
|
||||
|
||||
|
||||
def test_refine_entities_refines_high_confidence_regex_projects():
|
||||
"""High-confidence regex projects still need LLM review without source signal."""
|
||||
detected = {
|
||||
"people": [],
|
||||
"projects": [
|
||||
{
|
||||
"name": "OpenAPI",
|
||||
"type": "project",
|
||||
"confidence": 0.99,
|
||||
"frequency": 5,
|
||||
"signals": ["code file reference (5x)"],
|
||||
}
|
||||
],
|
||||
"uncertain": [],
|
||||
}
|
||||
provider = FakeProvider(
|
||||
response_text=(
|
||||
'{"classifications": [{"name": "OpenAPI", "label": "TOPIC", "reason": "technology"}]}'
|
||||
)
|
||||
)
|
||||
result = refine_entities(detected, "OpenAPI schemas", provider, show_progress=False)
|
||||
assert provider.call_count == 1
|
||||
assert result.reclassified == 1
|
||||
assert result.merged["projects"] == []
|
||||
assert result.merged["uncertain"][0]["name"] == "OpenAPI"
|
||||
|
||||
|
||||
def test_refine_entities_refines_regex_people_but_skips_git_people():
|
||||
detected = {
|
||||
"people": [
|
||||
{
|
||||
"name": "Igor Lins e Silva",
|
||||
"type": "person",
|
||||
"confidence": 0.99,
|
||||
"frequency": 100,
|
||||
"signals": ["100 commits across 3 repos"],
|
||||
},
|
||||
{
|
||||
"name": "Tool",
|
||||
"type": "person",
|
||||
"confidence": 0.99,
|
||||
"frequency": 5,
|
||||
"signals": ["pronoun nearby (5x)"],
|
||||
},
|
||||
],
|
||||
"projects": [],
|
||||
"uncertain": [],
|
||||
}
|
||||
provider = FakeProvider(
|
||||
response_text='{"classifications": [{"name": "Tool", "label": "COMMON_WORD"}]}'
|
||||
)
|
||||
result = refine_entities(detected, "Tool is a common noun.", provider, show_progress=False)
|
||||
assert provider.call_count == 1
|
||||
names = [e["name"] for e in result.merged["people"]]
|
||||
assert names == ["Igor Lins e Silva"]
|
||||
assert result.dropped == 1
|
||||
|
||||
|
||||
def test_refine_entities_can_keep_llm_only_project_in_uncertain():
|
||||
detected = {
|
||||
"people": [],
|
||||
"projects": [],
|
||||
"uncertain": [
|
||||
{
|
||||
"name": "Terraform",
|
||||
"type": "uncertain",
|
||||
"confidence": 0.4,
|
||||
"frequency": 9,
|
||||
"signals": ["regex"],
|
||||
}
|
||||
],
|
||||
}
|
||||
provider = FakeProvider(
|
||||
response_text='{"classifications": [{"name": "Terraform", "label": "PROJECT"}]}'
|
||||
)
|
||||
result = refine_entities(
|
||||
detected,
|
||||
"Terraform config",
|
||||
provider,
|
||||
show_progress=False,
|
||||
allow_project_promotions=False,
|
||||
)
|
||||
assert result.merged["projects"] == []
|
||||
assert result.merged["uncertain"][0]["name"] == "Terraform"
|
||||
assert any("LLM: project" in s for s in result.merged["uncertain"][0]["signals"])
|
||||
|
||||
|
||||
def test_refine_entities_empty_candidates_returns_noop():
|
||||
detected = {"people": [], "projects": [], "uncertain": []}
|
||||
provider = FakeProvider()
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
from mempalace.project_scanner import (
|
||||
PersonInfo,
|
||||
@@ -390,6 +391,49 @@ def test_discover_entities_prefers_real_signal_over_prose(tmp_path):
|
||||
assert "realproj" in proj_names
|
||||
|
||||
|
||||
def test_discover_entities_keeps_uncertain_for_llm_when_real_signal(tmp_path):
|
||||
"""With --llm, regex-uncertain prose candidates should reach refinement."""
|
||||
(tmp_path / "package.json").write_text(json.dumps({"name": "realproj"}))
|
||||
_init_git_repo(tmp_path)
|
||||
(tmp_path / "doc.md").write_text("Noise appeared. Noise repeated. Noise again.")
|
||||
|
||||
class FakeProvider:
|
||||
def __init__(self):
|
||||
self.prompts = []
|
||||
|
||||
def classify(self, _system, user, json_mode=True):
|
||||
self.prompts.append(user)
|
||||
return SimpleNamespace(
|
||||
text='{"classifications": [{"name": "Noise", "label": "COMMON_WORD"}]}'
|
||||
)
|
||||
|
||||
provider = FakeProvider()
|
||||
d = discover_entities(str(tmp_path), llm_provider=provider, show_progress=False)
|
||||
|
||||
assert len(provider.prompts) == 1
|
||||
assert "Noise" in provider.prompts[0]
|
||||
assert "Noise" not in [e["name"] for cat in d.values() for e in cat]
|
||||
|
||||
|
||||
def test_discover_entities_keeps_llm_only_project_uncertain_when_real_signal(tmp_path):
|
||||
"""Repo roots should not auto-promote LLM-only tools/topics into projects."""
|
||||
(tmp_path / "package.json").write_text(json.dumps({"name": "realproj"}))
|
||||
_init_git_repo(tmp_path)
|
||||
(tmp_path / "doc.md").write_text("Terraform shipped. Terraform changed. Terraform runs.")
|
||||
|
||||
class FakeProvider:
|
||||
def classify(self, _system, _user, json_mode=True):
|
||||
return SimpleNamespace(
|
||||
text='{"classifications": [{"name": "Terraform", "label": "PROJECT"}]}'
|
||||
)
|
||||
|
||||
d = discover_entities(str(tmp_path), llm_provider=FakeProvider(), show_progress=False)
|
||||
|
||||
assert "realproj" in [e["name"] for e in d["projects"]]
|
||||
assert "Terraform" not in [e["name"] for e in d["projects"]]
|
||||
assert "Terraform" in [e["name"] for e in d["uncertain"]]
|
||||
|
||||
|
||||
# ── _UnionFind basics ──────────────────────────────────────────────────
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user