fix(llm): tighter refinement — word boundaries, JSON extraction, authoritative sources
Addresses issues found while reviewing the initial phase-2 implementation against real data: **Bug: uncertain bucket starved from the LLM.** `discover_entities` was dropping the regex-uncertain bucket whenever real git/manifest signal existed — which is exactly when `--llm` is most useful for cleaning up prose noise. The uncertain candidates never reached the refinement step. Fixed: only drop when `llm_provider is None`. **Context collection: word boundaries, not substring.** `_collect_contexts` used substring matching on lower-cased lines, so the name "Go" matched "good", "going", "forgot". Switched to a `(?<!\w)…(?!\w)` regex so short names only match at token boundaries. **Authoritative-source detection replaces confidence threshold.** Previously the refinement step skipped entries with `confidence >= 0.95` to avoid second-guessing manifest-backed projects. That threshold was fragile — the regex detector produces 0.99 confidence for things like `code file reference (5x)` on framework names (OpenAPI, etc.), so those skipped the LLM despite being regex-only noise. New helpers `_is_authoritative_person` / `_is_authoritative_project` look at the actual signal strings (commits, package.json, etc.) to decide. **Now also refines regex-derived people.** After #1148's high-pronoun-signal fix, the regex detector can promote non-people to the `people` bucket (e.g. a capitalized common noun that happened to appear near pronouns). The LLM now gets a chance to clean those up, while git-authored people are still skipped. **Robust JSON extraction.** Small local models routinely wrap JSON output in prose ("Sure, here's the classification: {…}"). The previous code-fence stripper failed on that. `_extract_json_candidates` now does balanced-bracket extraction with string-aware quote handling, so it recovers JSON from: - raw responses - markdown fenced blocks - JSON embedded inside surrounding text - multiple candidate objects/arrays **Prompt guidance for frameworks vs user projects.** Added an explicit instruction: frameworks, runtimes, APIs, cloud services, and third-party vendors (Angular, OpenAPI, Terraform, Bun, Google, etc.) are TOPIC unless the context clearly says it's the user's own codebase. Directly addresses a false-positive pattern observed during dev runs. **Defensive mtime.** `convo_scanner._safe_mtime` catches OSError during `stat()` — permission changes, filesystem races, broken symlinks — and sorts the affected file to the end of the newest-first order rather than crashing the scan. **Cosmetic:** merged two adjacent f-strings on the same line in `backends/chroma.py` and `llm_client.py` (no behaviour change). 15 new tests cover the OSError fallback, word-boundary matching, JSON extraction variants, authoritative-source helpers, refining high- confidence regex projects, and end-to-end LLM refinement preserving the uncertain bucket.
This commit is contained in:
@@ -120,8 +120,7 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> li
|
||||
os.rename(seg_dir, target)
|
||||
moved.append(target)
|
||||
logger.warning(
|
||||
"Quarantined stale HNSW segment %s "
|
||||
"(sqlite %.0fs newer than HNSW); renamed to %s",
|
||||
"Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s",
|
||||
seg_dir,
|
||||
sqlite_mtime - hnsw_mtime,
|
||||
target,
|
||||
|
||||
@@ -91,6 +91,14 @@ def _decode_slug_fallback(slug: str) -> str:
|
||||
return parts[-1] if parts else slug
|
||||
|
||||
|
||||
def _safe_mtime(path: Path) -> float:
|
||||
"""Return file mtime, defaulting old on permission or filesystem errors."""
|
||||
try:
|
||||
return path.stat().st_mtime
|
||||
except OSError:
|
||||
return 0.0
|
||||
|
||||
|
||||
def _resolve_project_name(project_dir: Path) -> str:
|
||||
"""Read one session's cwd to recover the original project name.
|
||||
|
||||
@@ -98,7 +106,7 @@ def _resolve_project_name(project_dir: Path) -> str:
|
||||
"""
|
||||
sessions = sorted(
|
||||
(p for p in project_dir.iterdir() if p.is_file() and p.suffix == ".jsonl"),
|
||||
key=lambda p: p.stat().st_mtime,
|
||||
key=_safe_mtime,
|
||||
reverse=True, # newest first — most likely to be well-formed
|
||||
)
|
||||
for session in sessions:
|
||||
|
||||
@@ -124,7 +124,7 @@ class OllamaProvider(LLMProvider):
|
||||
if not names & wanted:
|
||||
return (
|
||||
False,
|
||||
f"Model '{self.model}' not loaded in Ollama. " f"Run: ollama pull {self.model}",
|
||||
f"Model '{self.model}' not loaded in Ollama. Run: ollama pull {self.model}",
|
||||
)
|
||||
return True, "ok"
|
||||
|
||||
|
||||
+101
-23
@@ -46,6 +46,10 @@ For each candidate, pick exactly ONE label:
|
||||
- COMMON_WORD: an English word, verb, or fragment that isn't a named entity at all (e.g. "Created", "Before", "Never")
|
||||
- AMBIGUOUS: context is insufficient to decide between two of the above
|
||||
|
||||
Frameworks, runtimes, APIs, cloud services, vendors, and third-party products
|
||||
(e.g. Angular, OpenAPI, Terraform, Bun, Google) are TOPIC unless the context
|
||||
clearly says this is the user's own named codebase, product, or active effort.
|
||||
|
||||
Use the provided context lines to disambiguate. A capitalized word that only appears in metadata ("Created: 2026-04-24") is COMMON_WORD. A name that appears with pronouns and dialogue is PERSON.
|
||||
|
||||
Respond with JSON only. Schema:
|
||||
@@ -58,7 +62,7 @@ One entry per candidate, same order as the input."""
|
||||
class RefineResult:
|
||||
merged: dict # updated detected dict
|
||||
reclassified: int # entries whose type changed
|
||||
dropped: int # entries moved out (COMMON_WORD, or AMBIGUOUS sent to uncertain)
|
||||
dropped: int # entries removed from the merged result (COMMON_WORD only)
|
||||
errors: list[str] # per-batch error messages (transport/parse failures)
|
||||
batches_completed: int
|
||||
batches_total: int
|
||||
@@ -70,14 +74,14 @@ def _collect_contexts(
|
||||
) -> list[str]:
|
||||
"""Return up to `max_lines` distinct lines from the corpus that mention `name`.
|
||||
|
||||
Case-insensitive substring match. Lines are truncated to
|
||||
Case-insensitive token-boundary match. Lines are truncated to
|
||||
CONTEXT_WINDOW_CHARS chars to keep token usage bounded.
|
||||
"""
|
||||
needle = name.lower()
|
||||
needle = re.compile(rf"(?<!\w){re.escape(name)}(?!\w)", re.IGNORECASE)
|
||||
seen: set[str] = set()
|
||||
out: list[str] = []
|
||||
for line in corpus_lines:
|
||||
if needle not in line.lower():
|
||||
if not needle.search(line):
|
||||
continue
|
||||
trimmed = line.strip()[:CONTEXT_WINDOW_CHARS]
|
||||
if not trimmed or trimmed in seen:
|
||||
@@ -102,20 +106,64 @@ def _build_user_prompt(candidates_with_contexts: list[tuple[str, str, list[str]]
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def _extract_json_candidates(text: str) -> list[str]:
|
||||
"""Return plausible JSON payloads extracted from an LLM response."""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return []
|
||||
|
||||
candidates: list[str] = [text]
|
||||
|
||||
for match in re.finditer(r"```(?:json)?\s*([\s\S]*?)\s*```", text, re.IGNORECASE):
|
||||
candidate = match.group(1).strip()
|
||||
if candidate and candidate not in candidates:
|
||||
candidates.append(candidate)
|
||||
|
||||
for start, opener in ((i, ch) for i, ch in enumerate(text) if ch in "{["):
|
||||
closer = "}" if opener == "{" else "]"
|
||||
depth = 0
|
||||
in_string = False
|
||||
escaped = False
|
||||
for i in range(start, len(text)):
|
||||
ch = text[i]
|
||||
if in_string:
|
||||
if escaped:
|
||||
escaped = False
|
||||
elif ch == "\\":
|
||||
escaped = True
|
||||
elif ch == '"':
|
||||
in_string = False
|
||||
continue
|
||||
|
||||
if ch == '"':
|
||||
in_string = True
|
||||
elif ch == opener:
|
||||
depth += 1
|
||||
elif ch == closer:
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
candidate = text[start : i + 1].strip()
|
||||
if candidate and candidate not in candidates:
|
||||
candidates.append(candidate)
|
||||
break
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def _parse_response(text: str, expected_names: list[str]) -> dict[str, tuple[str, str]]:
|
||||
"""Parse the LLM's JSON response into {name: (label, reason)}.
|
||||
|
||||
Robust to the model occasionally wrapping JSON in text or returning
|
||||
slight schema variations. Falls back to matching by candidate name.
|
||||
"""
|
||||
# Strip any surrounding fences or prose
|
||||
text = text.strip()
|
||||
if text.startswith("```"):
|
||||
text = re.sub(r"^```(?:json)?\s*", "", text)
|
||||
text = re.sub(r"\s*```\s*$", "", text)
|
||||
try:
|
||||
data = json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
data = None
|
||||
for candidate in _extract_json_candidates(text):
|
||||
try:
|
||||
data = json.loads(candidate)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if data is None:
|
||||
return {}
|
||||
|
||||
entries = data.get("classifications") if isinstance(data, dict) else data
|
||||
@@ -142,7 +190,9 @@ def _parse_response(text: str, expected_names: list[str]) -> dict[str, tuple[str
|
||||
|
||||
|
||||
def _apply_classifications(
|
||||
detected: dict, decisions: dict[str, tuple[str, str]]
|
||||
detected: dict,
|
||||
decisions: dict[str, tuple[str, str]],
|
||||
allow_project_promotions: bool = True,
|
||||
) -> tuple[dict, int, int]:
|
||||
"""Merge LLM decisions back into the detected dict.
|
||||
|
||||
@@ -182,6 +232,12 @@ def _apply_classifications(
|
||||
continue
|
||||
|
||||
target_bucket = label_to_bucket[label]
|
||||
if (
|
||||
label == "PROJECT"
|
||||
and not allow_project_promotions
|
||||
and not _is_authoritative_project(entry)
|
||||
):
|
||||
target_bucket = "uncertain"
|
||||
updated = dict(entry)
|
||||
# Append the LLM's reason as a new signal so the user sees why it moved
|
||||
signals = list(updated.get("signals", []))
|
||||
@@ -201,6 +257,19 @@ def _apply_classifications(
|
||||
return new_detected, reclassified, dropped
|
||||
|
||||
|
||||
def _is_authoritative_person(entry: dict) -> bool:
|
||||
"""Return True for git-author people that should not be second-guessed."""
|
||||
signals = " ".join(entry.get("signals", [])).lower()
|
||||
return "commit" in signals and "repo" in signals
|
||||
|
||||
|
||||
def _is_authoritative_project(entry: dict) -> bool:
|
||||
"""Return True for manifest/git-backed projects that are already source-backed."""
|
||||
signals = " ".join(entry.get("signals", [])).lower()
|
||||
manifest_markers = ("package.json", "pyproject.toml", "cargo.toml", "go.mod")
|
||||
return any(marker in signals for marker in manifest_markers) or "commit" in signals
|
||||
|
||||
|
||||
def _print_progress(batch_idx: int, total: int, current_name: str) -> None:
|
||||
"""Overwrite-line progress indicator."""
|
||||
width = 40
|
||||
@@ -217,12 +286,13 @@ def refine_entities(
|
||||
provider: LLMProvider,
|
||||
batch_size: int = BATCH_SIZE,
|
||||
show_progress: bool = True,
|
||||
allow_project_promotions: bool = True,
|
||||
) -> RefineResult:
|
||||
"""Reclassify detected entities using the LLM provider.
|
||||
|
||||
Only candidates in the ``uncertain`` and ``projects`` buckets are sent for
|
||||
refinement — ``people`` entries from git authorship are already
|
||||
high-confidence and don't benefit from LLM second-guessing.
|
||||
Only regex-derived candidates are sent for refinement. Git authors and
|
||||
manifest/git-backed projects are already source-backed and don't benefit
|
||||
from LLM second-guessing.
|
||||
|
||||
Ctrl-C during refinement: cancels the remaining batches, returns a
|
||||
RefineResult with ``cancelled=True`` and whatever was classified before
|
||||
@@ -231,16 +301,20 @@ def refine_entities(
|
||||
|
||||
Transport or parse failures in individual batches are recorded in
|
||||
``errors`` and do not abort the run.
|
||||
|
||||
``allow_project_promotions=False`` keeps LLM-only project guesses in the
|
||||
uncertain bucket. This is useful when manifest/git signal already supplied
|
||||
canonical projects and regex/LLM hits are likely tools, vendors, or topics.
|
||||
"""
|
||||
# Only refine buckets that actually benefit — keep `people` as-is
|
||||
# (git-authored people are already authoritative).
|
||||
candidates: list[tuple[str, str]] = []
|
||||
for bucket in ("projects", "uncertain"):
|
||||
current_type = {"people": "person", "projects": "project", "uncertain": "uncertain"}
|
||||
for bucket in ("people", "projects", "uncertain"):
|
||||
for e in detected.get(bucket, []):
|
||||
# Skip already-high-confidence entries (manifest-backed projects etc.)
|
||||
if e.get("confidence", 0) >= 0.95 and bucket == "projects":
|
||||
if bucket == "people" and _is_authoritative_person(e):
|
||||
continue
|
||||
candidates.append((e["name"], bucket.rstrip("s"))) # "projects" -> "project"
|
||||
if bucket == "projects" and _is_authoritative_project(e):
|
||||
continue
|
||||
candidates.append((e["name"], current_type[bucket]))
|
||||
|
||||
corpus_lines = corpus_text.splitlines() if corpus_text else []
|
||||
|
||||
@@ -300,7 +374,11 @@ def refine_entities(
|
||||
sys.stderr.write("\n")
|
||||
sys.stderr.flush()
|
||||
|
||||
merged, reclassified, dropped = _apply_classifications(detected, all_decisions)
|
||||
merged, reclassified, dropped = _apply_classifications(
|
||||
detected,
|
||||
all_decisions,
|
||||
allow_project_promotions=allow_project_promotions,
|
||||
)
|
||||
|
||||
return RefineResult(
|
||||
merged=merged,
|
||||
|
||||
@@ -632,18 +632,28 @@ def discover_entities(
|
||||
else {"people": [], "projects": [], "uncertain": []}
|
||||
)
|
||||
|
||||
# If git/manifests gave us real projects, suppress the regex "uncertain" bucket.
|
||||
# That bucket is mostly noise (common words, CamelCase tech terms, etc.) and
|
||||
# adding it to the review flow just makes the user do triage we can skip.
|
||||
# Without LLM refinement, suppress regex "uncertain" noise when real
|
||||
# manifest/git signal exists. With LLM refinement enabled, keep those
|
||||
# candidates so the model can promote real entities or drop common words.
|
||||
has_real_signal = bool(projects) or bool(people)
|
||||
merged = _merge_detected(real_signal, prose_detected, drop_secondary_uncertain=has_real_signal)
|
||||
merged = _merge_detected(
|
||||
real_signal,
|
||||
prose_detected,
|
||||
drop_secondary_uncertain=has_real_signal and llm_provider is None,
|
||||
)
|
||||
|
||||
# Optional phase 2: LLM refinement.
|
||||
if llm_provider is not None:
|
||||
from mempalace.llm_refine import collect_corpus_text, refine_entities
|
||||
|
||||
corpus = collect_corpus_text(str(project_dir))
|
||||
result = refine_entities(merged, corpus, llm_provider, show_progress=show_progress)
|
||||
result = refine_entities(
|
||||
merged,
|
||||
corpus,
|
||||
llm_provider,
|
||||
show_progress=show_progress,
|
||||
allow_project_promotions=not has_real_signal,
|
||||
)
|
||||
if show_progress:
|
||||
status_bits = []
|
||||
if result.cancelled:
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
"""Tests for mempalace.convo_scanner."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from mempalace.convo_scanner import (
|
||||
_decode_slug_fallback,
|
||||
_extract_cwd_from_session,
|
||||
_resolve_project_name,
|
||||
_safe_mtime,
|
||||
is_claude_projects_root,
|
||||
scan_claude_projects,
|
||||
)
|
||||
@@ -93,6 +95,23 @@ def test_decode_slug_fallback_only_dashes():
|
||||
assert _decode_slug_fallback("---") == "---"
|
||||
|
||||
|
||||
# ── safe metadata helpers ───────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_safe_mtime_returns_zero_on_stat_error(tmp_path, monkeypatch):
|
||||
f = tmp_path / "session.jsonl"
|
||||
f.write_text("{}\n")
|
||||
original_stat = Path.stat
|
||||
|
||||
def fail_stat(self):
|
||||
if self == f:
|
||||
raise OSError("permission denied")
|
||||
return original_stat(self)
|
||||
|
||||
monkeypatch.setattr(Path, "stat", fail_stat)
|
||||
assert _safe_mtime(f) == 0.0
|
||||
|
||||
|
||||
# ── _resolve_project_name ───────────────────────────────────────────────
|
||||
|
||||
|
||||
|
||||
@@ -11,6 +11,9 @@ from mempalace.llm_refine import (
|
||||
_apply_classifications,
|
||||
_build_user_prompt,
|
||||
_collect_contexts,
|
||||
_extract_json_candidates,
|
||||
_is_authoritative_person,
|
||||
_is_authoritative_project,
|
||||
_parse_response,
|
||||
collect_corpus_text,
|
||||
refine_entities,
|
||||
@@ -62,6 +65,16 @@ def test_collect_contexts_case_insensitive():
|
||||
assert out == ["lowercase alice mention"]
|
||||
|
||||
|
||||
def test_collect_contexts_uses_token_boundaries():
|
||||
lines = [
|
||||
"forgot should not match",
|
||||
"Go is a language.",
|
||||
"go-v1 shipped.",
|
||||
]
|
||||
out = _collect_contexts(lines, "Go", max_lines=5)
|
||||
assert out == ["Go is a language.", "go-v1 shipped."]
|
||||
|
||||
|
||||
def test_collect_contexts_dedupes_identical_lines():
|
||||
lines = ["Alice", "Alice", "Alice was here"]
|
||||
out = _collect_contexts(lines, "Alice", max_lines=5)
|
||||
@@ -131,6 +144,30 @@ def test_parse_response_strips_code_fences():
|
||||
assert out["X"][0] == "TOPIC"
|
||||
|
||||
|
||||
def test_parse_response_extracts_json_after_prose():
|
||||
text = 'Sure, here is the JSON: {"classifications": [{"name": "X", "label": "TOPIC"}]}'
|
||||
out = _parse_response(text, ["X"])
|
||||
assert out["X"][0] == "TOPIC"
|
||||
|
||||
|
||||
def test_parse_response_extracts_fenced_json_after_prose():
|
||||
text = 'Sure:\n```json\n{"classifications": [{"name": "X", "label": "PROJECT"}]}\n```'
|
||||
out = _parse_response(text, ["X"])
|
||||
assert out["X"][0] == "PROJECT"
|
||||
|
||||
|
||||
def test_extract_json_candidates_handles_embedded_array():
|
||||
text = 'prefix [{"name": "Y", "label": "PERSON"}] suffix'
|
||||
candidates = _extract_json_candidates(text)
|
||||
assert '[{"name": "Y", "label": "PERSON"}]' in candidates
|
||||
|
||||
|
||||
def test_parse_response_ignores_non_json_brackets_before_payload():
|
||||
text = 'See [note] first. JSON: {"classifications": [{"name": "X", "label": "TOPIC"}]}'
|
||||
out = _parse_response(text, ["X"])
|
||||
assert out["X"][0] == "TOPIC"
|
||||
|
||||
|
||||
def test_parse_response_malformed_returns_empty():
|
||||
out = _parse_response("not json at all", ["X"])
|
||||
assert out == {}
|
||||
@@ -257,6 +294,67 @@ def test_apply_classifications_topic_goes_to_uncertain():
|
||||
assert reclass == 1
|
||||
|
||||
|
||||
def test_apply_classifications_can_block_llm_only_project_promotion():
|
||||
detected = {
|
||||
"people": [],
|
||||
"projects": [],
|
||||
"uncertain": [
|
||||
{
|
||||
"name": "Terraform",
|
||||
"type": "uncertain",
|
||||
"confidence": 0.4,
|
||||
"frequency": 5,
|
||||
"signals": ["regex"],
|
||||
}
|
||||
],
|
||||
}
|
||||
decisions = {"Terraform": ("PROJECT", "tool")}
|
||||
new, reclass, _ = _apply_classifications(
|
||||
detected,
|
||||
decisions,
|
||||
allow_project_promotions=False,
|
||||
)
|
||||
assert new["projects"] == []
|
||||
assert new["uncertain"][0]["name"] == "Terraform"
|
||||
assert new["uncertain"][0]["type"] == "uncertain"
|
||||
assert reclass == 0
|
||||
|
||||
|
||||
def test_apply_classifications_allows_project_promotion_for_prose_only_mode():
|
||||
detected = {
|
||||
"people": [],
|
||||
"projects": [],
|
||||
"uncertain": [
|
||||
{
|
||||
"name": "Project Aurora",
|
||||
"type": "uncertain",
|
||||
"confidence": 0.4,
|
||||
"frequency": 5,
|
||||
"signals": ["regex"],
|
||||
}
|
||||
],
|
||||
}
|
||||
decisions = {"Project Aurora": ("PROJECT", "user effort")}
|
||||
new, reclass, _ = _apply_classifications(detected, decisions)
|
||||
assert new["projects"][0]["name"] == "Project Aurora"
|
||||
assert new["projects"][0]["type"] == "project"
|
||||
assert reclass == 1
|
||||
|
||||
|
||||
# ── authoritative source filters ────────────────────────────────────────
|
||||
|
||||
|
||||
def test_is_authoritative_person_requires_git_signal():
|
||||
assert _is_authoritative_person({"signals": ["5 commits across 2 repos"]})
|
||||
assert not _is_authoritative_person({"signals": ["pronoun nearby (5x)"]})
|
||||
|
||||
|
||||
def test_is_authoritative_project_requires_manifest_or_git_signal():
|
||||
assert _is_authoritative_project({"signals": ["package.json, 12 of your commits"]})
|
||||
assert _is_authoritative_project({"signals": ["57 commits (none by you)"]})
|
||||
assert not _is_authoritative_project({"signals": ["code file reference (5x)"]})
|
||||
|
||||
|
||||
# ── refine_entities ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@@ -347,6 +445,93 @@ def test_refine_entities_skips_high_confidence_projects():
|
||||
assert provider.call_count == 0
|
||||
|
||||
|
||||
def test_refine_entities_refines_high_confidence_regex_projects():
|
||||
"""High-confidence regex projects still need LLM review without source signal."""
|
||||
detected = {
|
||||
"people": [],
|
||||
"projects": [
|
||||
{
|
||||
"name": "OpenAPI",
|
||||
"type": "project",
|
||||
"confidence": 0.99,
|
||||
"frequency": 5,
|
||||
"signals": ["code file reference (5x)"],
|
||||
}
|
||||
],
|
||||
"uncertain": [],
|
||||
}
|
||||
provider = FakeProvider(
|
||||
response_text=(
|
||||
'{"classifications": [{"name": "OpenAPI", "label": "TOPIC", "reason": "technology"}]}'
|
||||
)
|
||||
)
|
||||
result = refine_entities(detected, "OpenAPI schemas", provider, show_progress=False)
|
||||
assert provider.call_count == 1
|
||||
assert result.reclassified == 1
|
||||
assert result.merged["projects"] == []
|
||||
assert result.merged["uncertain"][0]["name"] == "OpenAPI"
|
||||
|
||||
|
||||
def test_refine_entities_refines_regex_people_but_skips_git_people():
|
||||
detected = {
|
||||
"people": [
|
||||
{
|
||||
"name": "Igor Lins e Silva",
|
||||
"type": "person",
|
||||
"confidence": 0.99,
|
||||
"frequency": 100,
|
||||
"signals": ["100 commits across 3 repos"],
|
||||
},
|
||||
{
|
||||
"name": "Tool",
|
||||
"type": "person",
|
||||
"confidence": 0.99,
|
||||
"frequency": 5,
|
||||
"signals": ["pronoun nearby (5x)"],
|
||||
},
|
||||
],
|
||||
"projects": [],
|
||||
"uncertain": [],
|
||||
}
|
||||
provider = FakeProvider(
|
||||
response_text='{"classifications": [{"name": "Tool", "label": "COMMON_WORD"}]}'
|
||||
)
|
||||
result = refine_entities(detected, "Tool is a common noun.", provider, show_progress=False)
|
||||
assert provider.call_count == 1
|
||||
names = [e["name"] for e in result.merged["people"]]
|
||||
assert names == ["Igor Lins e Silva"]
|
||||
assert result.dropped == 1
|
||||
|
||||
|
||||
def test_refine_entities_can_keep_llm_only_project_in_uncertain():
|
||||
detected = {
|
||||
"people": [],
|
||||
"projects": [],
|
||||
"uncertain": [
|
||||
{
|
||||
"name": "Terraform",
|
||||
"type": "uncertain",
|
||||
"confidence": 0.4,
|
||||
"frequency": 9,
|
||||
"signals": ["regex"],
|
||||
}
|
||||
],
|
||||
}
|
||||
provider = FakeProvider(
|
||||
response_text='{"classifications": [{"name": "Terraform", "label": "PROJECT"}]}'
|
||||
)
|
||||
result = refine_entities(
|
||||
detected,
|
||||
"Terraform config",
|
||||
provider,
|
||||
show_progress=False,
|
||||
allow_project_promotions=False,
|
||||
)
|
||||
assert result.merged["projects"] == []
|
||||
assert result.merged["uncertain"][0]["name"] == "Terraform"
|
||||
assert any("LLM: project" in s for s in result.merged["uncertain"][0]["signals"])
|
||||
|
||||
|
||||
def test_refine_entities_empty_candidates_returns_noop():
|
||||
detected = {"people": [], "projects": [], "uncertain": []}
|
||||
provider = FakeProvider()
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
from mempalace.project_scanner import (
|
||||
PersonInfo,
|
||||
@@ -390,6 +391,49 @@ def test_discover_entities_prefers_real_signal_over_prose(tmp_path):
|
||||
assert "realproj" in proj_names
|
||||
|
||||
|
||||
def test_discover_entities_keeps_uncertain_for_llm_when_real_signal(tmp_path):
|
||||
"""With --llm, regex-uncertain prose candidates should reach refinement."""
|
||||
(tmp_path / "package.json").write_text(json.dumps({"name": "realproj"}))
|
||||
_init_git_repo(tmp_path)
|
||||
(tmp_path / "doc.md").write_text("Noise appeared. Noise repeated. Noise again.")
|
||||
|
||||
class FakeProvider:
|
||||
def __init__(self):
|
||||
self.prompts = []
|
||||
|
||||
def classify(self, _system, user, json_mode=True):
|
||||
self.prompts.append(user)
|
||||
return SimpleNamespace(
|
||||
text='{"classifications": [{"name": "Noise", "label": "COMMON_WORD"}]}'
|
||||
)
|
||||
|
||||
provider = FakeProvider()
|
||||
d = discover_entities(str(tmp_path), llm_provider=provider, show_progress=False)
|
||||
|
||||
assert len(provider.prompts) == 1
|
||||
assert "Noise" in provider.prompts[0]
|
||||
assert "Noise" not in [e["name"] for cat in d.values() for e in cat]
|
||||
|
||||
|
||||
def test_discover_entities_keeps_llm_only_project_uncertain_when_real_signal(tmp_path):
|
||||
"""Repo roots should not auto-promote LLM-only tools/topics into projects."""
|
||||
(tmp_path / "package.json").write_text(json.dumps({"name": "realproj"}))
|
||||
_init_git_repo(tmp_path)
|
||||
(tmp_path / "doc.md").write_text("Terraform shipped. Terraform changed. Terraform runs.")
|
||||
|
||||
class FakeProvider:
|
||||
def classify(self, _system, _user, json_mode=True):
|
||||
return SimpleNamespace(
|
||||
text='{"classifications": [{"name": "Terraform", "label": "PROJECT"}]}'
|
||||
)
|
||||
|
||||
d = discover_entities(str(tmp_path), llm_provider=FakeProvider(), show_progress=False)
|
||||
|
||||
assert "realproj" in [e["name"] for e in d["projects"]]
|
||||
assert "Terraform" not in [e["name"] for e in d["projects"]]
|
||||
assert "Terraform" in [e["name"] for e in d["uncertain"]]
|
||||
|
||||
|
||||
# ── _UnionFind basics ──────────────────────────────────────────────────
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user