fix(llm): tighter refinement — word boundaries, JSON extraction, authoritative sources
Addresses issues found while reviewing the initial phase-2 implementation against real data: **Bug: uncertain bucket starved from the LLM.** `discover_entities` was dropping the regex-uncertain bucket whenever real git/manifest signal existed — which is exactly when `--llm` is most useful for cleaning up prose noise. The uncertain candidates never reached the refinement step. Fixed: only drop when `llm_provider is None`. **Context collection: word boundaries, not substring.** `_collect_contexts` used substring matching on lower-cased lines, so the name "Go" matched "good", "going", "forgot". Switched to a `(?<!\w)…(?!\w)` regex so short names only match at token boundaries. **Authoritative-source detection replaces confidence threshold.** Previously the refinement step skipped entries with `confidence >= 0.95` to avoid second-guessing manifest-backed projects. That threshold was fragile — the regex detector produces 0.99 confidence for things like `code file reference (5x)` on framework names (OpenAPI, etc.), so those skipped the LLM despite being regex-only noise. New helpers `_is_authoritative_person` / `_is_authoritative_project` look at the actual signal strings (commits, package.json, etc.) to decide. **Now also refines regex-derived people.** After #1148's high-pronoun-signal fix, the regex detector can promote non-people to the `people` bucket (e.g. a capitalized common noun that happened to appear near pronouns). The LLM now gets a chance to clean those up, while git-authored people are still skipped. **Robust JSON extraction.** Small local models routinely wrap JSON output in prose ("Sure, here's the classification: {…}"). The previous code-fence stripper failed on that. `_extract_json_candidates` now does balanced-bracket extraction with string-aware quote handling, so it recovers JSON from: - raw responses - markdown fenced blocks - JSON embedded inside surrounding text - multiple candidate objects/arrays **Prompt guidance for frameworks vs user projects.** Added an explicit instruction: frameworks, runtimes, APIs, cloud services, and third-party vendors (Angular, OpenAPI, Terraform, Bun, Google, etc.) are TOPIC unless the context clearly says it's the user's own codebase. Directly addresses a false-positive pattern observed during dev runs. **Defensive mtime.** `convo_scanner._safe_mtime` catches OSError during `stat()` — permission changes, filesystem races, broken symlinks — and sorts the affected file to the end of the newest-first order rather than crashing the scan. **Cosmetic:** merged two adjacent f-strings on the same line in `backends/chroma.py` and `llm_client.py` (no behaviour change). 15 new tests cover the OSError fallback, word-boundary matching, JSON extraction variants, authoritative-source helpers, refining high- confidence regex projects, and end-to-end LLM refinement preserving the uncertain bucket.
This commit is contained in:
@@ -120,8 +120,7 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> li
|
|||||||
os.rename(seg_dir, target)
|
os.rename(seg_dir, target)
|
||||||
moved.append(target)
|
moved.append(target)
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Quarantined stale HNSW segment %s "
|
"Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s",
|
||||||
"(sqlite %.0fs newer than HNSW); renamed to %s",
|
|
||||||
seg_dir,
|
seg_dir,
|
||||||
sqlite_mtime - hnsw_mtime,
|
sqlite_mtime - hnsw_mtime,
|
||||||
target,
|
target,
|
||||||
|
|||||||
@@ -91,6 +91,14 @@ def _decode_slug_fallback(slug: str) -> str:
|
|||||||
return parts[-1] if parts else slug
|
return parts[-1] if parts else slug
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_mtime(path: Path) -> float:
|
||||||
|
"""Return file mtime, defaulting old on permission or filesystem errors."""
|
||||||
|
try:
|
||||||
|
return path.stat().st_mtime
|
||||||
|
except OSError:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
def _resolve_project_name(project_dir: Path) -> str:
|
def _resolve_project_name(project_dir: Path) -> str:
|
||||||
"""Read one session's cwd to recover the original project name.
|
"""Read one session's cwd to recover the original project name.
|
||||||
|
|
||||||
@@ -98,7 +106,7 @@ def _resolve_project_name(project_dir: Path) -> str:
|
|||||||
"""
|
"""
|
||||||
sessions = sorted(
|
sessions = sorted(
|
||||||
(p for p in project_dir.iterdir() if p.is_file() and p.suffix == ".jsonl"),
|
(p for p in project_dir.iterdir() if p.is_file() and p.suffix == ".jsonl"),
|
||||||
key=lambda p: p.stat().st_mtime,
|
key=_safe_mtime,
|
||||||
reverse=True, # newest first — most likely to be well-formed
|
reverse=True, # newest first — most likely to be well-formed
|
||||||
)
|
)
|
||||||
for session in sessions:
|
for session in sessions:
|
||||||
|
|||||||
@@ -124,7 +124,7 @@ class OllamaProvider(LLMProvider):
|
|||||||
if not names & wanted:
|
if not names & wanted:
|
||||||
return (
|
return (
|
||||||
False,
|
False,
|
||||||
f"Model '{self.model}' not loaded in Ollama. " f"Run: ollama pull {self.model}",
|
f"Model '{self.model}' not loaded in Ollama. Run: ollama pull {self.model}",
|
||||||
)
|
)
|
||||||
return True, "ok"
|
return True, "ok"
|
||||||
|
|
||||||
|
|||||||
+99
-21
@@ -46,6 +46,10 @@ For each candidate, pick exactly ONE label:
|
|||||||
- COMMON_WORD: an English word, verb, or fragment that isn't a named entity at all (e.g. "Created", "Before", "Never")
|
- COMMON_WORD: an English word, verb, or fragment that isn't a named entity at all (e.g. "Created", "Before", "Never")
|
||||||
- AMBIGUOUS: context is insufficient to decide between two of the above
|
- AMBIGUOUS: context is insufficient to decide between two of the above
|
||||||
|
|
||||||
|
Frameworks, runtimes, APIs, cloud services, vendors, and third-party products
|
||||||
|
(e.g. Angular, OpenAPI, Terraform, Bun, Google) are TOPIC unless the context
|
||||||
|
clearly says this is the user's own named codebase, product, or active effort.
|
||||||
|
|
||||||
Use the provided context lines to disambiguate. A capitalized word that only appears in metadata ("Created: 2026-04-24") is COMMON_WORD. A name that appears with pronouns and dialogue is PERSON.
|
Use the provided context lines to disambiguate. A capitalized word that only appears in metadata ("Created: 2026-04-24") is COMMON_WORD. A name that appears with pronouns and dialogue is PERSON.
|
||||||
|
|
||||||
Respond with JSON only. Schema:
|
Respond with JSON only. Schema:
|
||||||
@@ -58,7 +62,7 @@ One entry per candidate, same order as the input."""
|
|||||||
class RefineResult:
|
class RefineResult:
|
||||||
merged: dict # updated detected dict
|
merged: dict # updated detected dict
|
||||||
reclassified: int # entries whose type changed
|
reclassified: int # entries whose type changed
|
||||||
dropped: int # entries moved out (COMMON_WORD, or AMBIGUOUS sent to uncertain)
|
dropped: int # entries removed from the merged result (COMMON_WORD only)
|
||||||
errors: list[str] # per-batch error messages (transport/parse failures)
|
errors: list[str] # per-batch error messages (transport/parse failures)
|
||||||
batches_completed: int
|
batches_completed: int
|
||||||
batches_total: int
|
batches_total: int
|
||||||
@@ -70,14 +74,14 @@ def _collect_contexts(
|
|||||||
) -> list[str]:
|
) -> list[str]:
|
||||||
"""Return up to `max_lines` distinct lines from the corpus that mention `name`.
|
"""Return up to `max_lines` distinct lines from the corpus that mention `name`.
|
||||||
|
|
||||||
Case-insensitive substring match. Lines are truncated to
|
Case-insensitive token-boundary match. Lines are truncated to
|
||||||
CONTEXT_WINDOW_CHARS chars to keep token usage bounded.
|
CONTEXT_WINDOW_CHARS chars to keep token usage bounded.
|
||||||
"""
|
"""
|
||||||
needle = name.lower()
|
needle = re.compile(rf"(?<!\w){re.escape(name)}(?!\w)", re.IGNORECASE)
|
||||||
seen: set[str] = set()
|
seen: set[str] = set()
|
||||||
out: list[str] = []
|
out: list[str] = []
|
||||||
for line in corpus_lines:
|
for line in corpus_lines:
|
||||||
if needle not in line.lower():
|
if not needle.search(line):
|
||||||
continue
|
continue
|
||||||
trimmed = line.strip()[:CONTEXT_WINDOW_CHARS]
|
trimmed = line.strip()[:CONTEXT_WINDOW_CHARS]
|
||||||
if not trimmed or trimmed in seen:
|
if not trimmed or trimmed in seen:
|
||||||
@@ -102,20 +106,64 @@ def _build_user_prompt(candidates_with_contexts: list[tuple[str, str, list[str]]
|
|||||||
return "\n".join(parts)
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_json_candidates(text: str) -> list[str]:
|
||||||
|
"""Return plausible JSON payloads extracted from an LLM response."""
|
||||||
|
text = text.strip()
|
||||||
|
if not text:
|
||||||
|
return []
|
||||||
|
|
||||||
|
candidates: list[str] = [text]
|
||||||
|
|
||||||
|
for match in re.finditer(r"```(?:json)?\s*([\s\S]*?)\s*```", text, re.IGNORECASE):
|
||||||
|
candidate = match.group(1).strip()
|
||||||
|
if candidate and candidate not in candidates:
|
||||||
|
candidates.append(candidate)
|
||||||
|
|
||||||
|
for start, opener in ((i, ch) for i, ch in enumerate(text) if ch in "{["):
|
||||||
|
closer = "}" if opener == "{" else "]"
|
||||||
|
depth = 0
|
||||||
|
in_string = False
|
||||||
|
escaped = False
|
||||||
|
for i in range(start, len(text)):
|
||||||
|
ch = text[i]
|
||||||
|
if in_string:
|
||||||
|
if escaped:
|
||||||
|
escaped = False
|
||||||
|
elif ch == "\\":
|
||||||
|
escaped = True
|
||||||
|
elif ch == '"':
|
||||||
|
in_string = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
if ch == '"':
|
||||||
|
in_string = True
|
||||||
|
elif ch == opener:
|
||||||
|
depth += 1
|
||||||
|
elif ch == closer:
|
||||||
|
depth -= 1
|
||||||
|
if depth == 0:
|
||||||
|
candidate = text[start : i + 1].strip()
|
||||||
|
if candidate and candidate not in candidates:
|
||||||
|
candidates.append(candidate)
|
||||||
|
break
|
||||||
|
|
||||||
|
return candidates
|
||||||
|
|
||||||
|
|
||||||
def _parse_response(text: str, expected_names: list[str]) -> dict[str, tuple[str, str]]:
|
def _parse_response(text: str, expected_names: list[str]) -> dict[str, tuple[str, str]]:
|
||||||
"""Parse the LLM's JSON response into {name: (label, reason)}.
|
"""Parse the LLM's JSON response into {name: (label, reason)}.
|
||||||
|
|
||||||
Robust to the model occasionally wrapping JSON in text or returning
|
Robust to the model occasionally wrapping JSON in text or returning
|
||||||
slight schema variations. Falls back to matching by candidate name.
|
slight schema variations. Falls back to matching by candidate name.
|
||||||
"""
|
"""
|
||||||
# Strip any surrounding fences or prose
|
data = None
|
||||||
text = text.strip()
|
for candidate in _extract_json_candidates(text):
|
||||||
if text.startswith("```"):
|
|
||||||
text = re.sub(r"^```(?:json)?\s*", "", text)
|
|
||||||
text = re.sub(r"\s*```\s*$", "", text)
|
|
||||||
try:
|
try:
|
||||||
data = json.loads(text)
|
data = json.loads(candidate)
|
||||||
|
break
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
if data is None:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
entries = data.get("classifications") if isinstance(data, dict) else data
|
entries = data.get("classifications") if isinstance(data, dict) else data
|
||||||
@@ -142,7 +190,9 @@ def _parse_response(text: str, expected_names: list[str]) -> dict[str, tuple[str
|
|||||||
|
|
||||||
|
|
||||||
def _apply_classifications(
|
def _apply_classifications(
|
||||||
detected: dict, decisions: dict[str, tuple[str, str]]
|
detected: dict,
|
||||||
|
decisions: dict[str, tuple[str, str]],
|
||||||
|
allow_project_promotions: bool = True,
|
||||||
) -> tuple[dict, int, int]:
|
) -> tuple[dict, int, int]:
|
||||||
"""Merge LLM decisions back into the detected dict.
|
"""Merge LLM decisions back into the detected dict.
|
||||||
|
|
||||||
@@ -182,6 +232,12 @@ def _apply_classifications(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
target_bucket = label_to_bucket[label]
|
target_bucket = label_to_bucket[label]
|
||||||
|
if (
|
||||||
|
label == "PROJECT"
|
||||||
|
and not allow_project_promotions
|
||||||
|
and not _is_authoritative_project(entry)
|
||||||
|
):
|
||||||
|
target_bucket = "uncertain"
|
||||||
updated = dict(entry)
|
updated = dict(entry)
|
||||||
# Append the LLM's reason as a new signal so the user sees why it moved
|
# Append the LLM's reason as a new signal so the user sees why it moved
|
||||||
signals = list(updated.get("signals", []))
|
signals = list(updated.get("signals", []))
|
||||||
@@ -201,6 +257,19 @@ def _apply_classifications(
|
|||||||
return new_detected, reclassified, dropped
|
return new_detected, reclassified, dropped
|
||||||
|
|
||||||
|
|
||||||
|
def _is_authoritative_person(entry: dict) -> bool:
|
||||||
|
"""Return True for git-author people that should not be second-guessed."""
|
||||||
|
signals = " ".join(entry.get("signals", [])).lower()
|
||||||
|
return "commit" in signals and "repo" in signals
|
||||||
|
|
||||||
|
|
||||||
|
def _is_authoritative_project(entry: dict) -> bool:
|
||||||
|
"""Return True for manifest/git-backed projects that are already source-backed."""
|
||||||
|
signals = " ".join(entry.get("signals", [])).lower()
|
||||||
|
manifest_markers = ("package.json", "pyproject.toml", "cargo.toml", "go.mod")
|
||||||
|
return any(marker in signals for marker in manifest_markers) or "commit" in signals
|
||||||
|
|
||||||
|
|
||||||
def _print_progress(batch_idx: int, total: int, current_name: str) -> None:
|
def _print_progress(batch_idx: int, total: int, current_name: str) -> None:
|
||||||
"""Overwrite-line progress indicator."""
|
"""Overwrite-line progress indicator."""
|
||||||
width = 40
|
width = 40
|
||||||
@@ -217,12 +286,13 @@ def refine_entities(
|
|||||||
provider: LLMProvider,
|
provider: LLMProvider,
|
||||||
batch_size: int = BATCH_SIZE,
|
batch_size: int = BATCH_SIZE,
|
||||||
show_progress: bool = True,
|
show_progress: bool = True,
|
||||||
|
allow_project_promotions: bool = True,
|
||||||
) -> RefineResult:
|
) -> RefineResult:
|
||||||
"""Reclassify detected entities using the LLM provider.
|
"""Reclassify detected entities using the LLM provider.
|
||||||
|
|
||||||
Only candidates in the ``uncertain`` and ``projects`` buckets are sent for
|
Only regex-derived candidates are sent for refinement. Git authors and
|
||||||
refinement — ``people`` entries from git authorship are already
|
manifest/git-backed projects are already source-backed and don't benefit
|
||||||
high-confidence and don't benefit from LLM second-guessing.
|
from LLM second-guessing.
|
||||||
|
|
||||||
Ctrl-C during refinement: cancels the remaining batches, returns a
|
Ctrl-C during refinement: cancels the remaining batches, returns a
|
||||||
RefineResult with ``cancelled=True`` and whatever was classified before
|
RefineResult with ``cancelled=True`` and whatever was classified before
|
||||||
@@ -231,16 +301,20 @@ def refine_entities(
|
|||||||
|
|
||||||
Transport or parse failures in individual batches are recorded in
|
Transport or parse failures in individual batches are recorded in
|
||||||
``errors`` and do not abort the run.
|
``errors`` and do not abort the run.
|
||||||
|
|
||||||
|
``allow_project_promotions=False`` keeps LLM-only project guesses in the
|
||||||
|
uncertain bucket. This is useful when manifest/git signal already supplied
|
||||||
|
canonical projects and regex/LLM hits are likely tools, vendors, or topics.
|
||||||
"""
|
"""
|
||||||
# Only refine buckets that actually benefit — keep `people` as-is
|
|
||||||
# (git-authored people are already authoritative).
|
|
||||||
candidates: list[tuple[str, str]] = []
|
candidates: list[tuple[str, str]] = []
|
||||||
for bucket in ("projects", "uncertain"):
|
current_type = {"people": "person", "projects": "project", "uncertain": "uncertain"}
|
||||||
|
for bucket in ("people", "projects", "uncertain"):
|
||||||
for e in detected.get(bucket, []):
|
for e in detected.get(bucket, []):
|
||||||
# Skip already-high-confidence entries (manifest-backed projects etc.)
|
if bucket == "people" and _is_authoritative_person(e):
|
||||||
if e.get("confidence", 0) >= 0.95 and bucket == "projects":
|
|
||||||
continue
|
continue
|
||||||
candidates.append((e["name"], bucket.rstrip("s"))) # "projects" -> "project"
|
if bucket == "projects" and _is_authoritative_project(e):
|
||||||
|
continue
|
||||||
|
candidates.append((e["name"], current_type[bucket]))
|
||||||
|
|
||||||
corpus_lines = corpus_text.splitlines() if corpus_text else []
|
corpus_lines = corpus_text.splitlines() if corpus_text else []
|
||||||
|
|
||||||
@@ -300,7 +374,11 @@ def refine_entities(
|
|||||||
sys.stderr.write("\n")
|
sys.stderr.write("\n")
|
||||||
sys.stderr.flush()
|
sys.stderr.flush()
|
||||||
|
|
||||||
merged, reclassified, dropped = _apply_classifications(detected, all_decisions)
|
merged, reclassified, dropped = _apply_classifications(
|
||||||
|
detected,
|
||||||
|
all_decisions,
|
||||||
|
allow_project_promotions=allow_project_promotions,
|
||||||
|
)
|
||||||
|
|
||||||
return RefineResult(
|
return RefineResult(
|
||||||
merged=merged,
|
merged=merged,
|
||||||
|
|||||||
@@ -632,18 +632,28 @@ def discover_entities(
|
|||||||
else {"people": [], "projects": [], "uncertain": []}
|
else {"people": [], "projects": [], "uncertain": []}
|
||||||
)
|
)
|
||||||
|
|
||||||
# If git/manifests gave us real projects, suppress the regex "uncertain" bucket.
|
# Without LLM refinement, suppress regex "uncertain" noise when real
|
||||||
# That bucket is mostly noise (common words, CamelCase tech terms, etc.) and
|
# manifest/git signal exists. With LLM refinement enabled, keep those
|
||||||
# adding it to the review flow just makes the user do triage we can skip.
|
# candidates so the model can promote real entities or drop common words.
|
||||||
has_real_signal = bool(projects) or bool(people)
|
has_real_signal = bool(projects) or bool(people)
|
||||||
merged = _merge_detected(real_signal, prose_detected, drop_secondary_uncertain=has_real_signal)
|
merged = _merge_detected(
|
||||||
|
real_signal,
|
||||||
|
prose_detected,
|
||||||
|
drop_secondary_uncertain=has_real_signal and llm_provider is None,
|
||||||
|
)
|
||||||
|
|
||||||
# Optional phase 2: LLM refinement.
|
# Optional phase 2: LLM refinement.
|
||||||
if llm_provider is not None:
|
if llm_provider is not None:
|
||||||
from mempalace.llm_refine import collect_corpus_text, refine_entities
|
from mempalace.llm_refine import collect_corpus_text, refine_entities
|
||||||
|
|
||||||
corpus = collect_corpus_text(str(project_dir))
|
corpus = collect_corpus_text(str(project_dir))
|
||||||
result = refine_entities(merged, corpus, llm_provider, show_progress=show_progress)
|
result = refine_entities(
|
||||||
|
merged,
|
||||||
|
corpus,
|
||||||
|
llm_provider,
|
||||||
|
show_progress=show_progress,
|
||||||
|
allow_project_promotions=not has_real_signal,
|
||||||
|
)
|
||||||
if show_progress:
|
if show_progress:
|
||||||
status_bits = []
|
status_bits = []
|
||||||
if result.cancelled:
|
if result.cancelled:
|
||||||
|
|||||||
@@ -1,11 +1,13 @@
|
|||||||
"""Tests for mempalace.convo_scanner."""
|
"""Tests for mempalace.convo_scanner."""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from mempalace.convo_scanner import (
|
from mempalace.convo_scanner import (
|
||||||
_decode_slug_fallback,
|
_decode_slug_fallback,
|
||||||
_extract_cwd_from_session,
|
_extract_cwd_from_session,
|
||||||
_resolve_project_name,
|
_resolve_project_name,
|
||||||
|
_safe_mtime,
|
||||||
is_claude_projects_root,
|
is_claude_projects_root,
|
||||||
scan_claude_projects,
|
scan_claude_projects,
|
||||||
)
|
)
|
||||||
@@ -93,6 +95,23 @@ def test_decode_slug_fallback_only_dashes():
|
|||||||
assert _decode_slug_fallback("---") == "---"
|
assert _decode_slug_fallback("---") == "---"
|
||||||
|
|
||||||
|
|
||||||
|
# ── safe metadata helpers ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_safe_mtime_returns_zero_on_stat_error(tmp_path, monkeypatch):
|
||||||
|
f = tmp_path / "session.jsonl"
|
||||||
|
f.write_text("{}\n")
|
||||||
|
original_stat = Path.stat
|
||||||
|
|
||||||
|
def fail_stat(self):
|
||||||
|
if self == f:
|
||||||
|
raise OSError("permission denied")
|
||||||
|
return original_stat(self)
|
||||||
|
|
||||||
|
monkeypatch.setattr(Path, "stat", fail_stat)
|
||||||
|
assert _safe_mtime(f) == 0.0
|
||||||
|
|
||||||
|
|
||||||
# ── _resolve_project_name ───────────────────────────────────────────────
|
# ── _resolve_project_name ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -11,6 +11,9 @@ from mempalace.llm_refine import (
|
|||||||
_apply_classifications,
|
_apply_classifications,
|
||||||
_build_user_prompt,
|
_build_user_prompt,
|
||||||
_collect_contexts,
|
_collect_contexts,
|
||||||
|
_extract_json_candidates,
|
||||||
|
_is_authoritative_person,
|
||||||
|
_is_authoritative_project,
|
||||||
_parse_response,
|
_parse_response,
|
||||||
collect_corpus_text,
|
collect_corpus_text,
|
||||||
refine_entities,
|
refine_entities,
|
||||||
@@ -62,6 +65,16 @@ def test_collect_contexts_case_insensitive():
|
|||||||
assert out == ["lowercase alice mention"]
|
assert out == ["lowercase alice mention"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_collect_contexts_uses_token_boundaries():
|
||||||
|
lines = [
|
||||||
|
"forgot should not match",
|
||||||
|
"Go is a language.",
|
||||||
|
"go-v1 shipped.",
|
||||||
|
]
|
||||||
|
out = _collect_contexts(lines, "Go", max_lines=5)
|
||||||
|
assert out == ["Go is a language.", "go-v1 shipped."]
|
||||||
|
|
||||||
|
|
||||||
def test_collect_contexts_dedupes_identical_lines():
|
def test_collect_contexts_dedupes_identical_lines():
|
||||||
lines = ["Alice", "Alice", "Alice was here"]
|
lines = ["Alice", "Alice", "Alice was here"]
|
||||||
out = _collect_contexts(lines, "Alice", max_lines=5)
|
out = _collect_contexts(lines, "Alice", max_lines=5)
|
||||||
@@ -131,6 +144,30 @@ def test_parse_response_strips_code_fences():
|
|||||||
assert out["X"][0] == "TOPIC"
|
assert out["X"][0] == "TOPIC"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_response_extracts_json_after_prose():
|
||||||
|
text = 'Sure, here is the JSON: {"classifications": [{"name": "X", "label": "TOPIC"}]}'
|
||||||
|
out = _parse_response(text, ["X"])
|
||||||
|
assert out["X"][0] == "TOPIC"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_response_extracts_fenced_json_after_prose():
|
||||||
|
text = 'Sure:\n```json\n{"classifications": [{"name": "X", "label": "PROJECT"}]}\n```'
|
||||||
|
out = _parse_response(text, ["X"])
|
||||||
|
assert out["X"][0] == "PROJECT"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_json_candidates_handles_embedded_array():
|
||||||
|
text = 'prefix [{"name": "Y", "label": "PERSON"}] suffix'
|
||||||
|
candidates = _extract_json_candidates(text)
|
||||||
|
assert '[{"name": "Y", "label": "PERSON"}]' in candidates
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_response_ignores_non_json_brackets_before_payload():
|
||||||
|
text = 'See [note] first. JSON: {"classifications": [{"name": "X", "label": "TOPIC"}]}'
|
||||||
|
out = _parse_response(text, ["X"])
|
||||||
|
assert out["X"][0] == "TOPIC"
|
||||||
|
|
||||||
|
|
||||||
def test_parse_response_malformed_returns_empty():
|
def test_parse_response_malformed_returns_empty():
|
||||||
out = _parse_response("not json at all", ["X"])
|
out = _parse_response("not json at all", ["X"])
|
||||||
assert out == {}
|
assert out == {}
|
||||||
@@ -257,6 +294,67 @@ def test_apply_classifications_topic_goes_to_uncertain():
|
|||||||
assert reclass == 1
|
assert reclass == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_apply_classifications_can_block_llm_only_project_promotion():
|
||||||
|
detected = {
|
||||||
|
"people": [],
|
||||||
|
"projects": [],
|
||||||
|
"uncertain": [
|
||||||
|
{
|
||||||
|
"name": "Terraform",
|
||||||
|
"type": "uncertain",
|
||||||
|
"confidence": 0.4,
|
||||||
|
"frequency": 5,
|
||||||
|
"signals": ["regex"],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
decisions = {"Terraform": ("PROJECT", "tool")}
|
||||||
|
new, reclass, _ = _apply_classifications(
|
||||||
|
detected,
|
||||||
|
decisions,
|
||||||
|
allow_project_promotions=False,
|
||||||
|
)
|
||||||
|
assert new["projects"] == []
|
||||||
|
assert new["uncertain"][0]["name"] == "Terraform"
|
||||||
|
assert new["uncertain"][0]["type"] == "uncertain"
|
||||||
|
assert reclass == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_apply_classifications_allows_project_promotion_for_prose_only_mode():
|
||||||
|
detected = {
|
||||||
|
"people": [],
|
||||||
|
"projects": [],
|
||||||
|
"uncertain": [
|
||||||
|
{
|
||||||
|
"name": "Project Aurora",
|
||||||
|
"type": "uncertain",
|
||||||
|
"confidence": 0.4,
|
||||||
|
"frequency": 5,
|
||||||
|
"signals": ["regex"],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
decisions = {"Project Aurora": ("PROJECT", "user effort")}
|
||||||
|
new, reclass, _ = _apply_classifications(detected, decisions)
|
||||||
|
assert new["projects"][0]["name"] == "Project Aurora"
|
||||||
|
assert new["projects"][0]["type"] == "project"
|
||||||
|
assert reclass == 1
|
||||||
|
|
||||||
|
|
||||||
|
# ── authoritative source filters ────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_authoritative_person_requires_git_signal():
|
||||||
|
assert _is_authoritative_person({"signals": ["5 commits across 2 repos"]})
|
||||||
|
assert not _is_authoritative_person({"signals": ["pronoun nearby (5x)"]})
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_authoritative_project_requires_manifest_or_git_signal():
|
||||||
|
assert _is_authoritative_project({"signals": ["package.json, 12 of your commits"]})
|
||||||
|
assert _is_authoritative_project({"signals": ["57 commits (none by you)"]})
|
||||||
|
assert not _is_authoritative_project({"signals": ["code file reference (5x)"]})
|
||||||
|
|
||||||
|
|
||||||
# ── refine_entities ─────────────────────────────────────────────────────
|
# ── refine_entities ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
@@ -347,6 +445,93 @@ def test_refine_entities_skips_high_confidence_projects():
|
|||||||
assert provider.call_count == 0
|
assert provider.call_count == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_refine_entities_refines_high_confidence_regex_projects():
|
||||||
|
"""High-confidence regex projects still need LLM review without source signal."""
|
||||||
|
detected = {
|
||||||
|
"people": [],
|
||||||
|
"projects": [
|
||||||
|
{
|
||||||
|
"name": "OpenAPI",
|
||||||
|
"type": "project",
|
||||||
|
"confidence": 0.99,
|
||||||
|
"frequency": 5,
|
||||||
|
"signals": ["code file reference (5x)"],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"uncertain": [],
|
||||||
|
}
|
||||||
|
provider = FakeProvider(
|
||||||
|
response_text=(
|
||||||
|
'{"classifications": [{"name": "OpenAPI", "label": "TOPIC", "reason": "technology"}]}'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
result = refine_entities(detected, "OpenAPI schemas", provider, show_progress=False)
|
||||||
|
assert provider.call_count == 1
|
||||||
|
assert result.reclassified == 1
|
||||||
|
assert result.merged["projects"] == []
|
||||||
|
assert result.merged["uncertain"][0]["name"] == "OpenAPI"
|
||||||
|
|
||||||
|
|
||||||
|
def test_refine_entities_refines_regex_people_but_skips_git_people():
|
||||||
|
detected = {
|
||||||
|
"people": [
|
||||||
|
{
|
||||||
|
"name": "Igor Lins e Silva",
|
||||||
|
"type": "person",
|
||||||
|
"confidence": 0.99,
|
||||||
|
"frequency": 100,
|
||||||
|
"signals": ["100 commits across 3 repos"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Tool",
|
||||||
|
"type": "person",
|
||||||
|
"confidence": 0.99,
|
||||||
|
"frequency": 5,
|
||||||
|
"signals": ["pronoun nearby (5x)"],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"projects": [],
|
||||||
|
"uncertain": [],
|
||||||
|
}
|
||||||
|
provider = FakeProvider(
|
||||||
|
response_text='{"classifications": [{"name": "Tool", "label": "COMMON_WORD"}]}'
|
||||||
|
)
|
||||||
|
result = refine_entities(detected, "Tool is a common noun.", provider, show_progress=False)
|
||||||
|
assert provider.call_count == 1
|
||||||
|
names = [e["name"] for e in result.merged["people"]]
|
||||||
|
assert names == ["Igor Lins e Silva"]
|
||||||
|
assert result.dropped == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_refine_entities_can_keep_llm_only_project_in_uncertain():
|
||||||
|
detected = {
|
||||||
|
"people": [],
|
||||||
|
"projects": [],
|
||||||
|
"uncertain": [
|
||||||
|
{
|
||||||
|
"name": "Terraform",
|
||||||
|
"type": "uncertain",
|
||||||
|
"confidence": 0.4,
|
||||||
|
"frequency": 9,
|
||||||
|
"signals": ["regex"],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
provider = FakeProvider(
|
||||||
|
response_text='{"classifications": [{"name": "Terraform", "label": "PROJECT"}]}'
|
||||||
|
)
|
||||||
|
result = refine_entities(
|
||||||
|
detected,
|
||||||
|
"Terraform config",
|
||||||
|
provider,
|
||||||
|
show_progress=False,
|
||||||
|
allow_project_promotions=False,
|
||||||
|
)
|
||||||
|
assert result.merged["projects"] == []
|
||||||
|
assert result.merged["uncertain"][0]["name"] == "Terraform"
|
||||||
|
assert any("LLM: project" in s for s in result.merged["uncertain"][0]["signals"])
|
||||||
|
|
||||||
|
|
||||||
def test_refine_entities_empty_candidates_returns_noop():
|
def test_refine_entities_empty_candidates_returns_noop():
|
||||||
detected = {"people": [], "projects": [], "uncertain": []}
|
detected = {"people": [], "projects": [], "uncertain": []}
|
||||||
provider = FakeProvider()
|
provider = FakeProvider()
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
import json
|
import json
|
||||||
import subprocess
|
import subprocess
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
from mempalace.project_scanner import (
|
from mempalace.project_scanner import (
|
||||||
PersonInfo,
|
PersonInfo,
|
||||||
@@ -390,6 +391,49 @@ def test_discover_entities_prefers_real_signal_over_prose(tmp_path):
|
|||||||
assert "realproj" in proj_names
|
assert "realproj" in proj_names
|
||||||
|
|
||||||
|
|
||||||
|
def test_discover_entities_keeps_uncertain_for_llm_when_real_signal(tmp_path):
|
||||||
|
"""With --llm, regex-uncertain prose candidates should reach refinement."""
|
||||||
|
(tmp_path / "package.json").write_text(json.dumps({"name": "realproj"}))
|
||||||
|
_init_git_repo(tmp_path)
|
||||||
|
(tmp_path / "doc.md").write_text("Noise appeared. Noise repeated. Noise again.")
|
||||||
|
|
||||||
|
class FakeProvider:
|
||||||
|
def __init__(self):
|
||||||
|
self.prompts = []
|
||||||
|
|
||||||
|
def classify(self, _system, user, json_mode=True):
|
||||||
|
self.prompts.append(user)
|
||||||
|
return SimpleNamespace(
|
||||||
|
text='{"classifications": [{"name": "Noise", "label": "COMMON_WORD"}]}'
|
||||||
|
)
|
||||||
|
|
||||||
|
provider = FakeProvider()
|
||||||
|
d = discover_entities(str(tmp_path), llm_provider=provider, show_progress=False)
|
||||||
|
|
||||||
|
assert len(provider.prompts) == 1
|
||||||
|
assert "Noise" in provider.prompts[0]
|
||||||
|
assert "Noise" not in [e["name"] for cat in d.values() for e in cat]
|
||||||
|
|
||||||
|
|
||||||
|
def test_discover_entities_keeps_llm_only_project_uncertain_when_real_signal(tmp_path):
|
||||||
|
"""Repo roots should not auto-promote LLM-only tools/topics into projects."""
|
||||||
|
(tmp_path / "package.json").write_text(json.dumps({"name": "realproj"}))
|
||||||
|
_init_git_repo(tmp_path)
|
||||||
|
(tmp_path / "doc.md").write_text("Terraform shipped. Terraform changed. Terraform runs.")
|
||||||
|
|
||||||
|
class FakeProvider:
|
||||||
|
def classify(self, _system, _user, json_mode=True):
|
||||||
|
return SimpleNamespace(
|
||||||
|
text='{"classifications": [{"name": "Terraform", "label": "PROJECT"}]}'
|
||||||
|
)
|
||||||
|
|
||||||
|
d = discover_entities(str(tmp_path), llm_provider=FakeProvider(), show_progress=False)
|
||||||
|
|
||||||
|
assert "realproj" in [e["name"] for e in d["projects"]]
|
||||||
|
assert "Terraform" not in [e["name"] for e in d["projects"]]
|
||||||
|
assert "Terraform" in [e["name"] for e in d["uncertain"]]
|
||||||
|
|
||||||
|
|
||||||
# ── _UnionFind basics ──────────────────────────────────────────────────
|
# ── _UnionFind basics ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user