#1148, #1150, and #1157 were reviewed and merged on GitHub, but the two stacked children landed on their parent feature branches (now stale) rather than on develop. Only #1148's commits reached develop via the direct merge. Release PR #1159 (develop → main for v3.3.3) is therefore missing the LLM refinement, Claude-conversation scanner, and miner- registry wire-up that were ostensibly part of the release. This merge brings the stale `feat/llm-entity-refine` branch (which contains the rolled-up merge commit for #1157 → #1150 → everything below) into develop so the release tag includes it. No code changes here — only history recovery.
This commit is contained in:
@@ -120,8 +120,7 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> li
|
||||
os.rename(seg_dir, target)
|
||||
moved.append(target)
|
||||
logger.warning(
|
||||
"Quarantined stale HNSW segment %s "
|
||||
"(sqlite %.0fs newer than HNSW); renamed to %s",
|
||||
"Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s",
|
||||
seg_dir,
|
||||
sqlite_mtime - hnsw_mtime,
|
||||
target,
|
||||
|
||||
+74
-5
@@ -86,21 +86,53 @@ def cmd_init(args):
|
||||
languages = cfg.entity_languages
|
||||
languages_tuple = tuple(languages)
|
||||
|
||||
# Optional phase-2 LLM provider (opt-in via --llm).
|
||||
llm_provider = None
|
||||
if getattr(args, "llm", False):
|
||||
from .llm_client import LLMError, get_provider
|
||||
|
||||
try:
|
||||
llm_provider = get_provider(
|
||||
name=args.llm_provider,
|
||||
model=args.llm_model,
|
||||
endpoint=args.llm_endpoint,
|
||||
api_key=args.llm_api_key,
|
||||
)
|
||||
except LLMError as e:
|
||||
print(f" ERROR: {e}", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
ok, msg = llm_provider.check_available()
|
||||
if not ok:
|
||||
print(
|
||||
f" ERROR: LLM provider '{args.llm_provider}' unavailable: {msg}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(2)
|
||||
print(f" LLM refinement enabled: {args.llm_provider}/{args.llm_model}")
|
||||
|
||||
# Pass 1: discover entities — manifests + git authors first, prose detection
|
||||
# as supplement for names mentioned only in docs/notes.
|
||||
# as supplement for names mentioned only in docs/notes. Optional phase-2
|
||||
# LLM refinement runs inside discover_entities when llm_provider is given.
|
||||
print(f"\n Scanning for entities in: {args.dir}")
|
||||
if languages_tuple != ("en",):
|
||||
print(f" Languages: {', '.join(languages_tuple)}")
|
||||
detected = discover_entities(args.dir, languages=languages_tuple)
|
||||
detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider)
|
||||
total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
|
||||
if total > 0:
|
||||
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
|
||||
# Save confirmed entities to <project>/entities.json for the miner
|
||||
# Save confirmed entities to <project>/entities.json (per-project
|
||||
# audit trail — user can inspect or hand-edit) AND merge into the
|
||||
# global registry the miner reads at mine time.
|
||||
if confirmed["people"] or confirmed["projects"]:
|
||||
entities_path = Path(args.dir).expanduser().resolve() / "entities.json"
|
||||
with open(entities_path, "w") as f:
|
||||
json.dump(confirmed, f, indent=2)
|
||||
with open(entities_path, "w", encoding="utf-8") as f:
|
||||
json.dump(confirmed, f, indent=2, ensure_ascii=False)
|
||||
print(f" Entities saved: {entities_path}")
|
||||
|
||||
from .miner import add_to_known_entities
|
||||
|
||||
registry_path = add_to_known_entities(confirmed)
|
||||
print(f" Registry updated: {registry_path}")
|
||||
else:
|
||||
print(" No entities detected — proceeding with directory-based rooms.")
|
||||
|
||||
@@ -550,6 +582,43 @@ def main():
|
||||
"When given, the value is also persisted to config.json."
|
||||
),
|
||||
)
|
||||
p_init.add_argument(
|
||||
"--llm",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Enable LLM-assisted entity refinement (opt-in, local-first). "
|
||||
"Runs after manifest/git/regex detection, asking the configured "
|
||||
"provider to reclassify ambiguous candidates. "
|
||||
"Ctrl-C during refinement returns partial results."
|
||||
),
|
||||
)
|
||||
p_init.add_argument(
|
||||
"--llm-provider",
|
||||
default="ollama",
|
||||
choices=["ollama", "openai-compat", "anthropic"],
|
||||
help="LLM provider (default: ollama). Use --llm to enable.",
|
||||
)
|
||||
p_init.add_argument(
|
||||
"--llm-model",
|
||||
default="gemma4:e4b",
|
||||
help="Model name for the chosen provider (default: gemma4:e4b for Ollama).",
|
||||
)
|
||||
p_init.add_argument(
|
||||
"--llm-endpoint",
|
||||
default=None,
|
||||
help=(
|
||||
"Provider endpoint URL. Default for Ollama: http://localhost:11434. "
|
||||
"Required for openai-compat."
|
||||
),
|
||||
)
|
||||
p_init.add_argument(
|
||||
"--llm-api-key",
|
||||
default=None,
|
||||
help=(
|
||||
"API key for the provider. For anthropic, defaults to $ANTHROPIC_API_KEY; "
|
||||
"for openai-compat, defaults to $OPENAI_API_KEY."
|
||||
),
|
||||
)
|
||||
|
||||
# mine
|
||||
p_mine = sub.add_parser("mine", help="Mine files into the palace")
|
||||
|
||||
@@ -0,0 +1,160 @@
|
||||
"""
|
||||
convo_scanner.py — Parse Claude Code conversation directories into ProjectInfo.
|
||||
|
||||
Claude Code stores sessions under ``~/.claude/projects/<slug>/<id>.jsonl``,
|
||||
where the ``<slug>`` is the original CWD with ``/`` replaced by ``-``. That
|
||||
encoding is lossy: we can't tell whether ``foo-bar`` in a slug is the
|
||||
literal project name ``foo-bar`` or two path segments ``foo/bar``.
|
||||
|
||||
Fortunately, every message record in the JSONL carries a ``cwd`` field with
|
||||
the true path. This scanner reads one record per session to recover the
|
||||
accurate project name, falling back to slug-decoding only if the JSONL
|
||||
is malformed or empty.
|
||||
|
||||
Output is the same ``ProjectInfo`` shape used by ``project_scanner``, so the
|
||||
``discover_entities`` orchestrator can mix-and-match sources.
|
||||
|
||||
Public:
|
||||
is_claude_projects_root(path) -> bool
|
||||
scan_claude_projects(path) -> list[ProjectInfo]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from mempalace.project_scanner import ProjectInfo
|
||||
|
||||
|
||||
MAX_HEADER_LINES = 20 # lines to read per session looking for `cwd`
|
||||
|
||||
|
||||
def is_claude_projects_root(path: Path) -> bool:
|
||||
"""Return True if path looks like `.claude/projects/`.
|
||||
|
||||
Heuristic: at least one child dir whose name starts with ``-`` and which
|
||||
contains at least one ``.jsonl`` file.
|
||||
"""
|
||||
if not path.is_dir():
|
||||
return False
|
||||
try:
|
||||
children = list(path.iterdir())
|
||||
except OSError:
|
||||
return False
|
||||
for child in children:
|
||||
if not (child.is_dir() and child.name.startswith("-")):
|
||||
continue
|
||||
try:
|
||||
if any(p.suffix == ".jsonl" for p in child.iterdir() if p.is_file()):
|
||||
return True
|
||||
except OSError:
|
||||
continue
|
||||
return False
|
||||
|
||||
|
||||
def _extract_cwd_from_session(session_file: Path) -> Optional[str]:
|
||||
"""Return the ``cwd`` from the first message record that carries one.
|
||||
|
||||
Returns None if the file can't be read, has no JSON, or no record has cwd.
|
||||
"""
|
||||
try:
|
||||
with open(session_file, encoding="utf-8", errors="replace") as f:
|
||||
for i, line in enumerate(f):
|
||||
if i >= MAX_HEADER_LINES:
|
||||
break
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
cwd = obj.get("cwd")
|
||||
if isinstance(cwd, str) and cwd:
|
||||
return cwd
|
||||
except OSError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _decode_slug_fallback(slug: str) -> str:
|
||||
"""Best-effort project name from slug when cwd is unavailable.
|
||||
|
||||
The slug is lossy (`/` and `-` both become `-`). Last non-empty segment
|
||||
is the closest guess at the project name, preserving kebab-case is
|
||||
impossible without cwd.
|
||||
"""
|
||||
stripped = slug.lstrip("-")
|
||||
parts = [p for p in stripped.split("-") if p]
|
||||
return parts[-1] if parts else slug
|
||||
|
||||
|
||||
def _safe_mtime(path: Path) -> float:
|
||||
"""Return file mtime, defaulting old on permission or filesystem errors."""
|
||||
try:
|
||||
return path.stat().st_mtime
|
||||
except OSError:
|
||||
return 0.0
|
||||
|
||||
|
||||
def _resolve_project_name(project_dir: Path) -> str:
|
||||
"""Read one session's cwd to recover the original project name.
|
||||
|
||||
Falls back to slug-decoding if no session has a readable cwd.
|
||||
"""
|
||||
sessions = sorted(
|
||||
(p for p in project_dir.iterdir() if p.is_file() and p.suffix == ".jsonl"),
|
||||
key=_safe_mtime,
|
||||
reverse=True, # newest first — most likely to be well-formed
|
||||
)
|
||||
for session in sessions:
|
||||
cwd = _extract_cwd_from_session(session)
|
||||
if cwd:
|
||||
return Path(cwd).name or cwd
|
||||
return _decode_slug_fallback(project_dir.name)
|
||||
|
||||
|
||||
def scan_claude_projects(path: str | Path) -> list[ProjectInfo]:
|
||||
"""Scan a ``.claude/projects/`` directory for Claude Code conversations.
|
||||
|
||||
One ProjectInfo per subdir. ``has_git`` is False (the directory isn't a
|
||||
repo itself) but ``total_commits`` is repurposed here as session count so
|
||||
the UX surfaces a density signal for ranking.
|
||||
"""
|
||||
root = Path(path).expanduser().resolve()
|
||||
if not is_claude_projects_root(root):
|
||||
return []
|
||||
|
||||
projects: dict[str, ProjectInfo] = {}
|
||||
for sub in sorted(root.iterdir()):
|
||||
if not (sub.is_dir() and sub.name.startswith("-")):
|
||||
continue
|
||||
try:
|
||||
sessions = [p for p in sub.iterdir() if p.is_file() and p.suffix == ".jsonl"]
|
||||
except OSError:
|
||||
continue
|
||||
if not sessions:
|
||||
continue
|
||||
|
||||
name = _resolve_project_name(sub)
|
||||
session_count = len(sessions)
|
||||
|
||||
proj = ProjectInfo(
|
||||
name=name,
|
||||
repo_root=sub,
|
||||
manifest=None,
|
||||
has_git=False,
|
||||
total_commits=session_count,
|
||||
user_commits=session_count,
|
||||
is_mine=True, # Claude Code sessions are authored by the user
|
||||
)
|
||||
existing = projects.get(name)
|
||||
if existing is None or session_count > existing.user_commits:
|
||||
projects[name] = proj
|
||||
|
||||
return sorted(
|
||||
projects.values(),
|
||||
key=lambda p: (-p.user_commits, p.name),
|
||||
)
|
||||
@@ -0,0 +1,305 @@
|
||||
"""
|
||||
llm_client.py — Minimal provider abstraction for LLM-assisted entity refinement.
|
||||
|
||||
Three providers cover the useful space:
|
||||
|
||||
- ``ollama`` (default): local models via http://localhost:11434. Works fully
|
||||
offline. Honors MemPalace's "zero-API required" principle.
|
||||
- ``openai-compat``: any OpenAI-compatible ``/v1/chat/completions`` endpoint.
|
||||
Covers OpenRouter, LM Studio, llama.cpp server, vLLM, Groq, Fireworks,
|
||||
Together, and most self-hosted setups.
|
||||
- ``anthropic``: the official Messages API. Opt-in for users who want Haiku
|
||||
quality without setting up a local model.
|
||||
|
||||
All providers expose the same ``classify(system, user, json_mode)`` method and
|
||||
the same ``check_available()`` probe. No external SDK dependencies — stdlib
|
||||
``urllib`` only.
|
||||
|
||||
JSON mode matters here: we always ask for structured output. Providers
|
||||
differ on how to request it (Ollama: ``format: json``; OpenAI-compat:
|
||||
``response_format``; Anthropic: prompt-level instruction) and this module
|
||||
normalizes that away from the caller.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
from urllib.error import HTTPError, URLError
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
|
||||
class LLMError(RuntimeError):
|
||||
"""Raised for any provider failure — transport, parse, auth, missing model."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMResponse:
|
||||
text: str
|
||||
model: str
|
||||
provider: str
|
||||
raw: dict
|
||||
|
||||
|
||||
# ==================== BASE ====================
|
||||
|
||||
|
||||
class LLMProvider:
|
||||
name: str = "base"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
endpoint: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
timeout: int = 120,
|
||||
):
|
||||
self.model = model
|
||||
self.endpoint = endpoint
|
||||
self.api_key = api_key
|
||||
self.timeout = timeout
|
||||
|
||||
def classify(self, system: str, user: str, json_mode: bool = True) -> LLMResponse:
|
||||
raise NotImplementedError
|
||||
|
||||
def check_available(self) -> tuple[bool, str]:
|
||||
"""Return ``(ok, message)``. Fast probe that the provider is reachable."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def _http_post_json(url: str, body: dict, headers: dict, timeout: int) -> dict:
|
||||
"""POST JSON and return the parsed response. Raises LLMError on any failure."""
|
||||
req = Request(
|
||||
url,
|
||||
data=json.dumps(body).encode("utf-8"),
|
||||
headers={"Content-Type": "application/json", **headers},
|
||||
)
|
||||
try:
|
||||
with urlopen(req, timeout=timeout) as resp:
|
||||
return json.loads(resp.read())
|
||||
except HTTPError as e:
|
||||
detail = ""
|
||||
try:
|
||||
detail = e.read().decode("utf-8", errors="replace")[:500]
|
||||
except Exception:
|
||||
pass
|
||||
raise LLMError(f"HTTP {e.code} from {url}: {detail or e.reason}") from e
|
||||
except (URLError, OSError) as e:
|
||||
raise LLMError(f"Cannot reach {url}: {e}") from e
|
||||
except json.JSONDecodeError as e:
|
||||
raise LLMError(f"Malformed response from {url}: {e}") from e
|
||||
|
||||
|
||||
# ==================== OLLAMA ====================
|
||||
|
||||
|
||||
class OllamaProvider(LLMProvider):
|
||||
name = "ollama"
|
||||
DEFAULT_ENDPOINT = "http://localhost:11434"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
endpoint: Optional[str] = None,
|
||||
timeout: int = 180,
|
||||
**_: object,
|
||||
):
|
||||
super().__init__(
|
||||
model=model,
|
||||
endpoint=endpoint or self.DEFAULT_ENDPOINT,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
def check_available(self) -> tuple[bool, str]:
|
||||
try:
|
||||
with urlopen(f"{self.endpoint}/api/tags", timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
except (URLError, HTTPError, OSError, json.JSONDecodeError) as e:
|
||||
return False, f"Cannot reach Ollama at {self.endpoint}: {e}"
|
||||
names = {m.get("name", "") for m in data.get("models", []) or []}
|
||||
# Ollama tags may or may not include ':latest' — accept either form
|
||||
wanted = {self.model, f"{self.model}:latest"}
|
||||
if not names & wanted:
|
||||
return (
|
||||
False,
|
||||
f"Model '{self.model}' not loaded in Ollama. Run: ollama pull {self.model}",
|
||||
)
|
||||
return True, "ok"
|
||||
|
||||
def classify(self, system: str, user: str, json_mode: bool = True) -> LLMResponse:
|
||||
body: dict = {
|
||||
"model": self.model,
|
||||
"messages": [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": user},
|
||||
],
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.1},
|
||||
}
|
||||
if json_mode:
|
||||
body["format"] = "json"
|
||||
data = _http_post_json(f"{self.endpoint}/api/chat", body, headers={}, timeout=self.timeout)
|
||||
text = (data.get("message") or {}).get("content", "")
|
||||
if not text:
|
||||
raise LLMError(f"Empty response from Ollama (model={self.model})")
|
||||
return LLMResponse(text=text, model=self.model, provider=self.name, raw=data)
|
||||
|
||||
|
||||
# ==================== OPENAI-COMPAT ====================
|
||||
|
||||
|
||||
class OpenAICompatProvider(LLMProvider):
|
||||
"""Any OpenAI-compatible ``/v1/chat/completions`` endpoint.
|
||||
|
||||
Supply ``--llm-endpoint http://host:port`` (with or without ``/v1``).
|
||||
API key via ``--llm-api-key`` or the ``OPENAI_API_KEY`` env var.
|
||||
"""
|
||||
|
||||
name = "openai-compat"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
endpoint: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
timeout: int = 120,
|
||||
**_: object,
|
||||
):
|
||||
resolved_key = api_key or os.environ.get("OPENAI_API_KEY")
|
||||
super().__init__(model=model, endpoint=endpoint, api_key=resolved_key, timeout=timeout)
|
||||
|
||||
def _resolve_url(self) -> str:
|
||||
if not self.endpoint:
|
||||
raise LLMError("openai-compat provider requires --llm-endpoint")
|
||||
url = self.endpoint.rstrip("/")
|
||||
if url.endswith("/chat/completions"):
|
||||
return url
|
||||
if not url.endswith("/v1"):
|
||||
url = f"{url}/v1"
|
||||
return f"{url}/chat/completions"
|
||||
|
||||
def check_available(self) -> tuple[bool, str]:
|
||||
if not self.endpoint:
|
||||
return False, "no --llm-endpoint configured"
|
||||
base = self.endpoint.rstrip("/")
|
||||
base = base.removesuffix("/chat/completions").removesuffix("/v1")
|
||||
try:
|
||||
req = Request(f"{base}/v1/models")
|
||||
if self.api_key:
|
||||
req.add_header("Authorization", f"Bearer {self.api_key}")
|
||||
with urlopen(req, timeout=5):
|
||||
pass
|
||||
except (URLError, HTTPError, OSError) as e:
|
||||
return False, f"Cannot reach {self.endpoint}: {e}"
|
||||
return True, "ok"
|
||||
|
||||
def classify(self, system: str, user: str, json_mode: bool = True) -> LLMResponse:
|
||||
body: dict = {
|
||||
"model": self.model,
|
||||
"messages": [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": user},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
}
|
||||
if json_mode:
|
||||
body["response_format"] = {"type": "json_object"}
|
||||
headers = {}
|
||||
if self.api_key:
|
||||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
data = _http_post_json(self._resolve_url(), body, headers=headers, timeout=self.timeout)
|
||||
try:
|
||||
text = data["choices"][0]["message"]["content"]
|
||||
except (KeyError, IndexError, TypeError) as e:
|
||||
raise LLMError(f"Unexpected response shape: {e}") from e
|
||||
if not text:
|
||||
raise LLMError(f"Empty response from {self.name} (model={self.model})")
|
||||
return LLMResponse(text=text, model=self.model, provider=self.name, raw=data)
|
||||
|
||||
|
||||
# ==================== ANTHROPIC ====================
|
||||
|
||||
|
||||
class AnthropicProvider(LLMProvider):
|
||||
name = "anthropic"
|
||||
DEFAULT_ENDPOINT = "https://api.anthropic.com"
|
||||
API_VERSION = "2023-06-01"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
api_key: Optional[str] = None,
|
||||
endpoint: Optional[str] = None,
|
||||
timeout: int = 120,
|
||||
**_: object,
|
||||
):
|
||||
key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
||||
super().__init__(
|
||||
model=model,
|
||||
endpoint=endpoint or self.DEFAULT_ENDPOINT,
|
||||
api_key=key,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
def check_available(self) -> tuple[bool, str]:
|
||||
if not self.api_key:
|
||||
return False, "ANTHROPIC_API_KEY not set (use --llm-api-key or env)"
|
||||
# Don't probe — a live request would cost money. First real call will
|
||||
# surface auth errors if the key is invalid.
|
||||
return True, "ok"
|
||||
|
||||
def classify(self, system: str, user: str, json_mode: bool = True) -> LLMResponse:
|
||||
if not self.api_key:
|
||||
raise LLMError("Anthropic provider requires ANTHROPIC_API_KEY env or --llm-api-key")
|
||||
sys_prompt = system
|
||||
if json_mode:
|
||||
sys_prompt += "\n\nRespond with valid JSON only, no prose."
|
||||
body = {
|
||||
"model": self.model,
|
||||
"max_tokens": 2048,
|
||||
"temperature": 0.1,
|
||||
"system": sys_prompt,
|
||||
"messages": [{"role": "user", "content": user}],
|
||||
}
|
||||
headers = {
|
||||
"X-API-Key": self.api_key,
|
||||
"anthropic-version": self.API_VERSION,
|
||||
}
|
||||
data = _http_post_json(
|
||||
f"{self.endpoint}/v1/messages", body, headers=headers, timeout=self.timeout
|
||||
)
|
||||
try:
|
||||
text = "".join(
|
||||
b.get("text", "") for b in data.get("content", []) or [] if b.get("type") == "text"
|
||||
)
|
||||
except (AttributeError, TypeError) as e:
|
||||
raise LLMError(f"Unexpected response shape: {e}") from e
|
||||
if not text:
|
||||
raise LLMError(f"Empty response from Anthropic (model={self.model})")
|
||||
return LLMResponse(text=text, model=self.model, provider=self.name, raw=data)
|
||||
|
||||
|
||||
# ==================== FACTORY ====================
|
||||
|
||||
|
||||
PROVIDERS: dict[str, type[LLMProvider]] = {
|
||||
"ollama": OllamaProvider,
|
||||
"openai-compat": OpenAICompatProvider,
|
||||
"anthropic": AnthropicProvider,
|
||||
}
|
||||
|
||||
|
||||
def get_provider(
|
||||
name: str,
|
||||
model: str,
|
||||
endpoint: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
timeout: int = 120,
|
||||
) -> LLMProvider:
|
||||
"""Build a provider by name. Raises LLMError on unknown provider."""
|
||||
cls = PROVIDERS.get(name)
|
||||
if cls is None:
|
||||
raise LLMError(f"Unknown provider '{name}'. Choices: {sorted(PROVIDERS.keys())}")
|
||||
return cls(model=model, endpoint=endpoint, api_key=api_key, timeout=timeout)
|
||||
@@ -0,0 +1,446 @@
|
||||
"""
|
||||
llm_refine.py — Optional LLM refinement of regex-detected entities.
|
||||
|
||||
Takes the candidate set produced by phase-1 detection (manifests, git
|
||||
authors, regex on prose) and asks an LLM to reclassify each candidate as
|
||||
PERSON / PROJECT / TOPIC / COMMON_WORD / AMBIGUOUS.
|
||||
|
||||
Design constraints:
|
||||
- Opt-in. Default init path never imports this module.
|
||||
- Local-first by default (Ollama).
|
||||
- Interactive UX: visible progress, clean cancellation (Ctrl-C returns
|
||||
whatever was classified before the interrupt).
|
||||
- Don't feed the raw corpus to the LLM — feed candidates + a few sampled
|
||||
context lines each. Keeps total input to ~50-100K tokens even for huge
|
||||
prose corpora.
|
||||
|
||||
Public:
|
||||
refine_entities(detected, corpus_text, provider, ...) -> dict
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
|
||||
from mempalace.llm_client import LLMError, LLMProvider
|
||||
|
||||
|
||||
BATCH_SIZE = 25 # candidates per LLM call; tuned for 4B local models
|
||||
CONTEXT_LINES_PER_CANDIDATE = 3
|
||||
CONTEXT_WINDOW_CHARS = 240 # max chars per context line to keep tokens bounded
|
||||
|
||||
# Valid labels the LLM is allowed to return. Anything else is treated as
|
||||
# AMBIGUOUS so the user reviews it.
|
||||
VALID_LABELS = {"PERSON", "PROJECT", "TOPIC", "COMMON_WORD", "AMBIGUOUS"}
|
||||
|
||||
|
||||
SYSTEM_PROMPT = """You are helping organize a user's memory palace by classifying capitalized tokens found in their files.
|
||||
|
||||
For each candidate, pick exactly ONE label:
|
||||
- PERSON: a specific real person the user knows (colleague, family, character they write about)
|
||||
- PROJECT: a named product, codebase, or effort the user works on
|
||||
- TOPIC: a recurring theme or subject (not a person, not a project) — cities, technologies, concepts
|
||||
- COMMON_WORD: an English word, verb, or fragment that isn't a named entity at all (e.g. "Created", "Before", "Never")
|
||||
- AMBIGUOUS: context is insufficient to decide between two of the above
|
||||
|
||||
Frameworks, runtimes, APIs, cloud services, vendors, and third-party products
|
||||
(e.g. Angular, OpenAPI, Terraform, Bun, Google) are TOPIC unless the context
|
||||
clearly says this is the user's own named codebase, product, or active effort.
|
||||
|
||||
Use the provided context lines to disambiguate. A capitalized word that only appears in metadata ("Created: 2026-04-24") is COMMON_WORD. A name that appears with pronouns and dialogue is PERSON.
|
||||
|
||||
Respond with JSON only. Schema:
|
||||
{"classifications": [{"name": "<exact candidate name>", "label": "<LABEL>", "reason": "<one short sentence>"}]}
|
||||
|
||||
One entry per candidate, same order as the input."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class RefineResult:
|
||||
merged: dict # updated detected dict
|
||||
reclassified: int # entries whose type changed
|
||||
dropped: int # entries removed from the merged result (COMMON_WORD only)
|
||||
errors: list[str] # per-batch error messages (transport/parse failures)
|
||||
batches_completed: int
|
||||
batches_total: int
|
||||
cancelled: bool
|
||||
|
||||
|
||||
def _collect_contexts(
|
||||
corpus_lines: list[str], name: str, max_lines: int = CONTEXT_LINES_PER_CANDIDATE
|
||||
) -> list[str]:
|
||||
"""Return up to `max_lines` distinct lines from the corpus that mention `name`.
|
||||
|
||||
Case-insensitive token-boundary match. Lines are truncated to
|
||||
CONTEXT_WINDOW_CHARS chars to keep token usage bounded.
|
||||
"""
|
||||
needle = re.compile(rf"(?<!\w){re.escape(name)}(?!\w)", re.IGNORECASE)
|
||||
seen: set[str] = set()
|
||||
out: list[str] = []
|
||||
for line in corpus_lines:
|
||||
if not needle.search(line):
|
||||
continue
|
||||
trimmed = line.strip()[:CONTEXT_WINDOW_CHARS]
|
||||
if not trimmed or trimmed in seen:
|
||||
continue
|
||||
seen.add(trimmed)
|
||||
out.append(trimmed)
|
||||
if len(out) >= max_lines:
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
def _build_user_prompt(candidates_with_contexts: list[tuple[str, str, list[str]]]) -> str:
|
||||
"""Shape: for each candidate, list its current type guess + sampled contexts."""
|
||||
parts: list[str] = ["CANDIDATES:"]
|
||||
for i, (name, current_type, contexts) in enumerate(candidates_with_contexts, 1):
|
||||
parts.append(f"\n{i}. {name} (currently: {current_type})")
|
||||
if contexts:
|
||||
for c in contexts:
|
||||
parts.append(f" > {c}")
|
||||
else:
|
||||
parts.append(" > (no context available)")
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def _extract_json_candidates(text: str) -> list[str]:
|
||||
"""Return plausible JSON payloads extracted from an LLM response."""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return []
|
||||
|
||||
candidates: list[str] = [text]
|
||||
|
||||
for match in re.finditer(r"```(?:json)?\s*([\s\S]*?)\s*```", text, re.IGNORECASE):
|
||||
candidate = match.group(1).strip()
|
||||
if candidate and candidate not in candidates:
|
||||
candidates.append(candidate)
|
||||
|
||||
for start, opener in ((i, ch) for i, ch in enumerate(text) if ch in "{["):
|
||||
closer = "}" if opener == "{" else "]"
|
||||
depth = 0
|
||||
in_string = False
|
||||
escaped = False
|
||||
for i in range(start, len(text)):
|
||||
ch = text[i]
|
||||
if in_string:
|
||||
if escaped:
|
||||
escaped = False
|
||||
elif ch == "\\":
|
||||
escaped = True
|
||||
elif ch == '"':
|
||||
in_string = False
|
||||
continue
|
||||
|
||||
if ch == '"':
|
||||
in_string = True
|
||||
elif ch == opener:
|
||||
depth += 1
|
||||
elif ch == closer:
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
candidate = text[start : i + 1].strip()
|
||||
if candidate and candidate not in candidates:
|
||||
candidates.append(candidate)
|
||||
break
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def _parse_response(text: str, expected_names: list[str]) -> dict[str, tuple[str, str]]:
|
||||
"""Parse the LLM's JSON response into {name: (label, reason)}.
|
||||
|
||||
Robust to the model occasionally wrapping JSON in text or returning
|
||||
slight schema variations. Falls back to matching by candidate name.
|
||||
"""
|
||||
data = None
|
||||
for candidate in _extract_json_candidates(text):
|
||||
try:
|
||||
data = json.loads(candidate)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if data is None:
|
||||
return {}
|
||||
|
||||
entries = data.get("classifications") if isinstance(data, dict) else data
|
||||
if not isinstance(entries, list):
|
||||
return {}
|
||||
|
||||
name_to_label: dict[str, tuple[str, str]] = {}
|
||||
expected_set = {n.lower(): n for n in expected_names}
|
||||
for entry in entries:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
name = entry.get("name") or entry.get("candidate")
|
||||
label = entry.get("label") or entry.get("type") or entry.get("classification")
|
||||
reason = entry.get("reason") or ""
|
||||
if not isinstance(name, str) or not isinstance(label, str):
|
||||
continue
|
||||
# Restore canonical casing from expected_names
|
||||
canonical = expected_set.get(name.lower(), name)
|
||||
lbl = label.strip().upper()
|
||||
if lbl not in VALID_LABELS:
|
||||
lbl = "AMBIGUOUS"
|
||||
name_to_label[canonical] = (lbl, reason.strip()[:120])
|
||||
return name_to_label
|
||||
|
||||
|
||||
def _apply_classifications(
|
||||
detected: dict,
|
||||
decisions: dict[str, tuple[str, str]],
|
||||
allow_project_promotions: bool = True,
|
||||
) -> tuple[dict, int, int]:
|
||||
"""Merge LLM decisions back into the detected dict.
|
||||
|
||||
Returns (new_detected, reclassified_count, dropped_count).
|
||||
"""
|
||||
label_to_bucket = {
|
||||
"PERSON": "people",
|
||||
"PROJECT": "projects",
|
||||
"TOPIC": "uncertain",
|
||||
"AMBIGUOUS": "uncertain",
|
||||
}
|
||||
|
||||
# Index every entity by name for in-place update
|
||||
all_entries: list[tuple[str, dict]] = []
|
||||
for bucket, items in detected.items():
|
||||
for e in items:
|
||||
all_entries.append((bucket, e))
|
||||
|
||||
reclassified = 0
|
||||
dropped = 0
|
||||
new_detected: dict[str, list[dict]] = {
|
||||
"people": [],
|
||||
"projects": [],
|
||||
"uncertain": [],
|
||||
}
|
||||
|
||||
for old_bucket, entry in all_entries:
|
||||
decision = decisions.get(entry["name"])
|
||||
if decision is None:
|
||||
# No LLM opinion — keep as-is
|
||||
new_detected[old_bucket].append(entry)
|
||||
continue
|
||||
|
||||
label, reason = decision
|
||||
if label == "COMMON_WORD":
|
||||
dropped += 1
|
||||
continue
|
||||
|
||||
target_bucket = label_to_bucket[label]
|
||||
if (
|
||||
label == "PROJECT"
|
||||
and not allow_project_promotions
|
||||
and not _is_authoritative_project(entry)
|
||||
):
|
||||
target_bucket = "uncertain"
|
||||
updated = dict(entry)
|
||||
# Append the LLM's reason as a new signal so the user sees why it moved
|
||||
signals = list(updated.get("signals", []))
|
||||
signals.append(f"LLM: {label.lower()} — {reason}" if reason else f"LLM: {label.lower()}")
|
||||
updated["signals"] = signals
|
||||
if target_bucket != old_bucket:
|
||||
reclassified += 1
|
||||
updated["type"] = (
|
||||
"person"
|
||||
if target_bucket == "people"
|
||||
else "project"
|
||||
if target_bucket == "projects"
|
||||
else "uncertain"
|
||||
)
|
||||
new_detected[target_bucket].append(updated)
|
||||
|
||||
return new_detected, reclassified, dropped
|
||||
|
||||
|
||||
def _is_authoritative_person(entry: dict) -> bool:
|
||||
"""Return True for git-author people that should not be second-guessed."""
|
||||
signals = " ".join(entry.get("signals", [])).lower()
|
||||
return "commit" in signals and "repo" in signals
|
||||
|
||||
|
||||
def _is_authoritative_project(entry: dict) -> bool:
|
||||
"""Return True for manifest/git-backed projects that are already source-backed."""
|
||||
signals = " ".join(entry.get("signals", [])).lower()
|
||||
manifest_markers = ("package.json", "pyproject.toml", "cargo.toml", "go.mod")
|
||||
return any(marker in signals for marker in manifest_markers) or "commit" in signals
|
||||
|
||||
|
||||
def _print_progress(batch_idx: int, total: int, current_name: str) -> None:
|
||||
"""Overwrite-line progress indicator."""
|
||||
width = 40
|
||||
filled = int(width * batch_idx / total) if total else 0
|
||||
bar = "█" * filled + "░" * (width - filled)
|
||||
msg = f"\r LLM refine: [{bar}] batch {batch_idx}/{total} current: {current_name[:30]:<30}"
|
||||
sys.stderr.write(msg)
|
||||
sys.stderr.flush()
|
||||
|
||||
|
||||
def refine_entities(
|
||||
detected: dict,
|
||||
corpus_text: str,
|
||||
provider: LLMProvider,
|
||||
batch_size: int = BATCH_SIZE,
|
||||
show_progress: bool = True,
|
||||
allow_project_promotions: bool = True,
|
||||
) -> RefineResult:
|
||||
"""Reclassify detected entities using the LLM provider.
|
||||
|
||||
Only regex-derived candidates are sent for refinement. Git authors and
|
||||
manifest/git-backed projects are already source-backed and don't benefit
|
||||
from LLM second-guessing.
|
||||
|
||||
Ctrl-C during refinement: cancels the remaining batches, returns a
|
||||
RefineResult with ``cancelled=True`` and whatever was classified before
|
||||
the interrupt. The partial result is safe to pass straight to
|
||||
``confirm_entities``.
|
||||
|
||||
Transport or parse failures in individual batches are recorded in
|
||||
``errors`` and do not abort the run.
|
||||
|
||||
``allow_project_promotions=False`` keeps LLM-only project guesses in the
|
||||
uncertain bucket. This is useful when manifest/git signal already supplied
|
||||
canonical projects and regex/LLM hits are likely tools, vendors, or topics.
|
||||
"""
|
||||
candidates: list[tuple[str, str]] = []
|
||||
current_type = {"people": "person", "projects": "project", "uncertain": "uncertain"}
|
||||
for bucket in ("people", "projects", "uncertain"):
|
||||
for e in detected.get(bucket, []):
|
||||
if bucket == "people" and _is_authoritative_person(e):
|
||||
continue
|
||||
if bucket == "projects" and _is_authoritative_project(e):
|
||||
continue
|
||||
candidates.append((e["name"], current_type[bucket]))
|
||||
|
||||
corpus_lines = corpus_text.splitlines() if corpus_text else []
|
||||
|
||||
# Deduplicate candidate names while preserving order
|
||||
seen: set[str] = set()
|
||||
unique: list[tuple[str, str]] = []
|
||||
for name, kind in candidates:
|
||||
if name not in seen:
|
||||
seen.add(name)
|
||||
unique.append((name, kind))
|
||||
|
||||
if not unique:
|
||||
return RefineResult(
|
||||
merged=detected,
|
||||
reclassified=0,
|
||||
dropped=0,
|
||||
errors=[],
|
||||
batches_completed=0,
|
||||
batches_total=0,
|
||||
cancelled=False,
|
||||
)
|
||||
|
||||
# Build batches
|
||||
batches: list[list[tuple[str, str, list[str]]]] = []
|
||||
for i in range(0, len(unique), batch_size):
|
||||
chunk = unique[i : i + batch_size]
|
||||
enriched = [(name, kind, _collect_contexts(corpus_lines, name)) for name, kind in chunk]
|
||||
batches.append(enriched)
|
||||
|
||||
all_decisions: dict[str, tuple[str, str]] = {}
|
||||
errors: list[str] = []
|
||||
completed = 0
|
||||
cancelled = False
|
||||
|
||||
for idx, batch in enumerate(batches, 1):
|
||||
if show_progress and batch:
|
||||
_print_progress(idx - 1, len(batches), batch[0][0])
|
||||
user_prompt = _build_user_prompt(batch)
|
||||
try:
|
||||
resp = provider.classify(SYSTEM_PROMPT, user_prompt, json_mode=True)
|
||||
except KeyboardInterrupt:
|
||||
cancelled = True
|
||||
break
|
||||
except LLMError as e:
|
||||
errors.append(f"batch {idx}: {e}")
|
||||
continue
|
||||
names_in_batch = [name for name, _, _ in batch]
|
||||
decisions = _parse_response(resp.text, names_in_batch)
|
||||
if not decisions:
|
||||
errors.append(f"batch {idx}: could not parse response")
|
||||
all_decisions.update(decisions)
|
||||
completed += 1
|
||||
if show_progress:
|
||||
_print_progress(idx, len(batches), batch[-1][0])
|
||||
|
||||
if show_progress:
|
||||
sys.stderr.write("\n")
|
||||
sys.stderr.flush()
|
||||
|
||||
merged, reclassified, dropped = _apply_classifications(
|
||||
detected,
|
||||
all_decisions,
|
||||
allow_project_promotions=allow_project_promotions,
|
||||
)
|
||||
|
||||
return RefineResult(
|
||||
merged=merged,
|
||||
reclassified=reclassified,
|
||||
dropped=dropped,
|
||||
errors=errors,
|
||||
batches_completed=completed,
|
||||
batches_total=len(batches),
|
||||
cancelled=cancelled,
|
||||
)
|
||||
|
||||
|
||||
def collect_corpus_text(
|
||||
project_dir: str,
|
||||
max_files: int = 30,
|
||||
max_bytes_per_file: int = 20_000,
|
||||
) -> str:
|
||||
"""Gather prose text from ``project_dir`` for use as LLM context source.
|
||||
|
||||
Stratified: reads up to ``max_files`` prose files (``.md``, ``.txt``,
|
||||
``.rst``), preferring recently-modified. Each file capped at
|
||||
``max_bytes_per_file`` to bound total input.
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
from mempalace.entity_detector import PROSE_EXTENSIONS, SKIP_DIRS
|
||||
|
||||
root = Path(project_dir).expanduser().resolve()
|
||||
if not root.is_dir():
|
||||
return ""
|
||||
candidates: list[tuple[float, Path]] = []
|
||||
for dirpath, dirs, files in _walk_prose(root, SKIP_DIRS):
|
||||
for fname in files:
|
||||
p = dirpath / fname
|
||||
if p.suffix.lower() not in PROSE_EXTENSIONS:
|
||||
continue
|
||||
try:
|
||||
mtime = p.stat().st_mtime
|
||||
except OSError:
|
||||
continue
|
||||
candidates.append((mtime, p))
|
||||
candidates.sort(reverse=True)
|
||||
selected = [p for _, p in candidates[:max_files]]
|
||||
chunks: list[str] = []
|
||||
for p in selected:
|
||||
try:
|
||||
with open(p, encoding="utf-8", errors="replace") as f:
|
||||
chunks.append(f.read(max_bytes_per_file))
|
||||
except OSError:
|
||||
continue
|
||||
return "\n".join(chunks)
|
||||
|
||||
|
||||
def _walk_prose(root, skip_dirs):
|
||||
"""Walk a directory yielding (Path, dirs, files), pruning skip_dirs.
|
||||
|
||||
Inlined from ``project_scanner._walk`` to avoid a private-name import
|
||||
coupling. Functionality is intentionally narrow: prose collection only.
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
for dirpath, dirs, files in os.walk(root):
|
||||
dirs[:] = [d for d in dirs if d not in skip_dirs and not d.startswith(".")]
|
||||
yield Path(dirpath), dirs, files
|
||||
@@ -52,6 +52,7 @@ READABLE_EXTENSIONS = {
|
||||
}
|
||||
|
||||
SKIP_FILENAMES = {
|
||||
"entities.json",
|
||||
"mempalace.yaml",
|
||||
"mempalace.yml",
|
||||
"mempal.yaml",
|
||||
@@ -471,6 +472,97 @@ def _load_known_entities_raw() -> dict:
|
||||
return dict(_ENTITY_REGISTRY_CACHE["raw"])
|
||||
|
||||
|
||||
def add_to_known_entities(entities_by_category: dict) -> str:
|
||||
"""Union ``entities_by_category`` into ``~/.mempalace/known_entities.json``.
|
||||
|
||||
Accepts ``{category: [names]}`` shape as produced by ``mempalace init``
|
||||
and merges into the registry the miner reads at mine time. Existing
|
||||
categories are preserved untouched unless also present in the input;
|
||||
for categories present in both, entries are unioned case-insensitively
|
||||
without changing the on-disk ordering of pre-existing names.
|
||||
|
||||
If a category is stored on-disk as ``{name: code}`` (the alternate
|
||||
miner-supported shape, used by dialect-style configs), new names are
|
||||
added as keys with ``None`` values so existing code mappings aren't
|
||||
overwritten. A later compress pass can assign codes.
|
||||
|
||||
The in-process cache is invalidated on write so same-process callers
|
||||
(notably ``cmd_init`` → ``cmd_mine`` in sequence) see the update
|
||||
immediately instead of waiting for a mtime re-check.
|
||||
|
||||
Returns the registry path as a string for logging.
|
||||
"""
|
||||
import json as _json
|
||||
from pathlib import Path as _Path
|
||||
|
||||
registry_path = _Path(_ENTITY_REGISTRY_PATH)
|
||||
registry_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
existing: dict = {}
|
||||
if registry_path.exists():
|
||||
try:
|
||||
loaded = _json.loads(registry_path.read_text(encoding="utf-8"))
|
||||
if isinstance(loaded, dict):
|
||||
existing = loaded
|
||||
except (_json.JSONDecodeError, OSError):
|
||||
existing = {}
|
||||
|
||||
def _coerce_name(value):
|
||||
if not value:
|
||||
return None
|
||||
name = str(value)
|
||||
return name if name else None
|
||||
|
||||
for category, names in entities_by_category.items():
|
||||
if not isinstance(names, list) or not names:
|
||||
continue
|
||||
current = existing.get(category)
|
||||
if isinstance(current, list):
|
||||
seen_lower = {str(n).lower() for n in current}
|
||||
for n in names:
|
||||
name = _coerce_name(n)
|
||||
if not name:
|
||||
continue
|
||||
if name.lower() not in seen_lower:
|
||||
current.append(name)
|
||||
seen_lower.add(name.lower())
|
||||
elif isinstance(current, dict):
|
||||
seen_lower = {str(name).lower() for name in current}
|
||||
for n in names:
|
||||
name = _coerce_name(n)
|
||||
if not name or name.lower() in seen_lower:
|
||||
continue
|
||||
current[name] = None
|
||||
seen_lower.add(name.lower())
|
||||
else:
|
||||
# Missing or unrecognized shape — seed as a fresh list, deduped
|
||||
seen: set = set()
|
||||
ordered: list = []
|
||||
for n in names:
|
||||
name = _coerce_name(n)
|
||||
if not name:
|
||||
continue
|
||||
key = name.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
ordered.append(name)
|
||||
existing[category] = ordered
|
||||
|
||||
registry_path.write_text(_json.dumps(existing, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
try:
|
||||
registry_path.chmod(0o600)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
|
||||
# Invalidate in-process cache so later calls in the same run see the write.
|
||||
_ENTITY_REGISTRY_CACHE["mtime"] = None
|
||||
_ENTITY_REGISTRY_CACHE["names"] = frozenset()
|
||||
_ENTITY_REGISTRY_CACHE["raw"] = {}
|
||||
|
||||
return str(registry_path)
|
||||
|
||||
|
||||
_HALL_KEYWORDS_CACHE = None
|
||||
|
||||
|
||||
|
||||
@@ -594,6 +594,8 @@ def discover_entities(
|
||||
prose_file_cap: int = 10,
|
||||
project_cap: int = 15,
|
||||
people_cap: int = 15,
|
||||
llm_provider: object = None,
|
||||
show_progress: bool = True,
|
||||
) -> dict:
|
||||
"""Top-level entity discovery: real signals first, prose detection second.
|
||||
|
||||
@@ -604,10 +606,39 @@ def discover_entities(
|
||||
1. Package manifests (package.json, pyproject.toml, Cargo.toml, go.mod)
|
||||
→ canonical project names
|
||||
2. Git commit authors → real people with real commit counts
|
||||
3. Regex entity detection on prose files → supplementary names only
|
||||
3. Claude Code conversation dirs (~/.claude/projects/) → per-session
|
||||
project names (pulled from each session's ``cwd`` metadata)
|
||||
4. Regex entity detection on prose files → supplementary names only
|
||||
mentioned in docs/notes (not code)
|
||||
5. Optional LLM refinement pass — reclassifies ambiguous candidates
|
||||
using the caller-supplied provider
|
||||
|
||||
Passing ``llm_provider`` enables phase-2 refinement. The caller is
|
||||
responsible for constructing the provider (``llm_client.get_provider``)
|
||||
and confirming availability. Refinement is blocking-interactive:
|
||||
progress prints to stderr; Ctrl-C returns partial results.
|
||||
"""
|
||||
projects, people = scan(project_dir)
|
||||
|
||||
# If the target is a Claude Code conversations root, extract per-project
|
||||
# entries from there too. Same ProjectInfo shape, so dedup logic works.
|
||||
from mempalace.convo_scanner import is_claude_projects_root, scan_claude_projects
|
||||
|
||||
root_path = Path(project_dir).expanduser().resolve()
|
||||
if is_claude_projects_root(root_path):
|
||||
convo_projects = scan_claude_projects(root_path)
|
||||
# Dedup by name against the git-manifest list, preferring entries with
|
||||
# more user_commits as signal strength.
|
||||
by_name: dict[str, ProjectInfo] = {p.name: p for p in projects}
|
||||
for cp in convo_projects:
|
||||
existing = by_name.get(cp.name)
|
||||
if existing is None or cp.user_commits > existing.user_commits:
|
||||
by_name[cp.name] = cp
|
||||
projects = sorted(
|
||||
by_name.values(),
|
||||
key=lambda p: (not p.is_mine, -p.user_commits, -p.total_commits, p.name),
|
||||
)
|
||||
|
||||
real_signal = to_detected_dict(projects, people, project_cap=project_cap, people_cap=people_cap)
|
||||
|
||||
# Secondary pass: prose-only extraction catches names mentioned in docs
|
||||
@@ -621,11 +652,45 @@ def discover_entities(
|
||||
else {"people": [], "projects": [], "uncertain": []}
|
||||
)
|
||||
|
||||
# If git/manifests gave us real projects, suppress the regex "uncertain" bucket.
|
||||
# That bucket is mostly noise (common words, CamelCase tech terms, etc.) and
|
||||
# adding it to the review flow just makes the user do triage we can skip.
|
||||
# Without LLM refinement, suppress regex "uncertain" noise when real
|
||||
# manifest/git signal exists. With LLM refinement enabled, keep those
|
||||
# candidates so the model can promote real entities or drop common words.
|
||||
has_real_signal = bool(projects) or bool(people)
|
||||
return _merge_detected(real_signal, prose_detected, drop_secondary_uncertain=has_real_signal)
|
||||
merged = _merge_detected(
|
||||
real_signal,
|
||||
prose_detected,
|
||||
drop_secondary_uncertain=has_real_signal and llm_provider is None,
|
||||
)
|
||||
|
||||
# Optional phase 2: LLM refinement.
|
||||
if llm_provider is not None:
|
||||
from mempalace.llm_refine import collect_corpus_text, refine_entities
|
||||
|
||||
corpus = collect_corpus_text(str(project_dir))
|
||||
result = refine_entities(
|
||||
merged,
|
||||
corpus,
|
||||
llm_provider,
|
||||
show_progress=show_progress,
|
||||
allow_project_promotions=not has_real_signal,
|
||||
)
|
||||
if show_progress:
|
||||
status_bits = []
|
||||
if result.cancelled:
|
||||
status_bits.append("cancelled")
|
||||
if result.reclassified:
|
||||
status_bits.append(f"reclassified {result.reclassified}")
|
||||
if result.dropped:
|
||||
status_bits.append(f"dropped {result.dropped}")
|
||||
if result.errors:
|
||||
status_bits.append(f"{len(result.errors)} batch error(s)")
|
||||
if status_bits:
|
||||
import sys as _sys
|
||||
|
||||
print(f" LLM refine: {', '.join(status_bits)}", file=_sys.stderr)
|
||||
merged = result.merged
|
||||
|
||||
return merged
|
||||
|
||||
|
||||
# ==================== CLI ====================
|
||||
|
||||
Reference in New Issue
Block a user