chore: rescue merged stacked PRs #1150 and #1157 into develop

#1148, #1150, and #1157 were reviewed and merged on GitHub, but the two stacked children landed on their parent feature branches (now stale) rather than on develop. Only #1148's commits reached develop via the direct merge. Release PR #1159 (develop → main for v3.3.3) is therefore missing the LLM refinement, Claude-conversation scanner, and miner- registry wire-up that were ostensibly part of the release. This merge brings the stale `feat/llm-entity-refine` branch (which contains the rolled-up merge commit for #1157 → #1150 → everything below) into develop so the release tag includes it. No code changes here — only history recovery.
2026-04-24 13:49:12 -03:00
parent a851c7a7df 61d6c3cc3c
commit 19ce58c143
14 changed files with 2588 additions and 12 deletions
@@ -120,8 +120,7 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> li
            os.rename(seg_dir, target)
            moved.append(target)
            logger.warning(
-                "Quarantined stale HNSW segment %s "
-                "(sqlite %.0fs newer than HNSW); renamed to %s",
+                "Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s",
                seg_dir,
                sqlite_mtime - hnsw_mtime,
                target,
@@ -86,21 +86,53 @@ def cmd_init(args):
        languages = cfg.entity_languages
    languages_tuple = tuple(languages)

+    # Optional phase-2 LLM provider (opt-in via --llm).
+    llm_provider = None
+    if getattr(args, "llm", False):
+        from .llm_client import LLMError, get_provider
+
+        try:
+            llm_provider = get_provider(
+                name=args.llm_provider,
+                model=args.llm_model,
+                endpoint=args.llm_endpoint,
+                api_key=args.llm_api_key,
+            )
+        except LLMError as e:
+            print(f"  ERROR: {e}", file=sys.stderr)
+            sys.exit(2)
+        ok, msg = llm_provider.check_available()
+        if not ok:
+            print(
+                f"  ERROR: LLM provider '{args.llm_provider}' unavailable: {msg}",
+                file=sys.stderr,
+            )
+            sys.exit(2)
+        print(f"  LLM refinement enabled: {args.llm_provider}/{args.llm_model}")
+
    # Pass 1: discover entities — manifests + git authors first, prose detection
-    # as supplement for names mentioned only in docs/notes.
+    # as supplement for names mentioned only in docs/notes. Optional phase-2
+    # LLM refinement runs inside discover_entities when llm_provider is given.
    print(f"\n  Scanning for entities in: {args.dir}")
    if languages_tuple != ("en",):
        print(f"  Languages: {', '.join(languages_tuple)}")
-    detected = discover_entities(args.dir, languages=languages_tuple)
+    detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider)
    total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
    if total > 0:
        confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
-        # Save confirmed entities to <project>/entities.json for the miner
+        # Save confirmed entities to <project>/entities.json (per-project
+        # audit trail — user can inspect or hand-edit) AND merge into the
+        # global registry the miner reads at mine time.
        if confirmed["people"] or confirmed["projects"]:
            entities_path = Path(args.dir).expanduser().resolve() / "entities.json"
-            with open(entities_path, "w") as f:
-                json.dump(confirmed, f, indent=2)
+            with open(entities_path, "w", encoding="utf-8") as f:
+                json.dump(confirmed, f, indent=2, ensure_ascii=False)
            print(f"  Entities saved: {entities_path}")
+
+            from .miner import add_to_known_entities
+
+            registry_path = add_to_known_entities(confirmed)
+            print(f"  Registry updated: {registry_path}")
    else:
        print("  No entities detected — proceeding with directory-based rooms.")

@@ -550,6 +582,43 @@ def main():
            "When given, the value is also persisted to config.json."
        ),
    )
+    p_init.add_argument(
+        "--llm",
+        action="store_true",
+        help=(
+            "Enable LLM-assisted entity refinement (opt-in, local-first). "
+            "Runs after manifest/git/regex detection, asking the configured "
+            "provider to reclassify ambiguous candidates. "
+            "Ctrl-C during refinement returns partial results."
+        ),
+    )
+    p_init.add_argument(
+        "--llm-provider",
+        default="ollama",
+        choices=["ollama", "openai-compat", "anthropic"],
+        help="LLM provider (default: ollama). Use --llm to enable.",
+    )
+    p_init.add_argument(
+        "--llm-model",
+        default="gemma4:e4b",
+        help="Model name for the chosen provider (default: gemma4:e4b for Ollama).",
+    )
+    p_init.add_argument(
+        "--llm-endpoint",
+        default=None,
+        help=(
+            "Provider endpoint URL. Default for Ollama: http://localhost:11434. "
+            "Required for openai-compat."
+        ),
+    )
+    p_init.add_argument(
+        "--llm-api-key",
+        default=None,
+        help=(
+            "API key for the provider. For anthropic, defaults to $ANTHROPIC_API_KEY; "
+            "for openai-compat, defaults to $OPENAI_API_KEY."
+        ),
+    )

    # mine
    p_mine = sub.add_parser("mine", help="Mine files into the palace")
@@ -0,0 +1,160 @@
+"""
+convo_scanner.py — Parse Claude Code conversation directories into ProjectInfo.
+
+Claude Code stores sessions under ``~/.claude/projects/<slug>/<id>.jsonl``,
+where the ``<slug>`` is the original CWD with ``/`` replaced by ``-``. That
+encoding is lossy: we can't tell whether ``foo-bar`` in a slug is the
+literal project name ``foo-bar`` or two path segments ``foo/bar``.
+
+Fortunately, every message record in the JSONL carries a ``cwd`` field with
+the true path. This scanner reads one record per session to recover the
+accurate project name, falling back to slug-decoding only if the JSONL
+is malformed or empty.
+
+Output is the same ``ProjectInfo`` shape used by ``project_scanner``, so the
+``discover_entities`` orchestrator can mix-and-match sources.
+
+Public:
+    is_claude_projects_root(path) -> bool
+    scan_claude_projects(path) -> list[ProjectInfo]
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Optional
+
+from mempalace.project_scanner import ProjectInfo
+
+
+MAX_HEADER_LINES = 20  # lines to read per session looking for `cwd`
+
+
+def is_claude_projects_root(path: Path) -> bool:
+    """Return True if path looks like `.claude/projects/`.
+
+    Heuristic: at least one child dir whose name starts with ``-`` and which
+    contains at least one ``.jsonl`` file.
+    """
+    if not path.is_dir():
+        return False
+    try:
+        children = list(path.iterdir())
+    except OSError:
+        return False
+    for child in children:
+        if not (child.is_dir() and child.name.startswith("-")):
+            continue
+        try:
+            if any(p.suffix == ".jsonl" for p in child.iterdir() if p.is_file()):
+                return True
+        except OSError:
+            continue
+    return False
+
+
+def _extract_cwd_from_session(session_file: Path) -> Optional[str]:
+    """Return the ``cwd`` from the first message record that carries one.
+
+    Returns None if the file can't be read, has no JSON, or no record has cwd.
+    """
+    try:
+        with open(session_file, encoding="utf-8", errors="replace") as f:
+            for i, line in enumerate(f):
+                if i >= MAX_HEADER_LINES:
+                    break
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    obj = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                cwd = obj.get("cwd")
+                if isinstance(cwd, str) and cwd:
+                    return cwd
+    except OSError:
+        return None
+    return None
+
+
+def _decode_slug_fallback(slug: str) -> str:
+    """Best-effort project name from slug when cwd is unavailable.
+
+    The slug is lossy (`/` and `-` both become `-`). Last non-empty segment
+    is the closest guess at the project name, preserving kebab-case is
+    impossible without cwd.
+    """
+    stripped = slug.lstrip("-")
+    parts = [p for p in stripped.split("-") if p]
+    return parts[-1] if parts else slug
+
+
+def _safe_mtime(path: Path) -> float:
+    """Return file mtime, defaulting old on permission or filesystem errors."""
+    try:
+        return path.stat().st_mtime
+    except OSError:
+        return 0.0
+
+
+def _resolve_project_name(project_dir: Path) -> str:
+    """Read one session's cwd to recover the original project name.
+
+    Falls back to slug-decoding if no session has a readable cwd.
+    """
+    sessions = sorted(
+        (p for p in project_dir.iterdir() if p.is_file() and p.suffix == ".jsonl"),
+        key=_safe_mtime,
+        reverse=True,  # newest first — most likely to be well-formed
+    )
+    for session in sessions:
+        cwd = _extract_cwd_from_session(session)
+        if cwd:
+            return Path(cwd).name or cwd
+    return _decode_slug_fallback(project_dir.name)
+
+
+def scan_claude_projects(path: str | Path) -> list[ProjectInfo]:
+    """Scan a ``.claude/projects/`` directory for Claude Code conversations.
+
+    One ProjectInfo per subdir. ``has_git`` is False (the directory isn't a
+    repo itself) but ``total_commits`` is repurposed here as session count so
+    the UX surfaces a density signal for ranking.
+    """
+    root = Path(path).expanduser().resolve()
+    if not is_claude_projects_root(root):
+        return []
+
+    projects: dict[str, ProjectInfo] = {}
+    for sub in sorted(root.iterdir()):
+        if not (sub.is_dir() and sub.name.startswith("-")):
+            continue
+        try:
+            sessions = [p for p in sub.iterdir() if p.is_file() and p.suffix == ".jsonl"]
+        except OSError:
+            continue
+        if not sessions:
+            continue
+
+        name = _resolve_project_name(sub)
+        session_count = len(sessions)
+
+        proj = ProjectInfo(
+            name=name,
+            repo_root=sub,
+            manifest=None,
+            has_git=False,
+            total_commits=session_count,
+            user_commits=session_count,
+            is_mine=True,  # Claude Code sessions are authored by the user
+        )
+        existing = projects.get(name)
+        if existing is None or session_count > existing.user_commits:
+            projects[name] = proj
+
+    return sorted(
+        projects.values(),
+        key=lambda p: (-p.user_commits, p.name),
+    )
@@ -0,0 +1,305 @@
+"""
+llm_client.py — Minimal provider abstraction for LLM-assisted entity refinement.
+
+Three providers cover the useful space:
+
+- ``ollama`` (default): local models via http://localhost:11434. Works fully
+  offline. Honors MemPalace's "zero-API required" principle.
+- ``openai-compat``: any OpenAI-compatible ``/v1/chat/completions`` endpoint.
+  Covers OpenRouter, LM Studio, llama.cpp server, vLLM, Groq, Fireworks,
+  Together, and most self-hosted setups.
+- ``anthropic``: the official Messages API. Opt-in for users who want Haiku
+  quality without setting up a local model.
+
+All providers expose the same ``classify(system, user, json_mode)`` method and
+the same ``check_available()`` probe. No external SDK dependencies — stdlib
+``urllib`` only.
+
+JSON mode matters here: we always ask for structured output. Providers
+differ on how to request it (Ollama: ``format: json``; OpenAI-compat:
+``response_format``; Anthropic: prompt-level instruction) and this module
+normalizes that away from the caller.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from dataclasses import dataclass
+from typing import Optional
+from urllib.error import HTTPError, URLError
+from urllib.request import Request, urlopen
+
+
+class LLMError(RuntimeError):
+    """Raised for any provider failure — transport, parse, auth, missing model."""
+
+
+@dataclass
+class LLMResponse:
+    text: str
+    model: str
+    provider: str
+    raw: dict
+
+
+# ==================== BASE ====================
+
+
+class LLMProvider:
+    name: str = "base"
+
+    def __init__(
+        self,
+        model: str,
+        endpoint: Optional[str] = None,
+        api_key: Optional[str] = None,
+        timeout: int = 120,
+    ):
+        self.model = model
+        self.endpoint = endpoint
+        self.api_key = api_key
+        self.timeout = timeout
+
+    def classify(self, system: str, user: str, json_mode: bool = True) -> LLMResponse:
+        raise NotImplementedError
+
+    def check_available(self) -> tuple[bool, str]:
+        """Return ``(ok, message)``. Fast probe that the provider is reachable."""
+        raise NotImplementedError
+
+
+def _http_post_json(url: str, body: dict, headers: dict, timeout: int) -> dict:
+    """POST JSON and return the parsed response. Raises LLMError on any failure."""
+    req = Request(
+        url,
+        data=json.dumps(body).encode("utf-8"),
+        headers={"Content-Type": "application/json", **headers},
+    )
+    try:
+        with urlopen(req, timeout=timeout) as resp:
+            return json.loads(resp.read())
+    except HTTPError as e:
+        detail = ""
+        try:
+            detail = e.read().decode("utf-8", errors="replace")[:500]
+        except Exception:
+            pass
+        raise LLMError(f"HTTP {e.code} from {url}: {detail or e.reason}") from e
+    except (URLError, OSError) as e:
+        raise LLMError(f"Cannot reach {url}: {e}") from e
+    except json.JSONDecodeError as e:
+        raise LLMError(f"Malformed response from {url}: {e}") from e
+
+
+# ==================== OLLAMA ====================
+
+
+class OllamaProvider(LLMProvider):
+    name = "ollama"
+    DEFAULT_ENDPOINT = "http://localhost:11434"
+
+    def __init__(
+        self,
+        model: str,
+        endpoint: Optional[str] = None,
+        timeout: int = 180,
+        **_: object,
+    ):
+        super().__init__(
+            model=model,
+            endpoint=endpoint or self.DEFAULT_ENDPOINT,
+            timeout=timeout,
+        )
+
+    def check_available(self) -> tuple[bool, str]:
+        try:
+            with urlopen(f"{self.endpoint}/api/tags", timeout=5) as resp:
+                data = json.loads(resp.read())
+        except (URLError, HTTPError, OSError, json.JSONDecodeError) as e:
+            return False, f"Cannot reach Ollama at {self.endpoint}: {e}"
+        names = {m.get("name", "") for m in data.get("models", []) or []}
+        # Ollama tags may or may not include ':latest' — accept either form
+        wanted = {self.model, f"{self.model}:latest"}
+        if not names & wanted:
+            return (
+                False,
+                f"Model '{self.model}' not loaded in Ollama. Run: ollama pull {self.model}",
+            )
+        return True, "ok"
+
+    def classify(self, system: str, user: str, json_mode: bool = True) -> LLMResponse:
+        body: dict = {
+            "model": self.model,
+            "messages": [
+                {"role": "system", "content": system},
+                {"role": "user", "content": user},
+            ],
+            "stream": False,
+            "options": {"temperature": 0.1},
+        }
+        if json_mode:
+            body["format"] = "json"
+        data = _http_post_json(f"{self.endpoint}/api/chat", body, headers={}, timeout=self.timeout)
+        text = (data.get("message") or {}).get("content", "")
+        if not text:
+            raise LLMError(f"Empty response from Ollama (model={self.model})")
+        return LLMResponse(text=text, model=self.model, provider=self.name, raw=data)
+
+
+# ==================== OPENAI-COMPAT ====================
+
+
+class OpenAICompatProvider(LLMProvider):
+    """Any OpenAI-compatible ``/v1/chat/completions`` endpoint.
+
+    Supply ``--llm-endpoint http://host:port`` (with or without ``/v1``).
+    API key via ``--llm-api-key`` or the ``OPENAI_API_KEY`` env var.
+    """
+
+    name = "openai-compat"
+
+    def __init__(
+        self,
+        model: str,
+        endpoint: Optional[str] = None,
+        api_key: Optional[str] = None,
+        timeout: int = 120,
+        **_: object,
+    ):
+        resolved_key = api_key or os.environ.get("OPENAI_API_KEY")
+        super().__init__(model=model, endpoint=endpoint, api_key=resolved_key, timeout=timeout)
+
+    def _resolve_url(self) -> str:
+        if not self.endpoint:
+            raise LLMError("openai-compat provider requires --llm-endpoint")
+        url = self.endpoint.rstrip("/")
+        if url.endswith("/chat/completions"):
+            return url
+        if not url.endswith("/v1"):
+            url = f"{url}/v1"
+        return f"{url}/chat/completions"
+
+    def check_available(self) -> tuple[bool, str]:
+        if not self.endpoint:
+            return False, "no --llm-endpoint configured"
+        base = self.endpoint.rstrip("/")
+        base = base.removesuffix("/chat/completions").removesuffix("/v1")
+        try:
+            req = Request(f"{base}/v1/models")
+            if self.api_key:
+                req.add_header("Authorization", f"Bearer {self.api_key}")
+            with urlopen(req, timeout=5):
+                pass
+        except (URLError, HTTPError, OSError) as e:
+            return False, f"Cannot reach {self.endpoint}: {e}"
+        return True, "ok"
+
+    def classify(self, system: str, user: str, json_mode: bool = True) -> LLMResponse:
+        body: dict = {
+            "model": self.model,
+            "messages": [
+                {"role": "system", "content": system},
+                {"role": "user", "content": user},
+            ],
+            "temperature": 0.1,
+        }
+        if json_mode:
+            body["response_format"] = {"type": "json_object"}
+        headers = {}
+        if self.api_key:
+            headers["Authorization"] = f"Bearer {self.api_key}"
+        data = _http_post_json(self._resolve_url(), body, headers=headers, timeout=self.timeout)
+        try:
+            text = data["choices"][0]["message"]["content"]
+        except (KeyError, IndexError, TypeError) as e:
+            raise LLMError(f"Unexpected response shape: {e}") from e
+        if not text:
+            raise LLMError(f"Empty response from {self.name} (model={self.model})")
+        return LLMResponse(text=text, model=self.model, provider=self.name, raw=data)
+
+
+# ==================== ANTHROPIC ====================
+
+
+class AnthropicProvider(LLMProvider):
+    name = "anthropic"
+    DEFAULT_ENDPOINT = "https://api.anthropic.com"
+    API_VERSION = "2023-06-01"
+
+    def __init__(
+        self,
+        model: str,
+        api_key: Optional[str] = None,
+        endpoint: Optional[str] = None,
+        timeout: int = 120,
+        **_: object,
+    ):
+        key = api_key or os.environ.get("ANTHROPIC_API_KEY")
+        super().__init__(
+            model=model,
+            endpoint=endpoint or self.DEFAULT_ENDPOINT,
+            api_key=key,
+            timeout=timeout,
+        )
+
+    def check_available(self) -> tuple[bool, str]:
+        if not self.api_key:
+            return False, "ANTHROPIC_API_KEY not set (use --llm-api-key or env)"
+        # Don't probe — a live request would cost money. First real call will
+        # surface auth errors if the key is invalid.
+        return True, "ok"
+
+    def classify(self, system: str, user: str, json_mode: bool = True) -> LLMResponse:
+        if not self.api_key:
+            raise LLMError("Anthropic provider requires ANTHROPIC_API_KEY env or --llm-api-key")
+        sys_prompt = system
+        if json_mode:
+            sys_prompt += "\n\nRespond with valid JSON only, no prose."
+        body = {
+            "model": self.model,
+            "max_tokens": 2048,
+            "temperature": 0.1,
+            "system": sys_prompt,
+            "messages": [{"role": "user", "content": user}],
+        }
+        headers = {
+            "X-API-Key": self.api_key,
+            "anthropic-version": self.API_VERSION,
+        }
+        data = _http_post_json(
+            f"{self.endpoint}/v1/messages", body, headers=headers, timeout=self.timeout
+        )
+        try:
+            text = "".join(
+                b.get("text", "") for b in data.get("content", []) or [] if b.get("type") == "text"
+            )
+        except (AttributeError, TypeError) as e:
+            raise LLMError(f"Unexpected response shape: {e}") from e
+        if not text:
+            raise LLMError(f"Empty response from Anthropic (model={self.model})")
+        return LLMResponse(text=text, model=self.model, provider=self.name, raw=data)
+
+
+# ==================== FACTORY ====================
+
+
+PROVIDERS: dict[str, type[LLMProvider]] = {
+    "ollama": OllamaProvider,
+    "openai-compat": OpenAICompatProvider,
+    "anthropic": AnthropicProvider,
+}
+
+
+def get_provider(
+    name: str,
+    model: str,
+    endpoint: Optional[str] = None,
+    api_key: Optional[str] = None,
+    timeout: int = 120,
+) -> LLMProvider:
+    """Build a provider by name. Raises LLMError on unknown provider."""
+    cls = PROVIDERS.get(name)
+    if cls is None:
+        raise LLMError(f"Unknown provider '{name}'. Choices: {sorted(PROVIDERS.keys())}")
+    return cls(model=model, endpoint=endpoint, api_key=api_key, timeout=timeout)
@@ -0,0 +1,446 @@
+"""
+llm_refine.py — Optional LLM refinement of regex-detected entities.
+
+Takes the candidate set produced by phase-1 detection (manifests, git
+authors, regex on prose) and asks an LLM to reclassify each candidate as
+PERSON / PROJECT / TOPIC / COMMON_WORD / AMBIGUOUS.
+
+Design constraints:
+- Opt-in. Default init path never imports this module.
+- Local-first by default (Ollama).
+- Interactive UX: visible progress, clean cancellation (Ctrl-C returns
+  whatever was classified before the interrupt).
+- Don't feed the raw corpus to the LLM — feed candidates + a few sampled
+  context lines each. Keeps total input to ~50-100K tokens even for huge
+  prose corpora.
+
+Public:
+    refine_entities(detected, corpus_text, provider, ...) -> dict
+"""
+
+from __future__ import annotations
+
+import json
+import re
+import sys
+from dataclasses import dataclass
+
+from mempalace.llm_client import LLMError, LLMProvider
+
+
+BATCH_SIZE = 25  # candidates per LLM call; tuned for 4B local models
+CONTEXT_LINES_PER_CANDIDATE = 3
+CONTEXT_WINDOW_CHARS = 240  # max chars per context line to keep tokens bounded
+
+# Valid labels the LLM is allowed to return. Anything else is treated as
+# AMBIGUOUS so the user reviews it.
+VALID_LABELS = {"PERSON", "PROJECT", "TOPIC", "COMMON_WORD", "AMBIGUOUS"}
+
+
+SYSTEM_PROMPT = """You are helping organize a user's memory palace by classifying capitalized tokens found in their files.
+
+For each candidate, pick exactly ONE label:
+- PERSON: a specific real person the user knows (colleague, family, character they write about)
+- PROJECT: a named product, codebase, or effort the user works on
+- TOPIC: a recurring theme or subject (not a person, not a project) — cities, technologies, concepts
+- COMMON_WORD: an English word, verb, or fragment that isn't a named entity at all (e.g. "Created", "Before", "Never")
+- AMBIGUOUS: context is insufficient to decide between two of the above
+
+Frameworks, runtimes, APIs, cloud services, vendors, and third-party products
+(e.g. Angular, OpenAPI, Terraform, Bun, Google) are TOPIC unless the context
+clearly says this is the user's own named codebase, product, or active effort.
+
+Use the provided context lines to disambiguate. A capitalized word that only appears in metadata ("Created: 2026-04-24") is COMMON_WORD. A name that appears with pronouns and dialogue is PERSON.
+
+Respond with JSON only. Schema:
+{"classifications": [{"name": "<exact candidate name>", "label": "<LABEL>", "reason": "<one short sentence>"}]}
+
+One entry per candidate, same order as the input."""
+
+
+@dataclass
+class RefineResult:
+    merged: dict  # updated detected dict
+    reclassified: int  # entries whose type changed
+    dropped: int  # entries removed from the merged result (COMMON_WORD only)
+    errors: list[str]  # per-batch error messages (transport/parse failures)
+    batches_completed: int
+    batches_total: int
+    cancelled: bool
+
+
+def _collect_contexts(
+    corpus_lines: list[str], name: str, max_lines: int = CONTEXT_LINES_PER_CANDIDATE
+) -> list[str]:
+    """Return up to `max_lines` distinct lines from the corpus that mention `name`.
+
+    Case-insensitive token-boundary match. Lines are truncated to
+    CONTEXT_WINDOW_CHARS chars to keep token usage bounded.
+    """
+    needle = re.compile(rf"(?<!\w){re.escape(name)}(?!\w)", re.IGNORECASE)
+    seen: set[str] = set()
+    out: list[str] = []
+    for line in corpus_lines:
+        if not needle.search(line):
+            continue
+        trimmed = line.strip()[:CONTEXT_WINDOW_CHARS]
+        if not trimmed or trimmed in seen:
+            continue
+        seen.add(trimmed)
+        out.append(trimmed)
+        if len(out) >= max_lines:
+            break
+    return out
+
+
+def _build_user_prompt(candidates_with_contexts: list[tuple[str, str, list[str]]]) -> str:
+    """Shape: for each candidate, list its current type guess + sampled contexts."""
+    parts: list[str] = ["CANDIDATES:"]
+    for i, (name, current_type, contexts) in enumerate(candidates_with_contexts, 1):
+        parts.append(f"\n{i}. {name}  (currently: {current_type})")
+        if contexts:
+            for c in contexts:
+                parts.append(f"   > {c}")
+        else:
+            parts.append("   > (no context available)")
+    return "\n".join(parts)
+
+
+def _extract_json_candidates(text: str) -> list[str]:
+    """Return plausible JSON payloads extracted from an LLM response."""
+    text = text.strip()
+    if not text:
+        return []
+
+    candidates: list[str] = [text]
+
+    for match in re.finditer(r"```(?:json)?\s*([\s\S]*?)\s*```", text, re.IGNORECASE):
+        candidate = match.group(1).strip()
+        if candidate and candidate not in candidates:
+            candidates.append(candidate)
+
+    for start, opener in ((i, ch) for i, ch in enumerate(text) if ch in "{["):
+        closer = "}" if opener == "{" else "]"
+        depth = 0
+        in_string = False
+        escaped = False
+        for i in range(start, len(text)):
+            ch = text[i]
+            if in_string:
+                if escaped:
+                    escaped = False
+                elif ch == "\\":
+                    escaped = True
+                elif ch == '"':
+                    in_string = False
+                continue
+
+            if ch == '"':
+                in_string = True
+            elif ch == opener:
+                depth += 1
+            elif ch == closer:
+                depth -= 1
+                if depth == 0:
+                    candidate = text[start : i + 1].strip()
+                    if candidate and candidate not in candidates:
+                        candidates.append(candidate)
+                    break
+
+    return candidates
+
+
+def _parse_response(text: str, expected_names: list[str]) -> dict[str, tuple[str, str]]:
+    """Parse the LLM's JSON response into {name: (label, reason)}.
+
+    Robust to the model occasionally wrapping JSON in text or returning
+    slight schema variations. Falls back to matching by candidate name.
+    """
+    data = None
+    for candidate in _extract_json_candidates(text):
+        try:
+            data = json.loads(candidate)
+            break
+        except json.JSONDecodeError:
+            continue
+    if data is None:
+        return {}
+
+    entries = data.get("classifications") if isinstance(data, dict) else data
+    if not isinstance(entries, list):
+        return {}
+
+    name_to_label: dict[str, tuple[str, str]] = {}
+    expected_set = {n.lower(): n for n in expected_names}
+    for entry in entries:
+        if not isinstance(entry, dict):
+            continue
+        name = entry.get("name") or entry.get("candidate")
+        label = entry.get("label") or entry.get("type") or entry.get("classification")
+        reason = entry.get("reason") or ""
+        if not isinstance(name, str) or not isinstance(label, str):
+            continue
+        # Restore canonical casing from expected_names
+        canonical = expected_set.get(name.lower(), name)
+        lbl = label.strip().upper()
+        if lbl not in VALID_LABELS:
+            lbl = "AMBIGUOUS"
+        name_to_label[canonical] = (lbl, reason.strip()[:120])
+    return name_to_label
+
+
+def _apply_classifications(
+    detected: dict,
+    decisions: dict[str, tuple[str, str]],
+    allow_project_promotions: bool = True,
+) -> tuple[dict, int, int]:
+    """Merge LLM decisions back into the detected dict.
+
+    Returns (new_detected, reclassified_count, dropped_count).
+    """
+    label_to_bucket = {
+        "PERSON": "people",
+        "PROJECT": "projects",
+        "TOPIC": "uncertain",
+        "AMBIGUOUS": "uncertain",
+    }
+
+    # Index every entity by name for in-place update
+    all_entries: list[tuple[str, dict]] = []
+    for bucket, items in detected.items():
+        for e in items:
+            all_entries.append((bucket, e))
+
+    reclassified = 0
+    dropped = 0
+    new_detected: dict[str, list[dict]] = {
+        "people": [],
+        "projects": [],
+        "uncertain": [],
+    }
+
+    for old_bucket, entry in all_entries:
+        decision = decisions.get(entry["name"])
+        if decision is None:
+            # No LLM opinion — keep as-is
+            new_detected[old_bucket].append(entry)
+            continue
+
+        label, reason = decision
+        if label == "COMMON_WORD":
+            dropped += 1
+            continue
+
+        target_bucket = label_to_bucket[label]
+        if (
+            label == "PROJECT"
+            and not allow_project_promotions
+            and not _is_authoritative_project(entry)
+        ):
+            target_bucket = "uncertain"
+        updated = dict(entry)
+        # Append the LLM's reason as a new signal so the user sees why it moved
+        signals = list(updated.get("signals", []))
+        signals.append(f"LLM: {label.lower()} — {reason}" if reason else f"LLM: {label.lower()}")
+        updated["signals"] = signals
+        if target_bucket != old_bucket:
+            reclassified += 1
+            updated["type"] = (
+                "person"
+                if target_bucket == "people"
+                else "project"
+                if target_bucket == "projects"
+                else "uncertain"
+            )
+        new_detected[target_bucket].append(updated)
+
+    return new_detected, reclassified, dropped
+
+
+def _is_authoritative_person(entry: dict) -> bool:
+    """Return True for git-author people that should not be second-guessed."""
+    signals = " ".join(entry.get("signals", [])).lower()
+    return "commit" in signals and "repo" in signals
+
+
+def _is_authoritative_project(entry: dict) -> bool:
+    """Return True for manifest/git-backed projects that are already source-backed."""
+    signals = " ".join(entry.get("signals", [])).lower()
+    manifest_markers = ("package.json", "pyproject.toml", "cargo.toml", "go.mod")
+    return any(marker in signals for marker in manifest_markers) or "commit" in signals
+
+
+def _print_progress(batch_idx: int, total: int, current_name: str) -> None:
+    """Overwrite-line progress indicator."""
+    width = 40
+    filled = int(width * batch_idx / total) if total else 0
+    bar = "█" * filled + "░" * (width - filled)
+    msg = f"\r  LLM refine: [{bar}] batch {batch_idx}/{total}  current: {current_name[:30]:<30}"
+    sys.stderr.write(msg)
+    sys.stderr.flush()
+
+
+def refine_entities(
+    detected: dict,
+    corpus_text: str,
+    provider: LLMProvider,
+    batch_size: int = BATCH_SIZE,
+    show_progress: bool = True,
+    allow_project_promotions: bool = True,
+) -> RefineResult:
+    """Reclassify detected entities using the LLM provider.
+
+    Only regex-derived candidates are sent for refinement. Git authors and
+    manifest/git-backed projects are already source-backed and don't benefit
+    from LLM second-guessing.
+
+    Ctrl-C during refinement: cancels the remaining batches, returns a
+    RefineResult with ``cancelled=True`` and whatever was classified before
+    the interrupt. The partial result is safe to pass straight to
+    ``confirm_entities``.
+
+    Transport or parse failures in individual batches are recorded in
+    ``errors`` and do not abort the run.
+
+    ``allow_project_promotions=False`` keeps LLM-only project guesses in the
+    uncertain bucket. This is useful when manifest/git signal already supplied
+    canonical projects and regex/LLM hits are likely tools, vendors, or topics.
+    """
+    candidates: list[tuple[str, str]] = []
+    current_type = {"people": "person", "projects": "project", "uncertain": "uncertain"}
+    for bucket in ("people", "projects", "uncertain"):
+        for e in detected.get(bucket, []):
+            if bucket == "people" and _is_authoritative_person(e):
+                continue
+            if bucket == "projects" and _is_authoritative_project(e):
+                continue
+            candidates.append((e["name"], current_type[bucket]))
+
+    corpus_lines = corpus_text.splitlines() if corpus_text else []
+
+    # Deduplicate candidate names while preserving order
+    seen: set[str] = set()
+    unique: list[tuple[str, str]] = []
+    for name, kind in candidates:
+        if name not in seen:
+            seen.add(name)
+            unique.append((name, kind))
+
+    if not unique:
+        return RefineResult(
+            merged=detected,
+            reclassified=0,
+            dropped=0,
+            errors=[],
+            batches_completed=0,
+            batches_total=0,
+            cancelled=False,
+        )
+
+    # Build batches
+    batches: list[list[tuple[str, str, list[str]]]] = []
+    for i in range(0, len(unique), batch_size):
+        chunk = unique[i : i + batch_size]
+        enriched = [(name, kind, _collect_contexts(corpus_lines, name)) for name, kind in chunk]
+        batches.append(enriched)
+
+    all_decisions: dict[str, tuple[str, str]] = {}
+    errors: list[str] = []
+    completed = 0
+    cancelled = False
+
+    for idx, batch in enumerate(batches, 1):
+        if show_progress and batch:
+            _print_progress(idx - 1, len(batches), batch[0][0])
+        user_prompt = _build_user_prompt(batch)
+        try:
+            resp = provider.classify(SYSTEM_PROMPT, user_prompt, json_mode=True)
+        except KeyboardInterrupt:
+            cancelled = True
+            break
+        except LLMError as e:
+            errors.append(f"batch {idx}: {e}")
+            continue
+        names_in_batch = [name for name, _, _ in batch]
+        decisions = _parse_response(resp.text, names_in_batch)
+        if not decisions:
+            errors.append(f"batch {idx}: could not parse response")
+        all_decisions.update(decisions)
+        completed += 1
+        if show_progress:
+            _print_progress(idx, len(batches), batch[-1][0])
+
+    if show_progress:
+        sys.stderr.write("\n")
+        sys.stderr.flush()
+
+    merged, reclassified, dropped = _apply_classifications(
+        detected,
+        all_decisions,
+        allow_project_promotions=allow_project_promotions,
+    )
+
+    return RefineResult(
+        merged=merged,
+        reclassified=reclassified,
+        dropped=dropped,
+        errors=errors,
+        batches_completed=completed,
+        batches_total=len(batches),
+        cancelled=cancelled,
+    )
+
+
+def collect_corpus_text(
+    project_dir: str,
+    max_files: int = 30,
+    max_bytes_per_file: int = 20_000,
+) -> str:
+    """Gather prose text from ``project_dir`` for use as LLM context source.
+
+    Stratified: reads up to ``max_files`` prose files (``.md``, ``.txt``,
+    ``.rst``), preferring recently-modified. Each file capped at
+    ``max_bytes_per_file`` to bound total input.
+    """
+    from pathlib import Path
+
+    from mempalace.entity_detector import PROSE_EXTENSIONS, SKIP_DIRS
+
+    root = Path(project_dir).expanduser().resolve()
+    if not root.is_dir():
+        return ""
+    candidates: list[tuple[float, Path]] = []
+    for dirpath, dirs, files in _walk_prose(root, SKIP_DIRS):
+        for fname in files:
+            p = dirpath / fname
+            if p.suffix.lower() not in PROSE_EXTENSIONS:
+                continue
+            try:
+                mtime = p.stat().st_mtime
+            except OSError:
+                continue
+            candidates.append((mtime, p))
+    candidates.sort(reverse=True)
+    selected = [p for _, p in candidates[:max_files]]
+    chunks: list[str] = []
+    for p in selected:
+        try:
+            with open(p, encoding="utf-8", errors="replace") as f:
+                chunks.append(f.read(max_bytes_per_file))
+        except OSError:
+            continue
+    return "\n".join(chunks)
+
+
+def _walk_prose(root, skip_dirs):
+    """Walk a directory yielding (Path, dirs, files), pruning skip_dirs.
+
+    Inlined from ``project_scanner._walk`` to avoid a private-name import
+    coupling. Functionality is intentionally narrow: prose collection only.
+    """
+    import os
+    from pathlib import Path
+
+    for dirpath, dirs, files in os.walk(root):
+        dirs[:] = [d for d in dirs if d not in skip_dirs and not d.startswith(".")]
+        yield Path(dirpath), dirs, files
@@ -52,6 +52,7 @@ READABLE_EXTENSIONS = {
 }

 SKIP_FILENAMES = {
+    "entities.json",
    "mempalace.yaml",
    "mempalace.yml",
    "mempal.yaml",
@@ -471,6 +472,97 @@ def _load_known_entities_raw() -> dict:
    return dict(_ENTITY_REGISTRY_CACHE["raw"])


+def add_to_known_entities(entities_by_category: dict) -> str:
+    """Union ``entities_by_category`` into ``~/.mempalace/known_entities.json``.
+
+    Accepts ``{category: [names]}`` shape as produced by ``mempalace init``
+    and merges into the registry the miner reads at mine time. Existing
+    categories are preserved untouched unless also present in the input;
+    for categories present in both, entries are unioned case-insensitively
+    without changing the on-disk ordering of pre-existing names.
+
+    If a category is stored on-disk as ``{name: code}`` (the alternate
+    miner-supported shape, used by dialect-style configs), new names are
+    added as keys with ``None`` values so existing code mappings aren't
+    overwritten. A later compress pass can assign codes.
+
+    The in-process cache is invalidated on write so same-process callers
+    (notably ``cmd_init`` → ``cmd_mine`` in sequence) see the update
+    immediately instead of waiting for a mtime re-check.
+
+    Returns the registry path as a string for logging.
+    """
+    import json as _json
+    from pathlib import Path as _Path
+
+    registry_path = _Path(_ENTITY_REGISTRY_PATH)
+    registry_path.parent.mkdir(parents=True, exist_ok=True)
+
+    existing: dict = {}
+    if registry_path.exists():
+        try:
+            loaded = _json.loads(registry_path.read_text(encoding="utf-8"))
+            if isinstance(loaded, dict):
+                existing = loaded
+        except (_json.JSONDecodeError, OSError):
+            existing = {}
+
+    def _coerce_name(value):
+        if not value:
+            return None
+        name = str(value)
+        return name if name else None
+
+    for category, names in entities_by_category.items():
+        if not isinstance(names, list) or not names:
+            continue
+        current = existing.get(category)
+        if isinstance(current, list):
+            seen_lower = {str(n).lower() for n in current}
+            for n in names:
+                name = _coerce_name(n)
+                if not name:
+                    continue
+                if name.lower() not in seen_lower:
+                    current.append(name)
+                    seen_lower.add(name.lower())
+        elif isinstance(current, dict):
+            seen_lower = {str(name).lower() for name in current}
+            for n in names:
+                name = _coerce_name(n)
+                if not name or name.lower() in seen_lower:
+                    continue
+                current[name] = None
+                seen_lower.add(name.lower())
+        else:
+            # Missing or unrecognized shape — seed as a fresh list, deduped
+            seen: set = set()
+            ordered: list = []
+            for n in names:
+                name = _coerce_name(n)
+                if not name:
+                    continue
+                key = name.lower()
+                if key in seen:
+                    continue
+                seen.add(key)
+                ordered.append(name)
+            existing[category] = ordered
+
+    registry_path.write_text(_json.dumps(existing, indent=2, ensure_ascii=False), encoding="utf-8")
+    try:
+        registry_path.chmod(0o600)
+    except (OSError, NotImplementedError):
+        pass
+
+    # Invalidate in-process cache so later calls in the same run see the write.
+    _ENTITY_REGISTRY_CACHE["mtime"] = None
+    _ENTITY_REGISTRY_CACHE["names"] = frozenset()
+    _ENTITY_REGISTRY_CACHE["raw"] = {}
+
+    return str(registry_path)
+
+
 _HALL_KEYWORDS_CACHE = None


@@ -594,6 +594,8 @@ def discover_entities(
    prose_file_cap: int = 10,
    project_cap: int = 15,
    people_cap: int = 15,
+    llm_provider: object = None,
+    show_progress: bool = True,
 ) -> dict:
    """Top-level entity discovery: real signals first, prose detection second.

@@ -604,10 +606,39 @@ def discover_entities(
      1. Package manifests (package.json, pyproject.toml, Cargo.toml, go.mod)
         → canonical project names
      2. Git commit authors → real people with real commit counts
-      3. Regex entity detection on prose files → supplementary names only
+      3. Claude Code conversation dirs (~/.claude/projects/) → per-session
+         project names (pulled from each session's ``cwd`` metadata)
+      4. Regex entity detection on prose files → supplementary names only
         mentioned in docs/notes (not code)
+      5. Optional LLM refinement pass — reclassifies ambiguous candidates
+         using the caller-supplied provider
+
+    Passing ``llm_provider`` enables phase-2 refinement. The caller is
+    responsible for constructing the provider (``llm_client.get_provider``)
+    and confirming availability. Refinement is blocking-interactive:
+    progress prints to stderr; Ctrl-C returns partial results.
    """
    projects, people = scan(project_dir)
+
+    # If the target is a Claude Code conversations root, extract per-project
+    # entries from there too. Same ProjectInfo shape, so dedup logic works.
+    from mempalace.convo_scanner import is_claude_projects_root, scan_claude_projects
+
+    root_path = Path(project_dir).expanduser().resolve()
+    if is_claude_projects_root(root_path):
+        convo_projects = scan_claude_projects(root_path)
+        # Dedup by name against the git-manifest list, preferring entries with
+        # more user_commits as signal strength.
+        by_name: dict[str, ProjectInfo] = {p.name: p for p in projects}
+        for cp in convo_projects:
+            existing = by_name.get(cp.name)
+            if existing is None or cp.user_commits > existing.user_commits:
+                by_name[cp.name] = cp
+        projects = sorted(
+            by_name.values(),
+            key=lambda p: (not p.is_mine, -p.user_commits, -p.total_commits, p.name),
+        )
+
    real_signal = to_detected_dict(projects, people, project_cap=project_cap, people_cap=people_cap)

    # Secondary pass: prose-only extraction catches names mentioned in docs
@@ -621,11 +652,45 @@ def discover_entities(
        else {"people": [], "projects": [], "uncertain": []}
    )

-    # If git/manifests gave us real projects, suppress the regex "uncertain" bucket.
-    # That bucket is mostly noise (common words, CamelCase tech terms, etc.) and
-    # adding it to the review flow just makes the user do triage we can skip.
+    # Without LLM refinement, suppress regex "uncertain" noise when real
+    # manifest/git signal exists. With LLM refinement enabled, keep those
+    # candidates so the model can promote real entities or drop common words.
    has_real_signal = bool(projects) or bool(people)
-    return _merge_detected(real_signal, prose_detected, drop_secondary_uncertain=has_real_signal)
+    merged = _merge_detected(
+        real_signal,
+        prose_detected,
+        drop_secondary_uncertain=has_real_signal and llm_provider is None,
+    )
+
+    # Optional phase 2: LLM refinement.
+    if llm_provider is not None:
+        from mempalace.llm_refine import collect_corpus_text, refine_entities
+
+        corpus = collect_corpus_text(str(project_dir))
+        result = refine_entities(
+            merged,
+            corpus,
+            llm_provider,
+            show_progress=show_progress,
+            allow_project_promotions=not has_real_signal,
+        )
+        if show_progress:
+            status_bits = []
+            if result.cancelled:
+                status_bits.append("cancelled")
+            if result.reclassified:
+                status_bits.append(f"reclassified {result.reclassified}")
+            if result.dropped:
+                status_bits.append(f"dropped {result.dropped}")
+            if result.errors:
+                status_bits.append(f"{len(result.errors)} batch error(s)")
+            if status_bits:
+                import sys as _sys
+
+                print(f"  LLM refine: {', '.join(status_bits)}", file=_sys.stderr)
+        merged = result.merged
+
+    return merged


 # ==================== CLI ====================