From 36a8f219c251f39e77637a81717281323fd1cd5c Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Fri, 24 Apr 2026 00:47:14 -0300
Subject: [PATCH] feat(init): wire --llm flag and convo_scanner into
 discover_entities

Extends the init orchestrator to consume two new signal sources:

1. Claude Code conversation dirs: when the target is a
   `~/.claude/projects/` root, convo_scanner contributes ProjectInfo
   entries alongside the git/manifest projects. Dedup is by name,
   preferring the entry with more user-authored activity.
2. Optional LLM refinement: when --llm is passed, discover_entities
   constructs the provider, validates availability, and runs
   llm_refine.refine_entities on the merged candidates. Status
   summary (reclassified / dropped / cancelled / batch errors)
   prints to stderr.

New init flags (opt-in, default remains zero-API):
- --llm: enable refinement
- --llm-provider: ollama (default) | openai-compat | anthropic
- --llm-model: default gemma4:e4b for Ollama
- --llm-endpoint: URL (required for openai-compat)
- --llm-api-key: falls back to env ($ANTHROPIC_API_KEY or
  $OPENAI_API_KEY depending on provider)

Provider check_available runs before the scan, so the user sees an
immediate error ("Run: ollama pull <model>" or "ANTHROPIC_API_KEY not
set") rather than a mid-scan failure.
---
 mempalace/cli.py             | 66 ++++++++++++++++++++++++++++++++++--
 mempalace/project_scanner.py | 59 ++++++++++++++++++++++++++++++--
 uv.lock                      |  2 ++
 3 files changed, 123 insertions(+), 4 deletions(-)
diff --git a/mempalace/cli.py b/mempalace/cli.py
index de40090..1181120 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -86,12 +86,37 @@ def cmd_init(args):
         languages = cfg.entity_languages
     languages_tuple = tuple(languages)
 
+    # Optional phase-2 LLM provider (opt-in via --llm).
+    llm_provider = None
+    if getattr(args, "llm", False):
+        from .llm_client import LLMError, get_provider
+
+        try:
+            llm_provider = get_provider(
+                name=args.llm_provider,
+                model=args.llm_model,
+                endpoint=args.llm_endpoint,
+                api_key=args.llm_api_key,
+            )
+        except LLMError as e:
+            print(f"  ERROR: {e}", file=sys.stderr)
+            sys.exit(2)
+        ok, msg = llm_provider.check_available()
+        if not ok:
+            print(
+                f"  ERROR: LLM provider '{args.llm_provider}' unavailable: {msg}",
+                file=sys.stderr,
+            )
+            sys.exit(2)
+        print(f"  LLM refinement enabled: {args.llm_provider}/{args.llm_model}")
+
     # Pass 1: discover entities — manifests + git authors first, prose detection
-    # as supplement for names mentioned only in docs/notes.
+    # as supplement for names mentioned only in docs/notes. Optional phase-2
+    # LLM refinement runs inside discover_entities when llm_provider is given.
     print(f"\n  Scanning for entities in: {args.dir}")
     if languages_tuple != ("en",):
         print(f"  Languages: {', '.join(languages_tuple)}")
-    detected = discover_entities(args.dir, languages=languages_tuple)
+    detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider)
     total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
     if total > 0:
         confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
@@ -550,6 +575,43 @@ def main():
             "When given, the value is also persisted to config.json."
         ),
     )
+    p_init.add_argument(
+        "--llm",
+        action="store_true",
+        help=(
+            "Enable LLM-assisted entity refinement (opt-in, local-first). "
+            "Runs after manifest/git/regex detection, asking the configured "
+            "provider to reclassify ambiguous candidates. "
+            "Ctrl-C during refinement returns partial results."
+        ),
+    )
+    p_init.add_argument(
+        "--llm-provider",
+        default="ollama",
+        choices=["ollama", "openai-compat", "anthropic"],
+        help="LLM provider (default: ollama). Use --llm to enable.",
+    )
+    p_init.add_argument(
+        "--llm-model",
+        default="gemma4:e4b",
+        help="Model name for the chosen provider (default: gemma4:e4b for Ollama).",
+    )
+    p_init.add_argument(
+        "--llm-endpoint",
+        default=None,
+        help=(
+            "Provider endpoint URL. Default for Ollama: http://localhost:11434. "
+            "Required for openai-compat."
+        ),
+    )
+    p_init.add_argument(
+        "--llm-api-key",
+        default=None,
+        help=(
+            "API key for the provider. For anthropic, defaults to $ANTHROPIC_API_KEY; "
+            "for openai-compat, defaults to $OPENAI_API_KEY."
+        ),
+    )
 
     # mine
     p_mine = sub.add_parser("mine", help="Mine files into the palace")
diff --git a/mempalace/project_scanner.py b/mempalace/project_scanner.py
index c03b883..b5c408e 100644
--- a/mempalace/project_scanner.py
+++ b/mempalace/project_scanner.py
@@ -574,6 +574,8 @@ def discover_entities(
     prose_file_cap: int = 10,
     project_cap: int = 15,
     people_cap: int = 15,
+    llm_provider: object = None,
+    show_progress: bool = True,
 ) -> dict:
     """Top-level entity discovery: real signals first, prose detection second.
 
@@ -584,10 +586,39 @@ def discover_entities(
       1. Package manifests (package.json, pyproject.toml, Cargo.toml, go.mod)
          → canonical project names
       2. Git commit authors → real people with real commit counts
-      3. Regex entity detection on prose files → supplementary names only
+      3. Claude Code conversation dirs (~/.claude/projects/) → per-session
+         project names (pulled from each session's ``cwd`` metadata)
+      4. Regex entity detection on prose files → supplementary names only
          mentioned in docs/notes (not code)
+      5. Optional LLM refinement pass — reclassifies ambiguous candidates
+         using the caller-supplied provider
+
+    Passing ``llm_provider`` enables phase-2 refinement. The caller is
+    responsible for constructing the provider (``llm_client.get_provider``)
+    and confirming availability. Refinement is blocking-interactive:
+    progress prints to stderr; Ctrl-C returns partial results.
     """
     projects, people = scan(project_dir)
+
+    # If the target is a Claude Code conversations root, extract per-project
+    # entries from there too. Same ProjectInfo shape, so dedup logic works.
+    from mempalace.convo_scanner import is_claude_projects_root, scan_claude_projects
+
+    root_path = Path(project_dir).expanduser().resolve()
+    if is_claude_projects_root(root_path):
+        convo_projects = scan_claude_projects(root_path)
+        # Dedup by name against the git-manifest list, preferring entries with
+        # more user_commits as signal strength.
+        by_name: dict[str, ProjectInfo] = {p.name: p for p in projects}
+        for cp in convo_projects:
+            existing = by_name.get(cp.name)
+            if existing is None or cp.user_commits > existing.user_commits:
+                by_name[cp.name] = cp
+        projects = sorted(
+            by_name.values(),
+            key=lambda p: (not p.is_mine, -p.user_commits, -p.total_commits, p.name),
+        )
+
     real_signal = to_detected_dict(projects, people, project_cap=project_cap, people_cap=people_cap)
 
     # Secondary pass: prose-only extraction catches names mentioned in docs
@@ -605,7 +636,31 @@ def discover_entities(
     # That bucket is mostly noise (common words, CamelCase tech terms, etc.) and
     # adding it to the review flow just makes the user do triage we can skip.
     has_real_signal = bool(projects) or bool(people)
-    return _merge_detected(real_signal, prose_detected, drop_secondary_uncertain=has_real_signal)
+    merged = _merge_detected(real_signal, prose_detected, drop_secondary_uncertain=has_real_signal)
+
+    # Optional phase 2: LLM refinement.
+    if llm_provider is not None:
+        from mempalace.llm_refine import collect_corpus_text, refine_entities
+
+        corpus = collect_corpus_text(str(project_dir))
+        result = refine_entities(merged, corpus, llm_provider, show_progress=show_progress)
+        if show_progress:
+            status_bits = []
+            if result.cancelled:
+                status_bits.append("cancelled")
+            if result.reclassified:
+                status_bits.append(f"reclassified {result.reclassified}")
+            if result.dropped:
+                status_bits.append(f"dropped {result.dropped}")
+            if result.errors:
+                status_bits.append(f"{len(result.errors)} batch error(s)")
+            if status_bits:
+                import sys as _sys
+
+                print(f"  LLM refine: {', '.join(status_bits)}", file=_sys.stderr)
+        merged = result.merged
+
+    return merged
 
 
 # ==================== CLI ====================
diff --git a/uv.lock b/uv.lock
index 5af54f1..f102d43 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1174,6 +1174,7 @@ source = { editable = "." }
 dependencies = [
     { name = "chromadb" },
     { name = "pyyaml" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
 ]
 
 [package.optional-dependencies]
@@ -1206,6 +1207,7 @@ requires-dist = [
     { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0" },
     { name = "pyyaml", specifier = ">=6.0,<7" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4.0" },
+    { name = "tomli", marker = "python_full_version < '3.11'", specifier = ">=2.0.0" },
 ]
 provides-extras = ["dev", "spellcheck"]