From 36a8f219c251f39e77637a81717281323fd1cd5c Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Fri, 24 Apr 2026 00:47:14 -0300 Subject: [PATCH] feat(init): wire --llm flag and convo_scanner into discover_entities Extends the init orchestrator to consume two new signal sources: 1. Claude Code conversation dirs: when the target is a `~/.claude/projects/` root, convo_scanner contributes ProjectInfo entries alongside the git/manifest projects. Dedup is by name, preferring the entry with more user-authored activity. 2. Optional LLM refinement: when --llm is passed, discover_entities constructs the provider, validates availability, and runs llm_refine.refine_entities on the merged candidates. Status summary (reclassified / dropped / cancelled / batch errors) prints to stderr. New init flags (opt-in, default remains zero-API): - --llm: enable refinement - --llm-provider: ollama (default) | openai-compat | anthropic - --llm-model: default gemma4:e4b for Ollama - --llm-endpoint: URL (required for openai-compat) - --llm-api-key: falls back to env ($ANTHROPIC_API_KEY or $OPENAI_API_KEY depending on provider) Provider check_available runs before the scan, so the user sees an immediate error ("Run: ollama pull " or "ANTHROPIC_API_KEY not set") rather than a mid-scan failure. --- mempalace/cli.py | 66 ++++++++++++++++++++++++++++++++++-- mempalace/project_scanner.py | 59 ++++++++++++++++++++++++++++++-- uv.lock | 2 ++ 3 files changed, 123 insertions(+), 4 deletions(-) diff --git a/mempalace/cli.py b/mempalace/cli.py index de40090..1181120 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -86,12 +86,37 @@ def cmd_init(args): languages = cfg.entity_languages languages_tuple = tuple(languages) + # Optional phase-2 LLM provider (opt-in via --llm). + llm_provider = None + if getattr(args, "llm", False): + from .llm_client import LLMError, get_provider + + try: + llm_provider = get_provider( + name=args.llm_provider, + model=args.llm_model, + endpoint=args.llm_endpoint, + api_key=args.llm_api_key, + ) + except LLMError as e: + print(f" ERROR: {e}", file=sys.stderr) + sys.exit(2) + ok, msg = llm_provider.check_available() + if not ok: + print( + f" ERROR: LLM provider '{args.llm_provider}' unavailable: {msg}", + file=sys.stderr, + ) + sys.exit(2) + print(f" LLM refinement enabled: {args.llm_provider}/{args.llm_model}") + # Pass 1: discover entities — manifests + git authors first, prose detection - # as supplement for names mentioned only in docs/notes. + # as supplement for names mentioned only in docs/notes. Optional phase-2 + # LLM refinement runs inside discover_entities when llm_provider is given. print(f"\n Scanning for entities in: {args.dir}") if languages_tuple != ("en",): print(f" Languages: {', '.join(languages_tuple)}") - detected = discover_entities(args.dir, languages=languages_tuple) + detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider) total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"]) if total > 0: confirmed = confirm_entities(detected, yes=getattr(args, "yes", False)) @@ -550,6 +575,43 @@ def main(): "When given, the value is also persisted to config.json." ), ) + p_init.add_argument( + "--llm", + action="store_true", + help=( + "Enable LLM-assisted entity refinement (opt-in, local-first). " + "Runs after manifest/git/regex detection, asking the configured " + "provider to reclassify ambiguous candidates. " + "Ctrl-C during refinement returns partial results." + ), + ) + p_init.add_argument( + "--llm-provider", + default="ollama", + choices=["ollama", "openai-compat", "anthropic"], + help="LLM provider (default: ollama). Use --llm to enable.", + ) + p_init.add_argument( + "--llm-model", + default="gemma4:e4b", + help="Model name for the chosen provider (default: gemma4:e4b for Ollama).", + ) + p_init.add_argument( + "--llm-endpoint", + default=None, + help=( + "Provider endpoint URL. Default for Ollama: http://localhost:11434. " + "Required for openai-compat." + ), + ) + p_init.add_argument( + "--llm-api-key", + default=None, + help=( + "API key for the provider. For anthropic, defaults to $ANTHROPIC_API_KEY; " + "for openai-compat, defaults to $OPENAI_API_KEY." + ), + ) # mine p_mine = sub.add_parser("mine", help="Mine files into the palace") diff --git a/mempalace/project_scanner.py b/mempalace/project_scanner.py index c03b883..b5c408e 100644 --- a/mempalace/project_scanner.py +++ b/mempalace/project_scanner.py @@ -574,6 +574,8 @@ def discover_entities( prose_file_cap: int = 10, project_cap: int = 15, people_cap: int = 15, + llm_provider: object = None, + show_progress: bool = True, ) -> dict: """Top-level entity discovery: real signals first, prose detection second. @@ -584,10 +586,39 @@ def discover_entities( 1. Package manifests (package.json, pyproject.toml, Cargo.toml, go.mod) → canonical project names 2. Git commit authors → real people with real commit counts - 3. Regex entity detection on prose files → supplementary names only + 3. Claude Code conversation dirs (~/.claude/projects/) → per-session + project names (pulled from each session's ``cwd`` metadata) + 4. Regex entity detection on prose files → supplementary names only mentioned in docs/notes (not code) + 5. Optional LLM refinement pass — reclassifies ambiguous candidates + using the caller-supplied provider + + Passing ``llm_provider`` enables phase-2 refinement. The caller is + responsible for constructing the provider (``llm_client.get_provider``) + and confirming availability. Refinement is blocking-interactive: + progress prints to stderr; Ctrl-C returns partial results. """ projects, people = scan(project_dir) + + # If the target is a Claude Code conversations root, extract per-project + # entries from there too. Same ProjectInfo shape, so dedup logic works. + from mempalace.convo_scanner import is_claude_projects_root, scan_claude_projects + + root_path = Path(project_dir).expanduser().resolve() + if is_claude_projects_root(root_path): + convo_projects = scan_claude_projects(root_path) + # Dedup by name against the git-manifest list, preferring entries with + # more user_commits as signal strength. + by_name: dict[str, ProjectInfo] = {p.name: p for p in projects} + for cp in convo_projects: + existing = by_name.get(cp.name) + if existing is None or cp.user_commits > existing.user_commits: + by_name[cp.name] = cp + projects = sorted( + by_name.values(), + key=lambda p: (not p.is_mine, -p.user_commits, -p.total_commits, p.name), + ) + real_signal = to_detected_dict(projects, people, project_cap=project_cap, people_cap=people_cap) # Secondary pass: prose-only extraction catches names mentioned in docs @@ -605,7 +636,31 @@ def discover_entities( # That bucket is mostly noise (common words, CamelCase tech terms, etc.) and # adding it to the review flow just makes the user do triage we can skip. has_real_signal = bool(projects) or bool(people) - return _merge_detected(real_signal, prose_detected, drop_secondary_uncertain=has_real_signal) + merged = _merge_detected(real_signal, prose_detected, drop_secondary_uncertain=has_real_signal) + + # Optional phase 2: LLM refinement. + if llm_provider is not None: + from mempalace.llm_refine import collect_corpus_text, refine_entities + + corpus = collect_corpus_text(str(project_dir)) + result = refine_entities(merged, corpus, llm_provider, show_progress=show_progress) + if show_progress: + status_bits = [] + if result.cancelled: + status_bits.append("cancelled") + if result.reclassified: + status_bits.append(f"reclassified {result.reclassified}") + if result.dropped: + status_bits.append(f"dropped {result.dropped}") + if result.errors: + status_bits.append(f"{len(result.errors)} batch error(s)") + if status_bits: + import sys as _sys + + print(f" LLM refine: {', '.join(status_bits)}", file=_sys.stderr) + merged = result.merged + + return merged # ==================== CLI ==================== diff --git a/uv.lock b/uv.lock index 5af54f1..f102d43 100644 --- a/uv.lock +++ b/uv.lock @@ -1174,6 +1174,7 @@ source = { editable = "." } dependencies = [ { name = "chromadb" }, { name = "pyyaml" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, ] [package.optional-dependencies] @@ -1206,6 +1207,7 @@ requires-dist = [ { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0" }, { name = "pyyaml", specifier = ">=6.0,<7" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4.0" }, + { name = "tomli", marker = "python_full_version < '3.11'", specifier = ">=2.0.0" }, ] provides-extras = ["dev", "spellcheck"]