diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index 1a171c1..3a0d2c3 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -120,8 +120,7 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> li os.rename(seg_dir, target) moved.append(target) logger.warning( - "Quarantined stale HNSW segment %s " - "(sqlite %.0fs newer than HNSW); renamed to %s", + "Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s", seg_dir, sqlite_mtime - hnsw_mtime, target, diff --git a/mempalace/cli.py b/mempalace/cli.py index de40090..714c64c 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -86,21 +86,53 @@ def cmd_init(args): languages = cfg.entity_languages languages_tuple = tuple(languages) + # Optional phase-2 LLM provider (opt-in via --llm). + llm_provider = None + if getattr(args, "llm", False): + from .llm_client import LLMError, get_provider + + try: + llm_provider = get_provider( + name=args.llm_provider, + model=args.llm_model, + endpoint=args.llm_endpoint, + api_key=args.llm_api_key, + ) + except LLMError as e: + print(f" ERROR: {e}", file=sys.stderr) + sys.exit(2) + ok, msg = llm_provider.check_available() + if not ok: + print( + f" ERROR: LLM provider '{args.llm_provider}' unavailable: {msg}", + file=sys.stderr, + ) + sys.exit(2) + print(f" LLM refinement enabled: {args.llm_provider}/{args.llm_model}") + # Pass 1: discover entities — manifests + git authors first, prose detection - # as supplement for names mentioned only in docs/notes. + # as supplement for names mentioned only in docs/notes. Optional phase-2 + # LLM refinement runs inside discover_entities when llm_provider is given. print(f"\n Scanning for entities in: {args.dir}") if languages_tuple != ("en",): print(f" Languages: {', '.join(languages_tuple)}") - detected = discover_entities(args.dir, languages=languages_tuple) + detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider) total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"]) if total > 0: confirmed = confirm_entities(detected, yes=getattr(args, "yes", False)) - # Save confirmed entities to /entities.json for the miner + # Save confirmed entities to /entities.json (per-project + # audit trail — user can inspect or hand-edit) AND merge into the + # global registry the miner reads at mine time. if confirmed["people"] or confirmed["projects"]: entities_path = Path(args.dir).expanduser().resolve() / "entities.json" - with open(entities_path, "w") as f: - json.dump(confirmed, f, indent=2) + with open(entities_path, "w", encoding="utf-8") as f: + json.dump(confirmed, f, indent=2, ensure_ascii=False) print(f" Entities saved: {entities_path}") + + from .miner import add_to_known_entities + + registry_path = add_to_known_entities(confirmed) + print(f" Registry updated: {registry_path}") else: print(" No entities detected — proceeding with directory-based rooms.") @@ -550,6 +582,43 @@ def main(): "When given, the value is also persisted to config.json." ), ) + p_init.add_argument( + "--llm", + action="store_true", + help=( + "Enable LLM-assisted entity refinement (opt-in, local-first). " + "Runs after manifest/git/regex detection, asking the configured " + "provider to reclassify ambiguous candidates. " + "Ctrl-C during refinement returns partial results." + ), + ) + p_init.add_argument( + "--llm-provider", + default="ollama", + choices=["ollama", "openai-compat", "anthropic"], + help="LLM provider (default: ollama). Use --llm to enable.", + ) + p_init.add_argument( + "--llm-model", + default="gemma4:e4b", + help="Model name for the chosen provider (default: gemma4:e4b for Ollama).", + ) + p_init.add_argument( + "--llm-endpoint", + default=None, + help=( + "Provider endpoint URL. Default for Ollama: http://localhost:11434. " + "Required for openai-compat." + ), + ) + p_init.add_argument( + "--llm-api-key", + default=None, + help=( + "API key for the provider. For anthropic, defaults to $ANTHROPIC_API_KEY; " + "for openai-compat, defaults to $OPENAI_API_KEY." + ), + ) # mine p_mine = sub.add_parser("mine", help="Mine files into the palace") diff --git a/mempalace/convo_scanner.py b/mempalace/convo_scanner.py new file mode 100644 index 0000000..b592494 --- /dev/null +++ b/mempalace/convo_scanner.py @@ -0,0 +1,160 @@ +""" +convo_scanner.py — Parse Claude Code conversation directories into ProjectInfo. + +Claude Code stores sessions under ``~/.claude/projects//.jsonl``, +where the ```` is the original CWD with ``/`` replaced by ``-``. That +encoding is lossy: we can't tell whether ``foo-bar`` in a slug is the +literal project name ``foo-bar`` or two path segments ``foo/bar``. + +Fortunately, every message record in the JSONL carries a ``cwd`` field with +the true path. This scanner reads one record per session to recover the +accurate project name, falling back to slug-decoding only if the JSONL +is malformed or empty. + +Output is the same ``ProjectInfo`` shape used by ``project_scanner``, so the +``discover_entities`` orchestrator can mix-and-match sources. + +Public: + is_claude_projects_root(path) -> bool + scan_claude_projects(path) -> list[ProjectInfo] +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Optional + +from mempalace.project_scanner import ProjectInfo + + +MAX_HEADER_LINES = 20 # lines to read per session looking for `cwd` + + +def is_claude_projects_root(path: Path) -> bool: + """Return True if path looks like `.claude/projects/`. + + Heuristic: at least one child dir whose name starts with ``-`` and which + contains at least one ``.jsonl`` file. + """ + if not path.is_dir(): + return False + try: + children = list(path.iterdir()) + except OSError: + return False + for child in children: + if not (child.is_dir() and child.name.startswith("-")): + continue + try: + if any(p.suffix == ".jsonl" for p in child.iterdir() if p.is_file()): + return True + except OSError: + continue + return False + + +def _extract_cwd_from_session(session_file: Path) -> Optional[str]: + """Return the ``cwd`` from the first message record that carries one. + + Returns None if the file can't be read, has no JSON, or no record has cwd. + """ + try: + with open(session_file, encoding="utf-8", errors="replace") as f: + for i, line in enumerate(f): + if i >= MAX_HEADER_LINES: + break + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + cwd = obj.get("cwd") + if isinstance(cwd, str) and cwd: + return cwd + except OSError: + return None + return None + + +def _decode_slug_fallback(slug: str) -> str: + """Best-effort project name from slug when cwd is unavailable. + + The slug is lossy (`/` and `-` both become `-`). Last non-empty segment + is the closest guess at the project name, preserving kebab-case is + impossible without cwd. + """ + stripped = slug.lstrip("-") + parts = [p for p in stripped.split("-") if p] + return parts[-1] if parts else slug + + +def _safe_mtime(path: Path) -> float: + """Return file mtime, defaulting old on permission or filesystem errors.""" + try: + return path.stat().st_mtime + except OSError: + return 0.0 + + +def _resolve_project_name(project_dir: Path) -> str: + """Read one session's cwd to recover the original project name. + + Falls back to slug-decoding if no session has a readable cwd. + """ + sessions = sorted( + (p for p in project_dir.iterdir() if p.is_file() and p.suffix == ".jsonl"), + key=_safe_mtime, + reverse=True, # newest first — most likely to be well-formed + ) + for session in sessions: + cwd = _extract_cwd_from_session(session) + if cwd: + return Path(cwd).name or cwd + return _decode_slug_fallback(project_dir.name) + + +def scan_claude_projects(path: str | Path) -> list[ProjectInfo]: + """Scan a ``.claude/projects/`` directory for Claude Code conversations. + + One ProjectInfo per subdir. ``has_git`` is False (the directory isn't a + repo itself) but ``total_commits`` is repurposed here as session count so + the UX surfaces a density signal for ranking. + """ + root = Path(path).expanduser().resolve() + if not is_claude_projects_root(root): + return [] + + projects: dict[str, ProjectInfo] = {} + for sub in sorted(root.iterdir()): + if not (sub.is_dir() and sub.name.startswith("-")): + continue + try: + sessions = [p for p in sub.iterdir() if p.is_file() and p.suffix == ".jsonl"] + except OSError: + continue + if not sessions: + continue + + name = _resolve_project_name(sub) + session_count = len(sessions) + + proj = ProjectInfo( + name=name, + repo_root=sub, + manifest=None, + has_git=False, + total_commits=session_count, + user_commits=session_count, + is_mine=True, # Claude Code sessions are authored by the user + ) + existing = projects.get(name) + if existing is None or session_count > existing.user_commits: + projects[name] = proj + + return sorted( + projects.values(), + key=lambda p: (-p.user_commits, p.name), + ) diff --git a/mempalace/llm_client.py b/mempalace/llm_client.py new file mode 100644 index 0000000..74982ce --- /dev/null +++ b/mempalace/llm_client.py @@ -0,0 +1,305 @@ +""" +llm_client.py — Minimal provider abstraction for LLM-assisted entity refinement. + +Three providers cover the useful space: + +- ``ollama`` (default): local models via http://localhost:11434. Works fully + offline. Honors MemPalace's "zero-API required" principle. +- ``openai-compat``: any OpenAI-compatible ``/v1/chat/completions`` endpoint. + Covers OpenRouter, LM Studio, llama.cpp server, vLLM, Groq, Fireworks, + Together, and most self-hosted setups. +- ``anthropic``: the official Messages API. Opt-in for users who want Haiku + quality without setting up a local model. + +All providers expose the same ``classify(system, user, json_mode)`` method and +the same ``check_available()`` probe. No external SDK dependencies — stdlib +``urllib`` only. + +JSON mode matters here: we always ask for structured output. Providers +differ on how to request it (Ollama: ``format: json``; OpenAI-compat: +``response_format``; Anthropic: prompt-level instruction) and this module +normalizes that away from the caller. +""" + +from __future__ import annotations + +import json +import os +from dataclasses import dataclass +from typing import Optional +from urllib.error import HTTPError, URLError +from urllib.request import Request, urlopen + + +class LLMError(RuntimeError): + """Raised for any provider failure — transport, parse, auth, missing model.""" + + +@dataclass +class LLMResponse: + text: str + model: str + provider: str + raw: dict + + +# ==================== BASE ==================== + + +class LLMProvider: + name: str = "base" + + def __init__( + self, + model: str, + endpoint: Optional[str] = None, + api_key: Optional[str] = None, + timeout: int = 120, + ): + self.model = model + self.endpoint = endpoint + self.api_key = api_key + self.timeout = timeout + + def classify(self, system: str, user: str, json_mode: bool = True) -> LLMResponse: + raise NotImplementedError + + def check_available(self) -> tuple[bool, str]: + """Return ``(ok, message)``. Fast probe that the provider is reachable.""" + raise NotImplementedError + + +def _http_post_json(url: str, body: dict, headers: dict, timeout: int) -> dict: + """POST JSON and return the parsed response. Raises LLMError on any failure.""" + req = Request( + url, + data=json.dumps(body).encode("utf-8"), + headers={"Content-Type": "application/json", **headers}, + ) + try: + with urlopen(req, timeout=timeout) as resp: + return json.loads(resp.read()) + except HTTPError as e: + detail = "" + try: + detail = e.read().decode("utf-8", errors="replace")[:500] + except Exception: + pass + raise LLMError(f"HTTP {e.code} from {url}: {detail or e.reason}") from e + except (URLError, OSError) as e: + raise LLMError(f"Cannot reach {url}: {e}") from e + except json.JSONDecodeError as e: + raise LLMError(f"Malformed response from {url}: {e}") from e + + +# ==================== OLLAMA ==================== + + +class OllamaProvider(LLMProvider): + name = "ollama" + DEFAULT_ENDPOINT = "http://localhost:11434" + + def __init__( + self, + model: str, + endpoint: Optional[str] = None, + timeout: int = 180, + **_: object, + ): + super().__init__( + model=model, + endpoint=endpoint or self.DEFAULT_ENDPOINT, + timeout=timeout, + ) + + def check_available(self) -> tuple[bool, str]: + try: + with urlopen(f"{self.endpoint}/api/tags", timeout=5) as resp: + data = json.loads(resp.read()) + except (URLError, HTTPError, OSError, json.JSONDecodeError) as e: + return False, f"Cannot reach Ollama at {self.endpoint}: {e}" + names = {m.get("name", "") for m in data.get("models", []) or []} + # Ollama tags may or may not include ':latest' — accept either form + wanted = {self.model, f"{self.model}:latest"} + if not names & wanted: + return ( + False, + f"Model '{self.model}' not loaded in Ollama. Run: ollama pull {self.model}", + ) + return True, "ok" + + def classify(self, system: str, user: str, json_mode: bool = True) -> LLMResponse: + body: dict = { + "model": self.model, + "messages": [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + "stream": False, + "options": {"temperature": 0.1}, + } + if json_mode: + body["format"] = "json" + data = _http_post_json(f"{self.endpoint}/api/chat", body, headers={}, timeout=self.timeout) + text = (data.get("message") or {}).get("content", "") + if not text: + raise LLMError(f"Empty response from Ollama (model={self.model})") + return LLMResponse(text=text, model=self.model, provider=self.name, raw=data) + + +# ==================== OPENAI-COMPAT ==================== + + +class OpenAICompatProvider(LLMProvider): + """Any OpenAI-compatible ``/v1/chat/completions`` endpoint. + + Supply ``--llm-endpoint http://host:port`` (with or without ``/v1``). + API key via ``--llm-api-key`` or the ``OPENAI_API_KEY`` env var. + """ + + name = "openai-compat" + + def __init__( + self, + model: str, + endpoint: Optional[str] = None, + api_key: Optional[str] = None, + timeout: int = 120, + **_: object, + ): + resolved_key = api_key or os.environ.get("OPENAI_API_KEY") + super().__init__(model=model, endpoint=endpoint, api_key=resolved_key, timeout=timeout) + + def _resolve_url(self) -> str: + if not self.endpoint: + raise LLMError("openai-compat provider requires --llm-endpoint") + url = self.endpoint.rstrip("/") + if url.endswith("/chat/completions"): + return url + if not url.endswith("/v1"): + url = f"{url}/v1" + return f"{url}/chat/completions" + + def check_available(self) -> tuple[bool, str]: + if not self.endpoint: + return False, "no --llm-endpoint configured" + base = self.endpoint.rstrip("/") + base = base.removesuffix("/chat/completions").removesuffix("/v1") + try: + req = Request(f"{base}/v1/models") + if self.api_key: + req.add_header("Authorization", f"Bearer {self.api_key}") + with urlopen(req, timeout=5): + pass + except (URLError, HTTPError, OSError) as e: + return False, f"Cannot reach {self.endpoint}: {e}" + return True, "ok" + + def classify(self, system: str, user: str, json_mode: bool = True) -> LLMResponse: + body: dict = { + "model": self.model, + "messages": [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + "temperature": 0.1, + } + if json_mode: + body["response_format"] = {"type": "json_object"} + headers = {} + if self.api_key: + headers["Authorization"] = f"Bearer {self.api_key}" + data = _http_post_json(self._resolve_url(), body, headers=headers, timeout=self.timeout) + try: + text = data["choices"][0]["message"]["content"] + except (KeyError, IndexError, TypeError) as e: + raise LLMError(f"Unexpected response shape: {e}") from e + if not text: + raise LLMError(f"Empty response from {self.name} (model={self.model})") + return LLMResponse(text=text, model=self.model, provider=self.name, raw=data) + + +# ==================== ANTHROPIC ==================== + + +class AnthropicProvider(LLMProvider): + name = "anthropic" + DEFAULT_ENDPOINT = "https://api.anthropic.com" + API_VERSION = "2023-06-01" + + def __init__( + self, + model: str, + api_key: Optional[str] = None, + endpoint: Optional[str] = None, + timeout: int = 120, + **_: object, + ): + key = api_key or os.environ.get("ANTHROPIC_API_KEY") + super().__init__( + model=model, + endpoint=endpoint or self.DEFAULT_ENDPOINT, + api_key=key, + timeout=timeout, + ) + + def check_available(self) -> tuple[bool, str]: + if not self.api_key: + return False, "ANTHROPIC_API_KEY not set (use --llm-api-key or env)" + # Don't probe — a live request would cost money. First real call will + # surface auth errors if the key is invalid. + return True, "ok" + + def classify(self, system: str, user: str, json_mode: bool = True) -> LLMResponse: + if not self.api_key: + raise LLMError("Anthropic provider requires ANTHROPIC_API_KEY env or --llm-api-key") + sys_prompt = system + if json_mode: + sys_prompt += "\n\nRespond with valid JSON only, no prose." + body = { + "model": self.model, + "max_tokens": 2048, + "temperature": 0.1, + "system": sys_prompt, + "messages": [{"role": "user", "content": user}], + } + headers = { + "X-API-Key": self.api_key, + "anthropic-version": self.API_VERSION, + } + data = _http_post_json( + f"{self.endpoint}/v1/messages", body, headers=headers, timeout=self.timeout + ) + try: + text = "".join( + b.get("text", "") for b in data.get("content", []) or [] if b.get("type") == "text" + ) + except (AttributeError, TypeError) as e: + raise LLMError(f"Unexpected response shape: {e}") from e + if not text: + raise LLMError(f"Empty response from Anthropic (model={self.model})") + return LLMResponse(text=text, model=self.model, provider=self.name, raw=data) + + +# ==================== FACTORY ==================== + + +PROVIDERS: dict[str, type[LLMProvider]] = { + "ollama": OllamaProvider, + "openai-compat": OpenAICompatProvider, + "anthropic": AnthropicProvider, +} + + +def get_provider( + name: str, + model: str, + endpoint: Optional[str] = None, + api_key: Optional[str] = None, + timeout: int = 120, +) -> LLMProvider: + """Build a provider by name. Raises LLMError on unknown provider.""" + cls = PROVIDERS.get(name) + if cls is None: + raise LLMError(f"Unknown provider '{name}'. Choices: {sorted(PROVIDERS.keys())}") + return cls(model=model, endpoint=endpoint, api_key=api_key, timeout=timeout) diff --git a/mempalace/llm_refine.py b/mempalace/llm_refine.py new file mode 100644 index 0000000..faa737a --- /dev/null +++ b/mempalace/llm_refine.py @@ -0,0 +1,446 @@ +""" +llm_refine.py — Optional LLM refinement of regex-detected entities. + +Takes the candidate set produced by phase-1 detection (manifests, git +authors, regex on prose) and asks an LLM to reclassify each candidate as +PERSON / PROJECT / TOPIC / COMMON_WORD / AMBIGUOUS. + +Design constraints: +- Opt-in. Default init path never imports this module. +- Local-first by default (Ollama). +- Interactive UX: visible progress, clean cancellation (Ctrl-C returns + whatever was classified before the interrupt). +- Don't feed the raw corpus to the LLM — feed candidates + a few sampled + context lines each. Keeps total input to ~50-100K tokens even for huge + prose corpora. + +Public: + refine_entities(detected, corpus_text, provider, ...) -> dict +""" + +from __future__ import annotations + +import json +import re +import sys +from dataclasses import dataclass + +from mempalace.llm_client import LLMError, LLMProvider + + +BATCH_SIZE = 25 # candidates per LLM call; tuned for 4B local models +CONTEXT_LINES_PER_CANDIDATE = 3 +CONTEXT_WINDOW_CHARS = 240 # max chars per context line to keep tokens bounded + +# Valid labels the LLM is allowed to return. Anything else is treated as +# AMBIGUOUS so the user reviews it. +VALID_LABELS = {"PERSON", "PROJECT", "TOPIC", "COMMON_WORD", "AMBIGUOUS"} + + +SYSTEM_PROMPT = """You are helping organize a user's memory palace by classifying capitalized tokens found in their files. + +For each candidate, pick exactly ONE label: +- PERSON: a specific real person the user knows (colleague, family, character they write about) +- PROJECT: a named product, codebase, or effort the user works on +- TOPIC: a recurring theme or subject (not a person, not a project) — cities, technologies, concepts +- COMMON_WORD: an English word, verb, or fragment that isn't a named entity at all (e.g. "Created", "Before", "Never") +- AMBIGUOUS: context is insufficient to decide between two of the above + +Frameworks, runtimes, APIs, cloud services, vendors, and third-party products +(e.g. Angular, OpenAPI, Terraform, Bun, Google) are TOPIC unless the context +clearly says this is the user's own named codebase, product, or active effort. + +Use the provided context lines to disambiguate. A capitalized word that only appears in metadata ("Created: 2026-04-24") is COMMON_WORD. A name that appears with pronouns and dialogue is PERSON. + +Respond with JSON only. Schema: +{"classifications": [{"name": "", "label": "