ef8d83cc8a
When a `mempalace mine` collided with another writer (live mcp_server, another mine, anything taking mine_palace_lock), the operator saw a generic "another `mempalace mine` is already running" message and the CLI exited 0 — making the contention invisible to nohup or scripts checking $?. The reporter ran a `nohup mempalace mine ... & disown` and got a 200-byte log with only the auto-defaults warning, no clue that an MCP server was holding the store. palace.py: the lock file now records the holder's PID + first three argv tokens on acquire. A failed acquire reads the file and surfaces "palace <path> is held by PID N (mempalace mcp_server); wait for it to finish or stop the holder before retrying" in the MineAlreadyRunning message. Open mode changes from "w" to "a+" so the prior holder's identity survives long enough to be read. miner.mine() now lets MineAlreadyRunning propagate. cmd_mine catches it, prints the holder-aware message to stderr, and exits non-zero so shell wrappers detect the contention. Note: this is a behavior change for in-process callers that depended on miner.mine() silently swallowing MineAlreadyRunning. The silent swallow was the bug. Closes #1264
1296 lines
45 KiB
Python
1296 lines
45 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
miner.py — Files everything into the palace.
|
|
|
|
Reads mempalace.yaml from the project directory to know the wing + rooms.
|
|
Routes each file to the right room based on content.
|
|
Stores verbatim chunks as drawers. No summaries. Ever.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import shlex
|
|
import hashlib
|
|
import fnmatch
|
|
import logging
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from collections import defaultdict
|
|
from typing import Optional
|
|
|
|
from .palace import (
|
|
NORMALIZE_VERSION,
|
|
SKIP_DIRS,
|
|
build_closet_lines,
|
|
file_already_mined,
|
|
get_closets_collection,
|
|
get_collection,
|
|
mine_lock,
|
|
mine_palace_lock,
|
|
purge_file_closets,
|
|
upsert_closet_lines,
|
|
)
|
|
|
|
logger = logging.getLogger("mempalace_mcp")
|
|
|
|
READABLE_EXTENSIONS = {
|
|
".txt",
|
|
".md",
|
|
".py",
|
|
".js",
|
|
".ts",
|
|
".jsx",
|
|
".tsx",
|
|
".json",
|
|
".jsonl",
|
|
".yaml",
|
|
".yml",
|
|
".html",
|
|
".css",
|
|
".java",
|
|
".go",
|
|
".rs",
|
|
".rb",
|
|
".sh",
|
|
".csv",
|
|
".sql",
|
|
".toml",
|
|
}
|
|
|
|
SKIP_FILENAMES = {
|
|
"entities.json",
|
|
"mempalace.yaml",
|
|
"mempalace.yml",
|
|
"mempal.yaml",
|
|
"mempal.yml",
|
|
".gitignore",
|
|
"package-lock.json",
|
|
"pnpm-lock.yaml",
|
|
"yarn.lock",
|
|
}
|
|
|
|
CHUNK_SIZE = 800 # chars per drawer
|
|
CHUNK_OVERLAP = 100 # overlap between chunks
|
|
MIN_CHUNK_SIZE = 50 # skip tiny chunks
|
|
DRAWER_UPSERT_BATCH_SIZE = 1000
|
|
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB — skip files larger than this.
|
|
# A single file producing more chunks than this is almost always a generated
|
|
# artifact (CSV/JSON dump, lockfile not in SKIP_FILENAMES, etc.). Embedding
|
|
# thousands of chunks from one file in one batch has triggered ONNX runtime
|
|
# `bad allocation` errors on Windows (#1296). The cap is conservative: a
|
|
# 500-chunk file at CHUNK_SIZE=800 is ~400 KB of source, which covers most
|
|
# legitimate hand-written content while bounding the worst-case batch.
|
|
MAX_CHUNKS_PER_FILE = 500
|
|
# Long Claude Code sessions and large transcript exports routinely exceed
|
|
# 10 MB. The cap exists as a defensive rail against pathological binary
|
|
# files, not as a limit on legitimate text. Per-drawer size is bounded
|
|
# by CHUNK_SIZE, but larger sources still produce proportionally more
|
|
# drawers and therefore more storage, embedding, and processing work —
|
|
# and file reads are not streamed (the whole content is loaded into
|
|
# memory before chunking), so memory use scales with source size too.
|
|
|
|
|
|
# =============================================================================
|
|
# IGNORE MATCHING
|
|
# =============================================================================
|
|
|
|
|
|
class GitignoreMatcher:
|
|
"""Lightweight matcher for one directory's .gitignore patterns."""
|
|
|
|
def __init__(self, base_dir: Path, rules: list):
|
|
self.base_dir = base_dir
|
|
self.rules = rules
|
|
|
|
@classmethod
|
|
def from_dir(cls, dir_path: Path):
|
|
gitignore_path = dir_path / ".gitignore"
|
|
if not gitignore_path.is_file():
|
|
return None
|
|
|
|
try:
|
|
lines = gitignore_path.read_text(encoding="utf-8", errors="replace").splitlines()
|
|
except Exception:
|
|
return None
|
|
|
|
rules = []
|
|
for raw_line in lines:
|
|
line = raw_line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
if line.startswith("\\#") or line.startswith("\\!"):
|
|
line = line[1:]
|
|
elif line.startswith("#"):
|
|
continue
|
|
|
|
negated = line.startswith("!")
|
|
if negated:
|
|
line = line[1:]
|
|
|
|
anchored = line.startswith("/")
|
|
if anchored:
|
|
line = line.lstrip("/")
|
|
|
|
dir_only = line.endswith("/")
|
|
if dir_only:
|
|
line = line.rstrip("/")
|
|
|
|
if not line:
|
|
continue
|
|
|
|
rules.append(
|
|
{
|
|
"pattern": line,
|
|
"anchored": anchored,
|
|
"dir_only": dir_only,
|
|
"negated": negated,
|
|
}
|
|
)
|
|
|
|
if not rules:
|
|
return None
|
|
|
|
return cls(dir_path, rules)
|
|
|
|
def matches(self, path: Path, is_dir: bool = None):
|
|
try:
|
|
relative = path.relative_to(self.base_dir).as_posix().strip("/")
|
|
except ValueError:
|
|
return None
|
|
|
|
if not relative:
|
|
return None
|
|
|
|
if is_dir is None:
|
|
is_dir = path.is_dir()
|
|
|
|
ignored = None
|
|
for rule in self.rules:
|
|
if self._rule_matches(rule, relative, is_dir):
|
|
ignored = not rule["negated"]
|
|
return ignored
|
|
|
|
def _rule_matches(self, rule: dict, relative: str, is_dir: bool) -> bool:
|
|
pattern = rule["pattern"]
|
|
parts = relative.split("/")
|
|
pattern_parts = pattern.split("/")
|
|
|
|
if rule["dir_only"]:
|
|
target_parts = parts if is_dir else parts[:-1]
|
|
if not target_parts:
|
|
return False
|
|
if rule["anchored"] or len(pattern_parts) > 1:
|
|
return self._match_from_root(target_parts, pattern_parts)
|
|
return any(fnmatch.fnmatch(part, pattern) for part in target_parts)
|
|
|
|
if rule["anchored"] or len(pattern_parts) > 1:
|
|
return self._match_from_root(parts, pattern_parts)
|
|
|
|
return any(fnmatch.fnmatch(part, pattern) for part in parts)
|
|
|
|
def _match_from_root(self, target_parts: list, pattern_parts: list) -> bool:
|
|
def matches(path_index: int, pattern_index: int) -> bool:
|
|
if pattern_index == len(pattern_parts):
|
|
return True
|
|
|
|
if path_index == len(target_parts):
|
|
return all(part == "**" for part in pattern_parts[pattern_index:])
|
|
|
|
pattern_part = pattern_parts[pattern_index]
|
|
if pattern_part == "**":
|
|
return matches(path_index, pattern_index + 1) or matches(
|
|
path_index + 1, pattern_index
|
|
)
|
|
|
|
if not fnmatch.fnmatch(target_parts[path_index], pattern_part):
|
|
return False
|
|
|
|
return matches(path_index + 1, pattern_index + 1)
|
|
|
|
return matches(0, 0)
|
|
|
|
|
|
def load_gitignore_matcher(dir_path: Path, cache: dict):
|
|
"""Load and cache one directory's .gitignore matcher."""
|
|
if dir_path not in cache:
|
|
cache[dir_path] = GitignoreMatcher.from_dir(dir_path)
|
|
return cache[dir_path]
|
|
|
|
|
|
def is_gitignored(path: Path, matchers: list, is_dir: bool = False) -> bool:
|
|
"""Apply active .gitignore matchers in ancestor order; last match wins."""
|
|
ignored = False
|
|
for matcher in matchers:
|
|
decision = matcher.matches(path, is_dir=is_dir)
|
|
if decision is not None:
|
|
ignored = decision
|
|
return ignored
|
|
|
|
|
|
def should_skip_dir(dirname: str) -> bool:
|
|
"""Skip known generated/cache directories before gitignore matching."""
|
|
return dirname in SKIP_DIRS or dirname.endswith(".egg-info")
|
|
|
|
|
|
def normalize_include_paths(include_ignored: list) -> set:
|
|
"""Normalize comma-parsed include paths into project-relative POSIX strings."""
|
|
normalized = set()
|
|
for raw_path in include_ignored or []:
|
|
candidate = str(raw_path).strip().strip("/")
|
|
if candidate:
|
|
normalized.add(Path(candidate).as_posix())
|
|
return normalized
|
|
|
|
|
|
def is_exact_force_include(path: Path, project_path: Path, include_paths: set) -> bool:
|
|
"""Return True when a path exactly matches an explicit include override."""
|
|
if not include_paths:
|
|
return False
|
|
|
|
try:
|
|
relative = path.relative_to(project_path).as_posix().strip("/")
|
|
except ValueError:
|
|
return False
|
|
|
|
return relative in include_paths
|
|
|
|
|
|
def is_force_included(path: Path, project_path: Path, include_paths: set) -> bool:
|
|
"""Return True when a path or one of its ancestors/descendants was explicitly included."""
|
|
if not include_paths:
|
|
return False
|
|
|
|
try:
|
|
relative = path.relative_to(project_path).as_posix().strip("/")
|
|
except ValueError:
|
|
return False
|
|
|
|
if not relative:
|
|
return False
|
|
|
|
for include_path in include_paths:
|
|
if relative == include_path:
|
|
return True
|
|
if relative.startswith(f"{include_path}/"):
|
|
return True
|
|
if include_path.startswith(f"{relative}/"):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
# =============================================================================
|
|
# CONFIG
|
|
# =============================================================================
|
|
|
|
|
|
def load_config(project_dir: str) -> dict:
|
|
"""Load mempalace.yaml from project directory (falls back to mempal.yaml)."""
|
|
import yaml
|
|
|
|
resolved_project_dir = Path(project_dir).expanduser().resolve()
|
|
config_path = resolved_project_dir / "mempalace.yaml"
|
|
if not config_path.exists():
|
|
# Fallback to legacy name
|
|
legacy_path = resolved_project_dir / "mempal.yaml"
|
|
if legacy_path.exists():
|
|
config_path = legacy_path
|
|
else:
|
|
from .config import normalize_wing_name
|
|
|
|
# Normalize the dirname-derived fallback wing the same way
|
|
# ``cmd_init`` and ``room_detector_local`` do — otherwise a
|
|
# hyphenated project mined without a yaml file lands under a
|
|
# raw-name wing while ``topics_by_wing`` was keyed under the
|
|
# normalized slug, silently dropping every topic tunnel
|
|
# (the no-yaml branch of issue #1194).
|
|
wing_name = normalize_wing_name(resolved_project_dir.name)
|
|
print(
|
|
f" No mempalace.yaml found in {resolved_project_dir} "
|
|
f"— using auto-detected defaults (wing='{wing_name}'). "
|
|
"Directories with the same basename will share a wing; "
|
|
"add mempalace.yaml to disambiguate.",
|
|
file=sys.stderr,
|
|
)
|
|
return {
|
|
"wing": wing_name,
|
|
"rooms": [
|
|
{
|
|
"name": "general",
|
|
"description": "All project files",
|
|
"keywords": ["general"],
|
|
}
|
|
],
|
|
}
|
|
with open(config_path) as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
# =============================================================================
|
|
# FILE ROUTING — which room does this file belong to?
|
|
# =============================================================================
|
|
|
|
|
|
def detect_room(filepath: Path, content: str, rooms: list, project_path: Path) -> str:
|
|
"""
|
|
Route a file to the right room.
|
|
Priority:
|
|
1. Folder path matches a room name
|
|
2. Filename matches a room name or keyword
|
|
3. Content keyword scoring
|
|
4. Fallback: "general"
|
|
"""
|
|
relative = str(filepath.relative_to(project_path)).lower()
|
|
filename = filepath.stem.lower()
|
|
content_lower = content[:2000].lower()
|
|
|
|
# Priority 1: folder path matches room name or keywords
|
|
path_parts = relative.replace("\\", "/").split("/")
|
|
for part in path_parts[:-1]: # skip filename itself
|
|
for room in rooms:
|
|
candidates = [room["name"].lower()] + [k.lower() for k in room.get("keywords", [])]
|
|
if any(part == c or c in part or part in c for c in candidates):
|
|
return room["name"]
|
|
|
|
# Priority 2: filename matches room name
|
|
for room in rooms:
|
|
if room["name"].lower() in filename or filename in room["name"].lower():
|
|
return room["name"]
|
|
|
|
# Priority 3: keyword scoring from room keywords + name
|
|
scores = defaultdict(int)
|
|
for room in rooms:
|
|
keywords = room.get("keywords", []) + [room["name"]]
|
|
for kw in keywords:
|
|
count = content_lower.count(kw.lower())
|
|
scores[room["name"]] += count
|
|
|
|
if scores:
|
|
best = max(scores, key=scores.get)
|
|
if scores[best] > 0:
|
|
return best
|
|
|
|
return "general"
|
|
|
|
|
|
# =============================================================================
|
|
# CHUNKING
|
|
# =============================================================================
|
|
|
|
|
|
def chunk_text(content: str, source_file: str) -> list:
|
|
"""
|
|
Split content into drawer-sized chunks.
|
|
Tries to split on paragraph/line boundaries.
|
|
Returns list of {"content": str, "chunk_index": int}
|
|
"""
|
|
# Clean up
|
|
content = content.strip()
|
|
if not content:
|
|
return []
|
|
|
|
chunks = []
|
|
start = 0
|
|
chunk_index = 0
|
|
|
|
while start < len(content):
|
|
end = min(start + CHUNK_SIZE, len(content))
|
|
|
|
# Try to break at paragraph boundary
|
|
if end < len(content):
|
|
newline_pos = content.rfind("\n\n", start, end)
|
|
if newline_pos > start + CHUNK_SIZE // 2:
|
|
end = newline_pos
|
|
else:
|
|
newline_pos = content.rfind("\n", start, end)
|
|
if newline_pos > start + CHUNK_SIZE // 2:
|
|
end = newline_pos
|
|
|
|
chunk = content[start:end].strip()
|
|
if len(chunk) >= MIN_CHUNK_SIZE:
|
|
chunks.append(
|
|
{
|
|
"content": chunk,
|
|
"chunk_index": chunk_index,
|
|
}
|
|
)
|
|
chunk_index += 1
|
|
|
|
start = end - CHUNK_OVERLAP if end < len(content) else end
|
|
|
|
return chunks
|
|
|
|
|
|
# =============================================================================
|
|
# PALACE — ChromaDB operations
|
|
# =============================================================================
|
|
|
|
|
|
_ENTITY_REGISTRY_PATH = os.path.join(os.path.expanduser("~"), ".mempalace", "known_entities.json")
|
|
_ENTITY_REGISTRY_CACHE: dict = {"mtime": None, "names": frozenset(), "raw": {}}
|
|
_ENTITY_EXTRACT_WINDOW = 5000 # chars of content scanned for capitalized words
|
|
_ENTITY_METADATA_LIMIT = 25 # max entities packed into the metadata field
|
|
|
|
|
|
def _refresh_known_entities_cache() -> None:
|
|
"""Reload ``~/.mempalace/known_entities.json`` into the module cache if
|
|
its mtime changed since the last read. Shared by ``_load_known_entities``
|
|
(flat set) and ``_load_known_entities_raw`` (category dict), so callers
|
|
can pick whichever shape they need without duplicating the mtime-gated
|
|
disk read.
|
|
"""
|
|
try:
|
|
mtime = os.path.getmtime(_ENTITY_REGISTRY_PATH)
|
|
except OSError:
|
|
if _ENTITY_REGISTRY_CACHE["mtime"] is not None:
|
|
_ENTITY_REGISTRY_CACHE["mtime"] = None
|
|
_ENTITY_REGISTRY_CACHE["names"] = frozenset()
|
|
_ENTITY_REGISTRY_CACHE["raw"] = {}
|
|
return
|
|
|
|
if _ENTITY_REGISTRY_CACHE["mtime"] == mtime:
|
|
return
|
|
|
|
names: set = set()
|
|
raw: dict = {}
|
|
try:
|
|
import json
|
|
|
|
with open(_ENTITY_REGISTRY_PATH, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
if isinstance(data, dict):
|
|
raw = data
|
|
for cat_key, cat in data.items():
|
|
# Special wing-keyed map — its inner values are topic
|
|
# names but its outer keys are wings, which must NOT be
|
|
# surfaced as known entities. Pull the topic names out
|
|
# explicitly instead of treating it as a generic category.
|
|
if cat_key == "topics_by_wing" and isinstance(cat, dict):
|
|
for topic_list in cat.values():
|
|
if isinstance(topic_list, list):
|
|
names.update(str(n) for n in topic_list if n)
|
|
continue
|
|
if isinstance(cat, list):
|
|
names.update(str(n) for n in cat if n)
|
|
elif isinstance(cat, dict):
|
|
names.update(str(k) for k in cat.keys() if k)
|
|
except Exception:
|
|
names = set()
|
|
raw = {}
|
|
|
|
_ENTITY_REGISTRY_CACHE["mtime"] = mtime
|
|
_ENTITY_REGISTRY_CACHE["names"] = frozenset(names)
|
|
_ENTITY_REGISTRY_CACHE["raw"] = raw
|
|
|
|
|
|
def _load_known_entities() -> frozenset:
|
|
"""Flat set of every known entity name (across all categories).
|
|
|
|
Cached by mtime; invalidated when the registry file changes.
|
|
"""
|
|
_refresh_known_entities_cache()
|
|
return _ENTITY_REGISTRY_CACHE["names"]
|
|
|
|
|
|
def _load_known_entities_raw() -> dict:
|
|
"""Full category-dict view of the registry, shape
|
|
``{"category": ["Name1", ...], ...}``. Cached by mtime.
|
|
|
|
Consumed by modules (e.g., fact_checker) that need to reason about
|
|
categories rather than a flat name set. Never returns a mutable
|
|
reference to the cache — callers get a shallow copy.
|
|
"""
|
|
_refresh_known_entities_cache()
|
|
return dict(_ENTITY_REGISTRY_CACHE["raw"])
|
|
|
|
|
|
def _set_wing_topics(existing: dict, wing_key: str, topics_for_wing: list, coerce) -> None:
|
|
"""Update ``existing['topics_by_wing'][wing_key]`` to the deduped list.
|
|
|
|
Replaces (does not union) the wing's topic list — re-running ``init``
|
|
should reflect the user's latest confirmation rather than accumulate
|
|
stale labels. Empty input drops the wing entry; an empty map drops
|
|
the ``topics_by_wing`` key entirely.
|
|
"""
|
|
topics_map = existing.get("topics_by_wing")
|
|
if not isinstance(topics_map, dict):
|
|
topics_map = {}
|
|
seen_lower: set = set()
|
|
ordered: list = []
|
|
for n in topics_for_wing:
|
|
name = coerce(n)
|
|
if not name:
|
|
continue
|
|
key = name.lower()
|
|
if key in seen_lower:
|
|
continue
|
|
seen_lower.add(key)
|
|
ordered.append(name)
|
|
if ordered:
|
|
topics_map[wing_key] = ordered
|
|
else:
|
|
topics_map.pop(wing_key, None)
|
|
if topics_map:
|
|
existing["topics_by_wing"] = topics_map
|
|
else:
|
|
existing.pop("topics_by_wing", None)
|
|
|
|
|
|
def add_to_known_entities(entities_by_category: dict, wing: str = None) -> str:
|
|
"""Union ``entities_by_category`` into ``~/.mempalace/known_entities.json``.
|
|
|
|
Accepts ``{category: [names]}`` shape as produced by ``mempalace init``
|
|
and merges into the registry the miner reads at mine time. Existing
|
|
categories are preserved untouched unless also present in the input;
|
|
for categories present in both, entries are unioned case-insensitively
|
|
without changing the on-disk ordering of pre-existing names.
|
|
|
|
If a category is stored on-disk as ``{name: code}`` (the alternate
|
|
miner-supported shape, used by dialect-style configs), new names are
|
|
added as keys with ``None`` values so existing code mappings aren't
|
|
overwritten. A later compress pass can assign codes.
|
|
|
|
When ``wing`` is provided AND ``entities_by_category`` contains a
|
|
``topics`` list, those topics are also recorded under
|
|
``topics_by_wing[wing]`` (case-insensitive dedup, preserving the
|
|
casing of the first observed name). This is the signal source for
|
|
``palace_graph.compute_topic_tunnels`` at mine time. Topics for a
|
|
wing are *replaced*, not unioned, so a re-run of ``init`` reflects
|
|
the user's latest confirmation rather than accumulating stale labels
|
|
indefinitely.
|
|
|
|
The in-process cache is invalidated on write so same-process callers
|
|
(notably ``cmd_init`` → ``cmd_mine`` in sequence) see the update
|
|
immediately instead of waiting for a mtime re-check.
|
|
|
|
Returns the registry path as a string for logging.
|
|
"""
|
|
import json as _json
|
|
from pathlib import Path as _Path
|
|
|
|
registry_path = _Path(_ENTITY_REGISTRY_PATH)
|
|
registry_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
existing: dict = {}
|
|
if registry_path.exists():
|
|
try:
|
|
loaded = _json.loads(registry_path.read_text(encoding="utf-8"))
|
|
if isinstance(loaded, dict):
|
|
existing = loaded
|
|
except (_json.JSONDecodeError, OSError):
|
|
existing = {}
|
|
|
|
def _coerce_name(value):
|
|
if not value:
|
|
return None
|
|
name = str(value)
|
|
return name if name else None
|
|
|
|
# Separate the topics_by_wing key from regular categories so we don't
|
|
# treat it as a flat name-list elsewhere in this function.
|
|
topics_for_wing = None
|
|
if wing and isinstance(wing, str) and wing.strip():
|
|
topics_for_wing = entities_by_category.get("topics") or []
|
|
|
|
for category, names in entities_by_category.items():
|
|
if category == "topics_by_wing":
|
|
# Reserved key — managed separately below.
|
|
continue
|
|
if not isinstance(names, list) or not names:
|
|
continue
|
|
current = existing.get(category)
|
|
if isinstance(current, list):
|
|
seen_lower = {str(n).lower() for n in current}
|
|
for n in names:
|
|
name = _coerce_name(n)
|
|
if not name:
|
|
continue
|
|
if name.lower() not in seen_lower:
|
|
current.append(name)
|
|
seen_lower.add(name.lower())
|
|
elif isinstance(current, dict):
|
|
seen_lower = {str(name).lower() for name in current}
|
|
for n in names:
|
|
name = _coerce_name(n)
|
|
if not name or name.lower() in seen_lower:
|
|
continue
|
|
current[name] = None
|
|
seen_lower.add(name.lower())
|
|
else:
|
|
# Missing or unrecognized shape — seed as a fresh list, deduped
|
|
seen: set = set()
|
|
ordered: list = []
|
|
for n in names:
|
|
name = _coerce_name(n)
|
|
if not name:
|
|
continue
|
|
key = name.lower()
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
ordered.append(name)
|
|
existing[category] = ordered
|
|
|
|
if topics_for_wing is not None:
|
|
_set_wing_topics(existing, wing.strip(), topics_for_wing, _coerce_name)
|
|
|
|
registry_path.write_text(_json.dumps(existing, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
try:
|
|
registry_path.chmod(0o600)
|
|
except (OSError, NotImplementedError):
|
|
pass
|
|
|
|
# Invalidate in-process cache so later calls in the same run see the write.
|
|
_ENTITY_REGISTRY_CACHE["mtime"] = None
|
|
_ENTITY_REGISTRY_CACHE["names"] = frozenset()
|
|
_ENTITY_REGISTRY_CACHE["raw"] = {}
|
|
|
|
return str(registry_path)
|
|
|
|
|
|
def get_topics_by_wing() -> dict:
|
|
"""Return ``topics_by_wing`` from the global registry as a dict.
|
|
|
|
Returns ``{}`` if the registry is missing, malformed, or has no
|
|
``topics_by_wing`` key. Casing is preserved from disk; callers that
|
|
need case-insensitive comparison should normalize themselves.
|
|
"""
|
|
raw = _load_known_entities_raw()
|
|
topics_map = raw.get("topics_by_wing")
|
|
if not isinstance(topics_map, dict):
|
|
return {}
|
|
out: dict = {}
|
|
for wing, topics in topics_map.items():
|
|
if not isinstance(wing, str) or not wing.strip():
|
|
continue
|
|
if isinstance(topics, list):
|
|
cleaned = [str(t) for t in topics if isinstance(t, str) and t.strip()]
|
|
if cleaned:
|
|
out[wing.strip()] = cleaned
|
|
return out
|
|
|
|
|
|
_HALL_KEYWORDS_CACHE = None
|
|
|
|
|
|
def detect_hall(content: str) -> str:
|
|
"""Route content to a hall based on keyword scoring.
|
|
|
|
Halls connect rooms within a wing — they categorize the TYPE of content
|
|
(emotional, technical, family, etc.) while rooms categorize the TOPIC.
|
|
"""
|
|
global _HALL_KEYWORDS_CACHE
|
|
if _HALL_KEYWORDS_CACHE is None:
|
|
from .config import MempalaceConfig
|
|
|
|
_HALL_KEYWORDS_CACHE = MempalaceConfig().hall_keywords
|
|
content_lower = content[:3000].lower()
|
|
|
|
scores = {}
|
|
for hall, keywords in _HALL_KEYWORDS_CACHE.items():
|
|
score = sum(1 for kw in keywords if kw in content_lower)
|
|
if score > 0:
|
|
scores[hall] = score
|
|
|
|
if scores:
|
|
return max(scores, key=scores.get)
|
|
return "general"
|
|
|
|
|
|
def _extract_entities_for_metadata(content: str) -> str:
|
|
"""Extract entity names from content for metadata tagging.
|
|
|
|
Combines the user's known-entity registry (cached across calls) with
|
|
capitalized words appearing ≥2 times in the first ``_ENTITY_EXTRACT_WINDOW``
|
|
chars. Filters out the closet stoplist (``When``, ``After``, ``The``, …)
|
|
so sentence-starters don't masquerade as proper nouns.
|
|
|
|
Returns semicolon-separated string suitable for ChromaDB metadata
|
|
filtering. The list is truncated to ``_ENTITY_METADATA_LIMIT`` entries
|
|
*before* joining so a name is never cut in half.
|
|
"""
|
|
import re
|
|
|
|
from .palace import _ENTITY_STOPLIST
|
|
|
|
matched: set = set()
|
|
|
|
known = _load_known_entities()
|
|
for name in known:
|
|
if re.search(r"(?<!\w)" + re.escape(name) + r"(?!\w)", content):
|
|
matched.add(name)
|
|
|
|
from .palace import _candidate_entity_words
|
|
|
|
window = content[:_ENTITY_EXTRACT_WINDOW]
|
|
words = _candidate_entity_words(window)
|
|
freq: dict = {}
|
|
for w in words:
|
|
if w in _ENTITY_STOPLIST:
|
|
continue
|
|
freq[w] = freq.get(w, 0) + 1
|
|
for w, c in freq.items():
|
|
if c >= 2 and len(w) > 2:
|
|
matched.add(w)
|
|
|
|
if not matched:
|
|
return ""
|
|
# Truncate the *list*, not the joined string — never split a name.
|
|
capped = sorted(matched)[:_ENTITY_METADATA_LIMIT]
|
|
return ";".join(capped)
|
|
|
|
|
|
def _build_drawer_metadata(
|
|
wing: str,
|
|
room: str,
|
|
source_file: str,
|
|
chunk_index: int,
|
|
agent: str,
|
|
content: str,
|
|
source_mtime: Optional[float],
|
|
) -> dict:
|
|
"""Build the metadata dict for one drawer without upserting.
|
|
|
|
Split out from ``add_drawer`` so ``process_file`` can batch all chunks
|
|
of a file into a single ``collection.upsert`` — one embedding forward
|
|
pass per batch instead of per chunk.
|
|
"""
|
|
metadata = {
|
|
"wing": wing,
|
|
"room": room,
|
|
"source_file": source_file,
|
|
"chunk_index": chunk_index,
|
|
"added_by": agent,
|
|
"filed_at": datetime.now().isoformat(),
|
|
"normalize_version": NORMALIZE_VERSION,
|
|
}
|
|
if source_mtime is not None:
|
|
metadata["source_mtime"] = source_mtime
|
|
metadata["hall"] = detect_hall(content)
|
|
entities = _extract_entities_for_metadata(content)
|
|
if entities:
|
|
metadata["entities"] = entities
|
|
return metadata
|
|
|
|
|
|
def add_drawer(
|
|
collection, wing: str, room: str, content: str, source_file: str, chunk_index: int, agent: str
|
|
):
|
|
"""Add one drawer to the palace.
|
|
|
|
Kept for backward compatibility with external callers. In-tree the
|
|
miner uses ``_build_drawer_metadata`` + a batched ``collection.upsert``
|
|
to amortize the embedding model's forward-pass cost across chunks.
|
|
"""
|
|
drawer_id = f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(chunk_index)).encode()).hexdigest()[:24]}"
|
|
try:
|
|
source_mtime = os.path.getmtime(source_file)
|
|
except OSError:
|
|
source_mtime = None
|
|
metadata = _build_drawer_metadata(
|
|
wing, room, source_file, chunk_index, agent, content, source_mtime
|
|
)
|
|
collection.upsert(
|
|
documents=[content],
|
|
ids=[drawer_id],
|
|
metadatas=[metadata],
|
|
)
|
|
return True
|
|
|
|
|
|
# =============================================================================
|
|
# PROCESS ONE FILE
|
|
# =============================================================================
|
|
|
|
|
|
def process_file(
|
|
filepath: Path,
|
|
project_path: Path,
|
|
collection,
|
|
wing: str,
|
|
rooms: list,
|
|
agent: str,
|
|
dry_run: bool,
|
|
closets_col=None,
|
|
) -> tuple:
|
|
"""Read, chunk, route, and file one file. Returns (drawer_count, room_name)."""
|
|
|
|
# Skip if already filed
|
|
source_file = str(filepath)
|
|
if not dry_run and file_already_mined(collection, source_file, check_mtime=True):
|
|
return 0, "general"
|
|
|
|
try:
|
|
content = filepath.read_text(encoding="utf-8", errors="replace")
|
|
except OSError:
|
|
return 0, "general"
|
|
|
|
content = content.strip()
|
|
if len(content) < MIN_CHUNK_SIZE:
|
|
return 0, "general"
|
|
|
|
room = detect_room(filepath, content, rooms, project_path)
|
|
chunks = chunk_text(content, source_file)
|
|
|
|
if len(chunks) > MAX_CHUNKS_PER_FILE:
|
|
print(
|
|
f" ! [skip] {filepath.name[:50]:50} produced {len(chunks)} chunks "
|
|
f"(> {MAX_CHUNKS_PER_FILE}); add to SKIP_FILENAMES or .gitignore"
|
|
)
|
|
return 0, room
|
|
|
|
if dry_run:
|
|
print(f" [DRY RUN] {filepath.name} -> room:{room} ({len(chunks)} drawers)")
|
|
return len(chunks), room
|
|
|
|
# Lock this file so concurrent agents don't interleave delete+insert.
|
|
# Without the lock, two agents can both pass file_already_mined(),
|
|
# both delete, and both insert — creating duplicates or losing data.
|
|
with mine_lock(source_file):
|
|
# Re-check after acquiring lock — another agent may have just finished
|
|
if file_already_mined(collection, source_file, check_mtime=True):
|
|
return 0, room
|
|
|
|
# Purge stale drawers for this file before re-inserting the fresh chunks.
|
|
# Converts modified-file re-mines from upsert-over-existing-IDs (which hits
|
|
# hnswlib's thread-unsafe updatePoint path and can segfault on macOS ARM
|
|
# with chromadb 0.6.3) into a clean delete+insert, bypassing the update
|
|
# path entirely.
|
|
try:
|
|
collection.delete(where={"source_file": source_file})
|
|
except Exception:
|
|
logger.debug("Stale-drawer purge failed for %s", source_file, exc_info=True)
|
|
|
|
# Batch chunks into bounded upserts so the embedding model sees many
|
|
# chunks per forward pass without building one huge Chroma/SQLite
|
|
# request for pathological files. A bad chunk can fail its sub-batch;
|
|
# that is the deliberate trade-off for amortizing embedding overhead.
|
|
try:
|
|
source_mtime = os.path.getmtime(source_file)
|
|
except OSError:
|
|
source_mtime = None
|
|
|
|
drawers_added = 0
|
|
for batch_start in range(0, len(chunks), DRAWER_UPSERT_BATCH_SIZE):
|
|
batch_docs: list = []
|
|
batch_ids: list = []
|
|
batch_metas: list = []
|
|
for chunk in chunks[batch_start : batch_start + DRAWER_UPSERT_BATCH_SIZE]:
|
|
drawer_id = f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
|
|
batch_docs.append(chunk["content"])
|
|
batch_ids.append(drawer_id)
|
|
batch_metas.append(
|
|
_build_drawer_metadata(
|
|
wing,
|
|
room,
|
|
source_file,
|
|
chunk["chunk_index"],
|
|
agent,
|
|
chunk["content"],
|
|
source_mtime,
|
|
)
|
|
)
|
|
collection.upsert(
|
|
documents=batch_docs,
|
|
ids=batch_ids,
|
|
metadatas=batch_metas,
|
|
)
|
|
drawers_added += len(batch_docs)
|
|
|
|
# Build closet — the searchable index pointing to these drawers.
|
|
# Purge first: a re-mine (mtime change or normalize_version bump) must
|
|
# fully replace the prior closets, not append to them.
|
|
if closets_col and drawers_added > 0:
|
|
drawer_ids = [
|
|
f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(c['chunk_index'])).encode()).hexdigest()[:24]}"
|
|
for c in chunks
|
|
]
|
|
closet_lines = build_closet_lines(source_file, drawer_ids, content, wing, room)
|
|
closet_id_base = (
|
|
f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}"
|
|
)
|
|
entities = _extract_entities_for_metadata(content)
|
|
closet_meta = {
|
|
"wing": wing,
|
|
"room": room,
|
|
"source_file": source_file,
|
|
"drawer_count": drawers_added,
|
|
"filed_at": datetime.now().isoformat(),
|
|
"normalize_version": NORMALIZE_VERSION,
|
|
}
|
|
if entities:
|
|
closet_meta["entities"] = entities
|
|
purge_file_closets(closets_col, source_file)
|
|
upsert_closet_lines(closets_col, closet_id_base, closet_lines, closet_meta)
|
|
|
|
return drawers_added, room
|
|
|
|
|
|
# =============================================================================
|
|
# SCAN PROJECT
|
|
# =============================================================================
|
|
|
|
|
|
def scan_project(
|
|
project_dir: str,
|
|
respect_gitignore: bool = True,
|
|
include_ignored: list = None,
|
|
) -> list:
|
|
"""Return list of all readable file paths."""
|
|
project_path = Path(project_dir).expanduser().resolve()
|
|
files = []
|
|
active_matchers = []
|
|
matcher_cache = {}
|
|
include_paths = normalize_include_paths(include_ignored)
|
|
|
|
for root, dirs, filenames in os.walk(project_path):
|
|
root_path = Path(root)
|
|
|
|
if respect_gitignore:
|
|
active_matchers = [
|
|
matcher
|
|
for matcher in active_matchers
|
|
if root_path == matcher.base_dir or matcher.base_dir in root_path.parents
|
|
]
|
|
current_matcher = load_gitignore_matcher(root_path, matcher_cache)
|
|
if current_matcher is not None:
|
|
active_matchers.append(current_matcher)
|
|
|
|
dirs[:] = [
|
|
d
|
|
for d in dirs
|
|
if is_force_included(root_path / d, project_path, include_paths)
|
|
or not should_skip_dir(d)
|
|
]
|
|
if respect_gitignore and active_matchers:
|
|
dirs[:] = [
|
|
d
|
|
for d in dirs
|
|
if is_force_included(root_path / d, project_path, include_paths)
|
|
or not is_gitignored(root_path / d, active_matchers, is_dir=True)
|
|
]
|
|
|
|
for filename in filenames:
|
|
filepath = root_path / filename
|
|
force_include = is_force_included(filepath, project_path, include_paths)
|
|
exact_force_include = is_exact_force_include(filepath, project_path, include_paths)
|
|
|
|
if not force_include and filename in SKIP_FILENAMES:
|
|
continue
|
|
if filepath.suffix.lower() not in READABLE_EXTENSIONS and not exact_force_include:
|
|
continue
|
|
if respect_gitignore and active_matchers and not force_include:
|
|
if is_gitignored(filepath, active_matchers, is_dir=False):
|
|
continue
|
|
# Skip symlinks — prevents following links to /dev/urandom, etc.
|
|
if filepath.is_symlink():
|
|
continue
|
|
# Skip files exceeding size limit
|
|
try:
|
|
if filepath.stat().st_size > MAX_FILE_SIZE:
|
|
continue
|
|
except OSError:
|
|
continue
|
|
files.append(filepath)
|
|
return files
|
|
|
|
|
|
# =============================================================================
|
|
# MAIN: MINE
|
|
# =============================================================================
|
|
|
|
|
|
def mine(
|
|
project_dir: str,
|
|
palace_path: str,
|
|
wing_override: str = None,
|
|
agent: str = "mempalace",
|
|
limit: int = 0,
|
|
dry_run: bool = False,
|
|
respect_gitignore: bool = True,
|
|
include_ignored: list = None,
|
|
files: list = None,
|
|
):
|
|
"""Mine a project directory into the palace.
|
|
|
|
``files`` may optionally be a pre-scanned list of file paths from
|
|
:func:`scan_project`. When provided, the corpus walk is skipped — the
|
|
caller (e.g. ``init`` showing a file-count estimate before the mine
|
|
prompt) avoids walking the tree twice. When ``None`` (the default),
|
|
``mine`` walks the tree itself just like before.
|
|
"""
|
|
|
|
if dry_run:
|
|
return _mine_impl(
|
|
project_dir,
|
|
palace_path,
|
|
wing_override=wing_override,
|
|
agent=agent,
|
|
limit=limit,
|
|
dry_run=dry_run,
|
|
respect_gitignore=respect_gitignore,
|
|
include_ignored=include_ignored,
|
|
files=files,
|
|
)
|
|
|
|
# MineAlreadyRunning propagates so the CLI can render a clear holder-aware
|
|
# message and exit non-zero. In-process callers (tests, library users) that
|
|
# expect to coexist with another writer should handle the exception.
|
|
with mine_palace_lock(palace_path):
|
|
return _mine_impl(
|
|
project_dir,
|
|
palace_path,
|
|
wing_override=wing_override,
|
|
agent=agent,
|
|
limit=limit,
|
|
dry_run=dry_run,
|
|
respect_gitignore=respect_gitignore,
|
|
include_ignored=include_ignored,
|
|
files=files,
|
|
)
|
|
|
|
|
|
def _mine_impl(
|
|
project_dir: str,
|
|
palace_path: str,
|
|
wing_override: str = None,
|
|
agent: str = "mempalace",
|
|
limit: int = 0,
|
|
dry_run: bool = False,
|
|
respect_gitignore: bool = True,
|
|
include_ignored: list = None,
|
|
files: list = None,
|
|
):
|
|
project_path = Path(project_dir).expanduser().resolve()
|
|
config = load_config(project_dir)
|
|
|
|
wing = wing_override or config["wing"]
|
|
rooms = config.get("rooms", [{"name": "general", "description": "All project files"}])
|
|
|
|
if files is None:
|
|
files = scan_project(
|
|
project_dir,
|
|
respect_gitignore=respect_gitignore,
|
|
include_ignored=include_ignored,
|
|
)
|
|
if limit > 0:
|
|
files = files[:limit]
|
|
|
|
from .embedding import describe_device
|
|
|
|
print(f"\n{'=' * 55}")
|
|
print(" MemPalace Mine")
|
|
print(f"{'=' * 55}")
|
|
print(f" Wing: {wing}")
|
|
print(f" Rooms: {', '.join(r['name'] for r in rooms)}")
|
|
print(f" Files: {len(files)}")
|
|
print(f" Palace: {palace_path}")
|
|
print(f" Device: {describe_device()}")
|
|
if dry_run:
|
|
print(" DRY RUN — nothing will be filed")
|
|
if not respect_gitignore:
|
|
print(" .gitignore: DISABLED")
|
|
if include_ignored:
|
|
print(f" Include: {', '.join(sorted(normalize_include_paths(include_ignored)))}")
|
|
print(f"{'-' * 55}\n")
|
|
|
|
if not dry_run:
|
|
collection = get_collection(palace_path)
|
|
closets_col = get_closets_collection(palace_path)
|
|
else:
|
|
collection = None
|
|
closets_col = None
|
|
|
|
total_drawers = 0
|
|
files_skipped = 0
|
|
files_processed = 0
|
|
last_file = None
|
|
room_counts = defaultdict(int)
|
|
|
|
try:
|
|
for i, filepath in enumerate(files, 1):
|
|
try:
|
|
drawers, room = process_file(
|
|
filepath=filepath,
|
|
project_path=project_path,
|
|
collection=collection,
|
|
wing=wing,
|
|
rooms=rooms,
|
|
agent=agent,
|
|
dry_run=dry_run,
|
|
closets_col=closets_col,
|
|
)
|
|
except KeyboardInterrupt:
|
|
# Re-raise so the outer handler prints the summary; we
|
|
# capture the last-attempted file via last_file below.
|
|
last_file = filepath.name
|
|
raise
|
|
files_processed = i
|
|
last_file = filepath.name
|
|
if drawers == 0 and not dry_run:
|
|
files_skipped += 1
|
|
else:
|
|
total_drawers += drawers
|
|
room_counts[room] += 1
|
|
if not dry_run:
|
|
print(f" + [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers}")
|
|
|
|
if not dry_run:
|
|
# Cross-wing topic tunnels: after every file in this wing has been
|
|
# processed, link this wing to any other wing that shares a
|
|
# confirmed TOPIC label. Out of scope for v1: manifest-dependency
|
|
# overlap, per-topic allow/deny lists, search-result surfacing.
|
|
try:
|
|
tunnels_added = _compute_topic_tunnels_for_wing(wing)
|
|
if tunnels_added:
|
|
print(f"\n Topic tunnels: +{tunnels_added} cross-wing link(s)")
|
|
except Exception as e:
|
|
# Tunnel computation must never fail a mine — degrade quietly.
|
|
print(
|
|
f"\n WARNING: topic tunnel computation skipped — {e}",
|
|
file=sys.stderr,
|
|
)
|
|
|
|
print(f"\n{'=' * 55}")
|
|
print(" Done.")
|
|
print(f" Files processed: {len(files) - files_skipped}")
|
|
print(f" Files skipped (already filed): {files_skipped}")
|
|
print(f" Drawers filed: {total_drawers}")
|
|
print("\n By room:")
|
|
for room, count in sorted(room_counts.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" {room:20} {count} files")
|
|
print('\n Next: mempalace search "what you\'re looking for"')
|
|
print(f"{'=' * 55}\n")
|
|
except KeyboardInterrupt:
|
|
# Idempotent re-mine: deterministic drawer IDs mean already-filed
|
|
# drawers upsert to the same row on next run, so partial progress
|
|
# is safe to leave in place. A second Ctrl-C during this print
|
|
# propagates to the default handler — we don't try to catch
|
|
# everything.
|
|
print("\n\n Mine interrupted.")
|
|
print(f" files_processed: {files_processed}/{len(files)}")
|
|
print(f" drawers_filed: {total_drawers}")
|
|
print(f" last_file: {last_file or '<none>'}")
|
|
print(
|
|
f"\n Re-run `mempalace mine {shlex.quote(project_dir)}` to resume — "
|
|
"already-filed drawers are\n upserted idempotently and will not duplicate.\n"
|
|
)
|
|
sys.exit(130)
|
|
except Exception as exc:
|
|
# Without this, an arbitrary exception (ONNX bad_alloc, chromadb HNSW
|
|
# error, OS fault) propagates and the process exits with no completion
|
|
# banner — the operator sees only the final progress line and assumes
|
|
# the mine succeeded (#1296). Print the partial-progress summary the
|
|
# way we do for KeyboardInterrupt, then re-raise so the original
|
|
# traceback still surfaces and the exit code is non-zero.
|
|
print("\n\n Mine aborted by exception.")
|
|
print(f" files_processed: {files_processed}/{len(files)}")
|
|
print(f" drawers_filed: {total_drawers}")
|
|
print(f" last_file: {last_file or '<none>'}")
|
|
print(f" error: {type(exc).__name__}: {exc}")
|
|
print(
|
|
f"\n Re-run `mempalace mine {shlex.quote(project_dir)}` after addressing "
|
|
"the cause — already-filed\n drawers are upserted idempotently and will "
|
|
"not duplicate.\n"
|
|
)
|
|
raise
|
|
finally:
|
|
# Clean up the hooks-side PID lock if it points at us. Stale
|
|
# entries already pass _pid_alive() == False on POSIX, but
|
|
# actively removing the file makes the state observable
|
|
# (callers can stat it) and avoids accidental PID reuse on
|
|
# short-lived test runs. Only remove if the file claims our
|
|
# own PID — never another process's.
|
|
_cleanup_mine_pid_file()
|
|
|
|
|
|
def _cleanup_mine_pid_file() -> None:
|
|
"""Remove the global mine PID file if it currently points at us.
|
|
|
|
The PID file (``~/.mempalace/hook_state/mine.pid``, written by the
|
|
hook in :func:`mempalace.hooks_cli._spawn_mine`) tracks the PID of
|
|
the most recently spawned mine subprocess so the hook can dedup
|
|
concurrent auto-ingest fires. When that subprocess exits — cleanly,
|
|
on error, or via Ctrl-C — it should remove its own entry so the
|
|
next hook fire isn't briefly fooled by a stale PID before
|
|
``_pid_alive`` returns False.
|
|
|
|
We only delete the file if it claims our own PID; any other PID is
|
|
left alone (could be an unrelated mine running concurrently from
|
|
a different worktree / session).
|
|
"""
|
|
try:
|
|
from .hooks_cli import _MINE_PID_FILE
|
|
except Exception:
|
|
return
|
|
try:
|
|
if not _MINE_PID_FILE.exists():
|
|
return
|
|
recorded = _MINE_PID_FILE.read_text().strip()
|
|
if recorded and recorded.isdigit() and int(recorded) == os.getpid():
|
|
_MINE_PID_FILE.unlink()
|
|
except OSError:
|
|
# Best-effort cleanup; never fail the mine over PID bookkeeping.
|
|
pass
|
|
|
|
|
|
def _compute_topic_tunnels_for_wing(wing: str) -> int:
|
|
"""Drop tunnels between ``wing`` and every other wing that shares
|
|
confirmed topics, honoring the ``topic_tunnel_min_count`` config knob.
|
|
|
|
Returns the number of tunnels created or refreshed. Zero means no
|
|
overlap found (or the registry has no ``topics_by_wing`` map yet).
|
|
"""
|
|
from .config import MempalaceConfig
|
|
from .palace_graph import topic_tunnels_for_wing
|
|
|
|
topics_map = get_topics_by_wing()
|
|
if not topics_map or wing not in topics_map:
|
|
return 0
|
|
cfg = MempalaceConfig()
|
|
min_count = cfg.topic_tunnel_min_count
|
|
created = topic_tunnels_for_wing(wing, topics_map, min_count=min_count)
|
|
return len(created)
|
|
|
|
|
|
# =============================================================================
|
|
# STATUS
|
|
# =============================================================================
|
|
|
|
|
|
def status(palace_path: str):
|
|
"""Show what's been filed in the palace."""
|
|
try:
|
|
col = get_collection(palace_path, create=False)
|
|
except Exception:
|
|
print(f"\n No palace found at {palace_path}")
|
|
print(" Run: mempalace init <dir> then mempalace mine <dir>")
|
|
return
|
|
|
|
# Count by wing and room — paginate to avoid SQLite "too many SQL
|
|
# variables" error on large palaces (see #802, #850).
|
|
total = col.count()
|
|
wing_rooms: dict = defaultdict(lambda: defaultdict(int))
|
|
batch_size = 5000
|
|
offset = 0
|
|
while offset < total:
|
|
r = col.get(limit=batch_size, offset=offset, include=["metadatas"])
|
|
batch = r["metadatas"]
|
|
if not batch:
|
|
break
|
|
for m in batch:
|
|
m = m or {}
|
|
wing_rooms[m.get("wing", "?")][m.get("room", "?")] += 1
|
|
offset += len(batch)
|
|
|
|
print(f"\n{'=' * 55}")
|
|
print(f" MemPalace Status — {total} drawers")
|
|
print(f"{'=' * 55}\n")
|
|
for wing, rooms in sorted(wing_rooms.items()):
|
|
print(f" WING: {wing}")
|
|
for room, count in sorted(rooms.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" ROOM: {room:20} {count:5} drawers")
|
|
print()
|
|
print(f"{'=' * 55}\n")
|