From 6aebf458ff67793fa75cc803697f37ccbd4c9a14 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Fri, 24 Apr 2026 00:20:32 -0300 Subject: [PATCH 1/8] fix(entity): reduce noise in regex-based detection The pattern-matching detector had several systematic false positives that crowded the init review with nonsense. Concrete fixes: - CamelCase extraction: add `[A-Z][a-z]+(?:[A-Z][a-z]+|[A-Z]{2,})+` to candidate patterns so `MemPalace`, `ChromaDB`, `OpenAI`, `ChatGPT` are visible. Previously `MemPalace` fragmented into `Mem` + `Palace`. - Dialogue `^NAME:\s` requires >=2 matches to count. A single metadata line like `Created: 2026-04-21` was scoring as dialogue and classifying `Created` as a person. - Versioned/hyphenated pattern tightened to `\b{name}[-_]v?\d+(?:\.\d+)*\b` (version-only). The previous `\b{name}[-v]\w+` matched `context-manager`, `multi-word`, etc. - every hyphenated compound. - Skip LICENSE/COPYING/NOTICE/AUTHORS/PATENTS files during scan. They produce pure-English-prose noise (`Contributor`, `Software`, `Covered`, `Before`). - Extra SKIP_DIRS: `.terraform`, `vendor`, `target`. - Expand stopword list with capitalized participles/descriptors that commonly appear at sentence start: `created`, `updated`, `extracted`, `processed`, `total`, `summary`, `auto`, `multi`, `hybrid`, `context`, `bridge`, `batch`, `local`, `native`, `never`, `before`, `after`, etc. - classify_entity: high-pronoun single-category signal now classifies as person. A diary's main character gets referenced with pronouns, not dialogue markers - requiring two signal categories demoted `Lu` (16 pronoun hits across 30 mentions) to uncertain. Gate on `pronoun_hits >= 5 AND pronoun_hits / frequency >= 0.2` so common sentence-start words (`Never`, `Before`) with incidental proximity stay uncertain. --- mempalace/entity_detector.py | 57 +++++++++++++++++++++++++++++------ mempalace/i18n/en.json | 14 +++++++-- tests/test_entity_detector.py | 27 +++++++++++++++++ 3 files changed, 86 insertions(+), 12 deletions(-) diff --git a/mempalace/entity_detector.py b/mempalace/entity_detector.py index 754c65d..2f2aae4 100644 --- a/mempalace/entity_detector.py +++ b/mempalace/entity_detector.py @@ -113,6 +113,23 @@ SKIP_DIRS = { ".next", "coverage", ".mempalace", + ".terraform", + "vendor", + "target", +} + +# Files whose content is boilerplate prose — poisons entity detection. +# Matched by stem (case-insensitive), with or without an extension. +SKIP_FILENAMES = { + "license", + "licence", + "copying", + "copyright", + "notice", + "authors", + "patents", + "third_party_notices", + "third-party-notices", } @@ -193,7 +210,7 @@ def _build_patterns(name: str, languages: tuple = ("en",)) -> dict: "person_verbs": _compile_each(sources["person_verb_patterns"]), "project_verbs": _compile_each(sources["project_verb_patterns"]), "direct": direct_compiled, - "versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE), + "versioned": re.compile(rf"\b{n}[-_]v?\d+(?:\.\d+)*\b", re.IGNORECASE), "code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE), } @@ -227,12 +244,19 @@ def score_entity(name: str, text: str, lines: list, languages=("en",)) -> dict: # --- Person signals --- - # Dialogue markers (strong signal) + # Dialogue markers (strong signal). + # The bare `^NAME:\s` colon-prefix pattern matches metadata lines like + # `Created: 2026-04-21`, so we require >= 2 hits for it to count as dialogue + # (real speaker markers repeat; single-line metadata doesn't). for rx in patterns["dialogue"]: matches = len(rx.findall(text)) - if matches > 0: - person_score += matches * 3 - person_signals.append(f"dialogue marker ({matches}x)") + if matches == 0: + continue + is_bare_colon = rx.pattern.endswith(r":\s") and not rx.pattern.endswith(r"[:\s]") + if is_bare_colon and matches < 2: + continue + person_score += matches * 3 + person_signals.append(f"dialogue marker ({matches}x)") # Person verbs for rx in patterns["person_verbs"]: @@ -328,17 +352,28 @@ def classify_entity(name: str, frequency: int, scores: dict) -> dict: signal_categories.add("addressed") has_two_signal_types = len(signal_categories) >= 2 - _ = signal_categories - {"pronoun"} # reserved for future thresholds + # Single-category pronoun signal still classifies as person when the + # evidence is overwhelming — a diary's main character is referenced + # with pronouns, not dialogue markers. Require both: many pronoun hits + # AND a high pronoun-to-frequency ratio so common sentence-start words + # (Never, Before, etc.) with incidental pronoun proximity don't qualify. + pronoun_hits = 0 + for s in scores["person_signals"]: + m = re.search(r"pronoun nearby \((\d+)x\)", s) + if m: + pronoun_hits = int(m.group(1)) + break + strong_pronoun_signal = pronoun_hits >= 5 and frequency > 0 and pronoun_hits / frequency >= 0.2 - if person_ratio >= 0.7 and has_two_signal_types and ps >= 5: + if person_ratio >= 0.7 and (has_two_signal_types and ps >= 5 or strong_pronoun_signal): entity_type = "person" confidence = min(0.99, 0.5 + person_ratio * 0.5) signals = scores["person_signals"] or [f"appears {frequency}x"] - elif person_ratio >= 0.7 and (not has_two_signal_types or ps < 5): - # Pronoun-only match — downgrade to uncertain + elif person_ratio >= 0.7: + # Weak single-category person signal — downgrade to uncertain entity_type = "uncertain" confidence = 0.4 - signals = scores["person_signals"] + [f"appears {frequency}x — pronoun-only match"] + signals = scores["person_signals"] + [f"appears {frequency}x — weak person signal"] elif person_ratio <= 0.3: entity_type = "project" confidence = min(0.99, 0.5 + (1 - person_ratio) * 0.5) @@ -560,6 +595,8 @@ def scan_for_detection(project_dir: str, max_files: int = 10) -> list: dirs[:] = [d for d in dirs if d not in SKIP_DIRS] for filename in filenames: filepath = Path(root) / filename + if filepath.stem.lower() in SKIP_FILENAMES: + continue ext = filepath.suffix.lower() if ext in PROSE_EXTENSIONS: prose_files.append(filepath) diff --git a/mempalace/i18n/en.json b/mempalace/i18n/en.json index 6a9dff9..39d9ac1 100644 --- a/mempalace/i18n/en.json +++ b/mempalace/i18n/en.json @@ -42,7 +42,7 @@ "action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}" }, "entity": { - "candidate_pattern": "[A-Z][a-z]{1,19}", + "candidate_pattern": "[A-Z][a-z]+(?:[A-Z][a-z]+|[A-Z]{2,})+|[A-Z][a-z]{1,19}", "multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+", "person_verb_patterns": [ "\\b{name}\\s+said\\b", @@ -140,7 +140,17 @@ "agents", "tools", "others", "guards", "ethics", "regulation", "learning", "thinking", "memory", "language", "intelligence", "technology", "society", "culture", "future", "history", "science", - "model", "models", "network", "networks", "training", "inference" + "model", "models", "network", "networks", "training", "inference", + "created", "updated", "deleted", "added", "removed", "modified", + "extracted", "processed", "generated", "compiled", "launched", "installed", + "deployed", "executed", "loaded", "parsed", "validated", "configured", + "total", "summary", "covered", "included", "pending", "failed", "success", + "ready", "active", "disabled", "enabled", "available", "completed", + "auto", "multi", "mini", "micro", "meta", "super", "hybrid", + "context", "bridge", "batch", "local", "global", "native", "cloud", + "before", "after", "during", "often", "always", "never", + "project", "contributor", "software", + "backend", "frontend", "server", "client", "service", "app", "api" ] } } diff --git a/tests/test_entity_detector.py b/tests/test_entity_detector.py index f006270..afad4d7 100644 --- a/tests/test_entity_detector.py +++ b/tests/test_entity_detector.py @@ -148,6 +148,33 @@ def test_classify_entity_pronoun_only_is_uncertain(): assert result["type"] == "uncertain" +def test_classify_entity_high_pronoun_signal_is_person(): + """A diary's main character hit by many pronouns should still classify + as a person even with only the pronoun signal category. Example from + real data: `Lu` has 16 pronoun hits out of 30 mentions.""" + scores = { + "person_score": 32, + "project_score": 0, + "person_signals": ["pronoun nearby (16x)"], + "project_signals": [], + } + result = classify_entity("Lu", 30, scores) + assert result["type"] == "person" + + +def test_classify_entity_low_pronoun_proximity_is_uncertain(): + """Common sentence-start words (Never, Before) get a few pronouns nearby + incidentally. The ratio stays low (<20%), so they stay uncertain.""" + scores = { + "person_score": 4, + "project_score": 0, + "person_signals": ["pronoun nearby (2x)"], + "project_signals": [], + } + result = classify_entity("Never", 21, scores) + assert result["type"] == "uncertain" + + def test_classify_entity_mixed_signals(): scores = { "person_score": 5, From 9e7fa1ceb59ba4eee306f05a8d1ac50a2eaa0b49 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Fri, 24 Apr 2026 00:20:53 -0300 Subject: [PATCH 2/8] feat(init): scan manifests and git authors for real entity signal `mempalace init` previously leaned entirely on regex-based entity extraction from prose. That path works for text-only folders but wastes signal in any codebase: the project's own name is already in `package.json` / `pyproject.toml` / `Cargo.toml` / `go.mod`, and the people who worked on it are in `git log`. This adds `project_scanner.py`, which becomes the primary signal source when real signal is available, with the regex detector preserved as the fallback for prose-only folders (diaries, research notes, writing). What it does: - Walks the target directory, parses manifests for canonical project names, and detects git repos by the presence of a `.git` directory. - For each repo, reads `git log` for authors and filters obvious bots (`[bot]`, `dependabot`, `renovate`, `github-actions`, names ending in `bot`, `-autoroll`). Importantly does NOT filter `@users.noreply.github.com` - that's GitHub's privacy-protected human email, used by real contributors. - Resolves author aliases with a union-find: commits that share a name OR an email collapse into one person. Picks the most-frequent real-name variant as display, ignoring handles and single-token usernames. - Flags "mine" projects: user is top-5 committer OR has >=10% of commits OR >=20 commits. Ordered by user_commits in the UX. - `discover_entities()` merges scanner results with the regex detector case-insensitively (so `mempalace` from pyproject absorbs `MemPalace` from docs), and suppresses the regex `uncertain` bucket when real signal is already found - the user doesn't need to adjudicate prose noise when the answer is already in git. Integration: `cmd_init` now calls `discover_entities` instead of running the regex detector directly. Same output shape, so `confirm_entities` works unchanged. Ships with 39 new tests covering manifest parsing, bot filtering, union-find dedup, git repo discovery, scan integration, and merge/fallback behavior. Existing 56 regex-detector tests all pass. --- mempalace/cli.py | 33 +- mempalace/project_scanner.py | 623 ++++++++++++++++++++++++++++++++++ tests/test_project_scanner.py | 411 ++++++++++++++++++++++ uv.lock | 2 +- 4 files changed, 1051 insertions(+), 18 deletions(-) create mode 100644 mempalace/project_scanner.py create mode 100644 tests/test_project_scanner.py diff --git a/mempalace/cli.py b/mempalace/cli.py index d0da6e7..de40090 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -71,7 +71,8 @@ def _ensure_mempalace_files_gitignored(project_dir) -> bool: def cmd_init(args): import json from pathlib import Path - from .entity_detector import scan_for_detection, detect_entities, confirm_entities + from .entity_detector import confirm_entities + from .project_scanner import discover_entities from .room_detector_local import detect_rooms_local cfg = MempalaceConfig() @@ -85,25 +86,23 @@ def cmd_init(args): languages = cfg.entity_languages languages_tuple = tuple(languages) - # Pass 1: auto-detect people and projects from file content + # Pass 1: discover entities — manifests + git authors first, prose detection + # as supplement for names mentioned only in docs/notes. print(f"\n Scanning for entities in: {args.dir}") if languages_tuple != ("en",): print(f" Languages: {', '.join(languages_tuple)}") - files = scan_for_detection(args.dir) - if files: - print(f" Reading {len(files)} files...") - detected = detect_entities(files, languages=languages_tuple) - total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"]) - if total > 0: - confirmed = confirm_entities(detected, yes=getattr(args, "yes", False)) - # Save confirmed entities to /entities.json for the miner - if confirmed["people"] or confirmed["projects"]: - entities_path = Path(args.dir).expanduser().resolve() / "entities.json" - with open(entities_path, "w") as f: - json.dump(confirmed, f, indent=2) - print(f" Entities saved: {entities_path}") - else: - print(" No entities detected — proceeding with directory-based rooms.") + detected = discover_entities(args.dir, languages=languages_tuple) + total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"]) + if total > 0: + confirmed = confirm_entities(detected, yes=getattr(args, "yes", False)) + # Save confirmed entities to /entities.json for the miner + if confirmed["people"] or confirmed["projects"]: + entities_path = Path(args.dir).expanduser().resolve() / "entities.json" + with open(entities_path, "w") as f: + json.dump(confirmed, f, indent=2) + print(f" Entities saved: {entities_path}") + else: + print(" No entities detected — proceeding with directory-based rooms.") # Pass 2: detect rooms from folder structure detect_rooms_local(project_dir=args.dir, yes=getattr(args, "yes", False)) diff --git a/mempalace/project_scanner.py b/mempalace/project_scanner.py new file mode 100644 index 0000000..e078b6e --- /dev/null +++ b/mempalace/project_scanner.py @@ -0,0 +1,623 @@ +""" +project_scanner.py — Detect projects and people from real signal. + +For a codebase with build manifests or git history, this beats regex-based +entity detection by a wide margin: the project's own name is already written +down in package.json / pyproject.toml / Cargo.toml / go.mod, and the people +who worked on it are in `git log`. + +This module is used as the primary signal in `mempalace init`. The regex +detector in entity_detector.py stays as a fallback for prose-only folders +(notes, research, writing). + +Public: + scan(root) -> (projects, people) + to_detected_dict(projects, people) -> {people: [...], projects: [...], uncertain: []} +""" + +from __future__ import annotations + +import json +import os +import re +import subprocess +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +try: + import tomllib # Python 3.11+ +except ImportError: # pragma: no cover + tomllib = None # type: ignore + + +SKIP_DIRS = { + ".git", + "node_modules", + "__pycache__", + ".venv", + "venv", + "env", + "dist", + "build", + ".next", + "coverage", + ".terraform", + "vendor", + "target", + ".mempalace", + ".cache", + ".pytest_cache", + ".mypy_cache", + ".ruff_cache", +} + +MAX_DEPTH = 6 +MAX_COMMITS_PER_REPO = 1000 +GIT_TIMEOUT = 10 + + +# ==================== DATACLASSES ==================== + + +@dataclass +class ProjectInfo: + name: str + repo_root: Path + manifest: Optional[str] = None + has_git: bool = False + total_commits: int = 0 + user_commits: int = 0 + is_mine: bool = False + + @property + def confidence(self) -> float: + if self.is_mine: + return 0.99 + if self.has_git and self.total_commits > 0: + return 0.7 + return 0.85 # manifest-only, no git + + def to_signal(self) -> str: + parts: list[str] = [] + if self.manifest: + parts.append(self.manifest) + if self.has_git: + if self.is_mine and self.user_commits: + parts.append(f"{self.user_commits} of your commits") + elif self.user_commits: + parts.append(f"{self.user_commits}/{self.total_commits} yours") + else: + parts.append(f"{self.total_commits} commits (none by you)") + return ", ".join(parts) or "repo" + + +@dataclass +class PersonInfo: + name: str + total_commits: int = 0 + emails: set[str] = field(default_factory=set) + repos: set[str] = field(default_factory=set) + + @property + def confidence(self) -> float: + if self.total_commits >= 100 or len(self.repos) >= 3: + return 0.99 + if self.total_commits >= 20: + return 0.85 + return 0.65 + + def to_signal(self) -> str: + r = len(self.repos) + return f"{self.total_commits} commit{'s' if self.total_commits != 1 else ''} across {r} repo{'s' if r != 1 else ''}" + + +# ==================== MANIFEST PARSING ==================== + + +def _parse_package_json(path: Path) -> Optional[str]: + try: + data = json.loads(path.read_text(encoding="utf-8", errors="replace")) + except (json.JSONDecodeError, OSError): + return None + name = data.get("name") + return name if isinstance(name, str) and name else None + + +def _parse_toml(path: Path) -> dict: + if tomllib is None: + return {} + try: + with open(path, "rb") as f: + return tomllib.load(f) + except (OSError, Exception): + return {} + + +def _parse_pyproject(path: Path) -> Optional[str]: + data = _parse_toml(path) + name = data.get("project", {}).get("name") + if isinstance(name, str) and name: + return name + name = data.get("tool", {}).get("poetry", {}).get("name") + return name if isinstance(name, str) and name else None + + +def _parse_cargo(path: Path) -> Optional[str]: + data = _parse_toml(path) + name = data.get("package", {}).get("name") + return name if isinstance(name, str) and name else None + + +def _parse_gomod(path: Path) -> Optional[str]: + try: + for line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + line = line.strip() + if line.startswith("module "): + mod = line.split(None, 1)[1].strip() + return mod.split("/")[-1] or None + except OSError: + return None + return None + + +MANIFEST_PARSERS = { + "package.json": _parse_package_json, + "pyproject.toml": _parse_pyproject, + "Cargo.toml": _parse_cargo, + "go.mod": _parse_gomod, +} + + +# ==================== GIT HELPERS ==================== + + +def _run_git(cwd: Path, *args: str, timeout: int = GIT_TIMEOUT) -> str: + try: + r = subprocess.run( + ["git", "-C", str(cwd), *args], + capture_output=True, + text=True, + timeout=timeout, + check=False, + ) + return r.stdout if r.returncode == 0 else "" + except (OSError, subprocess.SubprocessError): + return "" + + +def _git_user_identity(repo: Path) -> tuple[str, str]: + """Return (name, email) for this repo, falling back to global config.""" + name = _run_git(repo, "config", "user.name", timeout=2).strip() + email = _run_git(repo, "config", "user.email", timeout=2).strip() + return name, email + + +def _global_git_identity() -> tuple[str, str]: + try: + n = subprocess.run( + ["git", "config", "--global", "user.name"], + capture_output=True, + text=True, + timeout=2, + check=False, + ).stdout.strip() + e = subprocess.run( + ["git", "config", "--global", "user.email"], + capture_output=True, + text=True, + timeout=2, + check=False, + ).stdout.strip() + return n, e + except (OSError, subprocess.SubprocessError): + return "", "" + + +def _git_authors(repo: Path) -> list[tuple[str, str]]: + out = _run_git( + repo, + "log", + f"--max-count={MAX_COMMITS_PER_REPO}", + "--format=%aN|%aE", + ) + result = [] + for line in out.splitlines(): + if "|" in line: + name, email = line.split("|", 1) + result.append((name.strip(), email.strip())) + return result + + +# ==================== BOT / NAME FILTERING ==================== + + +_BOT_NAME_PATTERNS = [ + r"\[bot\]", + r"^dependabot", + r"^renovate", + r"^github-actions", + r"^actions-user", + r"-bot$", + r"\bbot$", # catches "PR Bot", "Release Bot", etc. Not "robot" (no \b) + r"^bot-", + r"^snyk", + r"^greenkeeper", + r"^semantic-release", + r"^allcontributors", + r"-autoroll$", + r"^auto-format", + r"^pre-commit-ci", +] +_BOT_EMAIL_PATTERNS = [ + # `@users.noreply.github.com` is GitHub's privacy-protected human email — + # do NOT filter it. Real bots identify themselves via the display name + # (usually containing "[bot]"), which is caught by _BOT_NAME_PATTERNS. + r"bot@", + r"-bot@", + r"\[bot\]@", +] + +_BOT_RE_NAMES = [re.compile(p) for p in _BOT_NAME_PATTERNS] +_BOT_RE_EMAILS = [re.compile(p) for p in _BOT_EMAIL_PATTERNS] + + +def _is_bot(name: str, email: str) -> bool: + ln, le = name.lower(), email.lower() + return any(rx.search(ln) for rx in _BOT_RE_NAMES) or any(rx.search(le) for rx in _BOT_RE_EMAILS) + + +def _looks_like_real_name(name: str) -> bool: + """Heuristic: a human's name has a space and at least two title-cased parts. + + Filters out handles (lowercase, digits, one-token usernames). + """ + if not name or " " not in name: + return False + parts = name.split() + if len(parts) < 2: + return False + # First and last parts must start with an uppercase letter + return parts[0][:1].isupper() and parts[-1][:1].isupper() + + +# ==================== DIRECTORY WALK ==================== + + +def _walk(root: Path, max_depth: int = MAX_DEPTH): + for dirpath, dirs, files in os.walk(root): + dirs[:] = [d for d in dirs if d not in SKIP_DIRS and not d.startswith(".")] + rel = Path(dirpath).relative_to(root) if dirpath != str(root) else Path(".") + try: + rel = Path(dirpath).relative_to(root) + except ValueError: + continue + depth = 0 if rel == Path(".") else len(rel.parts) + if depth > max_depth: + dirs.clear() + continue + yield Path(dirpath), dirs, files + + +def find_git_repos(root: Path, max_depth: int = MAX_DEPTH) -> list[Path]: + """Return git repo roots under `root` (including root itself if it's a repo).""" + root = root.resolve() + repos: list[Path] = [] + if (root / ".git").is_dir(): + # Root is a repo — still walk for nested repos (submodules, etc.) + repos.append(root) + for dirpath, dirs, _ in _walk(root, max_depth): + if dirpath == root: + continue + if (dirpath / ".git").is_dir(): + repos.append(dirpath) + dirs.clear() # don't descend into this repo's contents from here + return repos + + +def _collect_manifest_names(repo_root: Path) -> list[tuple[str, str, Path]]: + """Return (manifest_filename, project_name, dirpath) within a repo. + + Does not descend into nested git repos. + """ + found: list[tuple[str, str, Path]] = [] + for dirpath, dirs, files in _walk(repo_root): + if dirpath != repo_root and (dirpath / ".git").is_dir(): + dirs.clear() + continue + for fname in files: + parser = MANIFEST_PARSERS.get(fname) + if not parser: + continue + name = parser(dirpath / fname) + if name: + found.append((fname, name, dirpath)) + return found + + +# ==================== MAIN SCAN ==================== + + +class _UnionFind: + """Minimal union-find for (name, email) identity resolution.""" + + def __init__(self) -> None: + self.parent: dict = {} + + def find(self, x): + if x not in self.parent: + self.parent[x] = x + return x + root = x + while self.parent[root] != root: + root = self.parent[root] + while self.parent[x] != root: + self.parent[x], x = root, self.parent[x] + return root + + def union(self, a, b) -> None: + ra, rb = self.find(a), self.find(b) + if ra != rb: + self.parent[ra] = rb + + +def _dedupe_people( + all_commits: list[tuple[str, str, str]], +) -> dict[str, PersonInfo]: + """Group commits by identity. Two commits are the same person if they + share a name OR an email. Display name = most frequent non-bot variant. + + ``all_commits`` is a list of (name, email, repo_str) triples from every repo. + """ + uf = _UnionFind() + for name, email, _repo in all_commits: + uf.union(("name", name), ("email", email) if email else ("name", name)) + + # Aggregate by component root + component_commits: dict = {} + for name, email, repo in all_commits: + key = uf.find(("name", name)) + entry = component_commits.setdefault( + key, {"name_counts": {}, "emails": set(), "repos": set(), "total": 0} + ) + entry["name_counts"][name] = entry["name_counts"].get(name, 0) + 1 + if email: + entry["emails"].add(email) + entry["repos"].add(repo) + entry["total"] += 1 + + # Pick display name per component: the most-frequent variant that looks + # like a real name; fall back to most-frequent overall. + people: dict[str, PersonInfo] = {} + for _key, entry in component_commits.items(): + candidates = sorted(entry["name_counts"].items(), key=lambda x: -x[1]) + display = next( + (n for n, _ in candidates if _looks_like_real_name(n)), + candidates[0][0], + ) + if not _looks_like_real_name(display): + continue # Skip handles and single-token names + # If we already have this display (rare — distinct components with the + # same chosen display), merge into the existing entry. + existing = people.get(display) + if existing: + existing.total_commits += entry["total"] + existing.emails.update(entry["emails"]) + existing.repos.update(entry["repos"]) + else: + people[display] = PersonInfo( + name=display, + total_commits=entry["total"], + emails=set(entry["emails"]), + repos=set(entry["repos"]), + ) + return people + + +def scan(root: str | os.PathLike) -> tuple[list[ProjectInfo], list[PersonInfo]]: + """Scan `root` for projects and people. Returns (projects, people) sorted.""" + root_path = Path(root).expanduser().resolve() + if not root_path.is_dir(): + return [], [] + + repos = find_git_repos(root_path) + + # Identify current user from first repo's git config, fall back to global + me_name, me_email = "", "" + if repos: + me_name, me_email = _git_user_identity(repos[0]) + if not me_name and not me_email: + me_name, me_email = _global_git_identity() + + projects: dict[str, ProjectInfo] = {} + all_commits: list[tuple[str, str, str]] = [] + + for repo in repos: + manifests = _collect_manifest_names(repo) + root_level = [m for m in manifests if m[2] == repo] + if root_level: + manifest_file, proj_name, _ = root_level[0] + elif manifests: + manifest_file, proj_name, _ = manifests[0] + else: + manifest_file, proj_name = None, repo.name + + authors = _git_authors(repo) + total_commits = len(authors) + user_commits = 0 + author_counts: dict[str, int] = {} + for name, email in authors: + if _is_bot(name, email): + continue + author_counts[name] = author_counts.get(name, 0) + 1 + all_commits.append((name, email, str(repo))) + if (me_name and name == me_name) or (me_email and email == me_email): + user_commits += 1 + + is_mine = False + if user_commits > 0: + sorted_authors = sorted(author_counts.items(), key=lambda x: -x[1]) + top5 = {n for n, _ in sorted_authors[:5]} + if me_name and me_name in top5: + is_mine = True + elif total_commits and user_commits / total_commits >= 0.10: + is_mine = True + elif user_commits >= 20: + is_mine = True + + proj = ProjectInfo( + name=proj_name, + repo_root=repo, + manifest=manifest_file, + has_git=True, + total_commits=total_commits, + user_commits=user_commits, + is_mine=is_mine, + ) + existing = projects.get(proj_name) + if existing is None or proj.user_commits > existing.user_commits: + projects[proj_name] = proj + + people = _dedupe_people(all_commits) + + # Handle case: root has manifests but no git repo anywhere + if not repos: + manifests = _collect_manifest_names(root_path) + for manifest_file, proj_name, _dirpath in manifests: + if proj_name in projects: + continue + projects[proj_name] = ProjectInfo( + name=proj_name, + repo_root=root_path, + manifest=manifest_file, + has_git=False, + ) + + project_list = sorted( + projects.values(), + key=lambda p: (not p.is_mine, -p.user_commits, -p.total_commits, p.name), + ) + people_list = sorted(people.values(), key=lambda p: -p.total_commits) + + return project_list, people_list + + +# ==================== ADAPTER ==================== + + +def to_detected_dict( + projects: list[ProjectInfo], + people: list[PersonInfo], + project_cap: int = 15, + people_cap: int = 15, +) -> dict: + """Convert scan results into the dict shape produced by entity_detector.detect_entities.""" + proj_entries = [ + { + "name": p.name, + "type": "project", + "confidence": round(p.confidence, 2), + "frequency": p.user_commits or p.total_commits, + "signals": [p.to_signal()], + } + for p in projects[:project_cap] + ] + people_entries = [ + { + "name": p.name, + "type": "person", + "confidence": round(p.confidence, 2), + "frequency": p.total_commits, + "signals": [p.to_signal()], + } + for p in people[:people_cap] + ] + return { + "people": people_entries, + "projects": proj_entries, + "uncertain": [], + } + + +# ==================== MERGE WITH REGEX DETECTOR ==================== + + +def _merge_detected(primary: dict, secondary: dict, drop_secondary_uncertain: bool = False) -> dict: + """Merge two detected dicts. Primary entries win on name conflict. + + Dedup is case-insensitive so "mempalace" (manifest name) absorbs "MemPalace" + (docs/prose reference) instead of surfacing both. + + If ``drop_secondary_uncertain`` is True, the secondary's uncertain bucket is + dropped entirely — useful when the primary signal is strong (real repo + found) and we'd rather not ask the user to adjudicate prose-regex noise. + """ + seen = {e["name"].lower() for cat in primary.values() for e in cat} + merged = {k: list(v) for k, v in primary.items()} + for cat_key in ("people", "projects", "uncertain"): + if cat_key == "uncertain" and drop_secondary_uncertain: + continue + for e in secondary.get(cat_key, []): + if e["name"].lower() in seen: + continue + merged.setdefault(cat_key, []).append(e) + seen.add(e["name"].lower()) + return merged + + +def discover_entities( + project_dir: str | os.PathLike, + languages: tuple = ("en",), + prose_file_cap: int = 10, + project_cap: int = 15, + people_cap: int = 15, +) -> dict: + """Top-level entity discovery: real signals first, prose detection second. + + Returns the same dict shape as ``entity_detector.detect_entities`` so it + plugs into ``confirm_entities`` unchanged. + + Order of signal preference: + 1. Package manifests (package.json, pyproject.toml, Cargo.toml, go.mod) + → canonical project names + 2. Git commit authors → real people with real commit counts + 3. Regex entity detection on prose files → supplementary names only + mentioned in docs/notes (not code) + """ + projects, people = scan(project_dir) + real_signal = to_detected_dict(projects, people, project_cap=project_cap, people_cap=people_cap) + + # Secondary pass: prose-only extraction catches names mentioned in docs + # that never made a commit (e.g. a stakeholder or family member in notes). + from mempalace.entity_detector import detect_entities, scan_for_detection + + prose_files = scan_for_detection(str(project_dir), max_files=prose_file_cap) + prose_detected = ( + detect_entities(prose_files, languages=languages) + if prose_files + else {"people": [], "projects": [], "uncertain": []} + ) + + # If git/manifests gave us real projects, suppress the regex "uncertain" bucket. + # That bucket is mostly noise (common words, CamelCase tech terms, etc.) and + # adding it to the review flow just makes the user do triage we can skip. + has_real_signal = bool(projects) or bool(people) + return _merge_detected(real_signal, prose_detected, drop_secondary_uncertain=has_real_signal) + + +# ==================== CLI ==================== + + +if __name__ == "__main__": + import sys + + target = sys.argv[1] if len(sys.argv) > 1 else "." + projs, ppl = scan(target) + print(f"=== PROJECTS ({len(projs)}) ===") + for p in projs[:30]: + mark = "★" if p.is_mine else " " + print(f" {mark} {p.name:35} conf={p.confidence:.2f} {p.to_signal()}") + print() + print(f"=== PEOPLE ({len(ppl)}) ===") + for p in ppl[:30]: + print(f" {p.name:30} conf={p.confidence:.2f} {p.to_signal()}") diff --git a/tests/test_project_scanner.py b/tests/test_project_scanner.py new file mode 100644 index 0000000..3499796 --- /dev/null +++ b/tests/test_project_scanner.py @@ -0,0 +1,411 @@ +"""Tests for mempalace.project_scanner.""" + +import json +import subprocess +from pathlib import Path + +from mempalace.project_scanner import ( + PersonInfo, + ProjectInfo, + _dedupe_people, + _is_bot, + _looks_like_real_name, + _merge_detected, + _parse_cargo, + _parse_gomod, + _parse_package_json, + _parse_pyproject, + _UnionFind, + discover_entities, + find_git_repos, + scan, + to_detected_dict, +) + + +# ── manifest parsers ──────────────────────────────────────────────────── + + +def test_parse_package_json(tmp_path): + f = tmp_path / "package.json" + f.write_text(json.dumps({"name": "my-package", "version": "1.0.0"})) + assert _parse_package_json(f) == "my-package" + + +def test_parse_package_json_missing_name(tmp_path): + f = tmp_path / "package.json" + f.write_text(json.dumps({"version": "1.0.0"})) + assert _parse_package_json(f) is None + + +def test_parse_package_json_malformed(tmp_path): + f = tmp_path / "package.json" + f.write_text("{ not valid json") + assert _parse_package_json(f) is None + + +def test_parse_pyproject_pep621(tmp_path): + f = tmp_path / "pyproject.toml" + f.write_text('[project]\nname = "my-py-package"\n') + assert _parse_pyproject(f) == "my-py-package" + + +def test_parse_pyproject_poetry(tmp_path): + f = tmp_path / "pyproject.toml" + f.write_text('[tool.poetry]\nname = "poetry-pkg"\n') + assert _parse_pyproject(f) == "poetry-pkg" + + +def test_parse_cargo(tmp_path): + f = tmp_path / "Cargo.toml" + f.write_text('[package]\nname = "rust-crate"\nversion = "0.1.0"\n') + assert _parse_cargo(f) == "rust-crate" + + +def test_parse_gomod(tmp_path): + f = tmp_path / "go.mod" + f.write_text("module github.com/user/my-go-mod\n\ngo 1.21\n") + assert _parse_gomod(f) == "my-go-mod" + + +# ── bot filtering ─────────────────────────────────────────────────────── + + +def test_is_bot_catches_github_actions(): + assert _is_bot("github-actions[bot]", "41898282+github-actions[bot]@users.noreply.github.com") + + +def test_is_bot_catches_dependabot(): + assert _is_bot("dependabot[bot]", "dependabot@github.com") + + +def test_is_bot_catches_pr_bot(): + assert _is_bot("Comfy Org PR Bot", "prbot@example.com") + + +def test_is_bot_does_not_flag_github_privacy_email(): + # Real humans use ...@users.noreply.github.com when privacy is enabled. + # Must NOT be filtered. + assert not _is_bot("Igor Lins e Silva", "123456+igorls@users.noreply.github.com") + + +def test_is_bot_does_not_flag_robot_person_name(): + # "Robot" as a surname should not trigger the \bbot$ pattern + # since \b requires a boundary before 'bot'. + assert not _is_bot("Sarah Robot", "sarah@example.com") + + +def test_looks_like_real_name_accepts_human(): + assert _looks_like_real_name("Igor Lins e Silva") + assert _looks_like_real_name("Jane Doe") + + +def test_looks_like_real_name_rejects_handles(): + assert not _looks_like_real_name("666ghj") + assert not _looks_like_real_name("comfyanonymous") + assert not _looks_like_real_name("bensig") + assert not _looks_like_real_name("") + assert not _looks_like_real_name("no_spaces_handle") + + +# ── union-find dedup ──────────────────────────────────────────────────── + + +def test_unionfind_merges_shared_email(): + commits = [ + ("Milla J", "shared@example.com", "repo1"), + ("MSL", "shared@example.com", "repo1"), + ("Milla J", "other@example.com", "repo1"), + ] + people = _dedupe_people(commits) + # All three commits collapse into one "Milla J" person (MSL is filtered + # as display name because it lacks a space but its commits still count). + assert "Milla J" in people + assert people["Milla J"].total_commits == 3 + assert "MSL" not in people + + +def test_unionfind_keeps_distinct_people_separate(): + commits = [ + ("Alice Example", "alice@example.com", "r"), + ("Bob Sample", "bob@sample.org", "r"), + ] + people = _dedupe_people(commits) + assert "Alice Example" in people + assert "Bob Sample" in people + + +def test_unionfind_merges_shared_name(): + """Same display name, two different emails, same person.""" + commits = [ + ("Jane Doe", "jane@work.com", "r"), + ("Jane Doe", "jane@personal.com", "r"), + ] + people = _dedupe_people(commits) + assert people["Jane Doe"].total_commits == 2 + assert len(people["Jane Doe"].emails) == 2 + + +# ── project_info / person_info ───────────────────────────────────────── + + +def test_project_info_confidence_is_mine(): + p = ProjectInfo(name="x", repo_root=Path("."), is_mine=True) + assert p.confidence == 0.99 + + +def test_project_info_confidence_no_git(): + p = ProjectInfo(name="x", repo_root=Path("."), has_git=False, manifest="package.json") + assert p.confidence > 0.8 + + +def test_person_info_signal_pluralization(): + p = PersonInfo(name="x", total_commits=1, repos={"a"}) + assert "1 commit across 1 repo" == p.to_signal() + p2 = PersonInfo(name="y", total_commits=5, repos={"a", "b"}) + assert "5 commits across 2 repos" == p2.to_signal() + + +# ── find_git_repos ────────────────────────────────────────────────────── + + +def test_find_git_repos_detects_root_repo(tmp_path): + (tmp_path / ".git").mkdir() + repos = find_git_repos(tmp_path) + assert tmp_path in repos + + +def test_find_git_repos_detects_nested(tmp_path): + sub = tmp_path / "subproject" + sub.mkdir() + (sub / ".git").mkdir() + repos = find_git_repos(tmp_path) + assert sub in repos + + +def test_find_git_repos_skips_nested_inside_repo(tmp_path): + """If root is a repo and there's another repo inside it, the inner repo is + NOT walked into (we stop at the first repo boundary when descending).""" + (tmp_path / ".git").mkdir() + deep = tmp_path / "a" / "b" / "nested-repo" + deep.mkdir(parents=True) + (deep / ".git").mkdir() + repos = find_git_repos(tmp_path) + # Root IS found; nested still discovered on its own branch (not inside root's .git) + assert tmp_path in repos + + +def test_find_git_repos_empty_dir(tmp_path): + assert find_git_repos(tmp_path) == [] + + +# ── scan ──────────────────────────────────────────────────────────────── + + +def _init_git_repo(path: Path, name: str = "Jane Doe", email: str = "jane@example.com"): + """Helper: init a git repo with one commit.""" + subprocess.run(["git", "init", "-q"], cwd=path, check=True) + subprocess.run(["git", "config", "user.name", name], cwd=path, check=True) + subprocess.run(["git", "config", "user.email", email], cwd=path, check=True) + subprocess.run(["git", "config", "commit.gpgsign", "false"], cwd=path, check=True) + (path / "README.md").write_text("hello") + subprocess.run(["git", "add", "README.md"], cwd=path, check=True) + subprocess.run( + ["git", "commit", "-q", "-m", "initial"], + cwd=path, + check=True, + env={"GIT_COMMITTER_NAME": name, "GIT_COMMITTER_EMAIL": email, "PATH": "/usr/bin:/bin"}, + ) + + +def test_scan_project_from_package_json(tmp_path): + (tmp_path / "package.json").write_text(json.dumps({"name": "my-app"})) + _init_git_repo(tmp_path) + projects, people = scan(tmp_path) + assert len(projects) == 1 + assert projects[0].name == "my-app" + assert projects[0].is_mine is True + + +def test_scan_project_from_pyproject(tmp_path): + (tmp_path / "pyproject.toml").write_text('[project]\nname = "pyproj"\n') + _init_git_repo(tmp_path) + projects, _ = scan(tmp_path) + assert any(p.name == "pyproj" for p in projects) + + +def test_scan_fallback_to_dir_name_when_no_manifest(tmp_path): + repo = tmp_path / "my-repo-name" + repo.mkdir() + _init_git_repo(repo) + projects, _ = scan(tmp_path) + assert any(p.name == "my-repo-name" for p in projects) + + +def test_scan_manifest_only_no_git(tmp_path): + """A dir with a manifest but no git still produces a project.""" + (tmp_path / "package.json").write_text(json.dumps({"name": "manifest-only"})) + projects, people = scan(tmp_path) + assert len(projects) == 1 + assert projects[0].name == "manifest-only" + assert projects[0].has_git is False + assert people == [] + + +def test_scan_empty_dir(tmp_path): + projects, people = scan(tmp_path) + assert projects == [] + assert people == [] + + +def test_scan_returns_empty_for_nonexistent(tmp_path): + missing = tmp_path / "does-not-exist" + projects, people = scan(missing) + assert projects == [] + assert people == [] + + +# ── to_detected_dict ──────────────────────────────────────────────────── + + +def test_to_detected_dict_shape(): + projects = [ProjectInfo(name="p", repo_root=Path("."), is_mine=True, manifest="package.json")] + people = [PersonInfo(name="Jane Doe", total_commits=5, repos={"r"})] + d = to_detected_dict(projects, people) + assert set(d.keys()) == {"people", "projects", "uncertain"} + assert d["projects"][0]["name"] == "p" + assert d["projects"][0]["type"] == "project" + assert d["people"][0]["name"] == "Jane Doe" + assert d["people"][0]["type"] == "person" + assert d["uncertain"] == [] + + +# ── merge ─────────────────────────────────────────────────────────────── + + +def test_merge_primary_wins_case_insensitive(): + primary = { + "people": [], + "projects": [ + { + "name": "mempalace", + "type": "project", + "confidence": 0.99, + "frequency": 10, + "signals": ["pyproject.toml"], + } + ], + "uncertain": [], + } + secondary = { + "people": [], + "projects": [], + "uncertain": [ + { + "name": "MemPalace", + "type": "uncertain", + "confidence": 0.4, + "frequency": 6, + "signals": ["regex"], + } + ], + } + merged = _merge_detected(primary, secondary) + # `MemPalace` (uncertain) is deduped against `mempalace` (project) case-insensitively + assert len(merged["projects"]) == 1 + assert len(merged["uncertain"]) == 0 + + +def test_merge_drops_secondary_uncertain_when_requested(): + primary = {"people": [], "projects": [], "uncertain": []} + secondary = { + "people": [], + "projects": [], + "uncertain": [ + {"name": "Foo", "type": "uncertain", "confidence": 0.4, "frequency": 3, "signals": []} + ], + } + merged = _merge_detected(primary, secondary, drop_secondary_uncertain=True) + assert merged["uncertain"] == [] + + +def test_merge_keeps_distinct_names(): + primary = { + "people": [ + { + "name": "Alice Smith", + "type": "person", + "confidence": 0.9, + "frequency": 10, + "signals": [], + } + ], + "projects": [], + "uncertain": [], + } + secondary = { + "people": [ + { + "name": "Bob Jones", + "type": "person", + "confidence": 0.7, + "frequency": 3, + "signals": [], + } + ], + "projects": [], + "uncertain": [], + } + merged = _merge_detected(primary, secondary) + assert len(merged["people"]) == 2 + + +# ── discover_entities ────────────────────────────────────────────────── + + +def test_discover_entities_falls_back_to_prose_when_no_git(tmp_path): + """If no manifests or git, regex detector on prose is the only source.""" + notes = tmp_path / "notes.md" + notes.write_text( + "Riley said hello. Riley asked about it. Riley laughed. " + "Hey Riley, thanks for the help. Riley pushed the change. " + "Riley decided to go." + ) + d = discover_entities(str(tmp_path)) + # Prose-only fallback kicks in — Riley appears with person signals + all_names = [e["name"] for cat in d.values() for e in cat] + assert "Riley" in all_names + + +def test_discover_entities_prefers_real_signal_over_prose(tmp_path): + """When manifest exists, its name wins even if prose has noisy candidates.""" + (tmp_path / "package.json").write_text(json.dumps({"name": "realproj"})) + _init_git_repo(tmp_path) + (tmp_path / "doc.md").write_text( + "Something. Another. Whatever. Context. Context. Context. Context. " + "realproj. realproj. realproj. realproj." + ) + d = discover_entities(str(tmp_path)) + proj_names = [e["name"] for e in d["projects"]] + assert "realproj" in proj_names + + +# ── _UnionFind basics ────────────────────────────────────────────────── + + +def test_unionfind_find_creates_singleton(): + uf = _UnionFind() + assert uf.find("x") == "x" + + +def test_unionfind_union_merges(): + uf = _UnionFind() + uf.union("a", "b") + assert uf.find("a") == uf.find("b") + + +def test_unionfind_transitive(): + uf = _UnionFind() + uf.union("a", "b") + uf.union("b", "c") + assert uf.find("a") == uf.find("c") diff --git a/uv.lock b/uv.lock index 49c28ff..5af54f1 100644 --- a/uv.lock +++ b/uv.lock @@ -1169,7 +1169,7 @@ wheels = [ [[package]] name = "mempalace" -version = "3.3.2" +version = "3.3.3" source = { editable = "." } dependencies = [ { name = "chromadb" }, From 14d7444abe5b0247b79c29b80e6e04625b9ec15e Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Fri, 24 Apr 2026 00:27:09 -0300 Subject: [PATCH 3/8] fix(deps): add tomli fallback for Python <3.11 `tomllib` is stdlib only in Python 3.11+. On Python 3.9/3.10 (and the macOS runner) the scanner's toml parsing returned empty, so manifest lookups for `pyproject.toml` / `Cargo.toml` produced no name. CI surfaced this via 4 test_project_scanner.py failures on the 3.9 matrix. Add `tomli>=2.0.0` as a conditional dependency for `python_version < '3.11'` and fall back to it in `project_scanner.py`. The project still declares `requires-python = ">=3.9"` so the fallback is the correct shape. --- mempalace/project_scanner.py | 7 +++++-- pyproject.toml | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/mempalace/project_scanner.py b/mempalace/project_scanner.py index e078b6e..c03b883 100644 --- a/mempalace/project_scanner.py +++ b/mempalace/project_scanner.py @@ -28,7 +28,10 @@ from typing import Optional try: import tomllib # Python 3.11+ except ImportError: # pragma: no cover - tomllib = None # type: ignore + try: + import tomli as tomllib # Python 3.9/3.10 backport + except ImportError: + tomllib = None # type: ignore SKIP_DIRS = { @@ -130,7 +133,7 @@ def _parse_toml(path: Path) -> dict: try: with open(path, "rb") as f: return tomllib.load(f) - except (OSError, Exception): + except (OSError, tomllib.TOMLDecodeError): return {} diff --git a/pyproject.toml b/pyproject.toml index 8733ec3..617c067 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ classifiers = [ dependencies = [ "chromadb>=1.5.4,<2", "pyyaml>=6.0,<7", + "tomli>=2.0.0; python_version < '3.11'", ] [project.urls] From 70d4c5471e04210182b58e0dcf81e6723263bb9c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 24 Apr 2026 03:48:47 +0000 Subject: [PATCH 4/8] fix(project-scanner): address review feedback Agent-Logs-Url: https://github.com/MemPalace/mempalace/sessions/3c277c46-20b3-4a43-8eb7-8ee2eb3cb55a Co-authored-by: igorls <4753812+igorls@users.noreply.github.com> --- mempalace/project_scanner.py | 44 ++++++++++++----- tests/test_project_scanner.py | 90 ++++++++++++++++++++++++++++++----- 2 files changed, 110 insertions(+), 24 deletions(-) diff --git a/mempalace/project_scanner.py b/mempalace/project_scanner.py index c03b883..e67220b 100644 --- a/mempalace/project_scanner.py +++ b/mempalace/project_scanner.py @@ -170,6 +170,12 @@ MANIFEST_PARSERS = { "Cargo.toml": _parse_cargo, "go.mod": _parse_gomod, } +MANIFEST_PRIORITY = { + "pyproject.toml": 0, + "package.json": 1, + "Cargo.toml": 2, + "go.mod": 3, +} # ==================== GIT HELPERS ==================== @@ -290,7 +296,6 @@ def _looks_like_real_name(name: str) -> bool: def _walk(root: Path, max_depth: int = MAX_DEPTH): for dirpath, dirs, files in os.walk(root): dirs[:] = [d for d in dirs if d not in SKIP_DIRS and not d.startswith(".")] - rel = Path(dirpath).relative_to(root) if dirpath != str(root) else Path(".") try: rel = Path(dirpath).relative_to(root) except ValueError: @@ -302,17 +307,34 @@ def _walk(root: Path, max_depth: int = MAX_DEPTH): yield Path(dirpath), dirs, files +def _has_git_marker(path: Path) -> bool: + git_path = path / ".git" + return git_path.is_dir() or git_path.is_file() + + +def _manifest_sort_key(entry: tuple[str, str, Path], repo_root: Path) -> tuple[int, int, str]: + manifest_file, _project_name, manifest_dir = entry + try: + rel = manifest_dir.relative_to(repo_root) + depth = len(rel.parts) + rel_str = rel.as_posix() + except ValueError: + depth = MAX_DEPTH + 1 + rel_str = manifest_dir.as_posix() + return (depth, MANIFEST_PRIORITY.get(manifest_file, len(MANIFEST_PRIORITY)), rel_str) + + def find_git_repos(root: Path, max_depth: int = MAX_DEPTH) -> list[Path]: """Return git repo roots under `root` (including root itself if it's a repo).""" root = root.resolve() repos: list[Path] = [] - if (root / ".git").is_dir(): + if _has_git_marker(root): # Root is a repo — still walk for nested repos (submodules, etc.) repos.append(root) for dirpath, dirs, _ in _walk(root, max_depth): if dirpath == root: continue - if (dirpath / ".git").is_dir(): + if _has_git_marker(dirpath): repos.append(dirpath) dirs.clear() # don't descend into this repo's contents from here return repos @@ -325,7 +347,7 @@ def _collect_manifest_names(repo_root: Path) -> list[tuple[str, str, Path]]: """ found: list[tuple[str, str, Path]] = [] for dirpath, dirs, files in _walk(repo_root): - if dirpath != repo_root and (dirpath / ".git").is_dir(): + if dirpath != repo_root and _has_git_marker(dirpath): dirs.clear() continue for fname in files: @@ -335,7 +357,7 @@ def _collect_manifest_names(repo_root: Path) -> list[tuple[str, str, Path]]: name = parser(dirpath / fname) if name: found.append((fname, name, dirpath)) - return found + return sorted(found, key=lambda entry: _manifest_sort_key(entry, repo_root)) # ==================== MAIN SCAN ==================== @@ -437,21 +459,17 @@ def scan(root: str | os.PathLike) -> tuple[list[ProjectInfo], list[PersonInfo]]: for repo in repos: manifests = _collect_manifest_names(repo) - root_level = [m for m in manifests if m[2] == repo] - if root_level: - manifest_file, proj_name, _ = root_level[0] - elif manifests: + if manifests: manifest_file, proj_name, _ = manifests[0] else: manifest_file, proj_name = None, repo.name authors = _git_authors(repo) - total_commits = len(authors) + non_bot_authors = [(name, email) for name, email in authors if not _is_bot(name, email)] + total_commits = len(non_bot_authors) user_commits = 0 author_counts: dict[str, int] = {} - for name, email in authors: - if _is_bot(name, email): - continue + for name, email in non_bot_authors: author_counts[name] = author_counts.get(name, 0) + 1 all_commits.append((name, email, str(repo))) if (me_name and name == me_name) or (me_email and email == me_email): diff --git a/tests/test_project_scanner.py b/tests/test_project_scanner.py index 3499796..0483959 100644 --- a/tests/test_project_scanner.py +++ b/tests/test_project_scanner.py @@ -1,15 +1,20 @@ """Tests for mempalace.project_scanner.""" import json +import os +import shutil import subprocess from pathlib import Path +import pytest + from mempalace.project_scanner import ( PersonInfo, ProjectInfo, _dedupe_people, _is_bot, _looks_like_real_name, + _collect_manifest_names, _merge_detected, _parse_cargo, _parse_gomod, @@ -184,15 +189,24 @@ def test_find_git_repos_detects_nested(tmp_path): def test_find_git_repos_skips_nested_inside_repo(tmp_path): - """If root is a repo and there's another repo inside it, the inner repo is - NOT walked into (we stop at the first repo boundary when descending).""" + """If root is a repo, nested repos are still discovered as separate roots.""" (tmp_path / ".git").mkdir() deep = tmp_path / "a" / "b" / "nested-repo" deep.mkdir(parents=True) (deep / ".git").mkdir() repos = find_git_repos(tmp_path) - # Root IS found; nested still discovered on its own branch (not inside root's .git) assert tmp_path in repos + assert deep in repos + + +def test_find_git_repos_detects_git_file_markers(tmp_path): + (tmp_path / ".git").write_text("gitdir: /tmp/root.git\n") + sub = tmp_path / "subproject" + sub.mkdir() + (sub / ".git").write_text("gitdir: /tmp/sub.git\n") + repos = find_git_repos(tmp_path) + assert tmp_path in repos + assert sub in repos def test_find_git_repos_empty_dir(tmp_path): @@ -202,20 +216,35 @@ def test_find_git_repos_empty_dir(tmp_path): # ── scan ──────────────────────────────────────────────────────────────── +def _require_git() -> None: + if shutil.which("git") is None: + pytest.skip("git executable not available") + + +def _git_commit( + path: Path, filename: str, content: str, message: str, name: str, email: str +) -> None: + _require_git() + env = { + **os.environ, + "GIT_AUTHOR_NAME": name, + "GIT_AUTHOR_EMAIL": email, + "GIT_COMMITTER_NAME": name, + "GIT_COMMITTER_EMAIL": email, + } + (path / filename).write_text(content) + subprocess.run(["git", "add", filename], cwd=path, check=True, env=env) + subprocess.run(["git", "commit", "-q", "-m", message], cwd=path, check=True, env=env) + + def _init_git_repo(path: Path, name: str = "Jane Doe", email: str = "jane@example.com"): """Helper: init a git repo with one commit.""" + _require_git() subprocess.run(["git", "init", "-q"], cwd=path, check=True) subprocess.run(["git", "config", "user.name", name], cwd=path, check=True) subprocess.run(["git", "config", "user.email", email], cwd=path, check=True) subprocess.run(["git", "config", "commit.gpgsign", "false"], cwd=path, check=True) - (path / "README.md").write_text("hello") - subprocess.run(["git", "add", "README.md"], cwd=path, check=True) - subprocess.run( - ["git", "commit", "-q", "-m", "initial"], - cwd=path, - check=True, - env={"GIT_COMMITTER_NAME": name, "GIT_COMMITTER_EMAIL": email, "PATH": "/usr/bin:/bin"}, - ) + _git_commit(path, "README.md", "hello", "initial", name, email) def test_scan_project_from_package_json(tmp_path): @@ -234,6 +263,17 @@ def test_scan_project_from_pyproject(tmp_path): assert any(p.name == "pyproj" for p in projects) +def test_scan_prefers_root_manifest_with_explicit_priority(tmp_path): + (tmp_path / "package.json").write_text(json.dumps({"name": "package-name"})) + (tmp_path / "pyproject.toml").write_text('[project]\nname = "pyproject-name"\n') + nested = tmp_path / "nested" + nested.mkdir() + (nested / "package.json").write_text(json.dumps({"name": "nested-name"})) + _init_git_repo(tmp_path) + projects, _ = scan(tmp_path) + assert projects[0].name == "pyproject-name" + + def test_scan_fallback_to_dir_name_when_no_manifest(tmp_path): repo = tmp_path / "my-repo-name" repo.mkdir() @@ -252,6 +292,34 @@ def test_scan_manifest_only_no_git(tmp_path): assert people == [] +def test_collect_manifest_names_stops_at_git_file_boundary(tmp_path): + (tmp_path / ".git").write_text("gitdir: /tmp/root.git\n") + (tmp_path / "package.json").write_text(json.dumps({"name": "root-name"})) + nested = tmp_path / "nested" + nested.mkdir() + (nested / ".git").write_text("gitdir: /tmp/nested.git\n") + (nested / "package.json").write_text(json.dumps({"name": "nested-name"})) + manifests = _collect_manifest_names(tmp_path) + assert [name for _file, name, _dir in manifests] == ["root-name"] + + +def test_scan_excludes_bot_commits_from_totals(tmp_path): + (tmp_path / "package.json").write_text(json.dumps({"name": "my-app"})) + _init_git_repo(tmp_path, name="Jane Doe", email="jane@example.com") + _git_commit( + tmp_path, + "bot.txt", + "generated", + "bot update", + "github-actions[bot]", + "41898282+github-actions[bot]@users.noreply.github.com", + ) + projects, people = scan(tmp_path) + assert projects[0].total_commits == 1 + assert projects[0].user_commits == 1 + assert [person.name for person in people] == ["Jane Doe"] + + def test_scan_empty_dir(tmp_path): projects, people = scan(tmp_path) assert projects == [] From 851ebebc29eb6ac9d0ca18e0a8d41f3f1dc1327d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 24 Apr 2026 03:50:13 +0000 Subject: [PATCH 5/8] test(project-scanner): tighten git helper env handling Agent-Logs-Url: https://github.com/MemPalace/mempalace/sessions/3c277c46-20b3-4a43-8eb7-8ee2eb3cb55a Co-authored-by: igorls <4753812+igorls@users.noreply.github.com> --- mempalace/project_scanner.py | 5 +---- tests/test_project_scanner.py | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/mempalace/project_scanner.py b/mempalace/project_scanner.py index e67220b..f426e8f 100644 --- a/mempalace/project_scanner.py +++ b/mempalace/project_scanner.py @@ -296,10 +296,7 @@ def _looks_like_real_name(name: str) -> bool: def _walk(root: Path, max_depth: int = MAX_DEPTH): for dirpath, dirs, files in os.walk(root): dirs[:] = [d for d in dirs if d not in SKIP_DIRS and not d.startswith(".")] - try: - rel = Path(dirpath).relative_to(root) - except ValueError: - continue + rel = Path(dirpath).relative_to(root) depth = 0 if rel == Path(".") else len(rel.parts) if depth > max_depth: dirs.clear() diff --git a/tests/test_project_scanner.py b/tests/test_project_scanner.py index 0483959..821f527 100644 --- a/tests/test_project_scanner.py +++ b/tests/test_project_scanner.py @@ -221,17 +221,25 @@ def _require_git() -> None: pytest.skip("git executable not available") -def _git_commit( - path: Path, filename: str, content: str, message: str, name: str, email: str -) -> None: - _require_git() +def _git_test_env(name: str, email: str) -> dict[str, str]: env = { - **os.environ, "GIT_AUTHOR_NAME": name, "GIT_AUTHOR_EMAIL": email, "GIT_COMMITTER_NAME": name, "GIT_COMMITTER_EMAIL": email, } + for key in ("PATH", "HOME", "SystemRoot", "ComSpec", "TMPDIR", "TEMP", "TMP"): + value = os.environ.get(key) + if value: + env[key] = value + return env + + +def _git_commit( + path: Path, filename: str, content: str, message: str, name: str, email: str +) -> None: + _require_git() + env = _git_test_env(name, email) (path / filename).write_text(content) subprocess.run(["git", "add", filename], cwd=path, check=True, env=env) subprocess.run(["git", "commit", "-q", "-m", message], cwd=path, check=True, env=env) From ec9084f4d855ada4d45d7b14053e7149d6b50041 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 24 Apr 2026 03:51:21 +0000 Subject: [PATCH 6/8] refactor(project-scanner): tidy manifest priority helpers Agent-Logs-Url: https://github.com/MemPalace/mempalace/sessions/3c277c46-20b3-4a43-8eb7-8ee2eb3cb55a Co-authored-by: igorls <4753812+igorls@users.noreply.github.com> --- mempalace/project_scanner.py | 15 ++++++++------- tests/test_project_scanner.py | 4 +++- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/mempalace/project_scanner.py b/mempalace/project_scanner.py index f426e8f..eaab560 100644 --- a/mempalace/project_scanner.py +++ b/mempalace/project_scanner.py @@ -164,18 +164,19 @@ def _parse_gomod(path: Path) -> Optional[str]: return None -MANIFEST_PARSERS = { - "package.json": _parse_package_json, - "pyproject.toml": _parse_pyproject, - "Cargo.toml": _parse_cargo, - "go.mod": _parse_gomod, -} MANIFEST_PRIORITY = { "pyproject.toml": 0, "package.json": 1, "Cargo.toml": 2, "go.mod": 3, } +UNKNOWN_MANIFEST_PRIORITY = 999 +MANIFEST_PARSERS = { + "package.json": _parse_package_json, + "pyproject.toml": _parse_pyproject, + "Cargo.toml": _parse_cargo, + "go.mod": _parse_gomod, +} # ==================== GIT HELPERS ==================== @@ -318,7 +319,7 @@ def _manifest_sort_key(entry: tuple[str, str, Path], repo_root: Path) -> tuple[i except ValueError: depth = MAX_DEPTH + 1 rel_str = manifest_dir.as_posix() - return (depth, MANIFEST_PRIORITY.get(manifest_file, len(MANIFEST_PRIORITY)), rel_str) + return (depth, MANIFEST_PRIORITY.get(manifest_file, UNKNOWN_MANIFEST_PRIORITY), rel_str) def find_git_repos(root: Path, max_depth: int = MAX_DEPTH) -> list[Path]: diff --git a/tests/test_project_scanner.py b/tests/test_project_scanner.py index 821f527..4fcb6dd 100644 --- a/tests/test_project_scanner.py +++ b/tests/test_project_scanner.py @@ -27,6 +27,8 @@ from mempalace.project_scanner import ( to_detected_dict, ) +GIT_ENV_ALLOWLIST = ("PATH", "HOME", "SystemRoot", "ComSpec", "TMPDIR", "TEMP", "TMP") + # ── manifest parsers ──────────────────────────────────────────────────── @@ -228,7 +230,7 @@ def _git_test_env(name: str, email: str) -> dict[str, str]: "GIT_COMMITTER_NAME": name, "GIT_COMMITTER_EMAIL": email, } - for key in ("PATH", "HOME", "SystemRoot", "ComSpec", "TMPDIR", "TEMP", "TMP"): + for key in GIT_ENV_ALLOWLIST: value = os.environ.get(key) if value: env[key] = value From d4cc367261b4d2929732186a5161943658110d01 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 24 Apr 2026 03:52:37 +0000 Subject: [PATCH 7/8] test(project-scanner): harden git helper execution Agent-Logs-Url: https://github.com/MemPalace/mempalace/sessions/3c277c46-20b3-4a43-8eb7-8ee2eb3cb55a Co-authored-by: igorls <4753812+igorls@users.noreply.github.com> --- mempalace/project_scanner.py | 4 ++++ tests/test_project_scanner.py | 24 ++++++++++++++++-------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/mempalace/project_scanner.py b/mempalace/project_scanner.py index eaab560..fd6dffc 100644 --- a/mempalace/project_scanner.py +++ b/mempalace/project_scanner.py @@ -170,6 +170,7 @@ MANIFEST_PRIORITY = { "Cargo.toml": 2, "go.mod": 3, } +# Sentinel so unknown manifests always sort after the known manifest types above. UNKNOWN_MANIFEST_PRIORITY = 999 MANIFEST_PARSERS = { "package.json": _parse_package_json, @@ -311,6 +312,9 @@ def _has_git_marker(path: Path) -> bool: def _manifest_sort_key(entry: tuple[str, str, Path], repo_root: Path) -> tuple[int, int, str]: + """Sort manifests by shallowest path first, then known manifest priority, + then lexicographic path for deterministic tie-breaking. + """ manifest_file, _project_name, manifest_dir = entry try: rel = manifest_dir.relative_to(repo_root) diff --git a/tests/test_project_scanner.py b/tests/test_project_scanner.py index 4fcb6dd..ac9134b 100644 --- a/tests/test_project_scanner.py +++ b/tests/test_project_scanner.py @@ -27,7 +27,9 @@ from mempalace.project_scanner import ( to_detected_dict, ) -GIT_ENV_ALLOWLIST = ("PATH", "HOME", "SystemRoot", "ComSpec", "TMPDIR", "TEMP", "TMP") +# Keep only a small portability-focused allowlist for git subprocesses in tests. +GIT_ENV_ALLOWLIST = ("HOME", "SystemRoot", "ComSpec", "TMPDIR", "TEMP", "TMP") +GIT_EXECUTABLE = shutil.which("git") # ── manifest parsers ──────────────────────────────────────────────────── @@ -219,7 +221,7 @@ def test_find_git_repos_empty_dir(tmp_path): def _require_git() -> None: - if shutil.which("git") is None: + if GIT_EXECUTABLE is None: pytest.skip("git executable not available") @@ -237,23 +239,29 @@ def _git_test_env(name: str, email: str) -> dict[str, str]: return env +def _git(*args: str) -> list[str]: + _require_git() + assert GIT_EXECUTABLE is not None + return [GIT_EXECUTABLE, *args] + + def _git_commit( path: Path, filename: str, content: str, message: str, name: str, email: str ) -> None: _require_git() env = _git_test_env(name, email) (path / filename).write_text(content) - subprocess.run(["git", "add", filename], cwd=path, check=True, env=env) - subprocess.run(["git", "commit", "-q", "-m", message], cwd=path, check=True, env=env) + subprocess.run(_git("add", filename), cwd=path, check=True, env=env) + subprocess.run(_git("commit", "-q", "-m", message), cwd=path, check=True, env=env) def _init_git_repo(path: Path, name: str = "Jane Doe", email: str = "jane@example.com"): """Helper: init a git repo with one commit.""" _require_git() - subprocess.run(["git", "init", "-q"], cwd=path, check=True) - subprocess.run(["git", "config", "user.name", name], cwd=path, check=True) - subprocess.run(["git", "config", "user.email", email], cwd=path, check=True) - subprocess.run(["git", "config", "commit.gpgsign", "false"], cwd=path, check=True) + subprocess.run(_git("init", "-q"), cwd=path, check=True) + subprocess.run(_git("config", "user.name", name), cwd=path, check=True) + subprocess.run(_git("config", "user.email", email), cwd=path, check=True) + subprocess.run(_git("config", "commit.gpgsign", "false"), cwd=path, check=True) _git_commit(path, "README.md", "hello", "initial", name, email) From 9486d8b129ccf3cacfe2a9fbac3100c4df981a5e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 24 Apr 2026 03:53:43 +0000 Subject: [PATCH 8/8] test(project-scanner): make gitdir fixtures portable Agent-Logs-Url: https://github.com/MemPalace/mempalace/sessions/3c277c46-20b3-4a43-8eb7-8ee2eb3cb55a Co-authored-by: igorls <4753812+igorls@users.noreply.github.com> --- mempalace/project_scanner.py | 2 +- tests/test_project_scanner.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/mempalace/project_scanner.py b/mempalace/project_scanner.py index fd6dffc..3486db3 100644 --- a/mempalace/project_scanner.py +++ b/mempalace/project_scanner.py @@ -171,7 +171,7 @@ MANIFEST_PRIORITY = { "go.mod": 3, } # Sentinel so unknown manifests always sort after the known manifest types above. -UNKNOWN_MANIFEST_PRIORITY = 999 +UNKNOWN_MANIFEST_PRIORITY = max(MANIFEST_PRIORITY.values()) + 1 MANIFEST_PARSERS = { "package.json": _parse_package_json, "pyproject.toml": _parse_pyproject, diff --git a/tests/test_project_scanner.py b/tests/test_project_scanner.py index ac9134b..ec66501 100644 --- a/tests/test_project_scanner.py +++ b/tests/test_project_scanner.py @@ -32,6 +32,10 @@ GIT_ENV_ALLOWLIST = ("HOME", "SystemRoot", "ComSpec", "TMPDIR", "TEMP", "TMP") GIT_EXECUTABLE = shutil.which("git") +def _gitdir_marker(path: Path) -> str: + return f"gitdir: {path}\n" + + # ── manifest parsers ──────────────────────────────────────────────────── @@ -204,10 +208,10 @@ def test_find_git_repos_skips_nested_inside_repo(tmp_path): def test_find_git_repos_detects_git_file_markers(tmp_path): - (tmp_path / ".git").write_text("gitdir: /tmp/root.git\n") + (tmp_path / ".git").write_text(_gitdir_marker(tmp_path.parent / "root.git")) sub = tmp_path / "subproject" sub.mkdir() - (sub / ".git").write_text("gitdir: /tmp/sub.git\n") + (sub / ".git").write_text(_gitdir_marker(tmp_path.parent / "sub.git")) repos = find_git_repos(tmp_path) assert tmp_path in repos assert sub in repos @@ -311,11 +315,11 @@ def test_scan_manifest_only_no_git(tmp_path): def test_collect_manifest_names_stops_at_git_file_boundary(tmp_path): - (tmp_path / ".git").write_text("gitdir: /tmp/root.git\n") + (tmp_path / ".git").write_text(_gitdir_marker(tmp_path.parent / "root.git")) (tmp_path / "package.json").write_text(json.dumps({"name": "root-name"})) nested = tmp_path / "nested" nested.mkdir() - (nested / ".git").write_text("gitdir: /tmp/nested.git\n") + (nested / ".git").write_text(_gitdir_marker(tmp_path.parent / "nested.git")) (nested / "package.json").write_text(json.dumps({"name": "nested-name"})) manifests = _collect_manifest_names(tmp_path) assert [name for _file, name, _dir in manifests] == ["root-name"]