Merge pull request #78 from ac-opensource/feature/respect-gitignore-mining

Respect nested .gitignore rules when mining project files
This commit is contained in:
Ben Sigman
2026-04-07 12:15:23 -07:00
committed by GitHub
3 changed files with 482 additions and 40 deletions
+16
View File
@@ -65,6 +65,9 @@ def cmd_init(args):
def cmd_mine(args): def cmd_mine(args):
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
include_ignored = []
for raw in args.include_ignored or []:
include_ignored.extend(part.strip() for part in raw.split(",") if part.strip())
if args.mode == "convos": if args.mode == "convos":
from .convo_miner import mine_convos from .convo_miner import mine_convos
@@ -88,6 +91,8 @@ def cmd_mine(args):
agent=args.agent, agent=args.agent,
limit=args.limit, limit=args.limit,
dry_run=args.dry_run, dry_run=args.dry_run,
respect_gitignore=not args.no_gitignore,
include_ignored=include_ignored,
) )
@@ -359,6 +364,17 @@ def main():
help="Ingest mode: 'projects' for code/docs (default), 'convos' for chat exports", help="Ingest mode: 'projects' for code/docs (default), 'convos' for chat exports",
) )
p_mine.add_argument("--wing", default=None, help="Wing name (default: directory name)") p_mine.add_argument("--wing", default=None, help="Wing name (default: directory name)")
p_mine.add_argument(
"--no-gitignore",
action="store_true",
help="Don't respect .gitignore files when scanning project files",
)
p_mine.add_argument(
"--include-ignored",
action="append",
default=[],
help="Always scan these project-relative paths even if ignored; repeat or pass comma-separated paths",
)
p_mine.add_argument( p_mine.add_argument(
"--agent", "--agent",
default="mempalace", default="mempalace",
+269 -15
View File
@@ -10,6 +10,7 @@ Stores verbatim chunks as drawers. No summaries. Ever.
import os import os
import sys import sys
import hashlib import hashlib
import fnmatch
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
from collections import defaultdict from collections import defaultdict
@@ -51,6 +52,27 @@ SKIP_DIRS = {
".next", ".next",
"coverage", "coverage",
".mempalace", ".mempalace",
".ruff_cache",
".mypy_cache",
".pytest_cache",
".cache",
".tox",
".nox",
".idea",
".vscode",
".ipynb_checkpoints",
".eggs",
"htmlcov",
"target",
}
SKIP_FILENAMES = {
"mempalace.yaml",
"mempalace.yml",
"mempal.yaml",
"mempal.yml",
".gitignore",
"package-lock.json",
} }
CHUNK_SIZE = 800 # chars per drawer CHUNK_SIZE = 800 # chars per drawer
@@ -58,6 +80,196 @@ CHUNK_OVERLAP = 100 # overlap between chunks
MIN_CHUNK_SIZE = 50 # skip tiny chunks MIN_CHUNK_SIZE = 50 # skip tiny chunks
# =============================================================================
# IGNORE MATCHING
# =============================================================================
class GitignoreMatcher:
"""Lightweight matcher for one directory's .gitignore patterns."""
def __init__(self, base_dir: Path, rules: list):
self.base_dir = base_dir
self.rules = rules
@classmethod
def from_dir(cls, dir_path: Path):
gitignore_path = dir_path / ".gitignore"
if not gitignore_path.is_file():
return None
try:
lines = gitignore_path.read_text(encoding="utf-8", errors="replace").splitlines()
except Exception:
return None
rules = []
for raw_line in lines:
line = raw_line.strip()
if not line:
continue
if line.startswith("\\#") or line.startswith("\\!"):
line = line[1:]
elif line.startswith("#"):
continue
negated = line.startswith("!")
if negated:
line = line[1:]
anchored = line.startswith("/")
if anchored:
line = line.lstrip("/")
dir_only = line.endswith("/")
if dir_only:
line = line.rstrip("/")
if not line:
continue
rules.append(
{
"pattern": line,
"anchored": anchored,
"dir_only": dir_only,
"negated": negated,
}
)
if not rules:
return None
return cls(dir_path, rules)
def matches(self, path: Path, is_dir: bool = None):
try:
relative = path.relative_to(self.base_dir).as_posix().strip("/")
except ValueError:
return None
if not relative:
return None
if is_dir is None:
is_dir = path.is_dir()
ignored = None
for rule in self.rules:
if self._rule_matches(rule, relative, is_dir):
ignored = not rule["negated"]
return ignored
def _rule_matches(self, rule: dict, relative: str, is_dir: bool) -> bool:
pattern = rule["pattern"]
parts = relative.split("/")
pattern_parts = pattern.split("/")
if rule["dir_only"]:
target_parts = parts if is_dir else parts[:-1]
if not target_parts:
return False
if rule["anchored"] or len(pattern_parts) > 1:
return self._match_from_root(target_parts, pattern_parts)
return any(fnmatch.fnmatch(part, pattern) for part in target_parts)
if rule["anchored"] or len(pattern_parts) > 1:
return self._match_from_root(parts, pattern_parts)
return any(fnmatch.fnmatch(part, pattern) for part in parts)
def _match_from_root(self, target_parts: list, pattern_parts: list) -> bool:
def matches(path_index: int, pattern_index: int) -> bool:
if pattern_index == len(pattern_parts):
return True
if path_index == len(target_parts):
return all(part == "**" for part in pattern_parts[pattern_index:])
pattern_part = pattern_parts[pattern_index]
if pattern_part == "**":
return matches(path_index, pattern_index + 1) or matches(
path_index + 1, pattern_index
)
if not fnmatch.fnmatch(target_parts[path_index], pattern_part):
return False
return matches(path_index + 1, pattern_index + 1)
return matches(0, 0)
def load_gitignore_matcher(dir_path: Path, cache: dict):
"""Load and cache one directory's .gitignore matcher."""
if dir_path not in cache:
cache[dir_path] = GitignoreMatcher.from_dir(dir_path)
return cache[dir_path]
def is_gitignored(path: Path, matchers: list, is_dir: bool = False) -> bool:
"""Apply active .gitignore matchers in ancestor order; last match wins."""
ignored = False
for matcher in matchers:
decision = matcher.matches(path, is_dir=is_dir)
if decision is not None:
ignored = decision
return ignored
def should_skip_dir(dirname: str) -> bool:
"""Skip known generated/cache directories before gitignore matching."""
return dirname in SKIP_DIRS or dirname.endswith(".egg-info")
def normalize_include_paths(include_ignored: list) -> set:
"""Normalize comma-parsed include paths into project-relative POSIX strings."""
normalized = set()
for raw_path in include_ignored or []:
candidate = str(raw_path).strip().strip("/")
if candidate:
normalized.add(Path(candidate).as_posix())
return normalized
def is_exact_force_include(path: Path, project_path: Path, include_paths: set) -> bool:
"""Return True when a path exactly matches an explicit include override."""
if not include_paths:
return False
try:
relative = path.relative_to(project_path).as_posix().strip("/")
except ValueError:
return False
return relative in include_paths
def is_force_included(path: Path, project_path: Path, include_paths: set) -> bool:
"""Return True when a path or one of its ancestors/descendants was explicitly included."""
if not include_paths:
return False
try:
relative = path.relative_to(project_path).as_posix().strip("/")
except ValueError:
return False
if not relative:
return False
for include_path in include_paths:
if relative == include_path:
return True
if relative.startswith(f"{include_path}/"):
return True
if include_path.startswith(f"{relative}/"):
return True
return False
# ============================================================================= # =============================================================================
# CONFIG # CONFIG
# ============================================================================= # =============================================================================
@@ -284,26 +496,58 @@ def process_file(
# ============================================================================= # =============================================================================
def scan_project(project_dir: str) -> list: def scan_project(
project_dir: str,
respect_gitignore: bool = True,
include_ignored: list = None,
) -> list:
"""Return list of all readable file paths.""" """Return list of all readable file paths."""
project_path = Path(project_dir).expanduser().resolve() project_path = Path(project_dir).expanduser().resolve()
files = [] files = []
active_matchers = []
matcher_cache = {}
include_paths = normalize_include_paths(include_ignored)
for root, dirs, filenames in os.walk(project_path): for root, dirs, filenames in os.walk(project_path):
dirs[:] = [d for d in dirs if d not in SKIP_DIRS] root_path = Path(root)
if respect_gitignore:
active_matchers = [
matcher
for matcher in active_matchers
if root_path == matcher.base_dir or matcher.base_dir in root_path.parents
]
current_matcher = load_gitignore_matcher(root_path, matcher_cache)
if current_matcher is not None:
active_matchers.append(current_matcher)
dirs[:] = [
d
for d in dirs
if is_force_included(root_path / d, project_path, include_paths)
or not should_skip_dir(d)
]
if respect_gitignore and active_matchers:
dirs[:] = [
d
for d in dirs
if is_force_included(root_path / d, project_path, include_paths)
or not is_gitignored(root_path / d, active_matchers, is_dir=True)
]
for filename in filenames: for filename in filenames:
filepath = Path(root) / filename filepath = root_path / filename
if filepath.suffix.lower() in READABLE_EXTENSIONS: force_include = is_force_included(filepath, project_path, include_paths)
# Skip config files exact_force_include = is_exact_force_include(filepath, project_path, include_paths)
if filename in (
"mempalace.yaml", if not force_include and filename in SKIP_FILENAMES:
"mempalace.yml", continue
"mempal.yaml", if filepath.suffix.lower() not in READABLE_EXTENSIONS and not exact_force_include:
"mempal.yml", continue
".gitignore", if respect_gitignore and active_matchers and not force_include:
"package-lock.json", if is_gitignored(filepath, active_matchers, is_dir=False):
):
continue continue
files.append(filepath) files.append(filepath)
return files return files
@@ -319,6 +563,8 @@ def mine(
agent: str = "mempalace", agent: str = "mempalace",
limit: int = 0, limit: int = 0,
dry_run: bool = False, dry_run: bool = False,
respect_gitignore: bool = True,
include_ignored: list = None,
): ):
"""Mine a project directory into the palace.""" """Mine a project directory into the palace."""
@@ -328,7 +574,11 @@ def mine(
wing = wing_override or config["wing"] wing = wing_override or config["wing"]
rooms = config.get("rooms", [{"name": "general", "description": "All project files"}]) rooms = config.get("rooms", [{"name": "general", "description": "All project files"}])
files = scan_project(project_dir) files = scan_project(
project_dir,
respect_gitignore=respect_gitignore,
include_ignored=include_ignored,
)
if limit > 0: if limit > 0:
files = files[:limit] files = files[:limit]
@@ -341,6 +591,10 @@ def mine(
print(f" Palace: {palace_path}") print(f" Palace: {palace_path}")
if dry_run: if dry_run:
print(" DRY RUN — nothing will be filed") print(" DRY RUN — nothing will be filed")
if not respect_gitignore:
print(" .gitignore: DISABLED")
if include_ignored:
print(f" Include: {', '.join(sorted(normalize_include_paths(include_ignored)))}")
print(f"{'' * 55}\n") print(f"{'' * 55}\n")
if not dry_run: if not dry_run:
+197 -25
View File
@@ -1,36 +1,208 @@
import os import os
import tempfile
import shutil import shutil
import yaml import tempfile
from pathlib import Path
import chromadb import chromadb
from mempalace.miner import mine import yaml
from mempalace.miner import mine, scan_project
def write_file(path: Path, content: str):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
def scanned_files(project_root: Path, **kwargs):
files = scan_project(str(project_root), **kwargs)
return sorted(path.relative_to(project_root).as_posix() for path in files)
def test_project_mining(): def test_project_mining():
tmpdir = tempfile.mkdtemp() tmpdir = tempfile.mkdtemp()
# Create a mini project try:
os.makedirs(os.path.join(tmpdir, "backend")) project_root = Path(tmpdir).resolve()
with open(os.path.join(tmpdir, "backend", "app.py"), "w") as f: os.makedirs(project_root / "backend")
f.write("def main():\n print('hello world')\n" * 20)
# Create config write_file(
with open(os.path.join(tmpdir, "mempalace.yaml"), "w") as f: project_root / "backend" / "app.py", "def main():\n print('hello world')\n" * 20
yaml.dump(
{
"wing": "test_project",
"rooms": [
{"name": "backend", "description": "Backend code"},
{"name": "general", "description": "General"},
],
},
f,
) )
with open(project_root / "mempalace.yaml", "w") as f:
yaml.dump(
{
"wing": "test_project",
"rooms": [
{"name": "backend", "description": "Backend code"},
{"name": "general", "description": "General"},
],
},
f,
)
palace_path = os.path.join(tmpdir, "palace") palace_path = project_root / "palace"
mine(tmpdir, palace_path) mine(str(project_root), str(palace_path))
# Verify client = chromadb.PersistentClient(path=str(palace_path))
client = chromadb.PersistentClient(path=palace_path) col = client.get_collection("mempalace_drawers")
col = client.get_collection("mempalace_drawers") assert col.count() > 0
assert col.count() > 0 finally:
shutil.rmtree(tmpdir)
shutil.rmtree(tmpdir)
def test_scan_project_respects_gitignore():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "ignored.py\ngenerated/\n")
write_file(project_root / "src" / "app.py", "print('hello')\n" * 20)
write_file(project_root / "ignored.py", "print('ignore me')\n" * 20)
write_file(project_root / "generated" / "artifact.py", "print('artifact')\n" * 20)
assert scanned_files(project_root) == ["src/app.py"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_respects_nested_gitignore():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "*.log\n")
write_file(project_root / "subrepo" / ".gitignore", "tasks/\n")
write_file(project_root / "subrepo" / "src" / "main.py", "print('main')\n" * 20)
write_file(project_root / "subrepo" / "tasks" / "task.py", "print('task')\n" * 20)
write_file(project_root / "subrepo" / "debug.log", "debug\n" * 20)
assert scanned_files(project_root) == ["subrepo/src/main.py"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_allows_nested_gitignore_override():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "*.csv\n")
write_file(project_root / "subrepo" / ".gitignore", "!keep.csv\n")
write_file(project_root / "drop.csv", "a,b,c\n" * 20)
write_file(project_root / "subrepo" / "keep.csv", "a,b,c\n" * 20)
assert scanned_files(project_root) == ["subrepo/keep.csv"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_allows_gitignore_negation_when_parent_dir_is_visible():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "generated/*\n!generated/keep.py\n")
write_file(project_root / "generated" / "drop.py", "print('drop')\n" * 20)
write_file(project_root / "generated" / "keep.py", "print('keep')\n" * 20)
assert scanned_files(project_root) == ["generated/keep.py"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_does_not_reinclude_file_from_ignored_directory():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "generated/\n!generated/keep.py\n")
write_file(project_root / "generated" / "drop.py", "print('drop')\n" * 20)
write_file(project_root / "generated" / "keep.py", "print('keep')\n" * 20)
assert scanned_files(project_root) == []
finally:
shutil.rmtree(tmpdir)
def test_scan_project_can_disable_gitignore():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "data/\n")
write_file(project_root / "data" / "stuff.csv", "a,b,c\n" * 20)
assert scanned_files(project_root, respect_gitignore=False) == ["data/stuff.csv"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_can_include_ignored_directory():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "docs/\n")
write_file(project_root / "docs" / "guide.md", "# Guide\n" * 20)
assert scanned_files(project_root, include_ignored=["docs"]) == ["docs/guide.md"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_can_include_specific_ignored_file():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "generated/\n")
write_file(project_root / "generated" / "drop.py", "print('drop')\n" * 20)
write_file(project_root / "generated" / "keep.py", "print('keep')\n" * 20)
assert scanned_files(project_root, include_ignored=["generated/keep.py"]) == [
"generated/keep.py"
]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_can_include_exact_file_without_known_extension():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "README\n")
write_file(project_root / "README", "hello\n" * 20)
assert scanned_files(project_root, include_ignored=["README"]) == ["README"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_include_override_beats_skip_dirs():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".pytest_cache" / "cache.py", "print('cache')\n" * 20)
assert scanned_files(
project_root,
respect_gitignore=False,
include_ignored=[".pytest_cache"],
) == [".pytest_cache/cache.py"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_skip_dirs_still_apply_without_override():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".pytest_cache" / "cache.py", "print('cache')\n" * 20)
write_file(project_root / "main.py", "print('main')\n" * 20)
assert scanned_files(project_root, respect_gitignore=False) == ["main.py"]
finally:
shutil.rmtree(tmpdir)