fix: respect .gitignore during project mining

This commit is contained in:
ac-opensource
2026-04-07 22:26:06 +08:00
parent 1782628b8a
commit 9b9daa9b4b
2 changed files with 175 additions and 2 deletions
+128 -1
View File
@@ -10,6 +10,7 @@ Stores verbatim chunks as drawers. No summaries. Ever.
import os import os
import sys import sys
import hashlib import hashlib
import fnmatch
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
from collections import defaultdict from collections import defaultdict
@@ -58,6 +59,122 @@ CHUNK_OVERLAP = 100 # overlap between chunks
MIN_CHUNK_SIZE = 50 # skip tiny chunks MIN_CHUNK_SIZE = 50 # skip tiny chunks
# =============================================================================
# IGNORE MATCHING
# =============================================================================
class GitignoreMatcher:
"""Lightweight matcher for a project's root .gitignore patterns."""
def __init__(self, rules: list):
self.rules = rules
self.has_negations = any(rule["negated"] for rule in rules)
@classmethod
def from_project(cls, project_path: Path):
gitignore_path = project_path / ".gitignore"
if not gitignore_path.exists():
return cls([])
try:
lines = gitignore_path.read_text(encoding="utf-8", errors="replace").splitlines()
except Exception:
return cls([])
rules = []
for raw_line in lines:
line = raw_line.strip()
if not line or line.startswith("#"):
continue
negated = line.startswith("!")
if negated:
line = line[1:]
anchored = line.startswith("/")
if anchored:
line = line.lstrip("/")
dir_only = line.endswith("/")
if dir_only:
line = line.rstrip("/")
if not line:
continue
rules.append(
{
"pattern": line,
"anchored": anchored,
"dir_only": dir_only,
"negated": negated,
}
)
return cls(rules)
def matches(self, path: Path, project_path: Path, is_dir: bool = None) -> bool:
if not self.rules:
return False
try:
relative = path.relative_to(project_path).as_posix().strip("/")
except ValueError:
return False
if not relative:
return False
if is_dir is None:
is_dir = path.is_dir()
ignored = False
for rule in self.rules:
if self._rule_matches(rule, relative, is_dir):
ignored = not rule["negated"]
return ignored
def _rule_matches(self, rule: dict, relative: str, is_dir: bool) -> bool:
pattern = rule["pattern"]
parts = relative.split("/")
pattern_parts = pattern.split("/")
if rule["dir_only"]:
target_parts = parts if is_dir else parts[:-1]
if not target_parts:
return False
if rule["anchored"] or len(pattern_parts) > 1:
return self._match_from_root(target_parts, pattern_parts)
return any(fnmatch.fnmatch(part, pattern) for part in target_parts)
if rule["anchored"] or len(pattern_parts) > 1:
return self._match_from_root(parts, pattern_parts)
return any(fnmatch.fnmatch(part, pattern) for part in parts)
def _match_from_root(self, target_parts: list, pattern_parts: list) -> bool:
def matches(path_index: int, pattern_index: int) -> bool:
if pattern_index == len(pattern_parts):
return True
if path_index == len(target_parts):
return all(part == "**" for part in pattern_parts[pattern_index:])
pattern_part = pattern_parts[pattern_index]
if pattern_part == "**":
return matches(path_index, pattern_index + 1) or matches(
path_index + 1, pattern_index
)
if not fnmatch.fnmatch(target_parts[path_index], pattern_part):
return False
return matches(path_index + 1, pattern_index + 1)
return matches(0, 0)
# ============================================================================= # =============================================================================
# CONFIG # CONFIG
# ============================================================================= # =============================================================================
@@ -287,11 +404,21 @@ def process_file(
def scan_project(project_dir: str) -> list: def scan_project(project_dir: str) -> list:
"""Return list of all readable file paths.""" """Return list of all readable file paths."""
project_path = Path(project_dir).expanduser().resolve() project_path = Path(project_dir).expanduser().resolve()
gitignore_matcher = GitignoreMatcher.from_project(project_path)
files = [] files = []
for root, dirs, filenames in os.walk(project_path): for root, dirs, filenames in os.walk(project_path):
root_path = Path(root)
dirs[:] = [d for d in dirs if d not in SKIP_DIRS] dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
if not gitignore_matcher.has_negations:
dirs[:] = [
d
for d in dirs
if not gitignore_matcher.matches(root_path / d, project_path, is_dir=True)
]
for filename in filenames: for filename in filenames:
filepath = Path(root) / filename filepath = root_path / filename
if gitignore_matcher.matches(filepath, project_path, is_dir=False):
continue
if filepath.suffix.lower() in READABLE_EXTENSIONS: if filepath.suffix.lower() in READABLE_EXTENSIONS:
# Skip config files # Skip config files
if filename in ( if filename in (
+47 -1
View File
@@ -3,7 +3,9 @@ import tempfile
import shutil import shutil
import yaml import yaml
import chromadb import chromadb
from mempalace.miner import mine from pathlib import Path
from mempalace.miner import mine, scan_project
def test_project_mining(): def test_project_mining():
@@ -34,3 +36,47 @@ def test_project_mining():
assert col.count() > 0 assert col.count() > 0
shutil.rmtree(tmpdir) shutil.rmtree(tmpdir)
def test_scan_project_respects_gitignore():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
os.makedirs(project_root / "src")
os.makedirs(project_root / "generated")
(project_root / ".gitignore").write_text("ignored.py\ngenerated/\n", encoding="utf-8")
(project_root / "src" / "app.py").write_text("print('hello')\n" * 20, encoding="utf-8")
(project_root / "ignored.py").write_text("print('ignore me')\n" * 20, encoding="utf-8")
(project_root / "generated" / "artifact.py").write_text(
"print('ignore this dir')\n" * 20,
encoding="utf-8",
)
files = scan_project(str(project_root))
relative_files = sorted(path.relative_to(project_root).as_posix() for path in files)
assert relative_files == ["src/app.py"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_handles_gitignore_negation():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
os.makedirs(project_root / "generated")
(project_root / ".gitignore").write_text(
"generated/\n!generated/keep.py\n",
encoding="utf-8",
)
(project_root / "generated" / "drop.py").write_text("print('drop')\n" * 20, encoding="utf-8")
(project_root / "generated" / "keep.py").write_text("print('keep')\n" * 20, encoding="utf-8")
files = scan_project(str(project_root))
relative_files = sorted(path.relative_to(project_root).as_posix() for path in files)
assert relative_files == ["generated/keep.py"]
finally:
shutil.rmtree(tmpdir)