fix: support nested .gitignore rules during mining

This commit is contained in:
ac-opensource
2026-04-08 00:02:21 +08:00
parent 9b9daa9b4b
commit c8c220d789
3 changed files with 355 additions and 86 deletions
+16
View File
@@ -65,6 +65,9 @@ def cmd_init(args):
def cmd_mine(args): def cmd_mine(args):
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
include_ignored = []
for raw in args.include_ignored or []:
include_ignored.extend(part.strip() for part in raw.split(",") if part.strip())
if args.mode == "convos": if args.mode == "convos":
from .convo_miner import mine_convos from .convo_miner import mine_convos
@@ -88,6 +91,8 @@ def cmd_mine(args):
agent=args.agent, agent=args.agent,
limit=args.limit, limit=args.limit,
dry_run=args.dry_run, dry_run=args.dry_run,
respect_gitignore=not args.no_gitignore,
include_ignored=include_ignored,
) )
@@ -288,6 +293,17 @@ def main():
help="Ingest mode: 'projects' for code/docs (default), 'convos' for chat exports", help="Ingest mode: 'projects' for code/docs (default), 'convos' for chat exports",
) )
p_mine.add_argument("--wing", default=None, help="Wing name (default: directory name)") p_mine.add_argument("--wing", default=None, help="Wing name (default: directory name)")
p_mine.add_argument(
"--no-gitignore",
action="store_true",
help="Don't respect .gitignore files when scanning project files",
)
p_mine.add_argument(
"--include-ignored",
action="append",
default=[],
help="Always scan these project-relative paths even if ignored; repeat or pass comma-separated paths",
)
p_mine.add_argument( p_mine.add_argument(
"--agent", "--agent",
default="mempalace", default="mempalace",
+162 -35
View File
@@ -52,6 +52,27 @@ SKIP_DIRS = {
".next", ".next",
"coverage", "coverage",
".mempalace", ".mempalace",
".ruff_cache",
".mypy_cache",
".pytest_cache",
".cache",
".tox",
".nox",
".idea",
".vscode",
".ipynb_checkpoints",
".eggs",
"htmlcov",
"target",
}
SKIP_FILENAMES = {
"mempalace.yaml",
"mempalace.yml",
"mempal.yaml",
"mempal.yml",
".gitignore",
"package-lock.json",
} }
CHUNK_SIZE = 800 # chars per drawer CHUNK_SIZE = 800 # chars per drawer
@@ -65,27 +86,32 @@ MIN_CHUNK_SIZE = 50 # skip tiny chunks
class GitignoreMatcher: class GitignoreMatcher:
"""Lightweight matcher for a project's root .gitignore patterns.""" """Lightweight matcher for one directory's .gitignore patterns."""
def __init__(self, rules: list): def __init__(self, base_dir: Path, rules: list):
self.base_dir = base_dir
self.rules = rules self.rules = rules
self.has_negations = any(rule["negated"] for rule in rules)
@classmethod @classmethod
def from_project(cls, project_path: Path): def from_dir(cls, dir_path: Path):
gitignore_path = project_path / ".gitignore" gitignore_path = dir_path / ".gitignore"
if not gitignore_path.exists(): if not gitignore_path.is_file():
return cls([]) return None
try: try:
lines = gitignore_path.read_text(encoding="utf-8", errors="replace").splitlines() lines = gitignore_path.read_text(encoding="utf-8", errors="replace").splitlines()
except Exception: except Exception:
return cls([]) return None
rules = [] rules = []
for raw_line in lines: for raw_line in lines:
line = raw_line.strip() line = raw_line.strip()
if not line or line.startswith("#"): if not line:
continue
if line.startswith("\\#") or line.startswith("\\!"):
line = line[1:]
elif line.startswith("#"):
continue continue
negated = line.startswith("!") negated = line.startswith("!")
@@ -112,24 +138,24 @@ class GitignoreMatcher:
} }
) )
return cls(rules) if not rules:
return None
def matches(self, path: Path, project_path: Path, is_dir: bool = None) -> bool: return cls(dir_path, rules)
if not self.rules:
return False
def matches(self, path: Path, is_dir: bool = None):
try: try:
relative = path.relative_to(project_path).as_posix().strip("/") relative = path.relative_to(self.base_dir).as_posix().strip("/")
except ValueError: except ValueError:
return False return None
if not relative: if not relative:
return False return None
if is_dir is None: if is_dir is None:
is_dir = path.is_dir() is_dir = path.is_dir()
ignored = False ignored = None
for rule in self.rules: for rule in self.rules:
if self._rule_matches(rule, relative, is_dir): if self._rule_matches(rule, relative, is_dir):
ignored = not rule["negated"] ignored = not rule["negated"]
@@ -175,6 +201,75 @@ class GitignoreMatcher:
return matches(0, 0) return matches(0, 0)
def load_gitignore_matcher(dir_path: Path, cache: dict):
"""Load and cache one directory's .gitignore matcher."""
if dir_path not in cache:
cache[dir_path] = GitignoreMatcher.from_dir(dir_path)
return cache[dir_path]
def is_gitignored(path: Path, matchers: list, is_dir: bool = False) -> bool:
"""Apply active .gitignore matchers in ancestor order; last match wins."""
ignored = False
for matcher in matchers:
decision = matcher.matches(path, is_dir=is_dir)
if decision is not None:
ignored = decision
return ignored
def should_skip_dir(dirname: str) -> bool:
"""Skip known generated/cache directories before gitignore matching."""
return dirname in SKIP_DIRS or dirname.endswith(".egg-info")
def normalize_include_paths(include_ignored: list) -> set:
"""Normalize comma-parsed include paths into project-relative POSIX strings."""
normalized = set()
for raw_path in include_ignored or []:
candidate = str(raw_path).strip().strip("/")
if candidate:
normalized.add(Path(candidate).as_posix())
return normalized
def is_exact_force_include(path: Path, project_path: Path, include_paths: set) -> bool:
"""Return True when a path exactly matches an explicit include override."""
if not include_paths:
return False
try:
relative = path.relative_to(project_path).as_posix().strip("/")
except ValueError:
return False
return relative in include_paths
def is_force_included(path: Path, project_path: Path, include_paths: set) -> bool:
"""Return True when a path or one of its ancestors/descendants was explicitly included."""
if not include_paths:
return False
try:
relative = path.relative_to(project_path).as_posix().strip("/")
except ValueError:
return False
if not relative:
return False
for include_path in include_paths:
if relative == include_path:
return True
if relative.startswith(f"{include_path}/"):
return True
if include_path.startswith(f"{relative}/"):
return True
return False
# ============================================================================= # =============================================================================
# CONFIG # CONFIG
# ============================================================================= # =============================================================================
@@ -401,36 +496,58 @@ def process_file(
# ============================================================================= # =============================================================================
def scan_project(project_dir: str) -> list: def scan_project(
project_dir: str,
respect_gitignore: bool = True,
include_ignored: list = None,
) -> list:
"""Return list of all readable file paths.""" """Return list of all readable file paths."""
project_path = Path(project_dir).expanduser().resolve() project_path = Path(project_dir).expanduser().resolve()
gitignore_matcher = GitignoreMatcher.from_project(project_path)
files = [] files = []
active_matchers = []
matcher_cache = {}
include_paths = normalize_include_paths(include_ignored)
for root, dirs, filenames in os.walk(project_path): for root, dirs, filenames in os.walk(project_path):
root_path = Path(root) root_path = Path(root)
dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
if not gitignore_matcher.has_negations: if respect_gitignore:
active_matchers = [
matcher
for matcher in active_matchers
if root_path == matcher.base_dir or matcher.base_dir in root_path.parents
]
current_matcher = load_gitignore_matcher(root_path, matcher_cache)
if current_matcher is not None:
active_matchers.append(current_matcher)
dirs[:] = [
d
for d in dirs
if is_force_included(root_path / d, project_path, include_paths)
or not should_skip_dir(d)
]
if respect_gitignore and active_matchers:
dirs[:] = [ dirs[:] = [
d d
for d in dirs for d in dirs
if not gitignore_matcher.matches(root_path / d, project_path, is_dir=True) if is_force_included(root_path / d, project_path, include_paths)
or not is_gitignored(root_path / d, active_matchers, is_dir=True)
] ]
for filename in filenames: for filename in filenames:
filepath = root_path / filename filepath = root_path / filename
if gitignore_matcher.matches(filepath, project_path, is_dir=False): force_include = is_force_included(filepath, project_path, include_paths)
exact_force_include = is_exact_force_include(filepath, project_path, include_paths)
if not force_include and filename in SKIP_FILENAMES:
continue continue
if filepath.suffix.lower() in READABLE_EXTENSIONS: if filepath.suffix.lower() not in READABLE_EXTENSIONS and not exact_force_include:
# Skip config files continue
if filename in ( if respect_gitignore and active_matchers and not force_include:
"mempalace.yaml", if is_gitignored(filepath, active_matchers, is_dir=False):
"mempalace.yml",
"mempal.yaml",
"mempal.yml",
".gitignore",
"package-lock.json",
):
continue continue
files.append(filepath) files.append(filepath)
return files return files
@@ -446,6 +563,8 @@ def mine(
agent: str = "mempalace", agent: str = "mempalace",
limit: int = 0, limit: int = 0,
dry_run: bool = False, dry_run: bool = False,
respect_gitignore: bool = True,
include_ignored: list = None,
): ):
"""Mine a project directory into the palace.""" """Mine a project directory into the palace."""
@@ -455,7 +574,11 @@ def mine(
wing = wing_override or config["wing"] wing = wing_override or config["wing"]
rooms = config.get("rooms", [{"name": "general", "description": "All project files"}]) rooms = config.get("rooms", [{"name": "general", "description": "All project files"}])
files = scan_project(project_dir) files = scan_project(
project_dir,
respect_gitignore=respect_gitignore,
include_ignored=include_ignored,
)
if limit > 0: if limit > 0:
files = files[:limit] files = files[:limit]
@@ -468,6 +591,10 @@ def mine(
print(f" Palace: {palace_path}") print(f" Palace: {palace_path}")
if dry_run: if dry_run:
print(" DRY RUN — nothing will be filed") print(" DRY RUN — nothing will be filed")
if not respect_gitignore:
print(" .gitignore: DISABLED")
if include_ignored:
print(f" Include: {', '.join(sorted(normalize_include_paths(include_ignored)))}")
print(f"{'' * 55}\n") print(f"{'' * 55}\n")
if not dry_run: if not dry_run:
+177 -51
View File
@@ -1,82 +1,208 @@
import os import os
import tempfile
import shutil import shutil
import yaml import tempfile
import chromadb
from pathlib import Path from pathlib import Path
import chromadb
import yaml
from mempalace.miner import mine, scan_project from mempalace.miner import mine, scan_project
def write_file(path: Path, content: str):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
def scanned_files(project_root: Path, **kwargs):
files = scan_project(str(project_root), **kwargs)
return sorted(path.relative_to(project_root).as_posix() for path in files)
def test_project_mining(): def test_project_mining():
tmpdir = tempfile.mkdtemp() tmpdir = tempfile.mkdtemp()
# Create a mini project try:
os.makedirs(os.path.join(tmpdir, "backend")) project_root = Path(tmpdir).resolve()
with open(os.path.join(tmpdir, "backend", "app.py"), "w") as f: os.makedirs(project_root / "backend")
f.write("def main():\n print('hello world')\n" * 20)
# Create config write_file(
with open(os.path.join(tmpdir, "mempalace.yaml"), "w") as f: project_root / "backend" / "app.py", "def main():\n print('hello world')\n" * 20
yaml.dump(
{
"wing": "test_project",
"rooms": [
{"name": "backend", "description": "Backend code"},
{"name": "general", "description": "General"},
],
},
f,
) )
with open(project_root / "mempalace.yaml", "w") as f:
yaml.dump(
{
"wing": "test_project",
"rooms": [
{"name": "backend", "description": "Backend code"},
{"name": "general", "description": "General"},
],
},
f,
)
palace_path = os.path.join(tmpdir, "palace") palace_path = project_root / "palace"
mine(tmpdir, palace_path) mine(str(project_root), str(palace_path))
# Verify client = chromadb.PersistentClient(path=str(palace_path))
client = chromadb.PersistentClient(path=palace_path) col = client.get_collection("mempalace_drawers")
col = client.get_collection("mempalace_drawers") assert col.count() > 0
assert col.count() > 0 finally:
shutil.rmtree(tmpdir)
shutil.rmtree(tmpdir)
def test_scan_project_respects_gitignore(): def test_scan_project_respects_gitignore():
tmpdir = tempfile.mkdtemp() tmpdir = tempfile.mkdtemp()
try: try:
project_root = Path(tmpdir).resolve() project_root = Path(tmpdir).resolve()
os.makedirs(project_root / "src")
os.makedirs(project_root / "generated")
(project_root / ".gitignore").write_text("ignored.py\ngenerated/\n", encoding="utf-8") write_file(project_root / ".gitignore", "ignored.py\ngenerated/\n")
(project_root / "src" / "app.py").write_text("print('hello')\n" * 20, encoding="utf-8") write_file(project_root / "src" / "app.py", "print('hello')\n" * 20)
(project_root / "ignored.py").write_text("print('ignore me')\n" * 20, encoding="utf-8") write_file(project_root / "ignored.py", "print('ignore me')\n" * 20)
(project_root / "generated" / "artifact.py").write_text( write_file(project_root / "generated" / "artifact.py", "print('artifact')\n" * 20)
"print('ignore this dir')\n" * 20,
encoding="utf-8",
)
files = scan_project(str(project_root)) assert scanned_files(project_root) == ["src/app.py"]
relative_files = sorted(path.relative_to(project_root).as_posix() for path in files)
assert relative_files == ["src/app.py"]
finally: finally:
shutil.rmtree(tmpdir) shutil.rmtree(tmpdir)
def test_scan_project_handles_gitignore_negation(): def test_scan_project_respects_nested_gitignore():
tmpdir = tempfile.mkdtemp() tmpdir = tempfile.mkdtemp()
try: try:
project_root = Path(tmpdir).resolve() project_root = Path(tmpdir).resolve()
os.makedirs(project_root / "generated")
(project_root / ".gitignore").write_text( write_file(project_root / ".gitignore", "*.log\n")
"generated/\n!generated/keep.py\n", write_file(project_root / "subrepo" / ".gitignore", "tasks/\n")
encoding="utf-8", write_file(project_root / "subrepo" / "src" / "main.py", "print('main')\n" * 20)
) write_file(project_root / "subrepo" / "tasks" / "task.py", "print('task')\n" * 20)
(project_root / "generated" / "drop.py").write_text("print('drop')\n" * 20, encoding="utf-8") write_file(project_root / "subrepo" / "debug.log", "debug\n" * 20)
(project_root / "generated" / "keep.py").write_text("print('keep')\n" * 20, encoding="utf-8")
files = scan_project(str(project_root)) assert scanned_files(project_root) == ["subrepo/src/main.py"]
relative_files = sorted(path.relative_to(project_root).as_posix() for path in files) finally:
shutil.rmtree(tmpdir)
assert relative_files == ["generated/keep.py"]
def test_scan_project_allows_nested_gitignore_override():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "*.csv\n")
write_file(project_root / "subrepo" / ".gitignore", "!keep.csv\n")
write_file(project_root / "drop.csv", "a,b,c\n" * 20)
write_file(project_root / "subrepo" / "keep.csv", "a,b,c\n" * 20)
assert scanned_files(project_root) == ["subrepo/keep.csv"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_allows_gitignore_negation_when_parent_dir_is_visible():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "generated/*\n!generated/keep.py\n")
write_file(project_root / "generated" / "drop.py", "print('drop')\n" * 20)
write_file(project_root / "generated" / "keep.py", "print('keep')\n" * 20)
assert scanned_files(project_root) == ["generated/keep.py"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_does_not_reinclude_file_from_ignored_directory():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "generated/\n!generated/keep.py\n")
write_file(project_root / "generated" / "drop.py", "print('drop')\n" * 20)
write_file(project_root / "generated" / "keep.py", "print('keep')\n" * 20)
assert scanned_files(project_root) == []
finally:
shutil.rmtree(tmpdir)
def test_scan_project_can_disable_gitignore():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "data/\n")
write_file(project_root / "data" / "stuff.csv", "a,b,c\n" * 20)
assert scanned_files(project_root, respect_gitignore=False) == ["data/stuff.csv"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_can_include_ignored_directory():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "docs/\n")
write_file(project_root / "docs" / "guide.md", "# Guide\n" * 20)
assert scanned_files(project_root, include_ignored=["docs"]) == ["docs/guide.md"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_can_include_specific_ignored_file():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "generated/\n")
write_file(project_root / "generated" / "drop.py", "print('drop')\n" * 20)
write_file(project_root / "generated" / "keep.py", "print('keep')\n" * 20)
assert scanned_files(project_root, include_ignored=["generated/keep.py"]) == [
"generated/keep.py"
]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_can_include_exact_file_without_known_extension():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "README\n")
write_file(project_root / "README", "hello\n" * 20)
assert scanned_files(project_root, include_ignored=["README"]) == ["README"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_include_override_beats_skip_dirs():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".pytest_cache" / "cache.py", "print('cache')\n" * 20)
assert scanned_files(
project_root,
respect_gitignore=False,
include_ignored=[".pytest_cache"],
) == [".pytest_cache/cache.py"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_skip_dirs_still_apply_without_override():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".pytest_cache" / "cache.py", "print('cache')\n" * 20)
write_file(project_root / "main.py", "print('main')\n" * 20)
assert scanned_files(project_root, respect_gitignore=False) == ["main.py"]
finally: finally:
shutil.rmtree(tmpdir) shutil.rmtree(tmpdir)