From 55c83e9f3d2c3b1e1a8cfa25243fac9c9e20a5d9 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Fri, 24 Apr 2026 14:11:54 -0300 Subject: [PATCH] fix(init): case-insensitive project dedup across manifest and convo sources `discover_entities` was deduping the convo_scanner results against the manifest/git scan with a case-sensitive key, while every other dedup path in the pipeline (`_merge_detected`, `miner.add_to_known_entities`) uses case-insensitive matching. A project named `foo` in a manifest plus `Foo` as a Claude Code `cwd` variant would surface as two review entries instead of collapsing to one. Fix keys `by_name` by `name.lower()` while preserving the first-seen casing, matching the rest of the pipeline. Flagged by Copilot on #1175. Regression test asserts a manifest project + a CamelCase-variant convo cwd for the same real project collapse to one entry. --- mempalace/project_scanner.py | 15 ++++++++++----- tests/test_project_scanner.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/mempalace/project_scanner.py b/mempalace/project_scanner.py index bbdb6f4..741a3e2 100644 --- a/mempalace/project_scanner.py +++ b/mempalace/project_scanner.py @@ -627,13 +627,18 @@ def discover_entities( root_path = Path(project_dir).expanduser().resolve() if is_claude_projects_root(root_path): convo_projects = scan_claude_projects(root_path) - # Dedup by name against the git-manifest list, preferring entries with - # more user_commits as signal strength. - by_name: dict[str, ProjectInfo] = {p.name: p for p in projects} + # Dedup by name against the git-manifest list, preferring entries + # with more user_commits as signal strength. Keyed case-insensitively + # so a `pyproject.toml` name like `mempalace` and a Claude Code + # `cwd` variant like `MemPalace` collapse into one entry — matches + # the case-insensitive dedup used in `_merge_detected` and + # `miner.add_to_known_entities`. + by_name: dict[str, ProjectInfo] = {p.name.lower(): p for p in projects} for cp in convo_projects: - existing = by_name.get(cp.name) + key = cp.name.lower() + existing = by_name.get(key) if existing is None or cp.user_commits > existing.user_commits: - by_name[cp.name] = cp + by_name[key] = cp projects = sorted( by_name.values(), key=lambda p: (not p.is_mine, -p.user_commits, -p.total_commits, p.name), diff --git a/tests/test_project_scanner.py b/tests/test_project_scanner.py index 2dc939a..49126b4 100644 --- a/tests/test_project_scanner.py +++ b/tests/test_project_scanner.py @@ -524,6 +524,34 @@ def test_discover_entities_keeps_llm_only_project_uncertain_when_real_signal(tmp assert "Terraform" in [e["name"] for e in d["uncertain"]] +def test_discover_entities_collapses_case_variants_between_manifest_and_convo(tmp_path): + """A project named `myproj` in a manifest and `MyProj` as a Claude Code + cwd must collapse into one entry. Matches the case-insensitive dedup + used by `_merge_detected` and `miner.add_to_known_entities`.""" + root = tmp_path / "projects_root" + root.mkdir() + + # Entry 1: a git+manifest project named lowercase `myproj` + repo = root / "-home-u-src-myproj" + repo.mkdir() + (repo / "package.json").write_text(json.dumps({"name": "myproj"})) + _init_git_repo(repo) + + # Entry 2: same root ALSO looks like a Claude Code `.claude/projects/` dir; + # the convo_scanner inside will resolve `cwd` to `/home/u/src/MyProj` + # (CamelCase variant of the same project). + session = repo / "abc.jsonl" + session.write_text(json.dumps({"type": "user", "cwd": "/home/u/src/MyProj"}) + "\n") + + d = discover_entities(str(root)) + + project_names = [e["name"] for e in d["projects"]] + # One entry, not two. First-seen casing ("myproj" from the manifest scan) + # is the winner since it was seeded first. + assert len(project_names) == 1 + assert project_names[0].lower() == "myproj" + + # ── _UnionFind basics ──────────────────────────────────────────────────