fix(init): case-insensitive project dedup across manifest and convo sources

`discover_entities` was deduping the convo_scanner results against the
manifest/git scan with a case-sensitive key, while every other dedup
path in the pipeline (`_merge_detected`, `miner.add_to_known_entities`)
uses case-insensitive matching. A project named `foo` in a manifest
plus `Foo` as a Claude Code `cwd` variant would surface as two review
entries instead of collapsing to one.

Fix keys `by_name` by `name.lower()` while preserving the first-seen
casing, matching the rest of the pipeline. Flagged by Copilot on #1175.

Regression test asserts a manifest project + a CamelCase-variant convo
cwd for the same real project collapse to one entry.
This commit is contained in:
Igor Lins e Silva
2026-04-24 14:11:54 -03:00
parent 19ce58c143
commit 55c83e9f3d
2 changed files with 38 additions and 5 deletions
+10 -5
View File
@@ -627,13 +627,18 @@ def discover_entities(
root_path = Path(project_dir).expanduser().resolve()
if is_claude_projects_root(root_path):
convo_projects = scan_claude_projects(root_path)
# Dedup by name against the git-manifest list, preferring entries with
# more user_commits as signal strength.
by_name: dict[str, ProjectInfo] = {p.name: p for p in projects}
# Dedup by name against the git-manifest list, preferring entries
# with more user_commits as signal strength. Keyed case-insensitively
# so a `pyproject.toml` name like `mempalace` and a Claude Code
# `cwd` variant like `MemPalace` collapse into one entry — matches
# the case-insensitive dedup used in `_merge_detected` and
# `miner.add_to_known_entities`.
by_name: dict[str, ProjectInfo] = {p.name.lower(): p for p in projects}
for cp in convo_projects:
existing = by_name.get(cp.name)
key = cp.name.lower()
existing = by_name.get(key)
if existing is None or cp.user_commits > existing.user_commits:
by_name[cp.name] = cp
by_name[key] = cp
projects = sorted(
by_name.values(),
key=lambda p: (not p.is_mine, -p.user_commits, -p.total_commits, p.name),
+28
View File
@@ -524,6 +524,34 @@ def test_discover_entities_keeps_llm_only_project_uncertain_when_real_signal(tmp
assert "Terraform" in [e["name"] for e in d["uncertain"]]
def test_discover_entities_collapses_case_variants_between_manifest_and_convo(tmp_path):
"""A project named `myproj` in a manifest and `MyProj` as a Claude Code
cwd must collapse into one entry. Matches the case-insensitive dedup
used by `_merge_detected` and `miner.add_to_known_entities`."""
root = tmp_path / "projects_root"
root.mkdir()
# Entry 1: a git+manifest project named lowercase `myproj`
repo = root / "-home-u-src-myproj"
repo.mkdir()
(repo / "package.json").write_text(json.dumps({"name": "myproj"}))
_init_git_repo(repo)
# Entry 2: same root ALSO looks like a Claude Code `.claude/projects/` dir;
# the convo_scanner inside will resolve `cwd` to `/home/u/src/MyProj`
# (CamelCase variant of the same project).
session = repo / "abc.jsonl"
session.write_text(json.dumps({"type": "user", "cwd": "/home/u/src/MyProj"}) + "\n")
d = discover_entities(str(root))
project_names = [e["name"] for e in d["projects"]]
# One entry, not two. First-seen casing ("myproj" from the manifest scan)
# is the winner since it was seeded first.
assert len(project_names) == 1
assert project_names[0].lower() == "myproj"
# ── _UnionFind basics ──────────────────────────────────────────────────