fix(miner): use token-boundary matching in detect_room

Substring checks in path/filename routing caused systemic misrouting in large monorepos — e.g., "views" ⊂ "interviews" sent every file under views/ to the interviews room. Switch to separator-bounded token matching (-, _, ., /) via a _name_matches helper, applied to priority 1 (path parts) and priority 2 (filename).
2026-04-18 18:50:21 +01:00
parent ba30ab6951
commit ead2c5d299
2 changed files with 123 additions and 3 deletions
@@ -8,6 +8,7 @@ Stores verbatim chunks as drawers. No summaries. Ever.
 """
 import os
 import re
 import sys
 import shlex
 import hashlib
@@ -332,6 +333,28 @@ def load_config(project_dir: str) -> dict:
 # FILE ROUTING — which room does this file belong to?
 # =============================================================================
 _TOKEN_SPLIT = re.compile(r"[-_./]+")
 def _tokens(value: str) -> set:
    """Split ``value`` into lowercased tokens bounded by ``-``, ``_``, ``.`` or ``/``."""
    return {t for t in _TOKEN_SPLIT.split(value.lower()) if t}
 def _name_matches(a: str, b: str) -> bool:
    """Return True when ``a`` and ``b`` match as equal strings or as
    separator-bounded tokens of each other.
    Prevents incidental substring collisions (e.g., ``"views" in "interviews"``)
    that a raw ``in`` check would produce, while preserving the intended
    match for real tokens (e.g., ``"frontend"`` in ``"frontend-app"``).
    """
    a = a.lower()
    b = b.lower()
    if a == b:
        return True
    return b in _tokens(a) or a in _tokens(b)
 def detect_room(filepath: Path, content: str, rooms: list, project_path: Path) -> str:
    """
@@ -351,12 +374,12 @@ def detect_room(filepath: Path, content: str, rooms: list, project_path: Path) -
    for part in path_parts[:-1]:  # skip filename itself
        for room in rooms:
            candidates = [room["name"].lower()] + [k.lower() for k in room.get("keywords", [])]
-            if any(part == c or c in part or part in c for c in candidates):
+            if any(_name_matches(part, c) for c in candidates):
                return room["name"]
    # Priority 2: filename matches room name
    for room in rooms:
-        if room["name"].lower() in filename or filename in room["name"].lower():
+        if _name_matches(filename, room["name"]):
            return room["name"]
    # Priority 3: keyword scoring from room keywords + name
@@ -7,7 +7,7 @@ from pathlib import Path
 import chromadb
 import yaml
-from mempalace.miner import load_config, mine, scan_project, status
+from mempalace.miner import detect_room, load_config, mine, scan_project, status
 from mempalace.palace import NORMALIZE_VERSION, file_already_mined
@@ -491,6 +491,103 @@ def test_file_already_mined_returns_false_for_stale_normalize_version():
        shutil.rmtree(tmpdir, ignore_errors=True)
 def test_detect_room_uses_token_boundary_matching(tmp_path):
    """Path-part routing must not fire on incidental substrings.
    Regression: "views" is a substring of "interviews", so the old
    substring check routed every file under views/ into a room keyed
    by "interviews". Token-boundary matching prevents this while still
    matching real tokens like "frontend" in "frontend-app".
    """
    project = tmp_path
    rooms = [
        {"name": "billing-page", "keywords": ["billing-page"]},
        {"name": "interviews", "keywords": ["interviews"]},
        {"name": "general", "keywords": []},
    ]
    # views/<X>/... must NOT route to "interviews" on the "views"⊂"interviews" accident
    view_file = project / "views" / "billing-page" / "Foo.test.tsx"
    view_file.parent.mkdir(parents=True)
    view_file.write_text("content")
    assert detect_room(view_file, "content", rooms, project) == "billing-page"
    # data/interviews/... must route to "interviews" via the real token
    data_file = project / "data" / "interviews" / "index.ts"
    data_file.parent.mkdir(parents=True)
    data_file.write_text("content")
    assert detect_room(data_file, "content", rooms, project) == "interviews"
 def test_detect_room_preserves_token_matches(tmp_path):
    """Real separator-bounded tokens still match in both directions."""
    project = tmp_path
    rooms = [
        {"name": "frontend", "keywords": ["frontend"]},
        {"name": "general", "keywords": []},
    ]
    # path part contains keyword as a token
    f1 = project / "frontend-app" / "main.ts"
    f1.parent.mkdir(parents=True)
    f1.write_text("x")
    assert detect_room(f1, "x", rooms, project) == "frontend"
    # keyword contains path part as a token (reverse direction)
    rooms2 = [
        {"name": "data-retention", "keywords": ["data-retention"]},
        {"name": "general", "keywords": []},
    ]
    f2 = project / "data" / "data-retention" / "policy.ts"
    f2.parent.mkdir(parents=True)
    f2.write_text("x")
    assert detect_room(f2, "x", rooms2, project) == "data-retention"
 def test_detect_room_matches_keyword_distinct_from_name(tmp_path):
    """Regression: PR #145 — path part must match a keyword even when the
    room name itself doesn't contain the path part as a token.
    Scenario: a folder named ``docs/`` should route to a room named
    ``documentation`` that declares ``"docs"`` as a keyword.
    """
    project = tmp_path
    rooms = [
        {"name": "documentation", "keywords": ["docs"]},
        {"name": "general", "keywords": []},
    ]
    f = project / "docs" / "readme.md"
    f.parent.mkdir(parents=True)
    f.write_text("x")
    assert detect_room(f, "x", rooms, project) == "documentation"
 def test_detect_room_filename_match_uses_token_boundary(tmp_path):
    """Priority 2 (filename match) must also use token-boundary rules."""
    project = tmp_path
    rooms = [
        {"name": "review", "keywords": []},
        {"name": "general", "keywords": []},
    ]
    # "review" is a substring of "reviewmodule" but not a token — should NOT match
    f1 = project / "reviewmodule.ts"
    f1.write_text("x")
    assert detect_room(f1, "x", rooms, project) != "review"
    # "review" IS a token of "review-page" — should match
    f2 = project / "review-page.ts"
    f2.write_text("x")
    assert detect_room(f2, "x", rooms, project) == "review"
    # Dotted filename stems like "Foo.test" split on "." too
    rooms3 = [{"name": "foo", "keywords": []}, {"name": "general", "keywords": []}]
    f3 = project / "foo.test.ts"
    f3.write_text("x")
    assert detect_room(f3, "x", rooms3, project) == "foo"
 def test_add_drawer_stamps_normalize_version(tmp_path):
    """Fresh drawers carry the current schema version so future upgrades work."""
    from mempalace.miner import add_drawer