Merge pull request #1004 from coogie/coogie/fix/miner-routing
Tests / test-linux (3.11) (push) Successful in 13m29s
Tests / test-linux (3.13) (push) Successful in 13m6s
Tests / test-linux (3.9) (push) Successful in 13m14s
Tests / lint (push) Successful in 9m24s
Tests / test-windows (push) Has been cancelled
Tests / test-macos (push) Has been cancelled
Tests / test-linux (3.11) (push) Successful in 13m29s
Tests / test-linux (3.13) (push) Successful in 13m6s
Tests / test-linux (3.9) (push) Successful in 13m14s
Tests / lint (push) Successful in 9m24s
Tests / test-windows (push) Has been cancelled
Tests / test-macos (push) Has been cancelled
fix(miner): use token-boundary matching in detect_room
This commit is contained in:
+98
-1
@@ -7,7 +7,7 @@ from pathlib import Path
|
||||
import chromadb
|
||||
import yaml
|
||||
|
||||
from mempalace.miner import load_config, mine, scan_project, status
|
||||
from mempalace.miner import detect_room, load_config, mine, scan_project, status
|
||||
from mempalace.palace import NORMALIZE_VERSION, file_already_mined
|
||||
|
||||
|
||||
@@ -491,6 +491,103 @@ def test_file_already_mined_returns_false_for_stale_normalize_version():
|
||||
shutil.rmtree(tmpdir, ignore_errors=True)
|
||||
|
||||
|
||||
def test_detect_room_uses_token_boundary_matching(tmp_path):
|
||||
"""Path-part routing must not fire on incidental substrings.
|
||||
|
||||
Regression: "views" is a substring of "interviews", so the old
|
||||
substring check routed every file under views/ into a room keyed
|
||||
by "interviews". Token-boundary matching prevents this while still
|
||||
matching real tokens like "frontend" in "frontend-app".
|
||||
"""
|
||||
project = tmp_path
|
||||
rooms = [
|
||||
{"name": "billing-page", "keywords": ["billing-page"]},
|
||||
{"name": "interviews", "keywords": ["interviews"]},
|
||||
{"name": "general", "keywords": []},
|
||||
]
|
||||
|
||||
# views/<X>/... must NOT route to "interviews" on the "views"⊂"interviews" accident
|
||||
view_file = project / "views" / "billing-page" / "Foo.test.tsx"
|
||||
view_file.parent.mkdir(parents=True)
|
||||
view_file.write_text("content")
|
||||
assert detect_room(view_file, "content", rooms, project) == "billing-page"
|
||||
|
||||
# data/interviews/... must route to "interviews" via the real token
|
||||
data_file = project / "data" / "interviews" / "index.ts"
|
||||
data_file.parent.mkdir(parents=True)
|
||||
data_file.write_text("content")
|
||||
assert detect_room(data_file, "content", rooms, project) == "interviews"
|
||||
|
||||
|
||||
def test_detect_room_preserves_token_matches(tmp_path):
|
||||
"""Real separator-bounded tokens still match in both directions."""
|
||||
project = tmp_path
|
||||
rooms = [
|
||||
{"name": "frontend", "keywords": ["frontend"]},
|
||||
{"name": "general", "keywords": []},
|
||||
]
|
||||
|
||||
# path part contains keyword as a token
|
||||
f1 = project / "frontend-app" / "main.ts"
|
||||
f1.parent.mkdir(parents=True)
|
||||
f1.write_text("x")
|
||||
assert detect_room(f1, "x", rooms, project) == "frontend"
|
||||
|
||||
# keyword contains path part as a token (reverse direction)
|
||||
rooms2 = [
|
||||
{"name": "data-retention", "keywords": ["data-retention"]},
|
||||
{"name": "general", "keywords": []},
|
||||
]
|
||||
f2 = project / "data" / "data-retention" / "policy.ts"
|
||||
f2.parent.mkdir(parents=True)
|
||||
f2.write_text("x")
|
||||
assert detect_room(f2, "x", rooms2, project) == "data-retention"
|
||||
|
||||
|
||||
def test_detect_room_matches_keyword_distinct_from_name(tmp_path):
|
||||
"""Regression: PR #145 — path part must match a keyword even when the
|
||||
room name itself doesn't contain the path part as a token.
|
||||
|
||||
Scenario: a folder named ``docs/`` should route to a room named
|
||||
``documentation`` that declares ``"docs"`` as a keyword.
|
||||
"""
|
||||
project = tmp_path
|
||||
rooms = [
|
||||
{"name": "documentation", "keywords": ["docs"]},
|
||||
{"name": "general", "keywords": []},
|
||||
]
|
||||
|
||||
f = project / "docs" / "readme.md"
|
||||
f.parent.mkdir(parents=True)
|
||||
f.write_text("x")
|
||||
assert detect_room(f, "x", rooms, project) == "documentation"
|
||||
|
||||
|
||||
def test_detect_room_filename_match_uses_token_boundary(tmp_path):
|
||||
"""Priority 2 (filename match) must also use token-boundary rules."""
|
||||
project = tmp_path
|
||||
rooms = [
|
||||
{"name": "review", "keywords": []},
|
||||
{"name": "general", "keywords": []},
|
||||
]
|
||||
|
||||
# "review" is a substring of "reviewmodule" but not a token — should NOT match
|
||||
f1 = project / "reviewmodule.ts"
|
||||
f1.write_text("x")
|
||||
assert detect_room(f1, "x", rooms, project) != "review"
|
||||
|
||||
# "review" IS a token of "review-page" — should match
|
||||
f2 = project / "review-page.ts"
|
||||
f2.write_text("x")
|
||||
assert detect_room(f2, "x", rooms, project) == "review"
|
||||
|
||||
# Dotted filename stems like "Foo.test" split on "." too
|
||||
rooms3 = [{"name": "foo", "keywords": []}, {"name": "general", "keywords": []}]
|
||||
f3 = project / "foo.test.ts"
|
||||
f3.write_text("x")
|
||||
assert detect_room(f3, "x", rooms3, project) == "foo"
|
||||
|
||||
|
||||
def test_add_drawer_stamps_normalize_version(tmp_path):
|
||||
"""Fresh drawers carry the current schema version so future upgrades work."""
|
||||
from mempalace.miner import add_drawer
|
||||
|
||||
Reference in New Issue
Block a user