feat(graph): cross-wing tunnels by shared topics (#1180)

When two wings have one or more confirmed TOPIC labels in common, the
miner now drops a symmetric tunnel between them at mine time so the
palace graph reflects shared themes (frameworks, vendors, recurring
concepts).

- llm_refine: TOPIC label routes to a dedicated `topics` bucket so the
  signal survives confirmation instead of getting collapsed into
  `uncertain` and dropped.
- entity_detector / project_scanner: bucket plumbed through the
  detection pipeline; `confirm_entities` returns confirmed topics
  alongside people/projects.
- miner.add_to_known_entities: optional `wing` parameter records the
  confirmed topics under `topics_by_wing` in
  `~/.mempalace/known_entities.json`. Wing names do NOT leak into the
  flat known-name set used by drawer-tagging.
- palace_graph: `compute_topic_tunnels` and `topic_tunnels_for_wing`
  create symmetric tunnels via the existing `create_tunnel` API so they
  share dedup and persistence with explicit tunnels.
- miner.mine: post-file-loop pass calls `topic_tunnels_for_wing` for
  the freshly-mined wing. Failures are logged but never abort the mine.
- config: `topic_tunnel_min_count` knob (env
  `MEMPALACE_TOPIC_TUNNEL_MIN_COUNT` or `~/.mempalace/config.json`),
  default 1.

Tests cover topic persistence through init->mine, tunnel creation when
wings share a topic, no tunnel below threshold, cross-wing tunnel
retrieval via `list_tunnels`, dedup on recompute, case-insensitive
overlap, and the end-to-end mine-time wiring.

Out of scope for this PR (called out in the PR body): manifest-
dependency overlap, per-topic allow/deny lists, search-result surfacing.
This commit is contained in:
Igor Lins e Silva
2026-04-24 19:19:58 -03:00
parent ed2ba726c9
commit fe051adc73
14 changed files with 678 additions and 28 deletions
+2 -2
View File
@@ -235,13 +235,13 @@ def test_detect_entities_empty_files(tmp_path):
f = tmp_path / "empty.txt"
f.write_text("")
result = detect_entities([f])
assert result == {"people": [], "projects": [], "uncertain": []}
assert result == {"people": [], "projects": [], "topics": [], "uncertain": []}
def test_detect_entities_handles_missing_file(tmp_path):
missing = tmp_path / "nonexistent.txt"
result = detect_entities([missing])
assert result == {"people": [], "projects": [], "uncertain": []}
assert result == {"people": [], "projects": [], "topics": [], "uncertain": []}
def test_detect_entities_respects_max_files(tmp_path):
+68
View File
@@ -206,3 +206,71 @@ def test_populated_registry_improves_miner_recall(temp_registry):
# All four registered entities should land in the metadata string
for expected in ("Julia Grib", "Kevin Heifner", "hyperion-history", "mempalace"):
assert expected in tagged, f"expected '{expected}' in metadata {tagged!r}"
# ── topics_by_wing — cross-wing tunnel signal source (issue #1180) ──
def test_topics_persisted_under_topics_by_wing(temp_registry):
miner.add_to_known_entities(
{"people": ["Alice"], "topics": ["Angular", "OpenAPI"]},
wing="wing_alpha",
)
data = json.loads(temp_registry.read_text())
# Topics also stored as a flat list (existing-style aggregate).
assert "Angular" in data["topics"]
# And recorded by wing for tunnel computation.
assert data["topics_by_wing"]["wing_alpha"] == ["Angular", "OpenAPI"]
def test_topics_by_wing_replaces_on_reinit(temp_registry):
"""Re-running init for the same wing should reflect the latest list,
not accumulate stale topics indefinitely."""
miner.add_to_known_entities({"topics": ["Angular", "OpenAPI"]}, wing="wing_alpha")
miner.add_to_known_entities({"topics": ["OpenAPI", "Postgres"]}, wing="wing_alpha")
data = json.loads(temp_registry.read_text())
assert data["topics_by_wing"]["wing_alpha"] == ["OpenAPI", "Postgres"]
def test_topics_by_wing_multiple_wings_coexist(temp_registry):
miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_a")
miner.add_to_known_entities({"topics": ["foo", "bar"]}, wing="wing_b")
data = json.loads(temp_registry.read_text())
assert data["topics_by_wing"] == {"wing_a": ["foo"], "wing_b": ["foo", "bar"]}
def test_topics_by_wing_skipped_without_wing(temp_registry):
miner.add_to_known_entities({"topics": ["foo"]})
data = json.loads(temp_registry.read_text())
# No wing → no topics_by_wing entry, but topics list still saved.
assert "topics_by_wing" not in data
assert data["topics"] == ["foo"]
def test_topics_by_wing_dedupes_case_insensitive(temp_registry):
miner.add_to_known_entities({"topics": ["OpenAPI", "openapi", "OPENAPI"]}, wing="wing_a")
data = json.loads(temp_registry.read_text())
# Only one entry, casing of the first observed name preserved.
assert data["topics_by_wing"]["wing_a"] == ["OpenAPI"]
def test_get_topics_by_wing_reads_registry(temp_registry):
miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_a")
miner.add_to_known_entities({"topics": ["foo", "bar"]}, wing="wing_b")
result = miner.get_topics_by_wing()
assert result == {"wing_a": ["foo"], "wing_b": ["foo", "bar"]}
def test_get_topics_by_wing_empty_when_missing(temp_registry):
miner.add_to_known_entities({"people": ["Alice"]})
assert miner.get_topics_by_wing() == {}
def test_topics_by_wing_does_not_pollute_known_names(temp_registry):
"""Wing names in topics_by_wing must NOT leak into the flat known-names
set used by ``_extract_entities_for_metadata`` — only the topic strings
themselves should be recognized."""
miner.add_to_known_entities({"topics": ["Angular"]}, wing="wing_super_secret_project")
known = miner._load_known_entities()
assert "Angular" in known
assert "wing_super_secret_project" not in known
+31 -3
View File
@@ -272,7 +272,9 @@ def test_apply_classifications_appends_reason_signal():
assert any("spoken of by name" in s for s in new["people"][0]["signals"])
def test_apply_classifications_topic_goes_to_uncertain():
def test_apply_classifications_topic_goes_to_topics_bucket():
"""TOPIC classifications now route to a dedicated ``topics`` bucket so the
miner can use them as cross-wing tunnel signal (issue #1180)."""
detected = {
"people": [],
"projects": [
@@ -289,8 +291,32 @@ def test_apply_classifications_topic_goes_to_uncertain():
decisions = {"Paris": ("TOPIC", "city, not a project")}
new, reclass, _ = _apply_classifications(detected, decisions)
assert len(new["projects"]) == 0
assert len(new["uncertain"]) == 0
assert len(new["topics"]) == 1
assert new["topics"][0]["name"] == "Paris"
assert new["topics"][0]["type"] == "topic"
assert reclass == 1
def test_apply_classifications_ambiguous_still_goes_to_uncertain():
detected = {
"people": [],
"projects": [
{
"name": "Foo",
"type": "project",
"confidence": 0.7,
"frequency": 5,
"signals": ["regex"],
}
],
"uncertain": [],
}
decisions = {"Foo": ("AMBIGUOUS", "context insufficient")}
new, reclass, _ = _apply_classifications(detected, decisions)
assert len(new["projects"]) == 0
assert len(new["uncertain"]) == 1
assert new["uncertain"][0]["name"] == "Paris"
assert new["uncertain"][0]["name"] == "Foo"
assert reclass == 1
@@ -469,7 +495,9 @@ def test_refine_entities_refines_high_confidence_regex_projects():
assert provider.call_count == 1
assert result.reclassified == 1
assert result.merged["projects"] == []
assert result.merged["uncertain"][0]["name"] == "OpenAPI"
# TOPIC labels go to the dedicated ``topics`` bucket so the miner can
# use them for cross-wing tunnel computation (issue #1180).
assert result.merged["topics"][0]["name"] == "OpenAPI"
def test_refine_entities_refines_regex_people_but_skips_git_people():
+101
View File
@@ -496,3 +496,104 @@ def test_add_drawer_stamps_normalize_version(tmp_path):
assert meta["normalize_version"] == NORMALIZE_VERSION
finally:
del col, client
def test_mine_creates_topic_tunnels_for_shared_topics(tmp_path, monkeypatch):
"""End-to-end: when two wings have already-confirmed topics that overlap,
the miner's mine-time pass drops a cross-wing tunnel between them.
Issue #1180.
"""
from mempalace import miner, palace_graph
# Redirect both the registry and tunnel-storage paths into tmp_path
# so we never touch the developer's real ~/.mempalace directory.
registry = tmp_path / "known_entities.json"
monkeypatch.setattr(miner, "_ENTITY_REGISTRY_PATH", str(registry))
miner._ENTITY_REGISTRY_CACHE.update({"mtime": None, "names": frozenset(), "raw": {}})
tunnels_file = tmp_path / "tunnels.json"
monkeypatch.setattr(palace_graph, "_TUNNEL_FILE", str(tunnels_file))
# Pre-populate the registry as if init had been run for two wings that
# share a topic.
miner.add_to_known_entities({"topics": ["foo", "bar"]}, wing="wing_one")
miner.add_to_known_entities({"topics": ["foo", "baz"]}, wing="wing_two")
# Mine wing_two — should drop tunnels between wing_two and wing_one
# for every shared topic. Just one in this case.
project_root = tmp_path / "wing_two_project"
project_root.mkdir()
write_file(
project_root / "notes.md",
"Some prose long enough to make a chunk. " * 20,
)
with open(project_root / "mempalace.yaml", "w") as f:
yaml.dump({"wing": "wing_two", "rooms": [{"name": "general"}]}, f)
palace_path = tmp_path / "palace"
mine(str(project_root), str(palace_path))
listed = palace_graph.list_tunnels()
assert len(listed) == 1
rooms = {listed[0]["source"]["room"], listed[0]["target"]["room"]}
assert rooms == {"foo"}
wings = {listed[0]["source"]["wing"], listed[0]["target"]["wing"]}
assert wings == {"wing_one", "wing_two"}
def test_mine_no_tunnel_when_threshold_blocks_overlap(tmp_path, monkeypatch):
"""Bumping ``MEMPALACE_TOPIC_TUNNEL_MIN_COUNT`` above the actual overlap
suppresses tunnel creation."""
from mempalace import miner, palace_graph
registry = tmp_path / "known_entities.json"
monkeypatch.setattr(miner, "_ENTITY_REGISTRY_PATH", str(registry))
miner._ENTITY_REGISTRY_CACHE.update({"mtime": None, "names": frozenset(), "raw": {}})
tunnels_file = tmp_path / "tunnels.json"
monkeypatch.setattr(palace_graph, "_TUNNEL_FILE", str(tunnels_file))
monkeypatch.setenv("MEMPALACE_TOPIC_TUNNEL_MIN_COUNT", "2")
miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_one")
miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_two")
project_root = tmp_path / "wing_two_project"
project_root.mkdir()
write_file(
project_root / "notes.md",
"Some prose long enough to make a chunk. " * 20,
)
with open(project_root / "mempalace.yaml", "w") as f:
yaml.dump({"wing": "wing_two", "rooms": [{"name": "general"}]}, f)
palace_path = tmp_path / "palace"
mine(str(project_root), str(palace_path))
# min_count=2 but only 1 shared topic → no tunnel.
assert palace_graph.list_tunnels() == []
def test_mine_no_tunnel_when_only_one_wing_has_topics(tmp_path, monkeypatch):
"""A wing in isolation (no other wing has confirmed topics) creates no tunnels."""
from mempalace import miner, palace_graph
registry = tmp_path / "known_entities.json"
monkeypatch.setattr(miner, "_ENTITY_REGISTRY_PATH", str(registry))
miner._ENTITY_REGISTRY_CACHE.update({"mtime": None, "names": frozenset(), "raw": {}})
tunnels_file = tmp_path / "tunnels.json"
monkeypatch.setattr(palace_graph, "_TUNNEL_FILE", str(tunnels_file))
miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_one")
project_root = tmp_path / "wing_one_project"
project_root.mkdir()
write_file(
project_root / "notes.md",
"Some prose long enough to make a chunk. " * 20,
)
with open(project_root / "mempalace.yaml", "w") as f:
yaml.dump({"wing": "wing_one", "rooms": [{"name": "general"}]}, f)
palace_path = tmp_path / "palace"
mine(str(project_root), str(palace_path))
assert palace_graph.list_tunnels() == []
+123
View File
@@ -135,3 +135,126 @@ class TestExplicitTunnels:
connections = palace_graph.follow_tunnels("wing_code", "auth", col=col)
assert len(connections) == 1
assert "drawer_preview" not in connections[0]
class TestTopicTunnels:
"""Cross-wing topic tunnels (issue #1180).
When two wings share confirmed TOPIC labels above a configurable
threshold, a symmetric tunnel is created between them. Tunnels are
routed through the existing ``create_tunnel`` storage so they share
dedup and persistence with explicit tunnels.
"""
def test_compute_topic_tunnels_creates_link_for_shared_topic(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {
"wing_alpha": ["Angular", "OpenAPI"],
"wing_beta": ["OpenAPI", "Kubernetes"],
}
created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1)
assert len(created) == 1
assert created[0]["source"]["wing"] in {"wing_alpha", "wing_beta"}
assert created[0]["target"]["wing"] in {"wing_alpha", "wing_beta"}
# Room is the topic itself (case preserved from the first wing).
assert created[0]["source"]["room"] == "OpenAPI"
assert "OpenAPI" in created[0]["label"]
# Tunnel is retrievable via the standard list_tunnels API.
listed = palace_graph.list_tunnels()
assert len(listed) == 1
assert listed[0]["id"] == created[0]["id"]
def test_compute_topic_tunnels_no_link_below_threshold(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {
"wing_alpha": ["Angular", "OpenAPI"],
"wing_beta": ["OpenAPI", "Kubernetes"],
}
# min_count=2 requires two overlapping topics — only one shared.
created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=2)
assert created == []
assert palace_graph.list_tunnels() == []
def test_compute_topic_tunnels_above_threshold_creates_per_topic_links(
self, tmp_path, monkeypatch
):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {
"wing_alpha": ["Angular", "OpenAPI", "Postgres"],
"wing_beta": ["Angular", "OpenAPI", "Redis"],
}
created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=2)
# Two shared topics × one wing pair = two tunnels.
rooms = sorted(t["source"]["room"] for t in created)
assert rooms == ["Angular", "OpenAPI"]
def test_compute_topic_tunnels_case_insensitive_overlap(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {
"wing_alpha": ["openapi"],
"wing_beta": ["OpenAPI"],
}
created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1)
assert len(created) == 1
def test_compute_topic_tunnels_empty_input_is_noop(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
assert palace_graph.compute_topic_tunnels({}) == []
assert palace_graph.compute_topic_tunnels({"wing_a": []}) == []
assert palace_graph.list_tunnels() == []
def test_compute_topic_tunnels_three_wings_pairwise(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {
"wing_a": ["foo"],
"wing_b": ["foo"],
"wing_c": ["foo"],
}
created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1)
# 3 wings sharing the same topic → C(3,2) = 3 pairs → 3 tunnels.
assert len(created) == 3
endpoint_pairs = {
tuple(sorted([t["source"]["wing"], t["target"]["wing"]])) for t in created
}
assert endpoint_pairs == {
("wing_a", "wing_b"),
("wing_a", "wing_c"),
("wing_b", "wing_c"),
}
def test_topic_tunnels_for_wing_only_links_that_wing(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {
"wing_a": ["foo", "bar"],
"wing_b": ["foo"],
"wing_c": ["bar"],
}
# wing_a should link to both b (via foo) and c (via bar).
created = palace_graph.topic_tunnels_for_wing("wing_a", topics_by_wing)
endpoint_pairs = {
tuple(sorted([t["source"]["wing"], t["target"]["wing"]])) for t in created
}
assert endpoint_pairs == {("wing_a", "wing_b"), ("wing_a", "wing_c")}
# The b-c pair is NOT created because wing_a's incremental pass
# only computes pairs that include wing_a.
assert len(palace_graph.list_tunnels()) == 2
def test_topic_tunnels_for_wing_unknown_wing_is_noop(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {"wing_a": ["foo"], "wing_b": ["foo"]}
assert palace_graph.topic_tunnels_for_wing("wing_missing", topics_by_wing) == []
assert palace_graph.list_tunnels() == []
def test_compute_topic_tunnels_dedupe_on_recompute(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {
"wing_alpha": ["OpenAPI"],
"wing_beta": ["OpenAPI"],
}
first = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1)
second = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1)
# create_tunnel is symmetric/dedupe — repeated computation should
# not multiply the stored tunnels.
assert first[0]["id"] == second[0]["id"]
assert len(palace_graph.list_tunnels()) == 1
+4 -1
View File
@@ -363,11 +363,14 @@ def test_to_detected_dict_shape():
projects = [ProjectInfo(name="p", repo_root=Path("."), is_mine=True, manifest="package.json")]
people = [PersonInfo(name="Jane Doe", total_commits=5, repos={"r"})]
d = to_detected_dict(projects, people)
assert set(d.keys()) == {"people", "projects", "uncertain"}
# ``topics`` is the LLM-refine bucket for cross-wing tunnel signal —
# always present even when empty so callers can rely on the shape.
assert set(d.keys()) == {"people", "projects", "topics", "uncertain"}
assert d["projects"][0]["name"] == "p"
assert d["projects"][0]["type"] == "project"
assert d["people"][0]["name"] == "Jane Doe"
assert d["people"][0]["type"] == "person"
assert d["topics"] == []
assert d["uncertain"] == []