feat(graph): cross-wing tunnels by shared topics (#1180)

When two wings have one or more confirmed TOPIC labels in common, the
miner now drops a symmetric tunnel between them at mine time so the
palace graph reflects shared themes (frameworks, vendors, recurring
concepts).

- llm_refine: TOPIC label routes to a dedicated `topics` bucket so the
  signal survives confirmation instead of getting collapsed into
  `uncertain` and dropped.
- entity_detector / project_scanner: bucket plumbed through the
  detection pipeline; `confirm_entities` returns confirmed topics
  alongside people/projects.
- miner.add_to_known_entities: optional `wing` parameter records the
  confirmed topics under `topics_by_wing` in
  `~/.mempalace/known_entities.json`. Wing names do NOT leak into the
  flat known-name set used by drawer-tagging.
- palace_graph: `compute_topic_tunnels` and `topic_tunnels_for_wing`
  create symmetric tunnels via the existing `create_tunnel` API so they
  share dedup and persistence with explicit tunnels.
- miner.mine: post-file-loop pass calls `topic_tunnels_for_wing` for
  the freshly-mined wing. Failures are logged but never abort the mine.
- config: `topic_tunnel_min_count` knob (env
  `MEMPALACE_TOPIC_TUNNEL_MIN_COUNT` or `~/.mempalace/config.json`),
  default 1.

Tests cover topic persistence through init->mine, tunnel creation when
wings share a topic, no tunnel below threshold, cross-wing tunnel
retrieval via `list_tunnels`, dedup on recompute, case-insensitive
overlap, and the end-to-end mine-time wiring.

Out of scope for this PR (called out in the PR body): manifest-
dependency overlap, per-topic allow/deny lists, search-result surfacing.
This commit is contained in:
Igor Lins e Silva
2026-04-24 19:19:58 -03:00
parent ed2ba726c9
commit fe051adc73
14 changed files with 678 additions and 28 deletions
+8
View File
@@ -6,6 +6,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
--- ---
## [3.3.4] — unreleased
### Added
- **Cross-wing topic tunnels.** When two wings have confirmed `TOPIC` labels in common (the LLM-refine bucket from `mempalace init --llm`), the miner now drops a symmetric tunnel between them at mine time so the palace graph reflects shared themes (frameworks, vendors, recurring concepts). Tunnels are routed through the existing `create_tunnel` storage so they share dedup and persistence with explicit tunnels. Threshold is configurable via `MEMPALACE_TOPIC_TUNNEL_MIN_COUNT` env var or `topic_tunnel_min_count` in `~/.mempalace/config.json` (default `1`). Manifest-dependency overlap and per-topic allow/deny lists remain out of scope. (#1180)
---
## [3.3.3] — 2026-04-23 ## [3.3.3] — 2026-04-23
### Bug Fixes ### Bug Fixes
+18 -5
View File
@@ -117,21 +117,34 @@ def cmd_init(args):
if languages_tuple != ("en",): if languages_tuple != ("en",):
print(f" Languages: {', '.join(languages_tuple)}") print(f" Languages: {', '.join(languages_tuple)}")
detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider) detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider)
total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"]) total = (
len(detected["people"])
+ len(detected["projects"])
+ len(detected.get("topics", []))
+ len(detected["uncertain"])
)
if total > 0: if total > 0:
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False)) confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
# Save confirmed entities to <project>/entities.json (per-project # Save confirmed entities to <project>/entities.json (per-project
# audit trail — user can inspect or hand-edit) AND merge into the # audit trail — user can inspect or hand-edit) AND merge into the
# global registry the miner reads at mine time. # global registry the miner reads at mine time. Topics are kept
if confirmed["people"] or confirmed["projects"]: # separately so the miner can later compute cross-wing tunnels
entities_path = Path(args.dir).expanduser().resolve() / "entities.json" # from shared topics (see palace_graph.compute_topic_tunnels).
if confirmed["people"] or confirmed["projects"] or confirmed.get("topics"):
project_path = Path(args.dir).expanduser().resolve()
entities_path = project_path / "entities.json"
with open(entities_path, "w", encoding="utf-8") as f: with open(entities_path, "w", encoding="utf-8") as f:
json.dump(confirmed, f, indent=2, ensure_ascii=False) json.dump(confirmed, f, indent=2, ensure_ascii=False)
print(f" Entities saved: {entities_path}") print(f" Entities saved: {entities_path}")
from .miner import add_to_known_entities from .miner import add_to_known_entities
registry_path = add_to_known_entities(confirmed) # Wing matches the default produced by ``room_detector_local``
# (folder basename) and the miner fallback in ``load_config``.
# Used by the topics_by_wing map so cross-wing tunnels can be
# computed at mine time.
wing = project_path.name
registry_path = add_to_known_entities(confirmed, wing=wing)
print(f" Registry updated: {registry_path}") print(f" Registry updated: {registry_path}")
else: else:
print(" No entities detected — proceeding with directory-based rooms.") print(" No entities detected — proceeding with directory-based rooms.")
+26
View File
@@ -253,6 +253,32 @@ class MempalaceConfig:
return env_val.strip().lower() return env_val.strip().lower()
return str(self._file_config.get("embedding_device", "auto")).strip().lower() return str(self._file_config.get("embedding_device", "auto")).strip().lower()
@property
def topic_tunnel_min_count(self):
"""Minimum number of overlapping confirmed topics required to create
a cross-wing tunnel between two wings.
Default is ``1`` — any single shared topic produces a tunnel. Bump
to ``2+`` if your projects share lots of common-tech labels (Python,
Docker, Git) and you want only meaningfully overlapping wings to
link. Reads ``MEMPALACE_TOPIC_TUNNEL_MIN_COUNT`` env first, then the
config-file value, then ``1``.
"""
env_val = os.environ.get("MEMPALACE_TOPIC_TUNNEL_MIN_COUNT")
if env_val:
try:
parsed = int(env_val)
if parsed >= 1:
return parsed
except ValueError:
pass
cfg_val = self._file_config.get("topic_tunnel_min_count")
try:
parsed = int(cfg_val) if cfg_val is not None else 1
except (TypeError, ValueError):
parsed = 1
return max(1, parsed)
@property @property
def hook_silent_save(self): def hook_silent_save(self):
"""Whether the stop hook saves directly (True) or blocks for MCP calls (False).""" """Whether the stop hook saves directly (True) or blocks for MCP calls (False)."""
+24 -4
View File
@@ -440,7 +440,7 @@ def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) ->
candidates = extract_candidates(combined_text, languages=langs) candidates = extract_candidates(combined_text, languages=langs)
if not candidates: if not candidates:
return {"people": [], "projects": [], "uncertain": []} return {"people": [], "projects": [], "topics": [], "uncertain": []}
# Score and classify each candidate # Score and classify each candidate
people = [] people = []
@@ -467,6 +467,7 @@ def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) ->
return { return {
"people": people[:15], "people": people[:15],
"projects": projects[:10], "projects": projects[:10],
"topics": [],
"uncertain": uncertain[:8], "uncertain": uncertain[:8],
} }
@@ -489,7 +490,13 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict:
""" """
Interactive confirmation step. Interactive confirmation step.
User reviews detected entities, removes wrong ones, adds missing ones. User reviews detected entities, removes wrong ones, adds missing ones.
Returns confirmed {people: [names], projects: [names]} Returns confirmed {people: [names], projects: [names], topics: [names]}.
Topics are not surfaced for interactive review — they come from the
LLM-refined ``TOPIC`` bucket and are passed through verbatim. They
feed cross-wing tunnel computation at mine time (see
``palace_graph.compute_topic_tunnels``); a wrong topic at worst adds
a low-traffic tunnel and never alters drawer storage.
Pass yes=True to auto-accept all detected entities without prompting. Pass yes=True to auto-accept all detected entities without prompting.
""" """
@@ -501,18 +508,28 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict:
_print_entity_list(detected["people"], "PEOPLE") _print_entity_list(detected["people"], "PEOPLE")
_print_entity_list(detected["projects"], "PROJECTS") _print_entity_list(detected["projects"], "PROJECTS")
if detected.get("topics"):
_print_entity_list(detected["topics"], "TOPICS (cross-wing tunnel signal)")
if detected["uncertain"]: if detected["uncertain"]:
_print_entity_list(detected["uncertain"], "UNCERTAIN (need your call)") _print_entity_list(detected["uncertain"], "UNCERTAIN (need your call)")
confirmed_people = [e["name"] for e in detected["people"]] confirmed_people = [e["name"] for e in detected["people"]]
confirmed_projects = [e["name"] for e in detected["projects"]] confirmed_projects = [e["name"] for e in detected["projects"]]
confirmed_topics = [e["name"] for e in detected.get("topics", [])]
if yes: if yes:
# Auto-accept: include all detected (skip uncertain — ambiguous without user input) # Auto-accept: include all detected (skip uncertain — ambiguous without user input)
print( print(
f"\n Auto-accepting {len(confirmed_people)} people, {len(confirmed_projects)} projects." f"\n Auto-accepting {len(confirmed_people)} people, "
f"{len(confirmed_projects)} projects, "
f"{len(confirmed_topics)} topics."
) )
return {"people": confirmed_people, "projects": confirmed_projects} return {
"people": confirmed_people,
"projects": confirmed_projects,
"topics": confirmed_topics,
}
print(f"\n{'' * 58}") print(f"\n{'' * 58}")
print(" Options:") print(" Options:")
@@ -570,11 +587,14 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict:
print(" Confirmed:") print(" Confirmed:")
print(f" People: {', '.join(confirmed_people) or '(none)'}") print(f" People: {', '.join(confirmed_people) or '(none)'}")
print(f" Projects: {', '.join(confirmed_projects) or '(none)'}") print(f" Projects: {', '.join(confirmed_projects) or '(none)'}")
if confirmed_topics:
print(f" Topics: {', '.join(confirmed_topics)}")
print(f"{'=' * 58}\n") print(f"{'=' * 58}\n")
return { return {
"people": confirmed_people, "people": confirmed_people,
"projects": confirmed_projects, "projects": confirmed_projects,
"topics": confirmed_topics,
} }
+14 -9
View File
@@ -197,13 +197,23 @@ def _apply_classifications(
"""Merge LLM decisions back into the detected dict. """Merge LLM decisions back into the detected dict.
Returns (new_detected, reclassified_count, dropped_count). Returns (new_detected, reclassified_count, dropped_count).
Topics get their own bucket so the caller can persist them as
cross-wing tunnel signal. ``AMBIGUOUS`` still falls back to
``uncertain`` for human review.
""" """
label_to_bucket = { label_to_bucket = {
"PERSON": "people", "PERSON": "people",
"PROJECT": "projects", "PROJECT": "projects",
"TOPIC": "uncertain", "TOPIC": "topics",
"AMBIGUOUS": "uncertain", "AMBIGUOUS": "uncertain",
} }
bucket_to_type = {
"people": "person",
"projects": "project",
"topics": "topic",
"uncertain": "uncertain",
}
# Index every entity by name for in-place update # Index every entity by name for in-place update
all_entries: list[tuple[str, dict]] = [] all_entries: list[tuple[str, dict]] = []
@@ -216,6 +226,7 @@ def _apply_classifications(
new_detected: dict[str, list[dict]] = { new_detected: dict[str, list[dict]] = {
"people": [], "people": [],
"projects": [], "projects": [],
"topics": [],
"uncertain": [], "uncertain": [],
} }
@@ -223,7 +234,7 @@ def _apply_classifications(
decision = decisions.get(entry["name"]) decision = decisions.get(entry["name"])
if decision is None: if decision is None:
# No LLM opinion — keep as-is # No LLM opinion — keep as-is
new_detected[old_bucket].append(entry) new_detected.setdefault(old_bucket, []).append(entry)
continue continue
label, reason = decision label, reason = decision
@@ -245,13 +256,7 @@ def _apply_classifications(
updated["signals"] = signals updated["signals"] = signals
if target_bucket != old_bucket: if target_bucket != old_bucket:
reclassified += 1 reclassified += 1
updated["type"] = ( updated["type"] = bucket_to_type.get(target_bucket, "uncertain")
"person"
if target_bucket == "people"
else "project"
if target_bucket == "projects"
else "uncertain"
)
new_detected[target_bucket].append(updated) new_detected[target_bucket].append(updated)
return new_detected, reclassified, dropped return new_detected, reclassified, dropped
+118 -2
View File
@@ -439,7 +439,16 @@ def _refresh_known_entities_cache() -> None:
data = json.load(f) data = json.load(f)
if isinstance(data, dict): if isinstance(data, dict):
raw = data raw = data
for cat in data.values(): for cat_key, cat in data.items():
# Special wing-keyed map — its inner values are topic
# names but its outer keys are wings, which must NOT be
# surfaced as known entities. Pull the topic names out
# explicitly instead of treating it as a generic category.
if cat_key == "topics_by_wing" and isinstance(cat, dict):
for topic_list in cat.values():
if isinstance(topic_list, list):
names.update(str(n) for n in topic_list if n)
continue
if isinstance(cat, list): if isinstance(cat, list):
names.update(str(n) for n in cat if n) names.update(str(n) for n in cat if n)
elif isinstance(cat, dict): elif isinstance(cat, dict):
@@ -474,7 +483,39 @@ def _load_known_entities_raw() -> dict:
return dict(_ENTITY_REGISTRY_CACHE["raw"]) return dict(_ENTITY_REGISTRY_CACHE["raw"])
def add_to_known_entities(entities_by_category: dict) -> str: def _set_wing_topics(existing: dict, wing_key: str, topics_for_wing: list, coerce) -> None:
"""Update ``existing['topics_by_wing'][wing_key]`` to the deduped list.
Replaces (does not union) the wing's topic list — re-running ``init``
should reflect the user's latest confirmation rather than accumulate
stale labels. Empty input drops the wing entry; an empty map drops
the ``topics_by_wing`` key entirely.
"""
topics_map = existing.get("topics_by_wing")
if not isinstance(topics_map, dict):
topics_map = {}
seen_lower: set = set()
ordered: list = []
for n in topics_for_wing:
name = coerce(n)
if not name:
continue
key = name.lower()
if key in seen_lower:
continue
seen_lower.add(key)
ordered.append(name)
if ordered:
topics_map[wing_key] = ordered
else:
topics_map.pop(wing_key, None)
if topics_map:
existing["topics_by_wing"] = topics_map
else:
existing.pop("topics_by_wing", None)
def add_to_known_entities(entities_by_category: dict, wing: str = None) -> str:
"""Union ``entities_by_category`` into ``~/.mempalace/known_entities.json``. """Union ``entities_by_category`` into ``~/.mempalace/known_entities.json``.
Accepts ``{category: [names]}`` shape as produced by ``mempalace init`` Accepts ``{category: [names]}`` shape as produced by ``mempalace init``
@@ -488,6 +529,15 @@ def add_to_known_entities(entities_by_category: dict) -> str:
added as keys with ``None`` values so existing code mappings aren't added as keys with ``None`` values so existing code mappings aren't
overwritten. A later compress pass can assign codes. overwritten. A later compress pass can assign codes.
When ``wing`` is provided AND ``entities_by_category`` contains a
``topics`` list, those topics are also recorded under
``topics_by_wing[wing]`` (case-insensitive dedup, preserving the
casing of the first observed name). This is the signal source for
``palace_graph.compute_topic_tunnels`` at mine time. Topics for a
wing are *replaced*, not unioned, so a re-run of ``init`` reflects
the user's latest confirmation rather than accumulating stale labels
indefinitely.
The in-process cache is invalidated on write so same-process callers The in-process cache is invalidated on write so same-process callers
(notably ``cmd_init`` → ``cmd_mine`` in sequence) see the update (notably ``cmd_init`` → ``cmd_mine`` in sequence) see the update
immediately instead of waiting for a mtime re-check. immediately instead of waiting for a mtime re-check.
@@ -515,7 +565,16 @@ def add_to_known_entities(entities_by_category: dict) -> str:
name = str(value) name = str(value)
return name if name else None return name if name else None
# Separate the topics_by_wing key from regular categories so we don't
# treat it as a flat name-list elsewhere in this function.
topics_for_wing = None
if wing and isinstance(wing, str) and wing.strip():
topics_for_wing = entities_by_category.get("topics") or []
for category, names in entities_by_category.items(): for category, names in entities_by_category.items():
if category == "topics_by_wing":
# Reserved key — managed separately below.
continue
if not isinstance(names, list) or not names: if not isinstance(names, list) or not names:
continue continue
current = existing.get(category) current = existing.get(category)
@@ -551,6 +610,9 @@ def add_to_known_entities(entities_by_category: dict) -> str:
ordered.append(name) ordered.append(name)
existing[category] = ordered existing[category] = ordered
if topics_for_wing is not None:
_set_wing_topics(existing, wing.strip(), topics_for_wing, _coerce_name)
registry_path.write_text(_json.dumps(existing, indent=2, ensure_ascii=False), encoding="utf-8") registry_path.write_text(_json.dumps(existing, indent=2, ensure_ascii=False), encoding="utf-8")
try: try:
registry_path.chmod(0o600) registry_path.chmod(0o600)
@@ -565,6 +627,28 @@ def add_to_known_entities(entities_by_category: dict) -> str:
return str(registry_path) return str(registry_path)
def get_topics_by_wing() -> dict:
"""Return ``topics_by_wing`` from the global registry as a dict.
Returns ``{}`` if the registry is missing, malformed, or has no
``topics_by_wing`` key. Casing is preserved from disk; callers that
need case-insensitive comparison should normalize themselves.
"""
raw = _load_known_entities_raw()
topics_map = raw.get("topics_by_wing")
if not isinstance(topics_map, dict):
return {}
out: dict = {}
for wing, topics in topics_map.items():
if not isinstance(wing, str) or not wing.strip():
continue
if isinstance(topics, list):
cleaned = [str(t) for t in topics if isinstance(t, str) and t.strip()]
if cleaned:
out[wing.strip()] = cleaned
return out
_HALL_KEYWORDS_CACHE = None _HALL_KEYWORDS_CACHE = None
@@ -962,6 +1046,19 @@ def mine(
if not dry_run: if not dry_run:
print(f" + [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers}") print(f" + [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers}")
if not dry_run:
# Cross-wing topic tunnels: after every file in this wing has been
# processed, link this wing to any other wing that shares a
# confirmed TOPIC label. Out of scope for v1: manifest-dependency
# overlap, per-topic allow/deny lists, search-result surfacing.
try:
tunnels_added = _compute_topic_tunnels_for_wing(wing)
if tunnels_added:
print(f"\n Topic tunnels: +{tunnels_added} cross-wing link(s)")
except Exception as e:
# Tunnel computation must never fail a mine — degrade quietly.
print(f"\n WARNING: topic tunnel computation skipped — {e}", file=sys.stderr)
print(f"\n{'=' * 55}") print(f"\n{'=' * 55}")
print(" Done.") print(" Done.")
print(f" Files processed: {len(files) - files_skipped}") print(f" Files processed: {len(files) - files_skipped}")
@@ -974,6 +1071,25 @@ def mine(
print(f"{'=' * 55}\n") print(f"{'=' * 55}\n")
def _compute_topic_tunnels_for_wing(wing: str) -> int:
"""Drop tunnels between ``wing`` and every other wing that shares
confirmed topics, honoring the ``topic_tunnel_min_count`` config knob.
Returns the number of tunnels created or refreshed. Zero means no
overlap found (or the registry has no ``topics_by_wing`` map yet).
"""
from .config import MempalaceConfig
from .palace_graph import topic_tunnels_for_wing
topics_map = get_topics_by_wing()
if not topics_map or wing not in topics_map:
return 0
cfg = MempalaceConfig()
min_count = cfg.topic_tunnel_min_count
created = topic_tunnels_for_wing(wing, topics_map, min_count=min_count)
return len(created)
# ============================================================================= # =============================================================================
# STATUS # STATUS
# ============================================================================= # =============================================================================
+138
View File
@@ -499,3 +499,141 @@ def follow_tunnels(wing: str, room: str, col=None, config=None):
pass pass
return connections return connections
# =============================================================================
# TOPIC TUNNELS — auto-link wings that share confirmed TOPIC labels
# =============================================================================
# When two wings have one or more confirmed topics in common (e.g. both
# discuss "Angular" or "OpenAPI"), drop a symmetric tunnel between them.
# Topics come from the LLM-refined ``TOPIC`` bucket in the per-project
# ``entities.json`` and are persisted by wing in
# ``~/.mempalace/known_entities.json`` under ``topics_by_wing``.
#
# Tunnels are created via the existing ``create_tunnel`` API so they share
# storage and dedup with explicit tunnels. The room is the topic name —
# this matches the "two wings share an idea" mental model and keeps the
# graph homogeneous.
def _normalize_topic(name: str) -> str:
"""Lowercase + strip topics for case-insensitive overlap detection."""
return str(name).strip().lower()
def compute_topic_tunnels(
topics_by_wing: dict,
min_count: int = 1,
label_prefix: str = "shared topic",
) -> list[dict]:
"""Create tunnels for every pair of wings that share >= ``min_count`` topics.
Args:
topics_by_wing: ``{wing_name: [topic_name, ...]}`` mapping. Topic
names are compared case-insensitively; the first observed
casing is used for the tunnel room name.
min_count: minimum number of overlapping topics required to drop
any tunnel between a wing pair. ``1`` means a single shared
topic is enough; bumping to e.g. ``2`` requires multiple
overlaps and filters out coincidental single-topic links.
label_prefix: human-readable string prefixed to the tunnel label.
Returns:
List of tunnel dicts as returned by ``create_tunnel`` — one per
(wing_a, wing_b, topic) triple that crossed the threshold. A
wing-pair below ``min_count`` produces no tunnels at all (not
even for its single shared topic).
No-op semantics:
- empty/None ``topics_by_wing`` returns ``[]``.
- wings whose topic list is empty are skipped.
- ``min_count <= 0`` is clamped to 1.
"""
if not topics_by_wing:
return []
min_count = max(1, int(min_count))
# Build a normalized-topic -> first-seen casing map per wing so we
# preserve display casing while still doing case-insensitive overlap.
wing_topics: dict[str, dict[str, str]] = {}
for wing, names in topics_by_wing.items():
if not isinstance(wing, str) or not wing.strip():
continue
if not isinstance(names, (list, tuple)):
continue
bucket: dict[str, str] = {}
for n in names:
if not isinstance(n, str):
continue
key = _normalize_topic(n)
if not key:
continue
bucket.setdefault(key, n.strip())
if bucket:
wing_topics[wing.strip()] = bucket
wings = sorted(wing_topics.keys())
created: list[dict] = []
for i, wa in enumerate(wings):
topics_a = wing_topics[wa]
for wb in wings[i + 1 :]:
topics_b = wing_topics[wb]
shared_keys = set(topics_a.keys()) & set(topics_b.keys())
if len(shared_keys) < min_count:
continue
# Stable sort for deterministic tunnel ordering across runs.
for key in sorted(shared_keys):
# Prefer the casing from whichever wing sorts first — both
# are valid; this just keeps the displayed room consistent.
room = topics_a[key] if topics_a[key] else topics_b[key]
tunnel = create_tunnel(
source_wing=wa,
source_room=room,
target_wing=wb,
target_room=room,
label=f"{label_prefix}: {room}",
)
created.append(tunnel)
return created
def topic_tunnels_for_wing(
wing: str,
topics_by_wing: dict,
min_count: int = 1,
label_prefix: str = "shared topic",
) -> list[dict]:
"""Compute topic tunnels involving a single wing.
Used by the miner to incrementally update tunnels for the wing that
just finished mining without recomputing pairs that don't involve it.
Returns the list of tunnels created or refreshed.
"""
if not topics_by_wing or not isinstance(wing, str) or not wing.strip():
return []
wing = wing.strip()
own = topics_by_wing.get(wing)
if not isinstance(own, (list, tuple)) or not own:
return []
# Restrict the pair-wise computation to (wing, other) pairs only by
# building a 2-wing slice for each other wing. Reusing
# ``compute_topic_tunnels`` keeps the threshold and casing logic in
# one place.
created: list[dict] = []
for other, other_topics in topics_by_wing.items():
if not isinstance(other, str) or not other.strip() or other == wing:
continue
if not isinstance(other_topics, (list, tuple)) or not other_topics:
continue
slice_map = {wing: list(own), other: list(other_topics)}
created.extend(
compute_topic_tunnels(
slice_map,
min_count=min_count,
label_prefix=label_prefix,
)
)
return created
+3 -2
View File
@@ -558,6 +558,7 @@ def to_detected_dict(
return { return {
"people": people_entries, "people": people_entries,
"projects": proj_entries, "projects": proj_entries,
"topics": [],
"uncertain": [], "uncertain": [],
} }
@@ -577,7 +578,7 @@ def _merge_detected(primary: dict, secondary: dict, drop_secondary_uncertain: bo
""" """
seen = {e["name"].lower() for cat in primary.values() for e in cat} seen = {e["name"].lower() for cat in primary.values() for e in cat}
merged = {k: list(v) for k, v in primary.items()} merged = {k: list(v) for k, v in primary.items()}
for cat_key in ("people", "projects", "uncertain"): for cat_key in ("people", "projects", "topics", "uncertain"):
if cat_key == "uncertain" and drop_secondary_uncertain: if cat_key == "uncertain" and drop_secondary_uncertain:
continue continue
for e in secondary.get(cat_key, []): for e in secondary.get(cat_key, []):
@@ -654,7 +655,7 @@ def discover_entities(
prose_detected = ( prose_detected = (
detect_entities(prose_files, languages=languages) detect_entities(prose_files, languages=languages)
if prose_files if prose_files
else {"people": [], "projects": [], "uncertain": []} else {"people": [], "projects": [], "topics": [], "uncertain": []}
) )
# Without LLM refinement, suppress regex "uncertain" noise when real # Without LLM refinement, suppress regex "uncertain" noise when real
+2 -2
View File
@@ -235,13 +235,13 @@ def test_detect_entities_empty_files(tmp_path):
f = tmp_path / "empty.txt" f = tmp_path / "empty.txt"
f.write_text("") f.write_text("")
result = detect_entities([f]) result = detect_entities([f])
assert result == {"people": [], "projects": [], "uncertain": []} assert result == {"people": [], "projects": [], "topics": [], "uncertain": []}
def test_detect_entities_handles_missing_file(tmp_path): def test_detect_entities_handles_missing_file(tmp_path):
missing = tmp_path / "nonexistent.txt" missing = tmp_path / "nonexistent.txt"
result = detect_entities([missing]) result = detect_entities([missing])
assert result == {"people": [], "projects": [], "uncertain": []} assert result == {"people": [], "projects": [], "topics": [], "uncertain": []}
def test_detect_entities_respects_max_files(tmp_path): def test_detect_entities_respects_max_files(tmp_path):
+68
View File
@@ -206,3 +206,71 @@ def test_populated_registry_improves_miner_recall(temp_registry):
# All four registered entities should land in the metadata string # All four registered entities should land in the metadata string
for expected in ("Julia Grib", "Kevin Heifner", "hyperion-history", "mempalace"): for expected in ("Julia Grib", "Kevin Heifner", "hyperion-history", "mempalace"):
assert expected in tagged, f"expected '{expected}' in metadata {tagged!r}" assert expected in tagged, f"expected '{expected}' in metadata {tagged!r}"
# ── topics_by_wing — cross-wing tunnel signal source (issue #1180) ──
def test_topics_persisted_under_topics_by_wing(temp_registry):
miner.add_to_known_entities(
{"people": ["Alice"], "topics": ["Angular", "OpenAPI"]},
wing="wing_alpha",
)
data = json.loads(temp_registry.read_text())
# Topics also stored as a flat list (existing-style aggregate).
assert "Angular" in data["topics"]
# And recorded by wing for tunnel computation.
assert data["topics_by_wing"]["wing_alpha"] == ["Angular", "OpenAPI"]
def test_topics_by_wing_replaces_on_reinit(temp_registry):
"""Re-running init for the same wing should reflect the latest list,
not accumulate stale topics indefinitely."""
miner.add_to_known_entities({"topics": ["Angular", "OpenAPI"]}, wing="wing_alpha")
miner.add_to_known_entities({"topics": ["OpenAPI", "Postgres"]}, wing="wing_alpha")
data = json.loads(temp_registry.read_text())
assert data["topics_by_wing"]["wing_alpha"] == ["OpenAPI", "Postgres"]
def test_topics_by_wing_multiple_wings_coexist(temp_registry):
miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_a")
miner.add_to_known_entities({"topics": ["foo", "bar"]}, wing="wing_b")
data = json.loads(temp_registry.read_text())
assert data["topics_by_wing"] == {"wing_a": ["foo"], "wing_b": ["foo", "bar"]}
def test_topics_by_wing_skipped_without_wing(temp_registry):
miner.add_to_known_entities({"topics": ["foo"]})
data = json.loads(temp_registry.read_text())
# No wing → no topics_by_wing entry, but topics list still saved.
assert "topics_by_wing" not in data
assert data["topics"] == ["foo"]
def test_topics_by_wing_dedupes_case_insensitive(temp_registry):
miner.add_to_known_entities({"topics": ["OpenAPI", "openapi", "OPENAPI"]}, wing="wing_a")
data = json.loads(temp_registry.read_text())
# Only one entry, casing of the first observed name preserved.
assert data["topics_by_wing"]["wing_a"] == ["OpenAPI"]
def test_get_topics_by_wing_reads_registry(temp_registry):
miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_a")
miner.add_to_known_entities({"topics": ["foo", "bar"]}, wing="wing_b")
result = miner.get_topics_by_wing()
assert result == {"wing_a": ["foo"], "wing_b": ["foo", "bar"]}
def test_get_topics_by_wing_empty_when_missing(temp_registry):
miner.add_to_known_entities({"people": ["Alice"]})
assert miner.get_topics_by_wing() == {}
def test_topics_by_wing_does_not_pollute_known_names(temp_registry):
"""Wing names in topics_by_wing must NOT leak into the flat known-names
set used by ``_extract_entities_for_metadata`` — only the topic strings
themselves should be recognized."""
miner.add_to_known_entities({"topics": ["Angular"]}, wing="wing_super_secret_project")
known = miner._load_known_entities()
assert "Angular" in known
assert "wing_super_secret_project" not in known
+31 -3
View File
@@ -272,7 +272,9 @@ def test_apply_classifications_appends_reason_signal():
assert any("spoken of by name" in s for s in new["people"][0]["signals"]) assert any("spoken of by name" in s for s in new["people"][0]["signals"])
def test_apply_classifications_topic_goes_to_uncertain(): def test_apply_classifications_topic_goes_to_topics_bucket():
"""TOPIC classifications now route to a dedicated ``topics`` bucket so the
miner can use them as cross-wing tunnel signal (issue #1180)."""
detected = { detected = {
"people": [], "people": [],
"projects": [ "projects": [
@@ -289,8 +291,32 @@ def test_apply_classifications_topic_goes_to_uncertain():
decisions = {"Paris": ("TOPIC", "city, not a project")} decisions = {"Paris": ("TOPIC", "city, not a project")}
new, reclass, _ = _apply_classifications(detected, decisions) new, reclass, _ = _apply_classifications(detected, decisions)
assert len(new["projects"]) == 0 assert len(new["projects"]) == 0
assert len(new["uncertain"]) == 0
assert len(new["topics"]) == 1
assert new["topics"][0]["name"] == "Paris"
assert new["topics"][0]["type"] == "topic"
assert reclass == 1
def test_apply_classifications_ambiguous_still_goes_to_uncertain():
detected = {
"people": [],
"projects": [
{
"name": "Foo",
"type": "project",
"confidence": 0.7,
"frequency": 5,
"signals": ["regex"],
}
],
"uncertain": [],
}
decisions = {"Foo": ("AMBIGUOUS", "context insufficient")}
new, reclass, _ = _apply_classifications(detected, decisions)
assert len(new["projects"]) == 0
assert len(new["uncertain"]) == 1 assert len(new["uncertain"]) == 1
assert new["uncertain"][0]["name"] == "Paris" assert new["uncertain"][0]["name"] == "Foo"
assert reclass == 1 assert reclass == 1
@@ -469,7 +495,9 @@ def test_refine_entities_refines_high_confidence_regex_projects():
assert provider.call_count == 1 assert provider.call_count == 1
assert result.reclassified == 1 assert result.reclassified == 1
assert result.merged["projects"] == [] assert result.merged["projects"] == []
assert result.merged["uncertain"][0]["name"] == "OpenAPI" # TOPIC labels go to the dedicated ``topics`` bucket so the miner can
# use them for cross-wing tunnel computation (issue #1180).
assert result.merged["topics"][0]["name"] == "OpenAPI"
def test_refine_entities_refines_regex_people_but_skips_git_people(): def test_refine_entities_refines_regex_people_but_skips_git_people():
+101
View File
@@ -496,3 +496,104 @@ def test_add_drawer_stamps_normalize_version(tmp_path):
assert meta["normalize_version"] == NORMALIZE_VERSION assert meta["normalize_version"] == NORMALIZE_VERSION
finally: finally:
del col, client del col, client
def test_mine_creates_topic_tunnels_for_shared_topics(tmp_path, monkeypatch):
"""End-to-end: when two wings have already-confirmed topics that overlap,
the miner's mine-time pass drops a cross-wing tunnel between them.
Issue #1180.
"""
from mempalace import miner, palace_graph
# Redirect both the registry and tunnel-storage paths into tmp_path
# so we never touch the developer's real ~/.mempalace directory.
registry = tmp_path / "known_entities.json"
monkeypatch.setattr(miner, "_ENTITY_REGISTRY_PATH", str(registry))
miner._ENTITY_REGISTRY_CACHE.update({"mtime": None, "names": frozenset(), "raw": {}})
tunnels_file = tmp_path / "tunnels.json"
monkeypatch.setattr(palace_graph, "_TUNNEL_FILE", str(tunnels_file))
# Pre-populate the registry as if init had been run for two wings that
# share a topic.
miner.add_to_known_entities({"topics": ["foo", "bar"]}, wing="wing_one")
miner.add_to_known_entities({"topics": ["foo", "baz"]}, wing="wing_two")
# Mine wing_two — should drop tunnels between wing_two and wing_one
# for every shared topic. Just one in this case.
project_root = tmp_path / "wing_two_project"
project_root.mkdir()
write_file(
project_root / "notes.md",
"Some prose long enough to make a chunk. " * 20,
)
with open(project_root / "mempalace.yaml", "w") as f:
yaml.dump({"wing": "wing_two", "rooms": [{"name": "general"}]}, f)
palace_path = tmp_path / "palace"
mine(str(project_root), str(palace_path))
listed = palace_graph.list_tunnels()
assert len(listed) == 1
rooms = {listed[0]["source"]["room"], listed[0]["target"]["room"]}
assert rooms == {"foo"}
wings = {listed[0]["source"]["wing"], listed[0]["target"]["wing"]}
assert wings == {"wing_one", "wing_two"}
def test_mine_no_tunnel_when_threshold_blocks_overlap(tmp_path, monkeypatch):
"""Bumping ``MEMPALACE_TOPIC_TUNNEL_MIN_COUNT`` above the actual overlap
suppresses tunnel creation."""
from mempalace import miner, palace_graph
registry = tmp_path / "known_entities.json"
monkeypatch.setattr(miner, "_ENTITY_REGISTRY_PATH", str(registry))
miner._ENTITY_REGISTRY_CACHE.update({"mtime": None, "names": frozenset(), "raw": {}})
tunnels_file = tmp_path / "tunnels.json"
monkeypatch.setattr(palace_graph, "_TUNNEL_FILE", str(tunnels_file))
monkeypatch.setenv("MEMPALACE_TOPIC_TUNNEL_MIN_COUNT", "2")
miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_one")
miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_two")
project_root = tmp_path / "wing_two_project"
project_root.mkdir()
write_file(
project_root / "notes.md",
"Some prose long enough to make a chunk. " * 20,
)
with open(project_root / "mempalace.yaml", "w") as f:
yaml.dump({"wing": "wing_two", "rooms": [{"name": "general"}]}, f)
palace_path = tmp_path / "palace"
mine(str(project_root), str(palace_path))
# min_count=2 but only 1 shared topic → no tunnel.
assert palace_graph.list_tunnels() == []
def test_mine_no_tunnel_when_only_one_wing_has_topics(tmp_path, monkeypatch):
"""A wing in isolation (no other wing has confirmed topics) creates no tunnels."""
from mempalace import miner, palace_graph
registry = tmp_path / "known_entities.json"
monkeypatch.setattr(miner, "_ENTITY_REGISTRY_PATH", str(registry))
miner._ENTITY_REGISTRY_CACHE.update({"mtime": None, "names": frozenset(), "raw": {}})
tunnels_file = tmp_path / "tunnels.json"
monkeypatch.setattr(palace_graph, "_TUNNEL_FILE", str(tunnels_file))
miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_one")
project_root = tmp_path / "wing_one_project"
project_root.mkdir()
write_file(
project_root / "notes.md",
"Some prose long enough to make a chunk. " * 20,
)
with open(project_root / "mempalace.yaml", "w") as f:
yaml.dump({"wing": "wing_one", "rooms": [{"name": "general"}]}, f)
palace_path = tmp_path / "palace"
mine(str(project_root), str(palace_path))
assert palace_graph.list_tunnels() == []
+123
View File
@@ -135,3 +135,126 @@ class TestExplicitTunnels:
connections = palace_graph.follow_tunnels("wing_code", "auth", col=col) connections = palace_graph.follow_tunnels("wing_code", "auth", col=col)
assert len(connections) == 1 assert len(connections) == 1
assert "drawer_preview" not in connections[0] assert "drawer_preview" not in connections[0]
class TestTopicTunnels:
"""Cross-wing topic tunnels (issue #1180).
When two wings share confirmed TOPIC labels above a configurable
threshold, a symmetric tunnel is created between them. Tunnels are
routed through the existing ``create_tunnel`` storage so they share
dedup and persistence with explicit tunnels.
"""
def test_compute_topic_tunnels_creates_link_for_shared_topic(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {
"wing_alpha": ["Angular", "OpenAPI"],
"wing_beta": ["OpenAPI", "Kubernetes"],
}
created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1)
assert len(created) == 1
assert created[0]["source"]["wing"] in {"wing_alpha", "wing_beta"}
assert created[0]["target"]["wing"] in {"wing_alpha", "wing_beta"}
# Room is the topic itself (case preserved from the first wing).
assert created[0]["source"]["room"] == "OpenAPI"
assert "OpenAPI" in created[0]["label"]
# Tunnel is retrievable via the standard list_tunnels API.
listed = palace_graph.list_tunnels()
assert len(listed) == 1
assert listed[0]["id"] == created[0]["id"]
def test_compute_topic_tunnels_no_link_below_threshold(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {
"wing_alpha": ["Angular", "OpenAPI"],
"wing_beta": ["OpenAPI", "Kubernetes"],
}
# min_count=2 requires two overlapping topics — only one shared.
created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=2)
assert created == []
assert palace_graph.list_tunnels() == []
def test_compute_topic_tunnels_above_threshold_creates_per_topic_links(
self, tmp_path, monkeypatch
):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {
"wing_alpha": ["Angular", "OpenAPI", "Postgres"],
"wing_beta": ["Angular", "OpenAPI", "Redis"],
}
created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=2)
# Two shared topics × one wing pair = two tunnels.
rooms = sorted(t["source"]["room"] for t in created)
assert rooms == ["Angular", "OpenAPI"]
def test_compute_topic_tunnels_case_insensitive_overlap(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {
"wing_alpha": ["openapi"],
"wing_beta": ["OpenAPI"],
}
created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1)
assert len(created) == 1
def test_compute_topic_tunnels_empty_input_is_noop(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
assert palace_graph.compute_topic_tunnels({}) == []
assert palace_graph.compute_topic_tunnels({"wing_a": []}) == []
assert palace_graph.list_tunnels() == []
def test_compute_topic_tunnels_three_wings_pairwise(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {
"wing_a": ["foo"],
"wing_b": ["foo"],
"wing_c": ["foo"],
}
created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1)
# 3 wings sharing the same topic → C(3,2) = 3 pairs → 3 tunnels.
assert len(created) == 3
endpoint_pairs = {
tuple(sorted([t["source"]["wing"], t["target"]["wing"]])) for t in created
}
assert endpoint_pairs == {
("wing_a", "wing_b"),
("wing_a", "wing_c"),
("wing_b", "wing_c"),
}
def test_topic_tunnels_for_wing_only_links_that_wing(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {
"wing_a": ["foo", "bar"],
"wing_b": ["foo"],
"wing_c": ["bar"],
}
# wing_a should link to both b (via foo) and c (via bar).
created = palace_graph.topic_tunnels_for_wing("wing_a", topics_by_wing)
endpoint_pairs = {
tuple(sorted([t["source"]["wing"], t["target"]["wing"]])) for t in created
}
assert endpoint_pairs == {("wing_a", "wing_b"), ("wing_a", "wing_c")}
# The b-c pair is NOT created because wing_a's incremental pass
# only computes pairs that include wing_a.
assert len(palace_graph.list_tunnels()) == 2
def test_topic_tunnels_for_wing_unknown_wing_is_noop(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {"wing_a": ["foo"], "wing_b": ["foo"]}
assert palace_graph.topic_tunnels_for_wing("wing_missing", topics_by_wing) == []
assert palace_graph.list_tunnels() == []
def test_compute_topic_tunnels_dedupe_on_recompute(self, tmp_path, monkeypatch):
_use_tmp_tunnel_file(monkeypatch, tmp_path)
topics_by_wing = {
"wing_alpha": ["OpenAPI"],
"wing_beta": ["OpenAPI"],
}
first = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1)
second = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1)
# create_tunnel is symmetric/dedupe — repeated computation should
# not multiply the stored tunnels.
assert first[0]["id"] == second[0]["id"]
assert len(palace_graph.list_tunnels()) == 1
+4 -1
View File
@@ -363,11 +363,14 @@ def test_to_detected_dict_shape():
projects = [ProjectInfo(name="p", repo_root=Path("."), is_mine=True, manifest="package.json")] projects = [ProjectInfo(name="p", repo_root=Path("."), is_mine=True, manifest="package.json")]
people = [PersonInfo(name="Jane Doe", total_commits=5, repos={"r"})] people = [PersonInfo(name="Jane Doe", total_commits=5, repos={"r"})]
d = to_detected_dict(projects, people) d = to_detected_dict(projects, people)
assert set(d.keys()) == {"people", "projects", "uncertain"} # ``topics`` is the LLM-refine bucket for cross-wing tunnel signal —
# always present even when empty so callers can rely on the shape.
assert set(d.keys()) == {"people", "projects", "topics", "uncertain"}
assert d["projects"][0]["name"] == "p" assert d["projects"][0]["name"] == "p"
assert d["projects"][0]["type"] == "project" assert d["projects"][0]["type"] == "project"
assert d["people"][0]["name"] == "Jane Doe" assert d["people"][0]["name"] == "Jane Doe"
assert d["people"][0]["type"] == "person" assert d["people"][0]["type"] == "person"
assert d["topics"] == []
assert d["uncertain"] == [] assert d["uncertain"] == []