feat(graph): cross-wing tunnels by shared topics (#1180)

When two wings have one or more confirmed TOPIC labels in common, the
miner now drops a symmetric tunnel between them at mine time so the
palace graph reflects shared themes (frameworks, vendors, recurring
concepts).

- llm_refine: TOPIC label routes to a dedicated `topics` bucket so the
  signal survives confirmation instead of getting collapsed into
  `uncertain` and dropped.
- entity_detector / project_scanner: bucket plumbed through the
  detection pipeline; `confirm_entities` returns confirmed topics
  alongside people/projects.
- miner.add_to_known_entities: optional `wing` parameter records the
  confirmed topics under `topics_by_wing` in
  `~/.mempalace/known_entities.json`. Wing names do NOT leak into the
  flat known-name set used by drawer-tagging.
- palace_graph: `compute_topic_tunnels` and `topic_tunnels_for_wing`
  create symmetric tunnels via the existing `create_tunnel` API so they
  share dedup and persistence with explicit tunnels.
- miner.mine: post-file-loop pass calls `topic_tunnels_for_wing` for
  the freshly-mined wing. Failures are logged but never abort the mine.
- config: `topic_tunnel_min_count` knob (env
  `MEMPALACE_TOPIC_TUNNEL_MIN_COUNT` or `~/.mempalace/config.json`),
  default 1.

Tests cover topic persistence through init->mine, tunnel creation when
wings share a topic, no tunnel below threshold, cross-wing tunnel
retrieval via `list_tunnels`, dedup on recompute, case-insensitive
overlap, and the end-to-end mine-time wiring.

Out of scope for this PR (called out in the PR body): manifest-
dependency overlap, per-topic allow/deny lists, search-result surfacing.
This commit is contained in:
Igor Lins e Silva
2026-04-24 19:19:58 -03:00
parent ed2ba726c9
commit fe051adc73
14 changed files with 678 additions and 28 deletions
+18 -5
View File
@@ -117,21 +117,34 @@ def cmd_init(args):
if languages_tuple != ("en",):
print(f" Languages: {', '.join(languages_tuple)}")
detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider)
total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
total = (
len(detected["people"])
+ len(detected["projects"])
+ len(detected.get("topics", []))
+ len(detected["uncertain"])
)
if total > 0:
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
# Save confirmed entities to <project>/entities.json (per-project
# audit trail — user can inspect or hand-edit) AND merge into the
# global registry the miner reads at mine time.
if confirmed["people"] or confirmed["projects"]:
entities_path = Path(args.dir).expanduser().resolve() / "entities.json"
# global registry the miner reads at mine time. Topics are kept
# separately so the miner can later compute cross-wing tunnels
# from shared topics (see palace_graph.compute_topic_tunnels).
if confirmed["people"] or confirmed["projects"] or confirmed.get("topics"):
project_path = Path(args.dir).expanduser().resolve()
entities_path = project_path / "entities.json"
with open(entities_path, "w", encoding="utf-8") as f:
json.dump(confirmed, f, indent=2, ensure_ascii=False)
print(f" Entities saved: {entities_path}")
from .miner import add_to_known_entities
registry_path = add_to_known_entities(confirmed)
# Wing matches the default produced by ``room_detector_local``
# (folder basename) and the miner fallback in ``load_config``.
# Used by the topics_by_wing map so cross-wing tunnels can be
# computed at mine time.
wing = project_path.name
registry_path = add_to_known_entities(confirmed, wing=wing)
print(f" Registry updated: {registry_path}")
else:
print(" No entities detected — proceeding with directory-based rooms.")
+26
View File
@@ -253,6 +253,32 @@ class MempalaceConfig:
return env_val.strip().lower()
return str(self._file_config.get("embedding_device", "auto")).strip().lower()
@property
def topic_tunnel_min_count(self):
"""Minimum number of overlapping confirmed topics required to create
a cross-wing tunnel between two wings.
Default is ``1`` — any single shared topic produces a tunnel. Bump
to ``2+`` if your projects share lots of common-tech labels (Python,
Docker, Git) and you want only meaningfully overlapping wings to
link. Reads ``MEMPALACE_TOPIC_TUNNEL_MIN_COUNT`` env first, then the
config-file value, then ``1``.
"""
env_val = os.environ.get("MEMPALACE_TOPIC_TUNNEL_MIN_COUNT")
if env_val:
try:
parsed = int(env_val)
if parsed >= 1:
return parsed
except ValueError:
pass
cfg_val = self._file_config.get("topic_tunnel_min_count")
try:
parsed = int(cfg_val) if cfg_val is not None else 1
except (TypeError, ValueError):
parsed = 1
return max(1, parsed)
@property
def hook_silent_save(self):
"""Whether the stop hook saves directly (True) or blocks for MCP calls (False)."""
+24 -4
View File
@@ -440,7 +440,7 @@ def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) ->
candidates = extract_candidates(combined_text, languages=langs)
if not candidates:
return {"people": [], "projects": [], "uncertain": []}
return {"people": [], "projects": [], "topics": [], "uncertain": []}
# Score and classify each candidate
people = []
@@ -467,6 +467,7 @@ def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) ->
return {
"people": people[:15],
"projects": projects[:10],
"topics": [],
"uncertain": uncertain[:8],
}
@@ -489,7 +490,13 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict:
"""
Interactive confirmation step.
User reviews detected entities, removes wrong ones, adds missing ones.
Returns confirmed {people: [names], projects: [names]}
Returns confirmed {people: [names], projects: [names], topics: [names]}.
Topics are not surfaced for interactive review — they come from the
LLM-refined ``TOPIC`` bucket and are passed through verbatim. They
feed cross-wing tunnel computation at mine time (see
``palace_graph.compute_topic_tunnels``); a wrong topic at worst adds
a low-traffic tunnel and never alters drawer storage.
Pass yes=True to auto-accept all detected entities without prompting.
"""
@@ -501,18 +508,28 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict:
_print_entity_list(detected["people"], "PEOPLE")
_print_entity_list(detected["projects"], "PROJECTS")
if detected.get("topics"):
_print_entity_list(detected["topics"], "TOPICS (cross-wing tunnel signal)")
if detected["uncertain"]:
_print_entity_list(detected["uncertain"], "UNCERTAIN (need your call)")
confirmed_people = [e["name"] for e in detected["people"]]
confirmed_projects = [e["name"] for e in detected["projects"]]
confirmed_topics = [e["name"] for e in detected.get("topics", [])]
if yes:
# Auto-accept: include all detected (skip uncertain — ambiguous without user input)
print(
f"\n Auto-accepting {len(confirmed_people)} people, {len(confirmed_projects)} projects."
f"\n Auto-accepting {len(confirmed_people)} people, "
f"{len(confirmed_projects)} projects, "
f"{len(confirmed_topics)} topics."
)
return {"people": confirmed_people, "projects": confirmed_projects}
return {
"people": confirmed_people,
"projects": confirmed_projects,
"topics": confirmed_topics,
}
print(f"\n{'' * 58}")
print(" Options:")
@@ -570,11 +587,14 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict:
print(" Confirmed:")
print(f" People: {', '.join(confirmed_people) or '(none)'}")
print(f" Projects: {', '.join(confirmed_projects) or '(none)'}")
if confirmed_topics:
print(f" Topics: {', '.join(confirmed_topics)}")
print(f"{'=' * 58}\n")
return {
"people": confirmed_people,
"projects": confirmed_projects,
"topics": confirmed_topics,
}
+14 -9
View File
@@ -197,13 +197,23 @@ def _apply_classifications(
"""Merge LLM decisions back into the detected dict.
Returns (new_detected, reclassified_count, dropped_count).
Topics get their own bucket so the caller can persist them as
cross-wing tunnel signal. ``AMBIGUOUS`` still falls back to
``uncertain`` for human review.
"""
label_to_bucket = {
"PERSON": "people",
"PROJECT": "projects",
"TOPIC": "uncertain",
"TOPIC": "topics",
"AMBIGUOUS": "uncertain",
}
bucket_to_type = {
"people": "person",
"projects": "project",
"topics": "topic",
"uncertain": "uncertain",
}
# Index every entity by name for in-place update
all_entries: list[tuple[str, dict]] = []
@@ -216,6 +226,7 @@ def _apply_classifications(
new_detected: dict[str, list[dict]] = {
"people": [],
"projects": [],
"topics": [],
"uncertain": [],
}
@@ -223,7 +234,7 @@ def _apply_classifications(
decision = decisions.get(entry["name"])
if decision is None:
# No LLM opinion — keep as-is
new_detected[old_bucket].append(entry)
new_detected.setdefault(old_bucket, []).append(entry)
continue
label, reason = decision
@@ -245,13 +256,7 @@ def _apply_classifications(
updated["signals"] = signals
if target_bucket != old_bucket:
reclassified += 1
updated["type"] = (
"person"
if target_bucket == "people"
else "project"
if target_bucket == "projects"
else "uncertain"
)
updated["type"] = bucket_to_type.get(target_bucket, "uncertain")
new_detected[target_bucket].append(updated)
return new_detected, reclassified, dropped
+118 -2
View File
@@ -439,7 +439,16 @@ def _refresh_known_entities_cache() -> None:
data = json.load(f)
if isinstance(data, dict):
raw = data
for cat in data.values():
for cat_key, cat in data.items():
# Special wing-keyed map — its inner values are topic
# names but its outer keys are wings, which must NOT be
# surfaced as known entities. Pull the topic names out
# explicitly instead of treating it as a generic category.
if cat_key == "topics_by_wing" and isinstance(cat, dict):
for topic_list in cat.values():
if isinstance(topic_list, list):
names.update(str(n) for n in topic_list if n)
continue
if isinstance(cat, list):
names.update(str(n) for n in cat if n)
elif isinstance(cat, dict):
@@ -474,7 +483,39 @@ def _load_known_entities_raw() -> dict:
return dict(_ENTITY_REGISTRY_CACHE["raw"])
def add_to_known_entities(entities_by_category: dict) -> str:
def _set_wing_topics(existing: dict, wing_key: str, topics_for_wing: list, coerce) -> None:
"""Update ``existing['topics_by_wing'][wing_key]`` to the deduped list.
Replaces (does not union) the wing's topic list — re-running ``init``
should reflect the user's latest confirmation rather than accumulate
stale labels. Empty input drops the wing entry; an empty map drops
the ``topics_by_wing`` key entirely.
"""
topics_map = existing.get("topics_by_wing")
if not isinstance(topics_map, dict):
topics_map = {}
seen_lower: set = set()
ordered: list = []
for n in topics_for_wing:
name = coerce(n)
if not name:
continue
key = name.lower()
if key in seen_lower:
continue
seen_lower.add(key)
ordered.append(name)
if ordered:
topics_map[wing_key] = ordered
else:
topics_map.pop(wing_key, None)
if topics_map:
existing["topics_by_wing"] = topics_map
else:
existing.pop("topics_by_wing", None)
def add_to_known_entities(entities_by_category: dict, wing: str = None) -> str:
"""Union ``entities_by_category`` into ``~/.mempalace/known_entities.json``.
Accepts ``{category: [names]}`` shape as produced by ``mempalace init``
@@ -488,6 +529,15 @@ def add_to_known_entities(entities_by_category: dict) -> str:
added as keys with ``None`` values so existing code mappings aren't
overwritten. A later compress pass can assign codes.
When ``wing`` is provided AND ``entities_by_category`` contains a
``topics`` list, those topics are also recorded under
``topics_by_wing[wing]`` (case-insensitive dedup, preserving the
casing of the first observed name). This is the signal source for
``palace_graph.compute_topic_tunnels`` at mine time. Topics for a
wing are *replaced*, not unioned, so a re-run of ``init`` reflects
the user's latest confirmation rather than accumulating stale labels
indefinitely.
The in-process cache is invalidated on write so same-process callers
(notably ``cmd_init`` → ``cmd_mine`` in sequence) see the update
immediately instead of waiting for a mtime re-check.
@@ -515,7 +565,16 @@ def add_to_known_entities(entities_by_category: dict) -> str:
name = str(value)
return name if name else None
# Separate the topics_by_wing key from regular categories so we don't
# treat it as a flat name-list elsewhere in this function.
topics_for_wing = None
if wing and isinstance(wing, str) and wing.strip():
topics_for_wing = entities_by_category.get("topics") or []
for category, names in entities_by_category.items():
if category == "topics_by_wing":
# Reserved key — managed separately below.
continue
if not isinstance(names, list) or not names:
continue
current = existing.get(category)
@@ -551,6 +610,9 @@ def add_to_known_entities(entities_by_category: dict) -> str:
ordered.append(name)
existing[category] = ordered
if topics_for_wing is not None:
_set_wing_topics(existing, wing.strip(), topics_for_wing, _coerce_name)
registry_path.write_text(_json.dumps(existing, indent=2, ensure_ascii=False), encoding="utf-8")
try:
registry_path.chmod(0o600)
@@ -565,6 +627,28 @@ def add_to_known_entities(entities_by_category: dict) -> str:
return str(registry_path)
def get_topics_by_wing() -> dict:
"""Return ``topics_by_wing`` from the global registry as a dict.
Returns ``{}`` if the registry is missing, malformed, or has no
``topics_by_wing`` key. Casing is preserved from disk; callers that
need case-insensitive comparison should normalize themselves.
"""
raw = _load_known_entities_raw()
topics_map = raw.get("topics_by_wing")
if not isinstance(topics_map, dict):
return {}
out: dict = {}
for wing, topics in topics_map.items():
if not isinstance(wing, str) or not wing.strip():
continue
if isinstance(topics, list):
cleaned = [str(t) for t in topics if isinstance(t, str) and t.strip()]
if cleaned:
out[wing.strip()] = cleaned
return out
_HALL_KEYWORDS_CACHE = None
@@ -962,6 +1046,19 @@ def mine(
if not dry_run:
print(f" + [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers}")
if not dry_run:
# Cross-wing topic tunnels: after every file in this wing has been
# processed, link this wing to any other wing that shares a
# confirmed TOPIC label. Out of scope for v1: manifest-dependency
# overlap, per-topic allow/deny lists, search-result surfacing.
try:
tunnels_added = _compute_topic_tunnels_for_wing(wing)
if tunnels_added:
print(f"\n Topic tunnels: +{tunnels_added} cross-wing link(s)")
except Exception as e:
# Tunnel computation must never fail a mine — degrade quietly.
print(f"\n WARNING: topic tunnel computation skipped — {e}", file=sys.stderr)
print(f"\n{'=' * 55}")
print(" Done.")
print(f" Files processed: {len(files) - files_skipped}")
@@ -974,6 +1071,25 @@ def mine(
print(f"{'=' * 55}\n")
def _compute_topic_tunnels_for_wing(wing: str) -> int:
"""Drop tunnels between ``wing`` and every other wing that shares
confirmed topics, honoring the ``topic_tunnel_min_count`` config knob.
Returns the number of tunnels created or refreshed. Zero means no
overlap found (or the registry has no ``topics_by_wing`` map yet).
"""
from .config import MempalaceConfig
from .palace_graph import topic_tunnels_for_wing
topics_map = get_topics_by_wing()
if not topics_map or wing not in topics_map:
return 0
cfg = MempalaceConfig()
min_count = cfg.topic_tunnel_min_count
created = topic_tunnels_for_wing(wing, topics_map, min_count=min_count)
return len(created)
# =============================================================================
# STATUS
# =============================================================================
+138
View File
@@ -499,3 +499,141 @@ def follow_tunnels(wing: str, room: str, col=None, config=None):
pass
return connections
# =============================================================================
# TOPIC TUNNELS — auto-link wings that share confirmed TOPIC labels
# =============================================================================
# When two wings have one or more confirmed topics in common (e.g. both
# discuss "Angular" or "OpenAPI"), drop a symmetric tunnel between them.
# Topics come from the LLM-refined ``TOPIC`` bucket in the per-project
# ``entities.json`` and are persisted by wing in
# ``~/.mempalace/known_entities.json`` under ``topics_by_wing``.
#
# Tunnels are created via the existing ``create_tunnel`` API so they share
# storage and dedup with explicit tunnels. The room is the topic name —
# this matches the "two wings share an idea" mental model and keeps the
# graph homogeneous.
def _normalize_topic(name: str) -> str:
"""Lowercase + strip topics for case-insensitive overlap detection."""
return str(name).strip().lower()
def compute_topic_tunnels(
topics_by_wing: dict,
min_count: int = 1,
label_prefix: str = "shared topic",
) -> list[dict]:
"""Create tunnels for every pair of wings that share >= ``min_count`` topics.
Args:
topics_by_wing: ``{wing_name: [topic_name, ...]}`` mapping. Topic
names are compared case-insensitively; the first observed
casing is used for the tunnel room name.
min_count: minimum number of overlapping topics required to drop
any tunnel between a wing pair. ``1`` means a single shared
topic is enough; bumping to e.g. ``2`` requires multiple
overlaps and filters out coincidental single-topic links.
label_prefix: human-readable string prefixed to the tunnel label.
Returns:
List of tunnel dicts as returned by ``create_tunnel`` — one per
(wing_a, wing_b, topic) triple that crossed the threshold. A
wing-pair below ``min_count`` produces no tunnels at all (not
even for its single shared topic).
No-op semantics:
- empty/None ``topics_by_wing`` returns ``[]``.
- wings whose topic list is empty are skipped.
- ``min_count <= 0`` is clamped to 1.
"""
if not topics_by_wing:
return []
min_count = max(1, int(min_count))
# Build a normalized-topic -> first-seen casing map per wing so we
# preserve display casing while still doing case-insensitive overlap.
wing_topics: dict[str, dict[str, str]] = {}
for wing, names in topics_by_wing.items():
if not isinstance(wing, str) or not wing.strip():
continue
if not isinstance(names, (list, tuple)):
continue
bucket: dict[str, str] = {}
for n in names:
if not isinstance(n, str):
continue
key = _normalize_topic(n)
if not key:
continue
bucket.setdefault(key, n.strip())
if bucket:
wing_topics[wing.strip()] = bucket
wings = sorted(wing_topics.keys())
created: list[dict] = []
for i, wa in enumerate(wings):
topics_a = wing_topics[wa]
for wb in wings[i + 1 :]:
topics_b = wing_topics[wb]
shared_keys = set(topics_a.keys()) & set(topics_b.keys())
if len(shared_keys) < min_count:
continue
# Stable sort for deterministic tunnel ordering across runs.
for key in sorted(shared_keys):
# Prefer the casing from whichever wing sorts first — both
# are valid; this just keeps the displayed room consistent.
room = topics_a[key] if topics_a[key] else topics_b[key]
tunnel = create_tunnel(
source_wing=wa,
source_room=room,
target_wing=wb,
target_room=room,
label=f"{label_prefix}: {room}",
)
created.append(tunnel)
return created
def topic_tunnels_for_wing(
wing: str,
topics_by_wing: dict,
min_count: int = 1,
label_prefix: str = "shared topic",
) -> list[dict]:
"""Compute topic tunnels involving a single wing.
Used by the miner to incrementally update tunnels for the wing that
just finished mining without recomputing pairs that don't involve it.
Returns the list of tunnels created or refreshed.
"""
if not topics_by_wing or not isinstance(wing, str) or not wing.strip():
return []
wing = wing.strip()
own = topics_by_wing.get(wing)
if not isinstance(own, (list, tuple)) or not own:
return []
# Restrict the pair-wise computation to (wing, other) pairs only by
# building a 2-wing slice for each other wing. Reusing
# ``compute_topic_tunnels`` keeps the threshold and casing logic in
# one place.
created: list[dict] = []
for other, other_topics in topics_by_wing.items():
if not isinstance(other, str) or not other.strip() or other == wing:
continue
if not isinstance(other_topics, (list, tuple)) or not other_topics:
continue
slice_map = {wing: list(own), other: list(other_topics)}
created.extend(
compute_topic_tunnels(
slice_map,
min_count=min_count,
label_prefix=label_prefix,
)
)
return created
+3 -2
View File
@@ -558,6 +558,7 @@ def to_detected_dict(
return {
"people": people_entries,
"projects": proj_entries,
"topics": [],
"uncertain": [],
}
@@ -577,7 +578,7 @@ def _merge_detected(primary: dict, secondary: dict, drop_secondary_uncertain: bo
"""
seen = {e["name"].lower() for cat in primary.values() for e in cat}
merged = {k: list(v) for k, v in primary.items()}
for cat_key in ("people", "projects", "uncertain"):
for cat_key in ("people", "projects", "topics", "uncertain"):
if cat_key == "uncertain" and drop_secondary_uncertain:
continue
for e in secondary.get(cat_key, []):
@@ -654,7 +655,7 @@ def discover_entities(
prose_detected = (
detect_entities(prose_files, languages=languages)
if prose_files
else {"people": [], "projects": [], "uncertain": []}
else {"people": [], "projects": [], "topics": [], "uncertain": []}
)
# Without LLM refinement, suppress regex "uncertain" noise when real