feat(graph): cross-wing tunnels by shared topics (#1180)

When two wings have one or more confirmed TOPIC labels in common, the miner now drops a symmetric tunnel between them at mine time so the palace graph reflects shared themes (frameworks, vendors, recurring concepts). - llm_refine: TOPIC label routes to a dedicated `topics` bucket so the signal survives confirmation instead of getting collapsed into `uncertain` and dropped. - entity_detector / project_scanner: bucket plumbed through the detection pipeline; `confirm_entities` returns confirmed topics alongside people/projects. - miner.add_to_known_entities: optional `wing` parameter records the confirmed topics under `topics_by_wing` in `~/.mempalace/known_entities.json`. Wing names do NOT leak into the flat known-name set used by drawer-tagging. - palace_graph: `compute_topic_tunnels` and `topic_tunnels_for_wing` create symmetric tunnels via the existing `create_tunnel` API so they share dedup and persistence with explicit tunnels. - miner.mine: post-file-loop pass calls `topic_tunnels_for_wing` for the freshly-mined wing. Failures are logged but never abort the mine. - config: `topic_tunnel_min_count` knob (env `MEMPALACE_TOPIC_TUNNEL_MIN_COUNT` or `~/.mempalace/config.json`), default 1. Tests cover topic persistence through init->mine, tunnel creation when wings share a topic, no tunnel below threshold, cross-wing tunnel retrieval via `list_tunnels`, dedup on recompute, case-insensitive overlap, and the end-to-end mine-time wiring. Out of scope for this PR (called out in the PR body): manifest- dependency overlap, per-topic allow/deny lists, search-result surfacing.
2026-04-24 19:19:58 -03:00
parent ed2ba726c9
commit fe051adc73
14 changed files with 678 additions and 28 deletions
@@ -117,21 +117,34 @@ def cmd_init(args):
    if languages_tuple != ("en",):
        print(f"  Languages: {', '.join(languages_tuple)}")
    detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider)
-    total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
+    total = (
+        len(detected["people"])
+        + len(detected["projects"])
+        + len(detected.get("topics", []))
+        + len(detected["uncertain"])
+    )
    if total > 0:
        confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
        # Save confirmed entities to <project>/entities.json (per-project
        # audit trail — user can inspect or hand-edit) AND merge into the
-        # global registry the miner reads at mine time.
-        if confirmed["people"] or confirmed["projects"]:
-            entities_path = Path(args.dir).expanduser().resolve() / "entities.json"
+        # global registry the miner reads at mine time. Topics are kept
+        # separately so the miner can later compute cross-wing tunnels
+        # from shared topics (see palace_graph.compute_topic_tunnels).
+        if confirmed["people"] or confirmed["projects"] or confirmed.get("topics"):
+            project_path = Path(args.dir).expanduser().resolve()
+            entities_path = project_path / "entities.json"
            with open(entities_path, "w", encoding="utf-8") as f:
                json.dump(confirmed, f, indent=2, ensure_ascii=False)
            print(f"  Entities saved: {entities_path}")

            from .miner import add_to_known_entities

-            registry_path = add_to_known_entities(confirmed)
+            # Wing matches the default produced by ``room_detector_local``
+            # (folder basename) and the miner fallback in ``load_config``.
+            # Used by the topics_by_wing map so cross-wing tunnels can be
+            # computed at mine time.
+            wing = project_path.name
+            registry_path = add_to_known_entities(confirmed, wing=wing)
            print(f"  Registry updated: {registry_path}")
    else:
        print("  No entities detected — proceeding with directory-based rooms.")
@@ -253,6 +253,32 @@ class MempalaceConfig:
            return env_val.strip().lower()
        return str(self._file_config.get("embedding_device", "auto")).strip().lower()

+    @property
+    def topic_tunnel_min_count(self):
+        """Minimum number of overlapping confirmed topics required to create
+        a cross-wing tunnel between two wings.
+
+        Default is ``1`` — any single shared topic produces a tunnel. Bump
+        to ``2+`` if your projects share lots of common-tech labels (Python,
+        Docker, Git) and you want only meaningfully overlapping wings to
+        link. Reads ``MEMPALACE_TOPIC_TUNNEL_MIN_COUNT`` env first, then the
+        config-file value, then ``1``.
+        """
+        env_val = os.environ.get("MEMPALACE_TOPIC_TUNNEL_MIN_COUNT")
+        if env_val:
+            try:
+                parsed = int(env_val)
+                if parsed >= 1:
+                    return parsed
+            except ValueError:
+                pass
+        cfg_val = self._file_config.get("topic_tunnel_min_count")
+        try:
+            parsed = int(cfg_val) if cfg_val is not None else 1
+        except (TypeError, ValueError):
+            parsed = 1
+        return max(1, parsed)
+
    @property
    def hook_silent_save(self):
        """Whether the stop hook saves directly (True) or blocks for MCP calls (False)."""
@@ -440,7 +440,7 @@ def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) ->
    candidates = extract_candidates(combined_text, languages=langs)

    if not candidates:
-        return {"people": [], "projects": [], "uncertain": []}
+        return {"people": [], "projects": [], "topics": [], "uncertain": []}

    # Score and classify each candidate
    people = []
@@ -467,6 +467,7 @@ def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) ->
    return {
        "people": people[:15],
        "projects": projects[:10],
+        "topics": [],
        "uncertain": uncertain[:8],
    }

@@ -489,7 +490,13 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict:
    """
    Interactive confirmation step.
    User reviews detected entities, removes wrong ones, adds missing ones.
-    Returns confirmed {people: [names], projects: [names]}
+    Returns confirmed {people: [names], projects: [names], topics: [names]}.
+
+    Topics are not surfaced for interactive review — they come from the
+    LLM-refined ``TOPIC`` bucket and are passed through verbatim. They
+    feed cross-wing tunnel computation at mine time (see
+    ``palace_graph.compute_topic_tunnels``); a wrong topic at worst adds
+    a low-traffic tunnel and never alters drawer storage.

    Pass yes=True to auto-accept all detected entities without prompting.
    """
@@ -501,18 +508,28 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict:
    _print_entity_list(detected["people"], "PEOPLE")
    _print_entity_list(detected["projects"], "PROJECTS")

+    if detected.get("topics"):
+        _print_entity_list(detected["topics"], "TOPICS (cross-wing tunnel signal)")
+
    if detected["uncertain"]:
        _print_entity_list(detected["uncertain"], "UNCERTAIN (need your call)")

    confirmed_people = [e["name"] for e in detected["people"]]
    confirmed_projects = [e["name"] for e in detected["projects"]]
+    confirmed_topics = [e["name"] for e in detected.get("topics", [])]

    if yes:
        # Auto-accept: include all detected (skip uncertain — ambiguous without user input)
        print(
-            f"\n  Auto-accepting {len(confirmed_people)} people, {len(confirmed_projects)} projects."
+            f"\n  Auto-accepting {len(confirmed_people)} people, "
+            f"{len(confirmed_projects)} projects, "
+            f"{len(confirmed_topics)} topics."
        )
-        return {"people": confirmed_people, "projects": confirmed_projects}
+        return {
+            "people": confirmed_people,
+            "projects": confirmed_projects,
+            "topics": confirmed_topics,
+        }

    print(f"\n{'─' * 58}")
    print("  Options:")
@@ -570,11 +587,14 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict:
    print("  Confirmed:")
    print(f"  People:   {', '.join(confirmed_people) or '(none)'}")
    print(f"  Projects: {', '.join(confirmed_projects) or '(none)'}")
+    if confirmed_topics:
+        print(f"  Topics:   {', '.join(confirmed_topics)}")
    print(f"{'=' * 58}\n")

    return {
        "people": confirmed_people,
        "projects": confirmed_projects,
+        "topics": confirmed_topics,
    }


@@ -197,13 +197,23 @@ def _apply_classifications(
    """Merge LLM decisions back into the detected dict.

    Returns (new_detected, reclassified_count, dropped_count).
+
+    Topics get their own bucket so the caller can persist them as
+    cross-wing tunnel signal. ``AMBIGUOUS`` still falls back to
+    ``uncertain`` for human review.
    """
    label_to_bucket = {
        "PERSON": "people",
        "PROJECT": "projects",
-        "TOPIC": "uncertain",
+        "TOPIC": "topics",
        "AMBIGUOUS": "uncertain",
    }
+    bucket_to_type = {
+        "people": "person",
+        "projects": "project",
+        "topics": "topic",
+        "uncertain": "uncertain",
+    }

    # Index every entity by name for in-place update
    all_entries: list[tuple[str, dict]] = []
@@ -216,6 +226,7 @@ def _apply_classifications(
    new_detected: dict[str, list[dict]] = {
        "people": [],
        "projects": [],
+        "topics": [],
        "uncertain": [],
    }

@@ -223,7 +234,7 @@ def _apply_classifications(
        decision = decisions.get(entry["name"])
        if decision is None:
            # No LLM opinion — keep as-is
-            new_detected[old_bucket].append(entry)
+            new_detected.setdefault(old_bucket, []).append(entry)
            continue

        label, reason = decision
@@ -245,13 +256,7 @@ def _apply_classifications(
        updated["signals"] = signals
        if target_bucket != old_bucket:
            reclassified += 1
-            updated["type"] = (
-                "person"
-                if target_bucket == "people"
-                else "project"
-                if target_bucket == "projects"
-                else "uncertain"
-            )
+            updated["type"] = bucket_to_type.get(target_bucket, "uncertain")
        new_detected[target_bucket].append(updated)

    return new_detected, reclassified, dropped
@@ -439,7 +439,16 @@ def _refresh_known_entities_cache() -> None:
            data = json.load(f)
        if isinstance(data, dict):
            raw = data
-            for cat in data.values():
+            for cat_key, cat in data.items():
+                # Special wing-keyed map — its inner values are topic
+                # names but its outer keys are wings, which must NOT be
+                # surfaced as known entities. Pull the topic names out
+                # explicitly instead of treating it as a generic category.
+                if cat_key == "topics_by_wing" and isinstance(cat, dict):
+                    for topic_list in cat.values():
+                        if isinstance(topic_list, list):
+                            names.update(str(n) for n in topic_list if n)
+                    continue
                if isinstance(cat, list):
                    names.update(str(n) for n in cat if n)
                elif isinstance(cat, dict):
@@ -474,7 +483,39 @@ def _load_known_entities_raw() -> dict:
    return dict(_ENTITY_REGISTRY_CACHE["raw"])


-def add_to_known_entities(entities_by_category: dict) -> str:
+def _set_wing_topics(existing: dict, wing_key: str, topics_for_wing: list, coerce) -> None:
+    """Update ``existing['topics_by_wing'][wing_key]`` to the deduped list.
+
+    Replaces (does not union) the wing's topic list — re-running ``init``
+    should reflect the user's latest confirmation rather than accumulate
+    stale labels. Empty input drops the wing entry; an empty map drops
+    the ``topics_by_wing`` key entirely.
+    """
+    topics_map = existing.get("topics_by_wing")
+    if not isinstance(topics_map, dict):
+        topics_map = {}
+    seen_lower: set = set()
+    ordered: list = []
+    for n in topics_for_wing:
+        name = coerce(n)
+        if not name:
+            continue
+        key = name.lower()
+        if key in seen_lower:
+            continue
+        seen_lower.add(key)
+        ordered.append(name)
+    if ordered:
+        topics_map[wing_key] = ordered
+    else:
+        topics_map.pop(wing_key, None)
+    if topics_map:
+        existing["topics_by_wing"] = topics_map
+    else:
+        existing.pop("topics_by_wing", None)
+
+
+def add_to_known_entities(entities_by_category: dict, wing: str = None) -> str:
    """Union ``entities_by_category`` into ``~/.mempalace/known_entities.json``.

    Accepts ``{category: [names]}`` shape as produced by ``mempalace init``
@@ -488,6 +529,15 @@ def add_to_known_entities(entities_by_category: dict) -> str:
    added as keys with ``None`` values so existing code mappings aren't
    overwritten. A later compress pass can assign codes.

+    When ``wing`` is provided AND ``entities_by_category`` contains a
+    ``topics`` list, those topics are also recorded under
+    ``topics_by_wing[wing]`` (case-insensitive dedup, preserving the
+    casing of the first observed name). This is the signal source for
+    ``palace_graph.compute_topic_tunnels`` at mine time. Topics for a
+    wing are *replaced*, not unioned, so a re-run of ``init`` reflects
+    the user's latest confirmation rather than accumulating stale labels
+    indefinitely.
+
    The in-process cache is invalidated on write so same-process callers
    (notably ``cmd_init`` → ``cmd_mine`` in sequence) see the update
    immediately instead of waiting for a mtime re-check.
@@ -515,7 +565,16 @@ def add_to_known_entities(entities_by_category: dict) -> str:
        name = str(value)
        return name if name else None

+    # Separate the topics_by_wing key from regular categories so we don't
+    # treat it as a flat name-list elsewhere in this function.
+    topics_for_wing = None
+    if wing and isinstance(wing, str) and wing.strip():
+        topics_for_wing = entities_by_category.get("topics") or []
+
    for category, names in entities_by_category.items():
+        if category == "topics_by_wing":
+            # Reserved key — managed separately below.
+            continue
        if not isinstance(names, list) or not names:
            continue
        current = existing.get(category)
@@ -551,6 +610,9 @@ def add_to_known_entities(entities_by_category: dict) -> str:
                ordered.append(name)
            existing[category] = ordered

+    if topics_for_wing is not None:
+        _set_wing_topics(existing, wing.strip(), topics_for_wing, _coerce_name)
+
    registry_path.write_text(_json.dumps(existing, indent=2, ensure_ascii=False), encoding="utf-8")
    try:
        registry_path.chmod(0o600)
@@ -565,6 +627,28 @@ def add_to_known_entities(entities_by_category: dict) -> str:
    return str(registry_path)


+def get_topics_by_wing() -> dict:
+    """Return ``topics_by_wing`` from the global registry as a dict.
+
+    Returns ``{}`` if the registry is missing, malformed, or has no
+    ``topics_by_wing`` key. Casing is preserved from disk; callers that
+    need case-insensitive comparison should normalize themselves.
+    """
+    raw = _load_known_entities_raw()
+    topics_map = raw.get("topics_by_wing")
+    if not isinstance(topics_map, dict):
+        return {}
+    out: dict = {}
+    for wing, topics in topics_map.items():
+        if not isinstance(wing, str) or not wing.strip():
+            continue
+        if isinstance(topics, list):
+            cleaned = [str(t) for t in topics if isinstance(t, str) and t.strip()]
+            if cleaned:
+                out[wing.strip()] = cleaned
+    return out
+
+
 _HALL_KEYWORDS_CACHE = None


@@ -962,6 +1046,19 @@ def mine(
            if not dry_run:
                print(f"  + [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers}")

+    if not dry_run:
+        # Cross-wing topic tunnels: after every file in this wing has been
+        # processed, link this wing to any other wing that shares a
+        # confirmed TOPIC label. Out of scope for v1: manifest-dependency
+        # overlap, per-topic allow/deny lists, search-result surfacing.
+        try:
+            tunnels_added = _compute_topic_tunnels_for_wing(wing)
+            if tunnels_added:
+                print(f"\n  Topic tunnels: +{tunnels_added} cross-wing link(s)")
+        except Exception as e:
+            # Tunnel computation must never fail a mine — degrade quietly.
+            print(f"\n  WARNING: topic tunnel computation skipped — {e}", file=sys.stderr)
+
    print(f"\n{'=' * 55}")
    print("  Done.")
    print(f"  Files processed: {len(files) - files_skipped}")
@@ -974,6 +1071,25 @@ def mine(
    print(f"{'=' * 55}\n")


+def _compute_topic_tunnels_for_wing(wing: str) -> int:
+    """Drop tunnels between ``wing`` and every other wing that shares
+    confirmed topics, honoring the ``topic_tunnel_min_count`` config knob.
+
+    Returns the number of tunnels created or refreshed. Zero means no
+    overlap found (or the registry has no ``topics_by_wing`` map yet).
+    """
+    from .config import MempalaceConfig
+    from .palace_graph import topic_tunnels_for_wing
+
+    topics_map = get_topics_by_wing()
+    if not topics_map or wing not in topics_map:
+        return 0
+    cfg = MempalaceConfig()
+    min_count = cfg.topic_tunnel_min_count
+    created = topic_tunnels_for_wing(wing, topics_map, min_count=min_count)
+    return len(created)
+
+
 # =============================================================================
 # STATUS
 # =============================================================================
@@ -499,3 +499,141 @@ def follow_tunnels(wing: str, room: str, col=None, config=None):
                pass

    return connections
+
+
+# =============================================================================
+# TOPIC TUNNELS — auto-link wings that share confirmed TOPIC labels
+# =============================================================================
+# When two wings have one or more confirmed topics in common (e.g. both
+# discuss "Angular" or "OpenAPI"), drop a symmetric tunnel between them.
+# Topics come from the LLM-refined ``TOPIC`` bucket in the per-project
+# ``entities.json`` and are persisted by wing in
+# ``~/.mempalace/known_entities.json`` under ``topics_by_wing``.
+#
+# Tunnels are created via the existing ``create_tunnel`` API so they share
+# storage and dedup with explicit tunnels. The room is the topic name —
+# this matches the "two wings share an idea" mental model and keeps the
+# graph homogeneous.
+
+
+def _normalize_topic(name: str) -> str:
+    """Lowercase + strip topics for case-insensitive overlap detection."""
+    return str(name).strip().lower()
+
+
+def compute_topic_tunnels(
+    topics_by_wing: dict,
+    min_count: int = 1,
+    label_prefix: str = "shared topic",
+) -> list[dict]:
+    """Create tunnels for every pair of wings that share >= ``min_count`` topics.
+
+    Args:
+        topics_by_wing: ``{wing_name: [topic_name, ...]}`` mapping. Topic
+            names are compared case-insensitively; the first observed
+            casing is used for the tunnel room name.
+        min_count: minimum number of overlapping topics required to drop
+            any tunnel between a wing pair. ``1`` means a single shared
+            topic is enough; bumping to e.g. ``2`` requires multiple
+            overlaps and filters out coincidental single-topic links.
+        label_prefix: human-readable string prefixed to the tunnel label.
+
+    Returns:
+        List of tunnel dicts as returned by ``create_tunnel`` — one per
+        (wing_a, wing_b, topic) triple that crossed the threshold. A
+        wing-pair below ``min_count`` produces no tunnels at all (not
+        even for its single shared topic).
+
+    No-op semantics:
+      - empty/None ``topics_by_wing`` returns ``[]``.
+      - wings whose topic list is empty are skipped.
+      - ``min_count <= 0`` is clamped to 1.
+    """
+    if not topics_by_wing:
+        return []
+
+    min_count = max(1, int(min_count))
+
+    # Build a normalized-topic -> first-seen casing map per wing so we
+    # preserve display casing while still doing case-insensitive overlap.
+    wing_topics: dict[str, dict[str, str]] = {}
+    for wing, names in topics_by_wing.items():
+        if not isinstance(wing, str) or not wing.strip():
+            continue
+        if not isinstance(names, (list, tuple)):
+            continue
+        bucket: dict[str, str] = {}
+        for n in names:
+            if not isinstance(n, str):
+                continue
+            key = _normalize_topic(n)
+            if not key:
+                continue
+            bucket.setdefault(key, n.strip())
+        if bucket:
+            wing_topics[wing.strip()] = bucket
+
+    wings = sorted(wing_topics.keys())
+    created: list[dict] = []
+    for i, wa in enumerate(wings):
+        topics_a = wing_topics[wa]
+        for wb in wings[i + 1 :]:
+            topics_b = wing_topics[wb]
+            shared_keys = set(topics_a.keys()) & set(topics_b.keys())
+            if len(shared_keys) < min_count:
+                continue
+            # Stable sort for deterministic tunnel ordering across runs.
+            for key in sorted(shared_keys):
+                # Prefer the casing from whichever wing sorts first — both
+                # are valid; this just keeps the displayed room consistent.
+                room = topics_a[key] if topics_a[key] else topics_b[key]
+                tunnel = create_tunnel(
+                    source_wing=wa,
+                    source_room=room,
+                    target_wing=wb,
+                    target_room=room,
+                    label=f"{label_prefix}: {room}",
+                )
+                created.append(tunnel)
+    return created
+
+
+def topic_tunnels_for_wing(
+    wing: str,
+    topics_by_wing: dict,
+    min_count: int = 1,
+    label_prefix: str = "shared topic",
+) -> list[dict]:
+    """Compute topic tunnels involving a single wing.
+
+    Used by the miner to incrementally update tunnels for the wing that
+    just finished mining without recomputing pairs that don't involve it.
+    Returns the list of tunnels created or refreshed.
+    """
+    if not topics_by_wing or not isinstance(wing, str) or not wing.strip():
+        return []
+
+    wing = wing.strip()
+    own = topics_by_wing.get(wing)
+    if not isinstance(own, (list, tuple)) or not own:
+        return []
+
+    # Restrict the pair-wise computation to (wing, other) pairs only by
+    # building a 2-wing slice for each other wing. Reusing
+    # ``compute_topic_tunnels`` keeps the threshold and casing logic in
+    # one place.
+    created: list[dict] = []
+    for other, other_topics in topics_by_wing.items():
+        if not isinstance(other, str) or not other.strip() or other == wing:
+            continue
+        if not isinstance(other_topics, (list, tuple)) or not other_topics:
+            continue
+        slice_map = {wing: list(own), other: list(other_topics)}
+        created.extend(
+            compute_topic_tunnels(
+                slice_map,
+                min_count=min_count,
+                label_prefix=label_prefix,
+            )
+        )
+    return created
@@ -558,6 +558,7 @@ def to_detected_dict(
    return {
        "people": people_entries,
        "projects": proj_entries,
+        "topics": [],
        "uncertain": [],
    }

@@ -577,7 +578,7 @@ def _merge_detected(primary: dict, secondary: dict, drop_secondary_uncertain: bo
    """
    seen = {e["name"].lower() for cat in primary.values() for e in cat}
    merged = {k: list(v) for k, v in primary.items()}
-    for cat_key in ("people", "projects", "uncertain"):
+    for cat_key in ("people", "projects", "topics", "uncertain"):
        if cat_key == "uncertain" and drop_secondary_uncertain:
            continue
        for e in secondary.get(cat_key, []):
@@ -654,7 +655,7 @@ def discover_entities(
    prose_detected = (
        detect_entities(prose_files, languages=languages)
        if prose_files
-        else {"people": [], "projects": [], "uncertain": []}
+        else {"people": [], "projects": [], "topics": [], "uncertain": []}
    )

    # Without LLM refinement, suppress regex "uncertain" noise when real