diff --git a/CHANGELOG.md b/CHANGELOG.md index 2051ab3..efd233e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), --- +## [3.3.4] — unreleased + +### Added + +- **Cross-wing topic tunnels.** When two wings have confirmed `TOPIC` labels in common (the LLM-refine bucket from `mempalace init --llm`), the miner now drops a symmetric tunnel between them at mine time so the palace graph reflects shared themes (frameworks, vendors, recurring concepts). Tunnels are routed through the existing `create_tunnel` storage so they share dedup and persistence with explicit tunnels. Threshold is configurable via `MEMPALACE_TOPIC_TUNNEL_MIN_COUNT` env var or `topic_tunnel_min_count` in `~/.mempalace/config.json` (default `1`). Manifest-dependency overlap and per-topic allow/deny lists remain out of scope. (#1180) + +--- + ## [3.3.3] — 2026-04-23 ### Bug Fixes diff --git a/mempalace/cli.py b/mempalace/cli.py index 714c64c..3dff964 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -117,21 +117,34 @@ def cmd_init(args): if languages_tuple != ("en",): print(f" Languages: {', '.join(languages_tuple)}") detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider) - total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"]) + total = ( + len(detected["people"]) + + len(detected["projects"]) + + len(detected.get("topics", [])) + + len(detected["uncertain"]) + ) if total > 0: confirmed = confirm_entities(detected, yes=getattr(args, "yes", False)) # Save confirmed entities to /entities.json (per-project # audit trail — user can inspect or hand-edit) AND merge into the - # global registry the miner reads at mine time. - if confirmed["people"] or confirmed["projects"]: - entities_path = Path(args.dir).expanduser().resolve() / "entities.json" + # global registry the miner reads at mine time. Topics are kept + # separately so the miner can later compute cross-wing tunnels + # from shared topics (see palace_graph.compute_topic_tunnels). + if confirmed["people"] or confirmed["projects"] or confirmed.get("topics"): + project_path = Path(args.dir).expanduser().resolve() + entities_path = project_path / "entities.json" with open(entities_path, "w", encoding="utf-8") as f: json.dump(confirmed, f, indent=2, ensure_ascii=False) print(f" Entities saved: {entities_path}") from .miner import add_to_known_entities - registry_path = add_to_known_entities(confirmed) + # Wing matches the default produced by ``room_detector_local`` + # (folder basename) and the miner fallback in ``load_config``. + # Used by the topics_by_wing map so cross-wing tunnels can be + # computed at mine time. + wing = project_path.name + registry_path = add_to_known_entities(confirmed, wing=wing) print(f" Registry updated: {registry_path}") else: print(" No entities detected — proceeding with directory-based rooms.") diff --git a/mempalace/config.py b/mempalace/config.py index 86aa90f..8e12b6b 100644 --- a/mempalace/config.py +++ b/mempalace/config.py @@ -253,6 +253,32 @@ class MempalaceConfig: return env_val.strip().lower() return str(self._file_config.get("embedding_device", "auto")).strip().lower() + @property + def topic_tunnel_min_count(self): + """Minimum number of overlapping confirmed topics required to create + a cross-wing tunnel between two wings. + + Default is ``1`` — any single shared topic produces a tunnel. Bump + to ``2+`` if your projects share lots of common-tech labels (Python, + Docker, Git) and you want only meaningfully overlapping wings to + link. Reads ``MEMPALACE_TOPIC_TUNNEL_MIN_COUNT`` env first, then the + config-file value, then ``1``. + """ + env_val = os.environ.get("MEMPALACE_TOPIC_TUNNEL_MIN_COUNT") + if env_val: + try: + parsed = int(env_val) + if parsed >= 1: + return parsed + except ValueError: + pass + cfg_val = self._file_config.get("topic_tunnel_min_count") + try: + parsed = int(cfg_val) if cfg_val is not None else 1 + except (TypeError, ValueError): + parsed = 1 + return max(1, parsed) + @property def hook_silent_save(self): """Whether the stop hook saves directly (True) or blocks for MCP calls (False).""" diff --git a/mempalace/entity_detector.py b/mempalace/entity_detector.py index 2f2aae4..5ff6b3c 100644 --- a/mempalace/entity_detector.py +++ b/mempalace/entity_detector.py @@ -440,7 +440,7 @@ def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) -> candidates = extract_candidates(combined_text, languages=langs) if not candidates: - return {"people": [], "projects": [], "uncertain": []} + return {"people": [], "projects": [], "topics": [], "uncertain": []} # Score and classify each candidate people = [] @@ -467,6 +467,7 @@ def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) -> return { "people": people[:15], "projects": projects[:10], + "topics": [], "uncertain": uncertain[:8], } @@ -489,7 +490,13 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict: """ Interactive confirmation step. User reviews detected entities, removes wrong ones, adds missing ones. - Returns confirmed {people: [names], projects: [names]} + Returns confirmed {people: [names], projects: [names], topics: [names]}. + + Topics are not surfaced for interactive review — they come from the + LLM-refined ``TOPIC`` bucket and are passed through verbatim. They + feed cross-wing tunnel computation at mine time (see + ``palace_graph.compute_topic_tunnels``); a wrong topic at worst adds + a low-traffic tunnel and never alters drawer storage. Pass yes=True to auto-accept all detected entities without prompting. """ @@ -501,18 +508,28 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict: _print_entity_list(detected["people"], "PEOPLE") _print_entity_list(detected["projects"], "PROJECTS") + if detected.get("topics"): + _print_entity_list(detected["topics"], "TOPICS (cross-wing tunnel signal)") + if detected["uncertain"]: _print_entity_list(detected["uncertain"], "UNCERTAIN (need your call)") confirmed_people = [e["name"] for e in detected["people"]] confirmed_projects = [e["name"] for e in detected["projects"]] + confirmed_topics = [e["name"] for e in detected.get("topics", [])] if yes: # Auto-accept: include all detected (skip uncertain — ambiguous without user input) print( - f"\n Auto-accepting {len(confirmed_people)} people, {len(confirmed_projects)} projects." + f"\n Auto-accepting {len(confirmed_people)} people, " + f"{len(confirmed_projects)} projects, " + f"{len(confirmed_topics)} topics." ) - return {"people": confirmed_people, "projects": confirmed_projects} + return { + "people": confirmed_people, + "projects": confirmed_projects, + "topics": confirmed_topics, + } print(f"\n{'─' * 58}") print(" Options:") @@ -570,11 +587,14 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict: print(" Confirmed:") print(f" People: {', '.join(confirmed_people) or '(none)'}") print(f" Projects: {', '.join(confirmed_projects) or '(none)'}") + if confirmed_topics: + print(f" Topics: {', '.join(confirmed_topics)}") print(f"{'=' * 58}\n") return { "people": confirmed_people, "projects": confirmed_projects, + "topics": confirmed_topics, } diff --git a/mempalace/llm_refine.py b/mempalace/llm_refine.py index faa737a..dda37df 100644 --- a/mempalace/llm_refine.py +++ b/mempalace/llm_refine.py @@ -197,13 +197,23 @@ def _apply_classifications( """Merge LLM decisions back into the detected dict. Returns (new_detected, reclassified_count, dropped_count). + + Topics get their own bucket so the caller can persist them as + cross-wing tunnel signal. ``AMBIGUOUS`` still falls back to + ``uncertain`` for human review. """ label_to_bucket = { "PERSON": "people", "PROJECT": "projects", - "TOPIC": "uncertain", + "TOPIC": "topics", "AMBIGUOUS": "uncertain", } + bucket_to_type = { + "people": "person", + "projects": "project", + "topics": "topic", + "uncertain": "uncertain", + } # Index every entity by name for in-place update all_entries: list[tuple[str, dict]] = [] @@ -216,6 +226,7 @@ def _apply_classifications( new_detected: dict[str, list[dict]] = { "people": [], "projects": [], + "topics": [], "uncertain": [], } @@ -223,7 +234,7 @@ def _apply_classifications( decision = decisions.get(entry["name"]) if decision is None: # No LLM opinion — keep as-is - new_detected[old_bucket].append(entry) + new_detected.setdefault(old_bucket, []).append(entry) continue label, reason = decision @@ -245,13 +256,7 @@ def _apply_classifications( updated["signals"] = signals if target_bucket != old_bucket: reclassified += 1 - updated["type"] = ( - "person" - if target_bucket == "people" - else "project" - if target_bucket == "projects" - else "uncertain" - ) + updated["type"] = bucket_to_type.get(target_bucket, "uncertain") new_detected[target_bucket].append(updated) return new_detected, reclassified, dropped diff --git a/mempalace/miner.py b/mempalace/miner.py index 84e57ba..6dea4a1 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -439,7 +439,16 @@ def _refresh_known_entities_cache() -> None: data = json.load(f) if isinstance(data, dict): raw = data - for cat in data.values(): + for cat_key, cat in data.items(): + # Special wing-keyed map — its inner values are topic + # names but its outer keys are wings, which must NOT be + # surfaced as known entities. Pull the topic names out + # explicitly instead of treating it as a generic category. + if cat_key == "topics_by_wing" and isinstance(cat, dict): + for topic_list in cat.values(): + if isinstance(topic_list, list): + names.update(str(n) for n in topic_list if n) + continue if isinstance(cat, list): names.update(str(n) for n in cat if n) elif isinstance(cat, dict): @@ -474,7 +483,39 @@ def _load_known_entities_raw() -> dict: return dict(_ENTITY_REGISTRY_CACHE["raw"]) -def add_to_known_entities(entities_by_category: dict) -> str: +def _set_wing_topics(existing: dict, wing_key: str, topics_for_wing: list, coerce) -> None: + """Update ``existing['topics_by_wing'][wing_key]`` to the deduped list. + + Replaces (does not union) the wing's topic list — re-running ``init`` + should reflect the user's latest confirmation rather than accumulate + stale labels. Empty input drops the wing entry; an empty map drops + the ``topics_by_wing`` key entirely. + """ + topics_map = existing.get("topics_by_wing") + if not isinstance(topics_map, dict): + topics_map = {} + seen_lower: set = set() + ordered: list = [] + for n in topics_for_wing: + name = coerce(n) + if not name: + continue + key = name.lower() + if key in seen_lower: + continue + seen_lower.add(key) + ordered.append(name) + if ordered: + topics_map[wing_key] = ordered + else: + topics_map.pop(wing_key, None) + if topics_map: + existing["topics_by_wing"] = topics_map + else: + existing.pop("topics_by_wing", None) + + +def add_to_known_entities(entities_by_category: dict, wing: str = None) -> str: """Union ``entities_by_category`` into ``~/.mempalace/known_entities.json``. Accepts ``{category: [names]}`` shape as produced by ``mempalace init`` @@ -488,6 +529,15 @@ def add_to_known_entities(entities_by_category: dict) -> str: added as keys with ``None`` values so existing code mappings aren't overwritten. A later compress pass can assign codes. + When ``wing`` is provided AND ``entities_by_category`` contains a + ``topics`` list, those topics are also recorded under + ``topics_by_wing[wing]`` (case-insensitive dedup, preserving the + casing of the first observed name). This is the signal source for + ``palace_graph.compute_topic_tunnels`` at mine time. Topics for a + wing are *replaced*, not unioned, so a re-run of ``init`` reflects + the user's latest confirmation rather than accumulating stale labels + indefinitely. + The in-process cache is invalidated on write so same-process callers (notably ``cmd_init`` → ``cmd_mine`` in sequence) see the update immediately instead of waiting for a mtime re-check. @@ -515,7 +565,16 @@ def add_to_known_entities(entities_by_category: dict) -> str: name = str(value) return name if name else None + # Separate the topics_by_wing key from regular categories so we don't + # treat it as a flat name-list elsewhere in this function. + topics_for_wing = None + if wing and isinstance(wing, str) and wing.strip(): + topics_for_wing = entities_by_category.get("topics") or [] + for category, names in entities_by_category.items(): + if category == "topics_by_wing": + # Reserved key — managed separately below. + continue if not isinstance(names, list) or not names: continue current = existing.get(category) @@ -551,6 +610,9 @@ def add_to_known_entities(entities_by_category: dict) -> str: ordered.append(name) existing[category] = ordered + if topics_for_wing is not None: + _set_wing_topics(existing, wing.strip(), topics_for_wing, _coerce_name) + registry_path.write_text(_json.dumps(existing, indent=2, ensure_ascii=False), encoding="utf-8") try: registry_path.chmod(0o600) @@ -565,6 +627,28 @@ def add_to_known_entities(entities_by_category: dict) -> str: return str(registry_path) +def get_topics_by_wing() -> dict: + """Return ``topics_by_wing`` from the global registry as a dict. + + Returns ``{}`` if the registry is missing, malformed, or has no + ``topics_by_wing`` key. Casing is preserved from disk; callers that + need case-insensitive comparison should normalize themselves. + """ + raw = _load_known_entities_raw() + topics_map = raw.get("topics_by_wing") + if not isinstance(topics_map, dict): + return {} + out: dict = {} + for wing, topics in topics_map.items(): + if not isinstance(wing, str) or not wing.strip(): + continue + if isinstance(topics, list): + cleaned = [str(t) for t in topics if isinstance(t, str) and t.strip()] + if cleaned: + out[wing.strip()] = cleaned + return out + + _HALL_KEYWORDS_CACHE = None @@ -962,6 +1046,19 @@ def mine( if not dry_run: print(f" + [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers}") + if not dry_run: + # Cross-wing topic tunnels: after every file in this wing has been + # processed, link this wing to any other wing that shares a + # confirmed TOPIC label. Out of scope for v1: manifest-dependency + # overlap, per-topic allow/deny lists, search-result surfacing. + try: + tunnels_added = _compute_topic_tunnels_for_wing(wing) + if tunnels_added: + print(f"\n Topic tunnels: +{tunnels_added} cross-wing link(s)") + except Exception as e: + # Tunnel computation must never fail a mine — degrade quietly. + print(f"\n WARNING: topic tunnel computation skipped — {e}", file=sys.stderr) + print(f"\n{'=' * 55}") print(" Done.") print(f" Files processed: {len(files) - files_skipped}") @@ -974,6 +1071,25 @@ def mine( print(f"{'=' * 55}\n") +def _compute_topic_tunnels_for_wing(wing: str) -> int: + """Drop tunnels between ``wing`` and every other wing that shares + confirmed topics, honoring the ``topic_tunnel_min_count`` config knob. + + Returns the number of tunnels created or refreshed. Zero means no + overlap found (or the registry has no ``topics_by_wing`` map yet). + """ + from .config import MempalaceConfig + from .palace_graph import topic_tunnels_for_wing + + topics_map = get_topics_by_wing() + if not topics_map or wing not in topics_map: + return 0 + cfg = MempalaceConfig() + min_count = cfg.topic_tunnel_min_count + created = topic_tunnels_for_wing(wing, topics_map, min_count=min_count) + return len(created) + + # ============================================================================= # STATUS # ============================================================================= diff --git a/mempalace/palace_graph.py b/mempalace/palace_graph.py index 125ec0d..526b591 100644 --- a/mempalace/palace_graph.py +++ b/mempalace/palace_graph.py @@ -499,3 +499,141 @@ def follow_tunnels(wing: str, room: str, col=None, config=None): pass return connections + + +# ============================================================================= +# TOPIC TUNNELS — auto-link wings that share confirmed TOPIC labels +# ============================================================================= +# When two wings have one or more confirmed topics in common (e.g. both +# discuss "Angular" or "OpenAPI"), drop a symmetric tunnel between them. +# Topics come from the LLM-refined ``TOPIC`` bucket in the per-project +# ``entities.json`` and are persisted by wing in +# ``~/.mempalace/known_entities.json`` under ``topics_by_wing``. +# +# Tunnels are created via the existing ``create_tunnel`` API so they share +# storage and dedup with explicit tunnels. The room is the topic name — +# this matches the "two wings share an idea" mental model and keeps the +# graph homogeneous. + + +def _normalize_topic(name: str) -> str: + """Lowercase + strip topics for case-insensitive overlap detection.""" + return str(name).strip().lower() + + +def compute_topic_tunnels( + topics_by_wing: dict, + min_count: int = 1, + label_prefix: str = "shared topic", +) -> list[dict]: + """Create tunnels for every pair of wings that share >= ``min_count`` topics. + + Args: + topics_by_wing: ``{wing_name: [topic_name, ...]}`` mapping. Topic + names are compared case-insensitively; the first observed + casing is used for the tunnel room name. + min_count: minimum number of overlapping topics required to drop + any tunnel between a wing pair. ``1`` means a single shared + topic is enough; bumping to e.g. ``2`` requires multiple + overlaps and filters out coincidental single-topic links. + label_prefix: human-readable string prefixed to the tunnel label. + + Returns: + List of tunnel dicts as returned by ``create_tunnel`` — one per + (wing_a, wing_b, topic) triple that crossed the threshold. A + wing-pair below ``min_count`` produces no tunnels at all (not + even for its single shared topic). + + No-op semantics: + - empty/None ``topics_by_wing`` returns ``[]``. + - wings whose topic list is empty are skipped. + - ``min_count <= 0`` is clamped to 1. + """ + if not topics_by_wing: + return [] + + min_count = max(1, int(min_count)) + + # Build a normalized-topic -> first-seen casing map per wing so we + # preserve display casing while still doing case-insensitive overlap. + wing_topics: dict[str, dict[str, str]] = {} + for wing, names in topics_by_wing.items(): + if not isinstance(wing, str) or not wing.strip(): + continue + if not isinstance(names, (list, tuple)): + continue + bucket: dict[str, str] = {} + for n in names: + if not isinstance(n, str): + continue + key = _normalize_topic(n) + if not key: + continue + bucket.setdefault(key, n.strip()) + if bucket: + wing_topics[wing.strip()] = bucket + + wings = sorted(wing_topics.keys()) + created: list[dict] = [] + for i, wa in enumerate(wings): + topics_a = wing_topics[wa] + for wb in wings[i + 1 :]: + topics_b = wing_topics[wb] + shared_keys = set(topics_a.keys()) & set(topics_b.keys()) + if len(shared_keys) < min_count: + continue + # Stable sort for deterministic tunnel ordering across runs. + for key in sorted(shared_keys): + # Prefer the casing from whichever wing sorts first — both + # are valid; this just keeps the displayed room consistent. + room = topics_a[key] if topics_a[key] else topics_b[key] + tunnel = create_tunnel( + source_wing=wa, + source_room=room, + target_wing=wb, + target_room=room, + label=f"{label_prefix}: {room}", + ) + created.append(tunnel) + return created + + +def topic_tunnels_for_wing( + wing: str, + topics_by_wing: dict, + min_count: int = 1, + label_prefix: str = "shared topic", +) -> list[dict]: + """Compute topic tunnels involving a single wing. + + Used by the miner to incrementally update tunnels for the wing that + just finished mining without recomputing pairs that don't involve it. + Returns the list of tunnels created or refreshed. + """ + if not topics_by_wing or not isinstance(wing, str) or not wing.strip(): + return [] + + wing = wing.strip() + own = topics_by_wing.get(wing) + if not isinstance(own, (list, tuple)) or not own: + return [] + + # Restrict the pair-wise computation to (wing, other) pairs only by + # building a 2-wing slice for each other wing. Reusing + # ``compute_topic_tunnels`` keeps the threshold and casing logic in + # one place. + created: list[dict] = [] + for other, other_topics in topics_by_wing.items(): + if not isinstance(other, str) or not other.strip() or other == wing: + continue + if not isinstance(other_topics, (list, tuple)) or not other_topics: + continue + slice_map = {wing: list(own), other: list(other_topics)} + created.extend( + compute_topic_tunnels( + slice_map, + min_count=min_count, + label_prefix=label_prefix, + ) + ) + return created diff --git a/mempalace/project_scanner.py b/mempalace/project_scanner.py index 741a3e2..e083dfb 100644 --- a/mempalace/project_scanner.py +++ b/mempalace/project_scanner.py @@ -558,6 +558,7 @@ def to_detected_dict( return { "people": people_entries, "projects": proj_entries, + "topics": [], "uncertain": [], } @@ -577,7 +578,7 @@ def _merge_detected(primary: dict, secondary: dict, drop_secondary_uncertain: bo """ seen = {e["name"].lower() for cat in primary.values() for e in cat} merged = {k: list(v) for k, v in primary.items()} - for cat_key in ("people", "projects", "uncertain"): + for cat_key in ("people", "projects", "topics", "uncertain"): if cat_key == "uncertain" and drop_secondary_uncertain: continue for e in secondary.get(cat_key, []): @@ -654,7 +655,7 @@ def discover_entities( prose_detected = ( detect_entities(prose_files, languages=languages) if prose_files - else {"people": [], "projects": [], "uncertain": []} + else {"people": [], "projects": [], "topics": [], "uncertain": []} ) # Without LLM refinement, suppress regex "uncertain" noise when real diff --git a/tests/test_entity_detector.py b/tests/test_entity_detector.py index afad4d7..304f68b 100644 --- a/tests/test_entity_detector.py +++ b/tests/test_entity_detector.py @@ -235,13 +235,13 @@ def test_detect_entities_empty_files(tmp_path): f = tmp_path / "empty.txt" f.write_text("") result = detect_entities([f]) - assert result == {"people": [], "projects": [], "uncertain": []} + assert result == {"people": [], "projects": [], "topics": [], "uncertain": []} def test_detect_entities_handles_missing_file(tmp_path): missing = tmp_path / "nonexistent.txt" result = detect_entities([missing]) - assert result == {"people": [], "projects": [], "uncertain": []} + assert result == {"people": [], "projects": [], "topics": [], "uncertain": []} def test_detect_entities_respects_max_files(tmp_path): diff --git a/tests/test_known_entities_registry.py b/tests/test_known_entities_registry.py index 300cfb6..06b81e5 100644 --- a/tests/test_known_entities_registry.py +++ b/tests/test_known_entities_registry.py @@ -206,3 +206,71 @@ def test_populated_registry_improves_miner_recall(temp_registry): # All four registered entities should land in the metadata string for expected in ("Julia Grib", "Kevin Heifner", "hyperion-history", "mempalace"): assert expected in tagged, f"expected '{expected}' in metadata {tagged!r}" + + +# ── topics_by_wing — cross-wing tunnel signal source (issue #1180) ── + + +def test_topics_persisted_under_topics_by_wing(temp_registry): + miner.add_to_known_entities( + {"people": ["Alice"], "topics": ["Angular", "OpenAPI"]}, + wing="wing_alpha", + ) + data = json.loads(temp_registry.read_text()) + # Topics also stored as a flat list (existing-style aggregate). + assert "Angular" in data["topics"] + # And recorded by wing for tunnel computation. + assert data["topics_by_wing"]["wing_alpha"] == ["Angular", "OpenAPI"] + + +def test_topics_by_wing_replaces_on_reinit(temp_registry): + """Re-running init for the same wing should reflect the latest list, + not accumulate stale topics indefinitely.""" + miner.add_to_known_entities({"topics": ["Angular", "OpenAPI"]}, wing="wing_alpha") + miner.add_to_known_entities({"topics": ["OpenAPI", "Postgres"]}, wing="wing_alpha") + data = json.loads(temp_registry.read_text()) + assert data["topics_by_wing"]["wing_alpha"] == ["OpenAPI", "Postgres"] + + +def test_topics_by_wing_multiple_wings_coexist(temp_registry): + miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_a") + miner.add_to_known_entities({"topics": ["foo", "bar"]}, wing="wing_b") + data = json.loads(temp_registry.read_text()) + assert data["topics_by_wing"] == {"wing_a": ["foo"], "wing_b": ["foo", "bar"]} + + +def test_topics_by_wing_skipped_without_wing(temp_registry): + miner.add_to_known_entities({"topics": ["foo"]}) + data = json.loads(temp_registry.read_text()) + # No wing → no topics_by_wing entry, but topics list still saved. + assert "topics_by_wing" not in data + assert data["topics"] == ["foo"] + + +def test_topics_by_wing_dedupes_case_insensitive(temp_registry): + miner.add_to_known_entities({"topics": ["OpenAPI", "openapi", "OPENAPI"]}, wing="wing_a") + data = json.loads(temp_registry.read_text()) + # Only one entry, casing of the first observed name preserved. + assert data["topics_by_wing"]["wing_a"] == ["OpenAPI"] + + +def test_get_topics_by_wing_reads_registry(temp_registry): + miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_a") + miner.add_to_known_entities({"topics": ["foo", "bar"]}, wing="wing_b") + result = miner.get_topics_by_wing() + assert result == {"wing_a": ["foo"], "wing_b": ["foo", "bar"]} + + +def test_get_topics_by_wing_empty_when_missing(temp_registry): + miner.add_to_known_entities({"people": ["Alice"]}) + assert miner.get_topics_by_wing() == {} + + +def test_topics_by_wing_does_not_pollute_known_names(temp_registry): + """Wing names in topics_by_wing must NOT leak into the flat known-names + set used by ``_extract_entities_for_metadata`` — only the topic strings + themselves should be recognized.""" + miner.add_to_known_entities({"topics": ["Angular"]}, wing="wing_super_secret_project") + known = miner._load_known_entities() + assert "Angular" in known + assert "wing_super_secret_project" not in known diff --git a/tests/test_llm_refine.py b/tests/test_llm_refine.py index b3e7d2d..823167c 100644 --- a/tests/test_llm_refine.py +++ b/tests/test_llm_refine.py @@ -272,7 +272,9 @@ def test_apply_classifications_appends_reason_signal(): assert any("spoken of by name" in s for s in new["people"][0]["signals"]) -def test_apply_classifications_topic_goes_to_uncertain(): +def test_apply_classifications_topic_goes_to_topics_bucket(): + """TOPIC classifications now route to a dedicated ``topics`` bucket so the + miner can use them as cross-wing tunnel signal (issue #1180).""" detected = { "people": [], "projects": [ @@ -289,8 +291,32 @@ def test_apply_classifications_topic_goes_to_uncertain(): decisions = {"Paris": ("TOPIC", "city, not a project")} new, reclass, _ = _apply_classifications(detected, decisions) assert len(new["projects"]) == 0 + assert len(new["uncertain"]) == 0 + assert len(new["topics"]) == 1 + assert new["topics"][0]["name"] == "Paris" + assert new["topics"][0]["type"] == "topic" + assert reclass == 1 + + +def test_apply_classifications_ambiguous_still_goes_to_uncertain(): + detected = { + "people": [], + "projects": [ + { + "name": "Foo", + "type": "project", + "confidence": 0.7, + "frequency": 5, + "signals": ["regex"], + } + ], + "uncertain": [], + } + decisions = {"Foo": ("AMBIGUOUS", "context insufficient")} + new, reclass, _ = _apply_classifications(detected, decisions) + assert len(new["projects"]) == 0 assert len(new["uncertain"]) == 1 - assert new["uncertain"][0]["name"] == "Paris" + assert new["uncertain"][0]["name"] == "Foo" assert reclass == 1 @@ -469,7 +495,9 @@ def test_refine_entities_refines_high_confidence_regex_projects(): assert provider.call_count == 1 assert result.reclassified == 1 assert result.merged["projects"] == [] - assert result.merged["uncertain"][0]["name"] == "OpenAPI" + # TOPIC labels go to the dedicated ``topics`` bucket so the miner can + # use them for cross-wing tunnel computation (issue #1180). + assert result.merged["topics"][0]["name"] == "OpenAPI" def test_refine_entities_refines_regex_people_but_skips_git_people(): diff --git a/tests/test_miner.py b/tests/test_miner.py index 85ed566..9b4f127 100644 --- a/tests/test_miner.py +++ b/tests/test_miner.py @@ -496,3 +496,104 @@ def test_add_drawer_stamps_normalize_version(tmp_path): assert meta["normalize_version"] == NORMALIZE_VERSION finally: del col, client + + +def test_mine_creates_topic_tunnels_for_shared_topics(tmp_path, monkeypatch): + """End-to-end: when two wings have already-confirmed topics that overlap, + the miner's mine-time pass drops a cross-wing tunnel between them. + + Issue #1180. + """ + from mempalace import miner, palace_graph + + # Redirect both the registry and tunnel-storage paths into tmp_path + # so we never touch the developer's real ~/.mempalace directory. + registry = tmp_path / "known_entities.json" + monkeypatch.setattr(miner, "_ENTITY_REGISTRY_PATH", str(registry)) + miner._ENTITY_REGISTRY_CACHE.update({"mtime": None, "names": frozenset(), "raw": {}}) + tunnels_file = tmp_path / "tunnels.json" + monkeypatch.setattr(palace_graph, "_TUNNEL_FILE", str(tunnels_file)) + + # Pre-populate the registry as if init had been run for two wings that + # share a topic. + miner.add_to_known_entities({"topics": ["foo", "bar"]}, wing="wing_one") + miner.add_to_known_entities({"topics": ["foo", "baz"]}, wing="wing_two") + + # Mine wing_two — should drop tunnels between wing_two and wing_one + # for every shared topic. Just one in this case. + project_root = tmp_path / "wing_two_project" + project_root.mkdir() + write_file( + project_root / "notes.md", + "Some prose long enough to make a chunk. " * 20, + ) + with open(project_root / "mempalace.yaml", "w") as f: + yaml.dump({"wing": "wing_two", "rooms": [{"name": "general"}]}, f) + + palace_path = tmp_path / "palace" + mine(str(project_root), str(palace_path)) + + listed = palace_graph.list_tunnels() + assert len(listed) == 1 + rooms = {listed[0]["source"]["room"], listed[0]["target"]["room"]} + assert rooms == {"foo"} + wings = {listed[0]["source"]["wing"], listed[0]["target"]["wing"]} + assert wings == {"wing_one", "wing_two"} + + +def test_mine_no_tunnel_when_threshold_blocks_overlap(tmp_path, monkeypatch): + """Bumping ``MEMPALACE_TOPIC_TUNNEL_MIN_COUNT`` above the actual overlap + suppresses tunnel creation.""" + from mempalace import miner, palace_graph + + registry = tmp_path / "known_entities.json" + monkeypatch.setattr(miner, "_ENTITY_REGISTRY_PATH", str(registry)) + miner._ENTITY_REGISTRY_CACHE.update({"mtime": None, "names": frozenset(), "raw": {}}) + tunnels_file = tmp_path / "tunnels.json" + monkeypatch.setattr(palace_graph, "_TUNNEL_FILE", str(tunnels_file)) + monkeypatch.setenv("MEMPALACE_TOPIC_TUNNEL_MIN_COUNT", "2") + + miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_one") + miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_two") + + project_root = tmp_path / "wing_two_project" + project_root.mkdir() + write_file( + project_root / "notes.md", + "Some prose long enough to make a chunk. " * 20, + ) + with open(project_root / "mempalace.yaml", "w") as f: + yaml.dump({"wing": "wing_two", "rooms": [{"name": "general"}]}, f) + + palace_path = tmp_path / "palace" + mine(str(project_root), str(palace_path)) + + # min_count=2 but only 1 shared topic → no tunnel. + assert palace_graph.list_tunnels() == [] + + +def test_mine_no_tunnel_when_only_one_wing_has_topics(tmp_path, monkeypatch): + """A wing in isolation (no other wing has confirmed topics) creates no tunnels.""" + from mempalace import miner, palace_graph + + registry = tmp_path / "known_entities.json" + monkeypatch.setattr(miner, "_ENTITY_REGISTRY_PATH", str(registry)) + miner._ENTITY_REGISTRY_CACHE.update({"mtime": None, "names": frozenset(), "raw": {}}) + tunnels_file = tmp_path / "tunnels.json" + monkeypatch.setattr(palace_graph, "_TUNNEL_FILE", str(tunnels_file)) + + miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_one") + + project_root = tmp_path / "wing_one_project" + project_root.mkdir() + write_file( + project_root / "notes.md", + "Some prose long enough to make a chunk. " * 20, + ) + with open(project_root / "mempalace.yaml", "w") as f: + yaml.dump({"wing": "wing_one", "rooms": [{"name": "general"}]}, f) + + palace_path = tmp_path / "palace" + mine(str(project_root), str(palace_path)) + + assert palace_graph.list_tunnels() == [] diff --git a/tests/test_palace_graph_tunnels.py b/tests/test_palace_graph_tunnels.py index 00c7400..5048ad5 100644 --- a/tests/test_palace_graph_tunnels.py +++ b/tests/test_palace_graph_tunnels.py @@ -135,3 +135,126 @@ class TestExplicitTunnels: connections = palace_graph.follow_tunnels("wing_code", "auth", col=col) assert len(connections) == 1 assert "drawer_preview" not in connections[0] + + +class TestTopicTunnels: + """Cross-wing topic tunnels (issue #1180). + + When two wings share confirmed TOPIC labels above a configurable + threshold, a symmetric tunnel is created between them. Tunnels are + routed through the existing ``create_tunnel`` storage so they share + dedup and persistence with explicit tunnels. + """ + + def test_compute_topic_tunnels_creates_link_for_shared_topic(self, tmp_path, monkeypatch): + _use_tmp_tunnel_file(monkeypatch, tmp_path) + topics_by_wing = { + "wing_alpha": ["Angular", "OpenAPI"], + "wing_beta": ["OpenAPI", "Kubernetes"], + } + created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1) + assert len(created) == 1 + assert created[0]["source"]["wing"] in {"wing_alpha", "wing_beta"} + assert created[0]["target"]["wing"] in {"wing_alpha", "wing_beta"} + # Room is the topic itself (case preserved from the first wing). + assert created[0]["source"]["room"] == "OpenAPI" + assert "OpenAPI" in created[0]["label"] + + # Tunnel is retrievable via the standard list_tunnels API. + listed = palace_graph.list_tunnels() + assert len(listed) == 1 + assert listed[0]["id"] == created[0]["id"] + + def test_compute_topic_tunnels_no_link_below_threshold(self, tmp_path, monkeypatch): + _use_tmp_tunnel_file(monkeypatch, tmp_path) + topics_by_wing = { + "wing_alpha": ["Angular", "OpenAPI"], + "wing_beta": ["OpenAPI", "Kubernetes"], + } + # min_count=2 requires two overlapping topics — only one shared. + created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=2) + assert created == [] + assert palace_graph.list_tunnels() == [] + + def test_compute_topic_tunnels_above_threshold_creates_per_topic_links( + self, tmp_path, monkeypatch + ): + _use_tmp_tunnel_file(monkeypatch, tmp_path) + topics_by_wing = { + "wing_alpha": ["Angular", "OpenAPI", "Postgres"], + "wing_beta": ["Angular", "OpenAPI", "Redis"], + } + created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=2) + # Two shared topics × one wing pair = two tunnels. + rooms = sorted(t["source"]["room"] for t in created) + assert rooms == ["Angular", "OpenAPI"] + + def test_compute_topic_tunnels_case_insensitive_overlap(self, tmp_path, monkeypatch): + _use_tmp_tunnel_file(monkeypatch, tmp_path) + topics_by_wing = { + "wing_alpha": ["openapi"], + "wing_beta": ["OpenAPI"], + } + created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1) + assert len(created) == 1 + + def test_compute_topic_tunnels_empty_input_is_noop(self, tmp_path, monkeypatch): + _use_tmp_tunnel_file(monkeypatch, tmp_path) + assert palace_graph.compute_topic_tunnels({}) == [] + assert palace_graph.compute_topic_tunnels({"wing_a": []}) == [] + assert palace_graph.list_tunnels() == [] + + def test_compute_topic_tunnels_three_wings_pairwise(self, tmp_path, monkeypatch): + _use_tmp_tunnel_file(monkeypatch, tmp_path) + topics_by_wing = { + "wing_a": ["foo"], + "wing_b": ["foo"], + "wing_c": ["foo"], + } + created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1) + # 3 wings sharing the same topic → C(3,2) = 3 pairs → 3 tunnels. + assert len(created) == 3 + endpoint_pairs = { + tuple(sorted([t["source"]["wing"], t["target"]["wing"]])) for t in created + } + assert endpoint_pairs == { + ("wing_a", "wing_b"), + ("wing_a", "wing_c"), + ("wing_b", "wing_c"), + } + + def test_topic_tunnels_for_wing_only_links_that_wing(self, tmp_path, monkeypatch): + _use_tmp_tunnel_file(monkeypatch, tmp_path) + topics_by_wing = { + "wing_a": ["foo", "bar"], + "wing_b": ["foo"], + "wing_c": ["bar"], + } + # wing_a should link to both b (via foo) and c (via bar). + created = palace_graph.topic_tunnels_for_wing("wing_a", topics_by_wing) + endpoint_pairs = { + tuple(sorted([t["source"]["wing"], t["target"]["wing"]])) for t in created + } + assert endpoint_pairs == {("wing_a", "wing_b"), ("wing_a", "wing_c")} + # The b-c pair is NOT created because wing_a's incremental pass + # only computes pairs that include wing_a. + assert len(palace_graph.list_tunnels()) == 2 + + def test_topic_tunnels_for_wing_unknown_wing_is_noop(self, tmp_path, monkeypatch): + _use_tmp_tunnel_file(monkeypatch, tmp_path) + topics_by_wing = {"wing_a": ["foo"], "wing_b": ["foo"]} + assert palace_graph.topic_tunnels_for_wing("wing_missing", topics_by_wing) == [] + assert palace_graph.list_tunnels() == [] + + def test_compute_topic_tunnels_dedupe_on_recompute(self, tmp_path, monkeypatch): + _use_tmp_tunnel_file(monkeypatch, tmp_path) + topics_by_wing = { + "wing_alpha": ["OpenAPI"], + "wing_beta": ["OpenAPI"], + } + first = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1) + second = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1) + # create_tunnel is symmetric/dedupe — repeated computation should + # not multiply the stored tunnels. + assert first[0]["id"] == second[0]["id"] + assert len(palace_graph.list_tunnels()) == 1 diff --git a/tests/test_project_scanner.py b/tests/test_project_scanner.py index 49126b4..45dc802 100644 --- a/tests/test_project_scanner.py +++ b/tests/test_project_scanner.py @@ -363,11 +363,14 @@ def test_to_detected_dict_shape(): projects = [ProjectInfo(name="p", repo_root=Path("."), is_mine=True, manifest="package.json")] people = [PersonInfo(name="Jane Doe", total_commits=5, repos={"r"})] d = to_detected_dict(projects, people) - assert set(d.keys()) == {"people", "projects", "uncertain"} + # ``topics`` is the LLM-refine bucket for cross-wing tunnel signal — + # always present even when empty so callers can rely on the shape. + assert set(d.keys()) == {"people", "projects", "topics", "uncertain"} assert d["projects"][0]["name"] == "p" assert d["projects"][0]["type"] == "project" assert d["people"][0]["name"] == "Jane Doe" assert d["people"][0]["type"] == "person" + assert d["topics"] == [] assert d["uncertain"] == []