feat(graph): cross-wing tunnels by shared topics (#1180)
When two wings have one or more confirmed TOPIC labels in common, the miner now drops a symmetric tunnel between them at mine time so the palace graph reflects shared themes (frameworks, vendors, recurring concepts). - llm_refine: TOPIC label routes to a dedicated `topics` bucket so the signal survives confirmation instead of getting collapsed into `uncertain` and dropped. - entity_detector / project_scanner: bucket plumbed through the detection pipeline; `confirm_entities` returns confirmed topics alongside people/projects. - miner.add_to_known_entities: optional `wing` parameter records the confirmed topics under `topics_by_wing` in `~/.mempalace/known_entities.json`. Wing names do NOT leak into the flat known-name set used by drawer-tagging. - palace_graph: `compute_topic_tunnels` and `topic_tunnels_for_wing` create symmetric tunnels via the existing `create_tunnel` API so they share dedup and persistence with explicit tunnels. - miner.mine: post-file-loop pass calls `topic_tunnels_for_wing` for the freshly-mined wing. Failures are logged but never abort the mine. - config: `topic_tunnel_min_count` knob (env `MEMPALACE_TOPIC_TUNNEL_MIN_COUNT` or `~/.mempalace/config.json`), default 1. Tests cover topic persistence through init->mine, tunnel creation when wings share a topic, no tunnel below threshold, cross-wing tunnel retrieval via `list_tunnels`, dedup on recompute, case-insensitive overlap, and the end-to-end mine-time wiring. Out of scope for this PR (called out in the PR body): manifest- dependency overlap, per-topic allow/deny lists, search-result surfacing.
This commit is contained in:
+18
-5
@@ -117,21 +117,34 @@ def cmd_init(args):
|
||||
if languages_tuple != ("en",):
|
||||
print(f" Languages: {', '.join(languages_tuple)}")
|
||||
detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider)
|
||||
total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
|
||||
total = (
|
||||
len(detected["people"])
|
||||
+ len(detected["projects"])
|
||||
+ len(detected.get("topics", []))
|
||||
+ len(detected["uncertain"])
|
||||
)
|
||||
if total > 0:
|
||||
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
|
||||
# Save confirmed entities to <project>/entities.json (per-project
|
||||
# audit trail — user can inspect or hand-edit) AND merge into the
|
||||
# global registry the miner reads at mine time.
|
||||
if confirmed["people"] or confirmed["projects"]:
|
||||
entities_path = Path(args.dir).expanduser().resolve() / "entities.json"
|
||||
# global registry the miner reads at mine time. Topics are kept
|
||||
# separately so the miner can later compute cross-wing tunnels
|
||||
# from shared topics (see palace_graph.compute_topic_tunnels).
|
||||
if confirmed["people"] or confirmed["projects"] or confirmed.get("topics"):
|
||||
project_path = Path(args.dir).expanduser().resolve()
|
||||
entities_path = project_path / "entities.json"
|
||||
with open(entities_path, "w", encoding="utf-8") as f:
|
||||
json.dump(confirmed, f, indent=2, ensure_ascii=False)
|
||||
print(f" Entities saved: {entities_path}")
|
||||
|
||||
from .miner import add_to_known_entities
|
||||
|
||||
registry_path = add_to_known_entities(confirmed)
|
||||
# Wing matches the default produced by ``room_detector_local``
|
||||
# (folder basename) and the miner fallback in ``load_config``.
|
||||
# Used by the topics_by_wing map so cross-wing tunnels can be
|
||||
# computed at mine time.
|
||||
wing = project_path.name
|
||||
registry_path = add_to_known_entities(confirmed, wing=wing)
|
||||
print(f" Registry updated: {registry_path}")
|
||||
else:
|
||||
print(" No entities detected — proceeding with directory-based rooms.")
|
||||
|
||||
@@ -253,6 +253,32 @@ class MempalaceConfig:
|
||||
return env_val.strip().lower()
|
||||
return str(self._file_config.get("embedding_device", "auto")).strip().lower()
|
||||
|
||||
@property
|
||||
def topic_tunnel_min_count(self):
|
||||
"""Minimum number of overlapping confirmed topics required to create
|
||||
a cross-wing tunnel between two wings.
|
||||
|
||||
Default is ``1`` — any single shared topic produces a tunnel. Bump
|
||||
to ``2+`` if your projects share lots of common-tech labels (Python,
|
||||
Docker, Git) and you want only meaningfully overlapping wings to
|
||||
link. Reads ``MEMPALACE_TOPIC_TUNNEL_MIN_COUNT`` env first, then the
|
||||
config-file value, then ``1``.
|
||||
"""
|
||||
env_val = os.environ.get("MEMPALACE_TOPIC_TUNNEL_MIN_COUNT")
|
||||
if env_val:
|
||||
try:
|
||||
parsed = int(env_val)
|
||||
if parsed >= 1:
|
||||
return parsed
|
||||
except ValueError:
|
||||
pass
|
||||
cfg_val = self._file_config.get("topic_tunnel_min_count")
|
||||
try:
|
||||
parsed = int(cfg_val) if cfg_val is not None else 1
|
||||
except (TypeError, ValueError):
|
||||
parsed = 1
|
||||
return max(1, parsed)
|
||||
|
||||
@property
|
||||
def hook_silent_save(self):
|
||||
"""Whether the stop hook saves directly (True) or blocks for MCP calls (False)."""
|
||||
|
||||
@@ -440,7 +440,7 @@ def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) ->
|
||||
candidates = extract_candidates(combined_text, languages=langs)
|
||||
|
||||
if not candidates:
|
||||
return {"people": [], "projects": [], "uncertain": []}
|
||||
return {"people": [], "projects": [], "topics": [], "uncertain": []}
|
||||
|
||||
# Score and classify each candidate
|
||||
people = []
|
||||
@@ -467,6 +467,7 @@ def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) ->
|
||||
return {
|
||||
"people": people[:15],
|
||||
"projects": projects[:10],
|
||||
"topics": [],
|
||||
"uncertain": uncertain[:8],
|
||||
}
|
||||
|
||||
@@ -489,7 +490,13 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict:
|
||||
"""
|
||||
Interactive confirmation step.
|
||||
User reviews detected entities, removes wrong ones, adds missing ones.
|
||||
Returns confirmed {people: [names], projects: [names]}
|
||||
Returns confirmed {people: [names], projects: [names], topics: [names]}.
|
||||
|
||||
Topics are not surfaced for interactive review — they come from the
|
||||
LLM-refined ``TOPIC`` bucket and are passed through verbatim. They
|
||||
feed cross-wing tunnel computation at mine time (see
|
||||
``palace_graph.compute_topic_tunnels``); a wrong topic at worst adds
|
||||
a low-traffic tunnel and never alters drawer storage.
|
||||
|
||||
Pass yes=True to auto-accept all detected entities without prompting.
|
||||
"""
|
||||
@@ -501,18 +508,28 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict:
|
||||
_print_entity_list(detected["people"], "PEOPLE")
|
||||
_print_entity_list(detected["projects"], "PROJECTS")
|
||||
|
||||
if detected.get("topics"):
|
||||
_print_entity_list(detected["topics"], "TOPICS (cross-wing tunnel signal)")
|
||||
|
||||
if detected["uncertain"]:
|
||||
_print_entity_list(detected["uncertain"], "UNCERTAIN (need your call)")
|
||||
|
||||
confirmed_people = [e["name"] for e in detected["people"]]
|
||||
confirmed_projects = [e["name"] for e in detected["projects"]]
|
||||
confirmed_topics = [e["name"] for e in detected.get("topics", [])]
|
||||
|
||||
if yes:
|
||||
# Auto-accept: include all detected (skip uncertain — ambiguous without user input)
|
||||
print(
|
||||
f"\n Auto-accepting {len(confirmed_people)} people, {len(confirmed_projects)} projects."
|
||||
f"\n Auto-accepting {len(confirmed_people)} people, "
|
||||
f"{len(confirmed_projects)} projects, "
|
||||
f"{len(confirmed_topics)} topics."
|
||||
)
|
||||
return {"people": confirmed_people, "projects": confirmed_projects}
|
||||
return {
|
||||
"people": confirmed_people,
|
||||
"projects": confirmed_projects,
|
||||
"topics": confirmed_topics,
|
||||
}
|
||||
|
||||
print(f"\n{'─' * 58}")
|
||||
print(" Options:")
|
||||
@@ -570,11 +587,14 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict:
|
||||
print(" Confirmed:")
|
||||
print(f" People: {', '.join(confirmed_people) or '(none)'}")
|
||||
print(f" Projects: {', '.join(confirmed_projects) or '(none)'}")
|
||||
if confirmed_topics:
|
||||
print(f" Topics: {', '.join(confirmed_topics)}")
|
||||
print(f"{'=' * 58}\n")
|
||||
|
||||
return {
|
||||
"people": confirmed_people,
|
||||
"projects": confirmed_projects,
|
||||
"topics": confirmed_topics,
|
||||
}
|
||||
|
||||
|
||||
|
||||
+14
-9
@@ -197,13 +197,23 @@ def _apply_classifications(
|
||||
"""Merge LLM decisions back into the detected dict.
|
||||
|
||||
Returns (new_detected, reclassified_count, dropped_count).
|
||||
|
||||
Topics get their own bucket so the caller can persist them as
|
||||
cross-wing tunnel signal. ``AMBIGUOUS`` still falls back to
|
||||
``uncertain`` for human review.
|
||||
"""
|
||||
label_to_bucket = {
|
||||
"PERSON": "people",
|
||||
"PROJECT": "projects",
|
||||
"TOPIC": "uncertain",
|
||||
"TOPIC": "topics",
|
||||
"AMBIGUOUS": "uncertain",
|
||||
}
|
||||
bucket_to_type = {
|
||||
"people": "person",
|
||||
"projects": "project",
|
||||
"topics": "topic",
|
||||
"uncertain": "uncertain",
|
||||
}
|
||||
|
||||
# Index every entity by name for in-place update
|
||||
all_entries: list[tuple[str, dict]] = []
|
||||
@@ -216,6 +226,7 @@ def _apply_classifications(
|
||||
new_detected: dict[str, list[dict]] = {
|
||||
"people": [],
|
||||
"projects": [],
|
||||
"topics": [],
|
||||
"uncertain": [],
|
||||
}
|
||||
|
||||
@@ -223,7 +234,7 @@ def _apply_classifications(
|
||||
decision = decisions.get(entry["name"])
|
||||
if decision is None:
|
||||
# No LLM opinion — keep as-is
|
||||
new_detected[old_bucket].append(entry)
|
||||
new_detected.setdefault(old_bucket, []).append(entry)
|
||||
continue
|
||||
|
||||
label, reason = decision
|
||||
@@ -245,13 +256,7 @@ def _apply_classifications(
|
||||
updated["signals"] = signals
|
||||
if target_bucket != old_bucket:
|
||||
reclassified += 1
|
||||
updated["type"] = (
|
||||
"person"
|
||||
if target_bucket == "people"
|
||||
else "project"
|
||||
if target_bucket == "projects"
|
||||
else "uncertain"
|
||||
)
|
||||
updated["type"] = bucket_to_type.get(target_bucket, "uncertain")
|
||||
new_detected[target_bucket].append(updated)
|
||||
|
||||
return new_detected, reclassified, dropped
|
||||
|
||||
+118
-2
@@ -439,7 +439,16 @@ def _refresh_known_entities_cache() -> None:
|
||||
data = json.load(f)
|
||||
if isinstance(data, dict):
|
||||
raw = data
|
||||
for cat in data.values():
|
||||
for cat_key, cat in data.items():
|
||||
# Special wing-keyed map — its inner values are topic
|
||||
# names but its outer keys are wings, which must NOT be
|
||||
# surfaced as known entities. Pull the topic names out
|
||||
# explicitly instead of treating it as a generic category.
|
||||
if cat_key == "topics_by_wing" and isinstance(cat, dict):
|
||||
for topic_list in cat.values():
|
||||
if isinstance(topic_list, list):
|
||||
names.update(str(n) for n in topic_list if n)
|
||||
continue
|
||||
if isinstance(cat, list):
|
||||
names.update(str(n) for n in cat if n)
|
||||
elif isinstance(cat, dict):
|
||||
@@ -474,7 +483,39 @@ def _load_known_entities_raw() -> dict:
|
||||
return dict(_ENTITY_REGISTRY_CACHE["raw"])
|
||||
|
||||
|
||||
def add_to_known_entities(entities_by_category: dict) -> str:
|
||||
def _set_wing_topics(existing: dict, wing_key: str, topics_for_wing: list, coerce) -> None:
|
||||
"""Update ``existing['topics_by_wing'][wing_key]`` to the deduped list.
|
||||
|
||||
Replaces (does not union) the wing's topic list — re-running ``init``
|
||||
should reflect the user's latest confirmation rather than accumulate
|
||||
stale labels. Empty input drops the wing entry; an empty map drops
|
||||
the ``topics_by_wing`` key entirely.
|
||||
"""
|
||||
topics_map = existing.get("topics_by_wing")
|
||||
if not isinstance(topics_map, dict):
|
||||
topics_map = {}
|
||||
seen_lower: set = set()
|
||||
ordered: list = []
|
||||
for n in topics_for_wing:
|
||||
name = coerce(n)
|
||||
if not name:
|
||||
continue
|
||||
key = name.lower()
|
||||
if key in seen_lower:
|
||||
continue
|
||||
seen_lower.add(key)
|
||||
ordered.append(name)
|
||||
if ordered:
|
||||
topics_map[wing_key] = ordered
|
||||
else:
|
||||
topics_map.pop(wing_key, None)
|
||||
if topics_map:
|
||||
existing["topics_by_wing"] = topics_map
|
||||
else:
|
||||
existing.pop("topics_by_wing", None)
|
||||
|
||||
|
||||
def add_to_known_entities(entities_by_category: dict, wing: str = None) -> str:
|
||||
"""Union ``entities_by_category`` into ``~/.mempalace/known_entities.json``.
|
||||
|
||||
Accepts ``{category: [names]}`` shape as produced by ``mempalace init``
|
||||
@@ -488,6 +529,15 @@ def add_to_known_entities(entities_by_category: dict) -> str:
|
||||
added as keys with ``None`` values so existing code mappings aren't
|
||||
overwritten. A later compress pass can assign codes.
|
||||
|
||||
When ``wing`` is provided AND ``entities_by_category`` contains a
|
||||
``topics`` list, those topics are also recorded under
|
||||
``topics_by_wing[wing]`` (case-insensitive dedup, preserving the
|
||||
casing of the first observed name). This is the signal source for
|
||||
``palace_graph.compute_topic_tunnels`` at mine time. Topics for a
|
||||
wing are *replaced*, not unioned, so a re-run of ``init`` reflects
|
||||
the user's latest confirmation rather than accumulating stale labels
|
||||
indefinitely.
|
||||
|
||||
The in-process cache is invalidated on write so same-process callers
|
||||
(notably ``cmd_init`` → ``cmd_mine`` in sequence) see the update
|
||||
immediately instead of waiting for a mtime re-check.
|
||||
@@ -515,7 +565,16 @@ def add_to_known_entities(entities_by_category: dict) -> str:
|
||||
name = str(value)
|
||||
return name if name else None
|
||||
|
||||
# Separate the topics_by_wing key from regular categories so we don't
|
||||
# treat it as a flat name-list elsewhere in this function.
|
||||
topics_for_wing = None
|
||||
if wing and isinstance(wing, str) and wing.strip():
|
||||
topics_for_wing = entities_by_category.get("topics") or []
|
||||
|
||||
for category, names in entities_by_category.items():
|
||||
if category == "topics_by_wing":
|
||||
# Reserved key — managed separately below.
|
||||
continue
|
||||
if not isinstance(names, list) or not names:
|
||||
continue
|
||||
current = existing.get(category)
|
||||
@@ -551,6 +610,9 @@ def add_to_known_entities(entities_by_category: dict) -> str:
|
||||
ordered.append(name)
|
||||
existing[category] = ordered
|
||||
|
||||
if topics_for_wing is not None:
|
||||
_set_wing_topics(existing, wing.strip(), topics_for_wing, _coerce_name)
|
||||
|
||||
registry_path.write_text(_json.dumps(existing, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
try:
|
||||
registry_path.chmod(0o600)
|
||||
@@ -565,6 +627,28 @@ def add_to_known_entities(entities_by_category: dict) -> str:
|
||||
return str(registry_path)
|
||||
|
||||
|
||||
def get_topics_by_wing() -> dict:
|
||||
"""Return ``topics_by_wing`` from the global registry as a dict.
|
||||
|
||||
Returns ``{}`` if the registry is missing, malformed, or has no
|
||||
``topics_by_wing`` key. Casing is preserved from disk; callers that
|
||||
need case-insensitive comparison should normalize themselves.
|
||||
"""
|
||||
raw = _load_known_entities_raw()
|
||||
topics_map = raw.get("topics_by_wing")
|
||||
if not isinstance(topics_map, dict):
|
||||
return {}
|
||||
out: dict = {}
|
||||
for wing, topics in topics_map.items():
|
||||
if not isinstance(wing, str) or not wing.strip():
|
||||
continue
|
||||
if isinstance(topics, list):
|
||||
cleaned = [str(t) for t in topics if isinstance(t, str) and t.strip()]
|
||||
if cleaned:
|
||||
out[wing.strip()] = cleaned
|
||||
return out
|
||||
|
||||
|
||||
_HALL_KEYWORDS_CACHE = None
|
||||
|
||||
|
||||
@@ -962,6 +1046,19 @@ def mine(
|
||||
if not dry_run:
|
||||
print(f" + [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers}")
|
||||
|
||||
if not dry_run:
|
||||
# Cross-wing topic tunnels: after every file in this wing has been
|
||||
# processed, link this wing to any other wing that shares a
|
||||
# confirmed TOPIC label. Out of scope for v1: manifest-dependency
|
||||
# overlap, per-topic allow/deny lists, search-result surfacing.
|
||||
try:
|
||||
tunnels_added = _compute_topic_tunnels_for_wing(wing)
|
||||
if tunnels_added:
|
||||
print(f"\n Topic tunnels: +{tunnels_added} cross-wing link(s)")
|
||||
except Exception as e:
|
||||
# Tunnel computation must never fail a mine — degrade quietly.
|
||||
print(f"\n WARNING: topic tunnel computation skipped — {e}", file=sys.stderr)
|
||||
|
||||
print(f"\n{'=' * 55}")
|
||||
print(" Done.")
|
||||
print(f" Files processed: {len(files) - files_skipped}")
|
||||
@@ -974,6 +1071,25 @@ def mine(
|
||||
print(f"{'=' * 55}\n")
|
||||
|
||||
|
||||
def _compute_topic_tunnels_for_wing(wing: str) -> int:
|
||||
"""Drop tunnels between ``wing`` and every other wing that shares
|
||||
confirmed topics, honoring the ``topic_tunnel_min_count`` config knob.
|
||||
|
||||
Returns the number of tunnels created or refreshed. Zero means no
|
||||
overlap found (or the registry has no ``topics_by_wing`` map yet).
|
||||
"""
|
||||
from .config import MempalaceConfig
|
||||
from .palace_graph import topic_tunnels_for_wing
|
||||
|
||||
topics_map = get_topics_by_wing()
|
||||
if not topics_map or wing not in topics_map:
|
||||
return 0
|
||||
cfg = MempalaceConfig()
|
||||
min_count = cfg.topic_tunnel_min_count
|
||||
created = topic_tunnels_for_wing(wing, topics_map, min_count=min_count)
|
||||
return len(created)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# STATUS
|
||||
# =============================================================================
|
||||
|
||||
@@ -499,3 +499,141 @@ def follow_tunnels(wing: str, room: str, col=None, config=None):
|
||||
pass
|
||||
|
||||
return connections
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TOPIC TUNNELS — auto-link wings that share confirmed TOPIC labels
|
||||
# =============================================================================
|
||||
# When two wings have one or more confirmed topics in common (e.g. both
|
||||
# discuss "Angular" or "OpenAPI"), drop a symmetric tunnel between them.
|
||||
# Topics come from the LLM-refined ``TOPIC`` bucket in the per-project
|
||||
# ``entities.json`` and are persisted by wing in
|
||||
# ``~/.mempalace/known_entities.json`` under ``topics_by_wing``.
|
||||
#
|
||||
# Tunnels are created via the existing ``create_tunnel`` API so they share
|
||||
# storage and dedup with explicit tunnels. The room is the topic name —
|
||||
# this matches the "two wings share an idea" mental model and keeps the
|
||||
# graph homogeneous.
|
||||
|
||||
|
||||
def _normalize_topic(name: str) -> str:
|
||||
"""Lowercase + strip topics for case-insensitive overlap detection."""
|
||||
return str(name).strip().lower()
|
||||
|
||||
|
||||
def compute_topic_tunnels(
|
||||
topics_by_wing: dict,
|
||||
min_count: int = 1,
|
||||
label_prefix: str = "shared topic",
|
||||
) -> list[dict]:
|
||||
"""Create tunnels for every pair of wings that share >= ``min_count`` topics.
|
||||
|
||||
Args:
|
||||
topics_by_wing: ``{wing_name: [topic_name, ...]}`` mapping. Topic
|
||||
names are compared case-insensitively; the first observed
|
||||
casing is used for the tunnel room name.
|
||||
min_count: minimum number of overlapping topics required to drop
|
||||
any tunnel between a wing pair. ``1`` means a single shared
|
||||
topic is enough; bumping to e.g. ``2`` requires multiple
|
||||
overlaps and filters out coincidental single-topic links.
|
||||
label_prefix: human-readable string prefixed to the tunnel label.
|
||||
|
||||
Returns:
|
||||
List of tunnel dicts as returned by ``create_tunnel`` — one per
|
||||
(wing_a, wing_b, topic) triple that crossed the threshold. A
|
||||
wing-pair below ``min_count`` produces no tunnels at all (not
|
||||
even for its single shared topic).
|
||||
|
||||
No-op semantics:
|
||||
- empty/None ``topics_by_wing`` returns ``[]``.
|
||||
- wings whose topic list is empty are skipped.
|
||||
- ``min_count <= 0`` is clamped to 1.
|
||||
"""
|
||||
if not topics_by_wing:
|
||||
return []
|
||||
|
||||
min_count = max(1, int(min_count))
|
||||
|
||||
# Build a normalized-topic -> first-seen casing map per wing so we
|
||||
# preserve display casing while still doing case-insensitive overlap.
|
||||
wing_topics: dict[str, dict[str, str]] = {}
|
||||
for wing, names in topics_by_wing.items():
|
||||
if not isinstance(wing, str) or not wing.strip():
|
||||
continue
|
||||
if not isinstance(names, (list, tuple)):
|
||||
continue
|
||||
bucket: dict[str, str] = {}
|
||||
for n in names:
|
||||
if not isinstance(n, str):
|
||||
continue
|
||||
key = _normalize_topic(n)
|
||||
if not key:
|
||||
continue
|
||||
bucket.setdefault(key, n.strip())
|
||||
if bucket:
|
||||
wing_topics[wing.strip()] = bucket
|
||||
|
||||
wings = sorted(wing_topics.keys())
|
||||
created: list[dict] = []
|
||||
for i, wa in enumerate(wings):
|
||||
topics_a = wing_topics[wa]
|
||||
for wb in wings[i + 1 :]:
|
||||
topics_b = wing_topics[wb]
|
||||
shared_keys = set(topics_a.keys()) & set(topics_b.keys())
|
||||
if len(shared_keys) < min_count:
|
||||
continue
|
||||
# Stable sort for deterministic tunnel ordering across runs.
|
||||
for key in sorted(shared_keys):
|
||||
# Prefer the casing from whichever wing sorts first — both
|
||||
# are valid; this just keeps the displayed room consistent.
|
||||
room = topics_a[key] if topics_a[key] else topics_b[key]
|
||||
tunnel = create_tunnel(
|
||||
source_wing=wa,
|
||||
source_room=room,
|
||||
target_wing=wb,
|
||||
target_room=room,
|
||||
label=f"{label_prefix}: {room}",
|
||||
)
|
||||
created.append(tunnel)
|
||||
return created
|
||||
|
||||
|
||||
def topic_tunnels_for_wing(
|
||||
wing: str,
|
||||
topics_by_wing: dict,
|
||||
min_count: int = 1,
|
||||
label_prefix: str = "shared topic",
|
||||
) -> list[dict]:
|
||||
"""Compute topic tunnels involving a single wing.
|
||||
|
||||
Used by the miner to incrementally update tunnels for the wing that
|
||||
just finished mining without recomputing pairs that don't involve it.
|
||||
Returns the list of tunnels created or refreshed.
|
||||
"""
|
||||
if not topics_by_wing or not isinstance(wing, str) or not wing.strip():
|
||||
return []
|
||||
|
||||
wing = wing.strip()
|
||||
own = topics_by_wing.get(wing)
|
||||
if not isinstance(own, (list, tuple)) or not own:
|
||||
return []
|
||||
|
||||
# Restrict the pair-wise computation to (wing, other) pairs only by
|
||||
# building a 2-wing slice for each other wing. Reusing
|
||||
# ``compute_topic_tunnels`` keeps the threshold and casing logic in
|
||||
# one place.
|
||||
created: list[dict] = []
|
||||
for other, other_topics in topics_by_wing.items():
|
||||
if not isinstance(other, str) or not other.strip() or other == wing:
|
||||
continue
|
||||
if not isinstance(other_topics, (list, tuple)) or not other_topics:
|
||||
continue
|
||||
slice_map = {wing: list(own), other: list(other_topics)}
|
||||
created.extend(
|
||||
compute_topic_tunnels(
|
||||
slice_map,
|
||||
min_count=min_count,
|
||||
label_prefix=label_prefix,
|
||||
)
|
||||
)
|
||||
return created
|
||||
|
||||
@@ -558,6 +558,7 @@ def to_detected_dict(
|
||||
return {
|
||||
"people": people_entries,
|
||||
"projects": proj_entries,
|
||||
"topics": [],
|
||||
"uncertain": [],
|
||||
}
|
||||
|
||||
@@ -577,7 +578,7 @@ def _merge_detected(primary: dict, secondary: dict, drop_secondary_uncertain: bo
|
||||
"""
|
||||
seen = {e["name"].lower() for cat in primary.values() for e in cat}
|
||||
merged = {k: list(v) for k, v in primary.items()}
|
||||
for cat_key in ("people", "projects", "uncertain"):
|
||||
for cat_key in ("people", "projects", "topics", "uncertain"):
|
||||
if cat_key == "uncertain" and drop_secondary_uncertain:
|
||||
continue
|
||||
for e in secondary.get(cat_key, []):
|
||||
@@ -654,7 +655,7 @@ def discover_entities(
|
||||
prose_detected = (
|
||||
detect_entities(prose_files, languages=languages)
|
||||
if prose_files
|
||||
else {"people": [], "projects": [], "uncertain": []}
|
||||
else {"people": [], "projects": [], "topics": [], "uncertain": []}
|
||||
)
|
||||
|
||||
# Without LLM refinement, suppress regex "uncertain" noise when real
|
||||
|
||||
Reference in New Issue
Block a user