2e441d17a2
Without this, on ext4 (and similar) filesystems the rename ack does not guarantee durability across power loss — a crash can revert to a state where the temp file is present and the target is at the old version. Suggested by @jphein on #1215.
707 lines
26 KiB
Python
707 lines
26 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
entity_registry.py — Persistent personal entity registry for MemPalace.
|
|
|
|
Knows the difference between Riley (a person) and ever (an adverb).
|
|
Built from three sources, in priority order:
|
|
1. Onboarding — what the user explicitly told us
|
|
2. Learned — what we inferred from session history with high confidence
|
|
3. Researched — what we looked up via Wikipedia for unknown words
|
|
|
|
Usage:
|
|
from mempalace.entity_registry import EntityRegistry
|
|
registry = EntityRegistry.load()
|
|
result = registry.lookup("Riley", context="I went with Riley today")
|
|
# → {"type": "person", "confidence": 1.0, "source": "onboarding"}
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import urllib.request
|
|
import urllib.parse
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Common English words that could be confused with names
|
|
# These get flagged as AMBIGUOUS and require context disambiguation
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
COMMON_ENGLISH_WORDS = {
|
|
# Words that are also common personal names
|
|
"ever",
|
|
"grace",
|
|
"will",
|
|
"bill",
|
|
"mark",
|
|
"april",
|
|
"may",
|
|
"june",
|
|
"joy",
|
|
"hope",
|
|
"faith",
|
|
"chance",
|
|
"chase",
|
|
"hunter",
|
|
"dash",
|
|
"flash",
|
|
"star",
|
|
"sky",
|
|
"river",
|
|
"brook",
|
|
"lane",
|
|
"art",
|
|
"clay",
|
|
"gil",
|
|
"nat",
|
|
"max",
|
|
"rex",
|
|
"ray",
|
|
"jay",
|
|
"rose",
|
|
"violet",
|
|
"lily",
|
|
"ivy",
|
|
"ash",
|
|
"reed",
|
|
"sage",
|
|
# Words that look like names at start of sentence
|
|
"monday",
|
|
"tuesday",
|
|
"wednesday",
|
|
"thursday",
|
|
"friday",
|
|
"saturday",
|
|
"sunday",
|
|
"january",
|
|
"february",
|
|
"march",
|
|
"july",
|
|
"august",
|
|
"september",
|
|
"october",
|
|
"november",
|
|
"december",
|
|
}
|
|
|
|
# Context patterns that indicate a word is being used as a PERSON name
|
|
PERSON_CONTEXT_PATTERNS = [
|
|
r"\b{name}\s+said\b",
|
|
r"\b{name}\s+told\b",
|
|
r"\b{name}\s+asked\b",
|
|
r"\b{name}\s+laughed\b",
|
|
r"\b{name}\s+smiled\b",
|
|
r"\b{name}\s+was\b",
|
|
r"\b{name}\s+is\b",
|
|
r"\b{name}\s+called\b",
|
|
r"\b{name}\s+texted\b",
|
|
r"\bwith\s+{name}\b",
|
|
r"\bsaw\s+{name}\b",
|
|
r"\bcalled\s+{name}\b",
|
|
r"\btook\s+{name}\b",
|
|
r"\bpicked\s+up\s+{name}\b",
|
|
r"\bdrop(?:ped)?\s+(?:off\s+)?{name}\b",
|
|
r"\b{name}(?:'s|s')\b", # Riley's, Max's
|
|
r"\bhey\s+{name}\b",
|
|
r"\bthanks?\s+{name}\b",
|
|
r"^{name}[:\s]", # dialogue: "Riley: ..."
|
|
r"\bmy\s+(?:son|daughter|kid|child|brother|sister|friend|partner|colleague|coworker)\s+{name}\b",
|
|
]
|
|
|
|
# Context patterns that indicate a word is NOT being used as a name
|
|
CONCEPT_CONTEXT_PATTERNS = [
|
|
r"\bhave\s+you\s+{name}\b", # "have you ever"
|
|
r"\bif\s+you\s+{name}\b", # "if you ever"
|
|
r"\b{name}\s+since\b", # "ever since"
|
|
r"\b{name}\s+again\b", # "ever again"
|
|
r"\bnot\s+{name}\b", # "not ever"
|
|
r"\b{name}\s+more\b", # "ever more"
|
|
r"\bwould\s+{name}\b", # "would ever"
|
|
r"\bcould\s+{name}\b", # "could ever"
|
|
r"\bwill\s+{name}\b", # "will ever"
|
|
r"(?:the\s+)?{name}\s+(?:of|in|at|for|to)\b", # "the grace of", "the mark of"
|
|
]
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Wikipedia lookup for unknown words
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
# Phrases in Wikipedia summaries that indicate a personal name
|
|
NAME_INDICATOR_PHRASES = [
|
|
"given name",
|
|
"personal name",
|
|
"first name",
|
|
"forename",
|
|
"masculine name",
|
|
"feminine name",
|
|
"boy's name",
|
|
"girl's name",
|
|
"male name",
|
|
"female name",
|
|
"irish name",
|
|
"welsh name",
|
|
"scottish name",
|
|
"gaelic name",
|
|
"hebrew name",
|
|
"arabic name",
|
|
"norse name",
|
|
"old english name",
|
|
"is a name",
|
|
"as a name",
|
|
"name meaning",
|
|
"name derived from",
|
|
"legendary irish",
|
|
"legendary welsh",
|
|
"legendary scottish",
|
|
]
|
|
|
|
PLACE_INDICATOR_PHRASES = [
|
|
"city in",
|
|
"town in",
|
|
"village in",
|
|
"municipality",
|
|
"capital of",
|
|
"district of",
|
|
"county",
|
|
"province",
|
|
"region of",
|
|
"island of",
|
|
"mountain in",
|
|
"river in",
|
|
]
|
|
|
|
|
|
def _wikipedia_lookup(word: str) -> dict:
|
|
"""
|
|
Look up a word via Wikipedia REST API.
|
|
Returns inferred type (person/place/concept/unknown) + confidence + summary.
|
|
Free, no API key, handles disambiguation pages.
|
|
|
|
**Privacy warning:** This function makes an outbound HTTPS request to
|
|
en.wikipedia.org, sending the queried word over the network. It should
|
|
only be called when the caller has explicitly opted in via
|
|
``allow_network=True`` in :meth:`EntityRegistry.research`. The default
|
|
behaviour of ``research()`` is local-only (no network calls).
|
|
"""
|
|
try:
|
|
url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(word)}"
|
|
req = urllib.request.Request(url, headers={"User-Agent": "MemPalace/1.0"})
|
|
with urllib.request.urlopen(req, timeout=5) as resp:
|
|
data = json.loads(resp.read())
|
|
|
|
page_type = data.get("type", "")
|
|
extract = data.get("extract", "").lower()
|
|
title = data.get("title", word)
|
|
|
|
# Disambiguation — look at description
|
|
if page_type == "disambiguation":
|
|
desc = data.get("description", "").lower()
|
|
if any(p in desc for p in ["name", "given name"]):
|
|
return {
|
|
"inferred_type": "person",
|
|
"confidence": 0.65,
|
|
"wiki_summary": extract[:200],
|
|
"wiki_title": title,
|
|
"note": "disambiguation page with name entries",
|
|
}
|
|
return {
|
|
"inferred_type": "ambiguous",
|
|
"confidence": 0.4,
|
|
"wiki_summary": extract[:200],
|
|
"wiki_title": title,
|
|
}
|
|
|
|
# Check for name indicators
|
|
if any(phrase in extract for phrase in NAME_INDICATOR_PHRASES):
|
|
# Higher confidence if the word itself is described as a name
|
|
confidence = (
|
|
0.90
|
|
if any(
|
|
f"{word.lower()} is a" in extract or f"{word.lower()} (name" in extract
|
|
for _ in [1]
|
|
)
|
|
else 0.80
|
|
)
|
|
return {
|
|
"inferred_type": "person",
|
|
"confidence": confidence,
|
|
"wiki_summary": extract[:200],
|
|
"wiki_title": title,
|
|
}
|
|
|
|
# Check for place indicators
|
|
if any(phrase in extract for phrase in PLACE_INDICATOR_PHRASES):
|
|
return {
|
|
"inferred_type": "place",
|
|
"confidence": 0.80,
|
|
"wiki_summary": extract[:200],
|
|
"wiki_title": title,
|
|
}
|
|
|
|
# Found but doesn't match name/place patterns
|
|
return {
|
|
"inferred_type": "concept",
|
|
"confidence": 0.60,
|
|
"wiki_summary": extract[:200],
|
|
"wiki_title": title,
|
|
}
|
|
|
|
except urllib.error.HTTPError as e:
|
|
if e.code == 404:
|
|
# Not in Wikipedia — this tells us nothing definitive about
|
|
# the word. Return "unknown" so the caller can decide.
|
|
return {
|
|
"inferred_type": "unknown",
|
|
"confidence": 0.3,
|
|
"wiki_summary": None,
|
|
"wiki_title": None,
|
|
"note": "not found in Wikipedia",
|
|
}
|
|
return {"inferred_type": "unknown", "confidence": 0.0, "wiki_summary": None}
|
|
except (urllib.error.URLError, OSError, json.JSONDecodeError, KeyError):
|
|
return {"inferred_type": "unknown", "confidence": 0.0, "wiki_summary": None}
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Entity Registry
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class EntityRegistry:
|
|
"""
|
|
Persistent personal entity registry.
|
|
|
|
Stored at ~/.mempalace/entity_registry.json
|
|
Schema:
|
|
{
|
|
"mode": "personal", # work | personal | combo
|
|
"version": 1,
|
|
"people": {
|
|
"Riley": {
|
|
"source": "onboarding",
|
|
"contexts": ["personal"],
|
|
"aliases": [],
|
|
"relationship": "daughter",
|
|
"confidence": 1.0
|
|
}
|
|
},
|
|
"projects": ["MemPalace", "Acme"],
|
|
"ambiguous_flags": ["riley", "max"],
|
|
"wiki_cache": {
|
|
"Sam": {"inferred_type": "person", "confidence": 0.9, "confirmed": true, ...}
|
|
}
|
|
}
|
|
"""
|
|
|
|
DEFAULT_PATH = Path.home() / ".mempalace" / "entity_registry.json"
|
|
|
|
def __init__(self, data: dict, path: Path):
|
|
self._data = data
|
|
self._path = path
|
|
|
|
# ── Load / Save ──────────────────────────────────────────────────────────
|
|
|
|
@classmethod
|
|
def load(cls, config_dir: Optional[Path] = None) -> "EntityRegistry":
|
|
path = (Path(config_dir) / "entity_registry.json") if config_dir else cls.DEFAULT_PATH
|
|
if path.exists():
|
|
try:
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
return cls(data, path)
|
|
except (json.JSONDecodeError, OSError):
|
|
pass
|
|
return cls(cls._empty(), path)
|
|
|
|
def save(self):
|
|
self._path.parent.mkdir(parents=True, exist_ok=True)
|
|
try:
|
|
self._path.parent.chmod(0o700)
|
|
except (OSError, NotImplementedError):
|
|
pass
|
|
# Atomic write: serialize to a sibling temp file in the same dir
|
|
# (so os.replace stays on one filesystem), fsync, then rename over
|
|
# the target. A crash mid-write leaves the previous registry intact
|
|
# instead of a half-written file or an empty file from the truncate.
|
|
payload = json.dumps(self._data, indent=2)
|
|
tmp_path = self._path.with_name(self._path.name + ".tmp")
|
|
with open(tmp_path, "w", encoding="utf-8") as f:
|
|
f.write(payload)
|
|
f.flush()
|
|
os.fsync(f.fileno())
|
|
try:
|
|
tmp_path.chmod(0o600)
|
|
except (OSError, NotImplementedError):
|
|
pass
|
|
os.replace(tmp_path, self._path)
|
|
# On ext4 (and similar) the rename's durability across power loss
|
|
# requires an additional fsync on the parent directory. Without it,
|
|
# the kernel can ack the rename and a crash reverts to the state
|
|
# where the temp file is present and the target is at the old version.
|
|
try:
|
|
dir_fd = os.open(str(self._path.parent), os.O_RDONLY)
|
|
try:
|
|
os.fsync(dir_fd)
|
|
finally:
|
|
os.close(dir_fd)
|
|
except OSError:
|
|
# Windows and some special filesystems reject directory fds — they
|
|
# have different durability semantics on rename anyway.
|
|
pass
|
|
|
|
@staticmethod
|
|
def _empty() -> dict:
|
|
return {
|
|
"version": 1,
|
|
"mode": "personal",
|
|
"people": {},
|
|
"projects": [],
|
|
"ambiguous_flags": [],
|
|
"wiki_cache": {},
|
|
}
|
|
|
|
# ── Properties ───────────────────────────────────────────────────────────
|
|
|
|
@property
|
|
def mode(self) -> str:
|
|
return self._data.get("mode", "personal")
|
|
|
|
@property
|
|
def people(self) -> dict:
|
|
return self._data.get("people", {})
|
|
|
|
@property
|
|
def projects(self) -> list:
|
|
return self._data.get("projects", [])
|
|
|
|
@property
|
|
def ambiguous_flags(self) -> list:
|
|
return self._data.get("ambiguous_flags", [])
|
|
|
|
# ── Seed from onboarding ─────────────────────────────────────────────────
|
|
|
|
def seed(self, mode: str, people: list, projects: list, aliases: dict = None):
|
|
"""
|
|
Seed the registry from onboarding data.
|
|
|
|
people: list of dicts {"name": str, "relationship": str, "context": str}
|
|
projects: list of str
|
|
aliases: dict {"Max": "Maxwell", ...}
|
|
"""
|
|
self._data["mode"] = mode
|
|
self._data["projects"] = list(projects)
|
|
|
|
aliases = aliases or {}
|
|
reverse_aliases = {v: k for k, v in aliases.items()} # Maxwell → Max
|
|
|
|
for entry in people:
|
|
name = entry["name"].strip()
|
|
if not name:
|
|
continue
|
|
context = entry.get("context", "personal")
|
|
relationship = entry.get("relationship", "")
|
|
|
|
self._data["people"][name] = {
|
|
"source": "onboarding",
|
|
"contexts": [context],
|
|
"aliases": [reverse_aliases[name]] if name in reverse_aliases else [],
|
|
"relationship": relationship,
|
|
"confidence": 1.0,
|
|
}
|
|
|
|
# Also register aliases
|
|
if name in reverse_aliases:
|
|
alias = reverse_aliases[name]
|
|
self._data["people"][alias] = {
|
|
"source": "onboarding",
|
|
"contexts": [context],
|
|
"aliases": [name],
|
|
"relationship": relationship,
|
|
"confidence": 1.0,
|
|
"canonical": name,
|
|
}
|
|
|
|
# Flag ambiguous names (also common English words)
|
|
ambiguous = []
|
|
for name in self._data["people"]:
|
|
if name.lower() in COMMON_ENGLISH_WORDS:
|
|
ambiguous.append(name.lower())
|
|
self._data["ambiguous_flags"] = ambiguous
|
|
|
|
self.save()
|
|
|
|
# ── Lookup ───────────────────────────────────────────────────────────────
|
|
|
|
def lookup(self, word: str, context: str = "") -> dict:
|
|
"""
|
|
Look up a word. Returns entity classification.
|
|
|
|
context: surrounding sentence (used for disambiguation of ambiguous words)
|
|
|
|
Returns:
|
|
{"type": "person"|"project"|"concept"|"unknown",
|
|
"confidence": float,
|
|
"source": "onboarding"|"learned"|"wiki"|"inferred",
|
|
"name": canonical name if found,
|
|
"needs_disambiguation": bool}
|
|
"""
|
|
# 1. Exact match in people registry
|
|
for canonical, info in self.people.items():
|
|
if word.lower() == canonical.lower() or word.lower() in [
|
|
a.lower() for a in info.get("aliases", [])
|
|
]:
|
|
# Check if this is an ambiguous word
|
|
if word.lower() in self.ambiguous_flags and context:
|
|
resolved = self._disambiguate(word, context, info)
|
|
if resolved is not None:
|
|
return resolved
|
|
return {
|
|
"type": "person",
|
|
"confidence": info["confidence"],
|
|
"source": info["source"],
|
|
"name": canonical,
|
|
"context": info.get("contexts", ["personal"]),
|
|
"needs_disambiguation": False,
|
|
}
|
|
|
|
# 2. Project match
|
|
for proj in self.projects:
|
|
if word.lower() == proj.lower():
|
|
return {
|
|
"type": "project",
|
|
"confidence": 1.0,
|
|
"source": "onboarding",
|
|
"name": proj,
|
|
"needs_disambiguation": False,
|
|
}
|
|
|
|
# 3. Wiki cache
|
|
cache = self._data.get("wiki_cache", {})
|
|
for cached_word, cached_result in cache.items():
|
|
if word.lower() == cached_word.lower() and cached_result.get("confirmed"):
|
|
return {
|
|
"type": cached_result["inferred_type"],
|
|
"confidence": cached_result["confidence"],
|
|
"source": "wiki",
|
|
"name": word,
|
|
"needs_disambiguation": False,
|
|
}
|
|
|
|
return {
|
|
"type": "unknown",
|
|
"confidence": 0.0,
|
|
"source": "none",
|
|
"name": word,
|
|
"needs_disambiguation": False,
|
|
}
|
|
|
|
def _disambiguate(self, word: str, context: str, person_info: dict) -> Optional[dict]:
|
|
"""
|
|
When a word is both a name and a common word, check context.
|
|
Returns person result if context suggests a name, None if ambiguous.
|
|
"""
|
|
name_lower = word.lower()
|
|
ctx_lower = context.lower()
|
|
|
|
# Check person context patterns
|
|
person_score = 0
|
|
for pat in PERSON_CONTEXT_PATTERNS:
|
|
if re.search(pat.format(name=re.escape(name_lower)), ctx_lower):
|
|
person_score += 1
|
|
|
|
# Check concept context patterns
|
|
concept_score = 0
|
|
for pat in CONCEPT_CONTEXT_PATTERNS:
|
|
if re.search(pat.format(name=re.escape(name_lower)), ctx_lower):
|
|
concept_score += 1
|
|
|
|
if person_score > concept_score:
|
|
return {
|
|
"type": "person",
|
|
"confidence": min(0.95, 0.7 + person_score * 0.1),
|
|
"source": person_info["source"],
|
|
"name": word,
|
|
"context": person_info.get("contexts", ["personal"]),
|
|
"needs_disambiguation": False,
|
|
"disambiguated_by": "context_patterns",
|
|
}
|
|
elif concept_score > person_score:
|
|
return {
|
|
"type": "concept",
|
|
"confidence": min(0.90, 0.7 + concept_score * 0.1),
|
|
"source": "context_disambiguated",
|
|
"name": word,
|
|
"needs_disambiguation": False,
|
|
"disambiguated_by": "context_patterns",
|
|
}
|
|
|
|
# Truly ambiguous — return None to fall through to person (registered name)
|
|
return None
|
|
|
|
# ── Research unknown words ───────────────────────────────────────────────
|
|
|
|
def research(self, word: str, auto_confirm: bool = False, allow_network: bool = False) -> dict:
|
|
"""
|
|
Research an unknown word.
|
|
|
|
By default this is **local-only**: it checks the wiki cache and
|
|
returns ``"unknown"`` for uncached words. Pass
|
|
``allow_network=True`` to explicitly opt in to an outbound
|
|
Wikipedia lookup. This design honours the project's
|
|
*local-first, zero API* and *privacy by architecture* principles
|
|
— no data leaves the machine unless the caller requests it.
|
|
|
|
Caches result. If *auto_confirm* is ``False``, marks the entry
|
|
as unconfirmed (needs user review).
|
|
"""
|
|
# Check cache (read-only — no mutation when allow_network is False)
|
|
cache = self._data.get("wiki_cache", {})
|
|
if word in cache:
|
|
return cache[word]
|
|
|
|
if not allow_network:
|
|
return {
|
|
"inferred_type": "unknown",
|
|
"confidence": 0.0,
|
|
"wiki_summary": None,
|
|
"wiki_title": None,
|
|
"word": word,
|
|
"confirmed": False,
|
|
"note": "network lookup disabled — pass allow_network=True to query Wikipedia",
|
|
}
|
|
|
|
# Network path — ensure wiki_cache key exists before writing
|
|
cache = self._data.setdefault("wiki_cache", {})
|
|
result = _wikipedia_lookup(word)
|
|
result.setdefault("word", word)
|
|
result.setdefault("confirmed", auto_confirm)
|
|
|
|
cache[word] = result
|
|
self.save()
|
|
return result
|
|
|
|
def confirm_research(
|
|
self, word: str, entity_type: str, relationship: str = "", context: str = "personal"
|
|
):
|
|
"""Mark a researched word as confirmed and add to people registry."""
|
|
cache = self._data.get("wiki_cache", {})
|
|
if word in cache:
|
|
cache[word]["confirmed"] = True
|
|
cache[word]["confirmed_type"] = entity_type
|
|
|
|
if entity_type == "person":
|
|
self._data["people"][word] = {
|
|
"source": "wiki",
|
|
"contexts": [context],
|
|
"aliases": [],
|
|
"relationship": relationship,
|
|
"confidence": 0.90,
|
|
}
|
|
if word.lower() in COMMON_ENGLISH_WORDS:
|
|
flags = self._data.setdefault("ambiguous_flags", [])
|
|
if word.lower() not in flags:
|
|
flags.append(word.lower())
|
|
|
|
self.save()
|
|
|
|
# ── Learn from sessions ──────────────────────────────────────────────────
|
|
|
|
def learn_from_text(self, text: str, min_confidence: float = 0.75, languages=("en",)) -> list:
|
|
"""
|
|
Scan session text for new entity candidates.
|
|
Returns list of newly discovered candidates for review.
|
|
|
|
``languages`` is forwarded to entity detection — pass the user's
|
|
configured ``MempalaceConfig().entity_languages`` to match the
|
|
locales used at ``mempalace init`` time.
|
|
"""
|
|
from mempalace.entity_detector import extract_candidates, score_entity, classify_entity
|
|
|
|
lines = text.splitlines()
|
|
candidates = extract_candidates(text, languages=languages)
|
|
new_candidates = []
|
|
|
|
for name, frequency in candidates.items():
|
|
# Skip if already known
|
|
if name in self.people or name in self.projects:
|
|
continue
|
|
|
|
scores = score_entity(name, text, lines, languages=languages)
|
|
entity = classify_entity(name, frequency, scores)
|
|
|
|
if entity["type"] == "person" and entity["confidence"] >= min_confidence:
|
|
self._data["people"][name] = {
|
|
"source": "learned",
|
|
"contexts": [self.mode if self.mode != "combo" else "personal"],
|
|
"aliases": [],
|
|
"relationship": "",
|
|
"confidence": entity["confidence"],
|
|
"seen_count": frequency,
|
|
}
|
|
if name.lower() in COMMON_ENGLISH_WORDS:
|
|
flags = self._data.setdefault("ambiguous_flags", [])
|
|
if name.lower() not in flags:
|
|
flags.append(name.lower())
|
|
new_candidates.append(entity)
|
|
|
|
if new_candidates:
|
|
self.save()
|
|
|
|
return new_candidates
|
|
|
|
# ── Query helpers for retrieval ──────────────────────────────────────────
|
|
|
|
def extract_people_from_query(self, query: str) -> list:
|
|
"""
|
|
Extract known person names from a query string.
|
|
Returns list of canonical names found.
|
|
"""
|
|
found = []
|
|
|
|
for canonical, info in self.people.items():
|
|
names_to_check = [canonical] + info.get("aliases", [])
|
|
for name in names_to_check:
|
|
# Word boundary match
|
|
if re.search(rf"\b{re.escape(name)}\b", query, re.IGNORECASE):
|
|
# For ambiguous words, check context
|
|
if name.lower() in self.ambiguous_flags:
|
|
result = self._disambiguate(name, query, info)
|
|
if result and result["type"] == "person":
|
|
if canonical not in found:
|
|
found.append(canonical)
|
|
else:
|
|
if canonical not in found:
|
|
found.append(canonical)
|
|
return found
|
|
|
|
def extract_unknown_candidates(self, query: str) -> list:
|
|
"""
|
|
Find capitalized words in query that aren't in registry or common words.
|
|
These are candidates for Wikipedia research.
|
|
"""
|
|
from .palace import _candidate_entity_words
|
|
|
|
candidates = _candidate_entity_words(query)
|
|
unknown = []
|
|
for word in set(candidates):
|
|
if word.lower() in COMMON_ENGLISH_WORDS:
|
|
continue
|
|
result = self.lookup(word)
|
|
if result["type"] == "unknown":
|
|
unknown.append(word)
|
|
return unknown
|
|
|
|
# ── Summary ──────────────────────────────────────────────────────────────
|
|
|
|
def summary(self) -> str:
|
|
lines = [
|
|
f"Mode: {self.mode}",
|
|
f"People: {len(self.people)} ({', '.join(list(self.people.keys())[:8])}{'...' if len(self.people) > 8 else ''})",
|
|
f"Projects: {', '.join(self.projects) or '(none)'}",
|
|
f"Ambiguous flags: {', '.join(self.ambiguous_flags) or '(none)'}",
|
|
f"Wiki cache: {len(self._data.get('wiki_cache', {}))} entries",
|
|
]
|
|
return "\n".join(lines)
|