From f20f45a2dab6510c17d9d9cb85434bce048eb903 Mon Sep 17 00:00:00 2001 From: "Marcio E. Heiderscheidt" Date: Wed, 15 Apr 2026 04:26:24 -0300 Subject: [PATCH] fix: make entity_registry.research() local-only by default (#811) * fix: make entity_registry.research() local-only by default research() previously called _wikipedia_lookup() unconditionally, sending entity names to en.wikipedia.org on every uncached lookup. This violates the project's local-first and privacy-by-architecture principles documented in CLAUDE.md. Changes: - research() now returns "unknown" for uncached words by default - New allow_network=True parameter required for Wikipedia lookups - Wikipedia 404 now returns "unknown" instead of asserting "person" with 0.70 confidence, preventing entity registry poisoning - Added privacy warning docstring to _wikipedia_lookup() - Added tests for local-only default, opt-in network, 404 handling, and cache-not-persisted-on-local-only behaviour Refs: MemPalace/mempalace#809 * fix: improve research() cache read path and deduplicate test mocks - Use .get() instead of .setdefault() for cache reads in research() so the local-only path never mutates _data unnecessarily - Move .setdefault() to the network-write path only - Use result.setdefault() for word/confirmed keys to ensure consistent return shape across all _wikipedia_lookup error paths - Extract duplicated mock_result dict into _MOCK_SAOIRSE_PERSON constant shared by 3 test functions --- mempalace/entity_registry.py | 52 ++++++++++++++----- tests/test_entity_registry.py | 96 ++++++++++++++++++++++++++++------- 2 files changed, 118 insertions(+), 30 deletions(-) diff --git a/mempalace/entity_registry.py b/mempalace/entity_registry.py index 2a4ad8d..d28de54 100644 --- a/mempalace/entity_registry.py +++ b/mempalace/entity_registry.py @@ -178,6 +178,12 @@ def _wikipedia_lookup(word: str) -> dict: Look up a word via Wikipedia REST API. Returns inferred type (person/place/concept/unknown) + confidence + summary. Free, no API key, handles disambiguation pages. + + **Privacy warning:** This function makes an outbound HTTPS request to + en.wikipedia.org, sending the queried word over the network. It should + only be called when the caller has explicitly opted in via + ``allow_network=True`` in :meth:`EntityRegistry.research`. The default + behaviour of ``research()`` is local-only (no network calls). """ try: url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(word)}" @@ -244,13 +250,14 @@ def _wikipedia_lookup(word: str) -> dict: except urllib.error.HTTPError as e: if e.code == 404: - # Not in Wikipedia — strong signal it's a proper noun (unusual name, nickname) + # Not in Wikipedia — this tells us nothing definitive about + # the word. Return "unknown" so the caller can decide. return { - "inferred_type": "person", - "confidence": 0.70, + "inferred_type": "unknown", + "confidence": 0.3, "wiki_summary": None, "wiki_title": None, - "note": "not found in Wikipedia — likely a proper noun or unusual name", + "note": "not found in Wikipedia", } return {"inferred_type": "unknown", "confidence": 0.0, "wiki_summary": None} except (urllib.error.URLError, OSError, json.JSONDecodeError, KeyError): @@ -502,20 +509,41 @@ class EntityRegistry: # ── Research unknown words ─────────────────────────────────────────────── - def research(self, word: str, auto_confirm: bool = False) -> dict: + def research(self, word: str, auto_confirm: bool = False, allow_network: bool = False) -> dict: """ - Research an unknown word via Wikipedia. - Caches result. If auto_confirm=False, marks as unconfirmed (needs user review). - Returns the lookup result. + Research an unknown word. + + By default this is **local-only**: it checks the wiki cache and + returns ``"unknown"`` for uncached words. Pass + ``allow_network=True`` to explicitly opt in to an outbound + Wikipedia lookup. This design honours the project's + *local-first, zero API* and *privacy by architecture* principles + — no data leaves the machine unless the caller requests it. + + Caches result. If *auto_confirm* is ``False``, marks the entry + as unconfirmed (needs user review). """ - # Already cached? - cache = self._data.setdefault("wiki_cache", {}) + # Check cache (read-only — no mutation when allow_network is False) + cache = self._data.get("wiki_cache", {}) if word in cache: return cache[word] + if not allow_network: + return { + "inferred_type": "unknown", + "confidence": 0.0, + "wiki_summary": None, + "wiki_title": None, + "word": word, + "confirmed": False, + "note": "network lookup disabled — pass allow_network=True to query Wikipedia", + } + + # Network path — ensure wiki_cache key exists before writing + cache = self._data.setdefault("wiki_cache", {}) result = _wikipedia_lookup(word) - result["word"] = word - result["confirmed"] = auto_confirm + result.setdefault("word", word) + result.setdefault("confirmed", auto_confirm) cache[word] = result self.save() diff --git a/tests/test_entity_registry.py b/tests/test_entity_registry.py index b92bf84..c857a07 100644 --- a/tests/test_entity_registry.py +++ b/tests/test_entity_registry.py @@ -8,6 +8,14 @@ from mempalace.entity_registry import ( EntityRegistry, ) +# Shared mock result for Wikipedia person lookup tests +_MOCK_SAOIRSE_PERSON = { + "inferred_type": "person", + "confidence": 0.80, + "wiki_summary": "Saoirse is an Irish given name.", + "wiki_title": "Saoirse", +} + # ── COMMON_ENGLISH_WORDS ──────────────────────────────────────────────── @@ -213,22 +221,49 @@ def test_lookup_ambiguous_word_as_concept(tmp_path): assert result["type"] == "concept" -# ── research (Wikipedia) — mocked ────────────────────────────────────── +# ── research — local-only by default ─────────────────────────────────── -def test_research_caches_result(tmp_path): +def test_research_local_only_by_default(tmp_path): + """research() must NOT call Wikipedia unless allow_network=True.""" registry = EntityRegistry.load(config_dir=tmp_path) registry.seed(mode="personal", people=[], projects=[]) - mock_result = { - "inferred_type": "person", - "confidence": 0.80, - "wiki_summary": "Saoirse is an Irish given name.", - "wiki_title": "Saoirse", - } + with patch( + "mempalace.entity_registry._wikipedia_lookup", + side_effect=AssertionError("network call should not happen"), + ): + result = registry.research("Saoirse") - with patch("mempalace.entity_registry._wikipedia_lookup", return_value=mock_result): - result = registry.research("Saoirse", auto_confirm=True) + assert result["inferred_type"] == "unknown" + assert result["confidence"] == 0.0 + assert result["word"] == "Saoirse" + assert "network lookup disabled" in result.get("note", "") + + +def test_research_with_allow_network(tmp_path): + """research(allow_network=True) calls Wikipedia and caches result.""" + registry = EntityRegistry.load(config_dir=tmp_path) + registry.seed(mode="personal", people=[], projects=[]) + + with patch( + "mempalace.entity_registry._wikipedia_lookup", + return_value=dict(_MOCK_SAOIRSE_PERSON), + ): + result = registry.research("Saoirse", auto_confirm=True, allow_network=True) + assert result["inferred_type"] == "person" + + +def test_research_caches_result(tmp_path): + """Once cached via allow_network, subsequent calls use cache without network.""" + registry = EntityRegistry.load(config_dir=tmp_path) + registry.seed(mode="personal", people=[], projects=[]) + + with patch( + "mempalace.entity_registry._wikipedia_lookup", + return_value=dict(_MOCK_SAOIRSE_PERSON), + ): + result = registry.research("Saoirse", auto_confirm=True, allow_network=True) assert result["inferred_type"] == "person" # Second call should use cache, not call Wikipedia again @@ -240,24 +275,49 @@ def test_research_caches_result(tmp_path): assert cached["inferred_type"] == "person" +def test_research_local_only_not_cached(tmp_path): + """Local-only result for uncached word should NOT be persisted to cache.""" + registry = EntityRegistry.load(config_dir=tmp_path) + registry.seed(mode="personal", people=[], projects=[]) + + registry.research("Xander") # local-only, no network + assert "Xander" not in registry._data.get("wiki_cache", {}) + + def test_confirm_research_adds_to_people(tmp_path): registry = EntityRegistry.load(config_dir=tmp_path) registry.seed(mode="personal", people=[], projects=[]) - mock_result = { - "inferred_type": "person", - "confidence": 0.80, - "wiki_summary": "Saoirse is a name", - "wiki_title": "Saoirse", - } - with patch("mempalace.entity_registry._wikipedia_lookup", return_value=mock_result): - registry.research("Saoirse", auto_confirm=False) + with patch( + "mempalace.entity_registry._wikipedia_lookup", + return_value=dict(_MOCK_SAOIRSE_PERSON), + ): + registry.research("Saoirse", auto_confirm=False, allow_network=True) registry.confirm_research("Saoirse", entity_type="person", relationship="friend") assert "Saoirse" in registry.people assert registry.people["Saoirse"]["source"] == "wiki" +def test_wikipedia_404_returns_unknown(tmp_path): + """A 404 from Wikipedia should return 'unknown', not assert 'person'.""" + registry = EntityRegistry.load(config_dir=tmp_path) + registry.seed(mode="personal", people=[], projects=[]) + + mock_result = { + "inferred_type": "unknown", + "confidence": 0.3, + "wiki_summary": None, + "wiki_title": None, + "note": "not found in Wikipedia", + } + with patch("mempalace.entity_registry._wikipedia_lookup", return_value=mock_result): + result = registry.research("Zzxqy", auto_confirm=False, allow_network=True) + + assert result["inferred_type"] == "unknown" + assert result["confidence"] < 0.5 + + # ── extract_people_from_query ───────────────────────────────────────────