fix: make entity_registry.research() local-only by default (#811)
* fix: make entity_registry.research() local-only by default research() previously called _wikipedia_lookup() unconditionally, sending entity names to en.wikipedia.org on every uncached lookup. This violates the project's local-first and privacy-by-architecture principles documented in CLAUDE.md. Changes: - research() now returns "unknown" for uncached words by default - New allow_network=True parameter required for Wikipedia lookups - Wikipedia 404 now returns "unknown" instead of asserting "person" with 0.70 confidence, preventing entity registry poisoning - Added privacy warning docstring to _wikipedia_lookup() - Added tests for local-only default, opt-in network, 404 handling, and cache-not-persisted-on-local-only behaviour Refs: MemPalace/mempalace#809 * fix: improve research() cache read path and deduplicate test mocks - Use .get() instead of .setdefault() for cache reads in research() so the local-only path never mutates _data unnecessarily - Move .setdefault() to the network-write path only - Use result.setdefault() for word/confirmed keys to ensure consistent return shape across all _wikipedia_lookup error paths - Extract duplicated mock_result dict into _MOCK_SAOIRSE_PERSON constant shared by 3 test functions
This commit is contained in:
committed by
GitHub
parent
f36d04e4a4
commit
f20f45a2da
@@ -178,6 +178,12 @@ def _wikipedia_lookup(word: str) -> dict:
|
|||||||
Look up a word via Wikipedia REST API.
|
Look up a word via Wikipedia REST API.
|
||||||
Returns inferred type (person/place/concept/unknown) + confidence + summary.
|
Returns inferred type (person/place/concept/unknown) + confidence + summary.
|
||||||
Free, no API key, handles disambiguation pages.
|
Free, no API key, handles disambiguation pages.
|
||||||
|
|
||||||
|
**Privacy warning:** This function makes an outbound HTTPS request to
|
||||||
|
en.wikipedia.org, sending the queried word over the network. It should
|
||||||
|
only be called when the caller has explicitly opted in via
|
||||||
|
``allow_network=True`` in :meth:`EntityRegistry.research`. The default
|
||||||
|
behaviour of ``research()`` is local-only (no network calls).
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(word)}"
|
url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(word)}"
|
||||||
@@ -244,13 +250,14 @@ def _wikipedia_lookup(word: str) -> dict:
|
|||||||
|
|
||||||
except urllib.error.HTTPError as e:
|
except urllib.error.HTTPError as e:
|
||||||
if e.code == 404:
|
if e.code == 404:
|
||||||
# Not in Wikipedia — strong signal it's a proper noun (unusual name, nickname)
|
# Not in Wikipedia — this tells us nothing definitive about
|
||||||
|
# the word. Return "unknown" so the caller can decide.
|
||||||
return {
|
return {
|
||||||
"inferred_type": "person",
|
"inferred_type": "unknown",
|
||||||
"confidence": 0.70,
|
"confidence": 0.3,
|
||||||
"wiki_summary": None,
|
"wiki_summary": None,
|
||||||
"wiki_title": None,
|
"wiki_title": None,
|
||||||
"note": "not found in Wikipedia — likely a proper noun or unusual name",
|
"note": "not found in Wikipedia",
|
||||||
}
|
}
|
||||||
return {"inferred_type": "unknown", "confidence": 0.0, "wiki_summary": None}
|
return {"inferred_type": "unknown", "confidence": 0.0, "wiki_summary": None}
|
||||||
except (urllib.error.URLError, OSError, json.JSONDecodeError, KeyError):
|
except (urllib.error.URLError, OSError, json.JSONDecodeError, KeyError):
|
||||||
@@ -502,20 +509,41 @@ class EntityRegistry:
|
|||||||
|
|
||||||
# ── Research unknown words ───────────────────────────────────────────────
|
# ── Research unknown words ───────────────────────────────────────────────
|
||||||
|
|
||||||
def research(self, word: str, auto_confirm: bool = False) -> dict:
|
def research(self, word: str, auto_confirm: bool = False, allow_network: bool = False) -> dict:
|
||||||
"""
|
"""
|
||||||
Research an unknown word via Wikipedia.
|
Research an unknown word.
|
||||||
Caches result. If auto_confirm=False, marks as unconfirmed (needs user review).
|
|
||||||
Returns the lookup result.
|
By default this is **local-only**: it checks the wiki cache and
|
||||||
|
returns ``"unknown"`` for uncached words. Pass
|
||||||
|
``allow_network=True`` to explicitly opt in to an outbound
|
||||||
|
Wikipedia lookup. This design honours the project's
|
||||||
|
*local-first, zero API* and *privacy by architecture* principles
|
||||||
|
— no data leaves the machine unless the caller requests it.
|
||||||
|
|
||||||
|
Caches result. If *auto_confirm* is ``False``, marks the entry
|
||||||
|
as unconfirmed (needs user review).
|
||||||
"""
|
"""
|
||||||
# Already cached?
|
# Check cache (read-only — no mutation when allow_network is False)
|
||||||
cache = self._data.setdefault("wiki_cache", {})
|
cache = self._data.get("wiki_cache", {})
|
||||||
if word in cache:
|
if word in cache:
|
||||||
return cache[word]
|
return cache[word]
|
||||||
|
|
||||||
|
if not allow_network:
|
||||||
|
return {
|
||||||
|
"inferred_type": "unknown",
|
||||||
|
"confidence": 0.0,
|
||||||
|
"wiki_summary": None,
|
||||||
|
"wiki_title": None,
|
||||||
|
"word": word,
|
||||||
|
"confirmed": False,
|
||||||
|
"note": "network lookup disabled — pass allow_network=True to query Wikipedia",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Network path — ensure wiki_cache key exists before writing
|
||||||
|
cache = self._data.setdefault("wiki_cache", {})
|
||||||
result = _wikipedia_lookup(word)
|
result = _wikipedia_lookup(word)
|
||||||
result["word"] = word
|
result.setdefault("word", word)
|
||||||
result["confirmed"] = auto_confirm
|
result.setdefault("confirmed", auto_confirm)
|
||||||
|
|
||||||
cache[word] = result
|
cache[word] = result
|
||||||
self.save()
|
self.save()
|
||||||
|
|||||||
@@ -8,6 +8,14 @@ from mempalace.entity_registry import (
|
|||||||
EntityRegistry,
|
EntityRegistry,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Shared mock result for Wikipedia person lookup tests
|
||||||
|
_MOCK_SAOIRSE_PERSON = {
|
||||||
|
"inferred_type": "person",
|
||||||
|
"confidence": 0.80,
|
||||||
|
"wiki_summary": "Saoirse is an Irish given name.",
|
||||||
|
"wiki_title": "Saoirse",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# ── COMMON_ENGLISH_WORDS ────────────────────────────────────────────────
|
# ── COMMON_ENGLISH_WORDS ────────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -213,22 +221,49 @@ def test_lookup_ambiguous_word_as_concept(tmp_path):
|
|||||||
assert result["type"] == "concept"
|
assert result["type"] == "concept"
|
||||||
|
|
||||||
|
|
||||||
# ── research (Wikipedia) — mocked ──────────────────────────────────────
|
# ── research — local-only by default ───────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
def test_research_caches_result(tmp_path):
|
def test_research_local_only_by_default(tmp_path):
|
||||||
|
"""research() must NOT call Wikipedia unless allow_network=True."""
|
||||||
registry = EntityRegistry.load(config_dir=tmp_path)
|
registry = EntityRegistry.load(config_dir=tmp_path)
|
||||||
registry.seed(mode="personal", people=[], projects=[])
|
registry.seed(mode="personal", people=[], projects=[])
|
||||||
|
|
||||||
mock_result = {
|
with patch(
|
||||||
"inferred_type": "person",
|
"mempalace.entity_registry._wikipedia_lookup",
|
||||||
"confidence": 0.80,
|
side_effect=AssertionError("network call should not happen"),
|
||||||
"wiki_summary": "Saoirse is an Irish given name.",
|
):
|
||||||
"wiki_title": "Saoirse",
|
result = registry.research("Saoirse")
|
||||||
}
|
|
||||||
|
|
||||||
with patch("mempalace.entity_registry._wikipedia_lookup", return_value=mock_result):
|
assert result["inferred_type"] == "unknown"
|
||||||
result = registry.research("Saoirse", auto_confirm=True)
|
assert result["confidence"] == 0.0
|
||||||
|
assert result["word"] == "Saoirse"
|
||||||
|
assert "network lookup disabled" in result.get("note", "")
|
||||||
|
|
||||||
|
|
||||||
|
def test_research_with_allow_network(tmp_path):
|
||||||
|
"""research(allow_network=True) calls Wikipedia and caches result."""
|
||||||
|
registry = EntityRegistry.load(config_dir=tmp_path)
|
||||||
|
registry.seed(mode="personal", people=[], projects=[])
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"mempalace.entity_registry._wikipedia_lookup",
|
||||||
|
return_value=dict(_MOCK_SAOIRSE_PERSON),
|
||||||
|
):
|
||||||
|
result = registry.research("Saoirse", auto_confirm=True, allow_network=True)
|
||||||
|
assert result["inferred_type"] == "person"
|
||||||
|
|
||||||
|
|
||||||
|
def test_research_caches_result(tmp_path):
|
||||||
|
"""Once cached via allow_network, subsequent calls use cache without network."""
|
||||||
|
registry = EntityRegistry.load(config_dir=tmp_path)
|
||||||
|
registry.seed(mode="personal", people=[], projects=[])
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"mempalace.entity_registry._wikipedia_lookup",
|
||||||
|
return_value=dict(_MOCK_SAOIRSE_PERSON),
|
||||||
|
):
|
||||||
|
result = registry.research("Saoirse", auto_confirm=True, allow_network=True)
|
||||||
assert result["inferred_type"] == "person"
|
assert result["inferred_type"] == "person"
|
||||||
|
|
||||||
# Second call should use cache, not call Wikipedia again
|
# Second call should use cache, not call Wikipedia again
|
||||||
@@ -240,24 +275,49 @@ def test_research_caches_result(tmp_path):
|
|||||||
assert cached["inferred_type"] == "person"
|
assert cached["inferred_type"] == "person"
|
||||||
|
|
||||||
|
|
||||||
|
def test_research_local_only_not_cached(tmp_path):
|
||||||
|
"""Local-only result for uncached word should NOT be persisted to cache."""
|
||||||
|
registry = EntityRegistry.load(config_dir=tmp_path)
|
||||||
|
registry.seed(mode="personal", people=[], projects=[])
|
||||||
|
|
||||||
|
registry.research("Xander") # local-only, no network
|
||||||
|
assert "Xander" not in registry._data.get("wiki_cache", {})
|
||||||
|
|
||||||
|
|
||||||
def test_confirm_research_adds_to_people(tmp_path):
|
def test_confirm_research_adds_to_people(tmp_path):
|
||||||
registry = EntityRegistry.load(config_dir=tmp_path)
|
registry = EntityRegistry.load(config_dir=tmp_path)
|
||||||
registry.seed(mode="personal", people=[], projects=[])
|
registry.seed(mode="personal", people=[], projects=[])
|
||||||
|
|
||||||
mock_result = {
|
with patch(
|
||||||
"inferred_type": "person",
|
"mempalace.entity_registry._wikipedia_lookup",
|
||||||
"confidence": 0.80,
|
return_value=dict(_MOCK_SAOIRSE_PERSON),
|
||||||
"wiki_summary": "Saoirse is a name",
|
):
|
||||||
"wiki_title": "Saoirse",
|
registry.research("Saoirse", auto_confirm=False, allow_network=True)
|
||||||
}
|
|
||||||
with patch("mempalace.entity_registry._wikipedia_lookup", return_value=mock_result):
|
|
||||||
registry.research("Saoirse", auto_confirm=False)
|
|
||||||
|
|
||||||
registry.confirm_research("Saoirse", entity_type="person", relationship="friend")
|
registry.confirm_research("Saoirse", entity_type="person", relationship="friend")
|
||||||
assert "Saoirse" in registry.people
|
assert "Saoirse" in registry.people
|
||||||
assert registry.people["Saoirse"]["source"] == "wiki"
|
assert registry.people["Saoirse"]["source"] == "wiki"
|
||||||
|
|
||||||
|
|
||||||
|
def test_wikipedia_404_returns_unknown(tmp_path):
|
||||||
|
"""A 404 from Wikipedia should return 'unknown', not assert 'person'."""
|
||||||
|
registry = EntityRegistry.load(config_dir=tmp_path)
|
||||||
|
registry.seed(mode="personal", people=[], projects=[])
|
||||||
|
|
||||||
|
mock_result = {
|
||||||
|
"inferred_type": "unknown",
|
||||||
|
"confidence": 0.3,
|
||||||
|
"wiki_summary": None,
|
||||||
|
"wiki_title": None,
|
||||||
|
"note": "not found in Wikipedia",
|
||||||
|
}
|
||||||
|
with patch("mempalace.entity_registry._wikipedia_lookup", return_value=mock_result):
|
||||||
|
result = registry.research("Zzxqy", auto_confirm=False, allow_network=True)
|
||||||
|
|
||||||
|
assert result["inferred_type"] == "unknown"
|
||||||
|
assert result["confidence"] < 0.5
|
||||||
|
|
||||||
|
|
||||||
# ── extract_people_from_query ───────────────────────────────────────────
|
# ── extract_people_from_query ───────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user