Files
mempalace/tests/test_entity_registry.py
T
Arnold Wender 4f36145c2e fix(entity_registry): atomic write to prevent partial corruption on crash
EntityRegistry.save() called Path.write_text() directly, which truncates
the target file and then writes — so a crash mid-write (power loss, OOM,
filesystem-full mid-flush) leaves an empty or half-written
entity_registry.json. The whole people/projects map is lost; the system
falls back to an empty registry on next load.

Switch to the standard atomic-write pattern: serialize to a sibling
.tmp file in the same directory (so os.replace stays on one filesystem),
fsync, chmod 0o600, then os.replace over the target. The replace is
atomic on POSIX and Windows, so any crash leaves the previous registry
intact instead of a truncated file.

Tests cover: no leftover .tmp on success, and previous content preserved
when os.replace itself raises mid-save.
2026-05-04 11:08:14 +02:00

420 lines
15 KiB
Python

"""Tests for mempalace.entity_registry."""
from unittest.mock import patch
import pytest
from mempalace.entity_registry import (
COMMON_ENGLISH_WORDS,
PERSON_CONTEXT_PATTERNS,
EntityRegistry,
)
# Shared mock result for Wikipedia person lookup tests
_MOCK_SAOIRSE_PERSON = {
"inferred_type": "person",
"confidence": 0.80,
"wiki_summary": "Saoirse is an Irish given name.",
"wiki_title": "Saoirse",
}
# ── COMMON_ENGLISH_WORDS ────────────────────────────────────────────────
def test_common_english_words_has_expected_entries():
assert "ever" in COMMON_ENGLISH_WORDS
assert "grace" in COMMON_ENGLISH_WORDS
assert "will" in COMMON_ENGLISH_WORDS
assert "may" in COMMON_ENGLISH_WORDS
assert "monday" in COMMON_ENGLISH_WORDS
def test_common_english_words_is_lowercase():
for word in COMMON_ENGLISH_WORDS:
assert word == word.lower(), f"{word} should be lowercase"
# ── PERSON_CONTEXT_PATTERNS ─────────────────────────────────────────────
def test_person_context_patterns_is_nonempty():
assert len(PERSON_CONTEXT_PATTERNS) > 0
# ── EntityRegistry creation and empty state ─────────────────────────────
def test_load_from_nonexistent_dir(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
assert registry.people == {}
assert registry.projects == []
assert registry.mode == "personal"
assert registry.ambiguous_flags == []
def test_save_and_load_roundtrip(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(
mode="work",
people=[{"name": "Alice", "relationship": "colleague", "context": "work"}],
projects=["MemPalace"],
)
# Load again from same dir
loaded = EntityRegistry.load(config_dir=tmp_path)
assert loaded.mode == "work"
assert "Alice" in loaded.people
assert "MemPalace" in loaded.projects
def test_save_creates_file(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.save()
assert (tmp_path / "entity_registry.json").exists()
def test_save_is_atomic_does_not_leave_tmp(tmp_path):
# Atomic write must not leave the .tmp sidecar file after a successful save.
registry = EntityRegistry.load(config_dir=tmp_path)
registry.save()
leftover = list(tmp_path.glob("entity_registry.json.tmp*"))
assert leftover == [], f"atomic write leaked tmp file(s): {leftover}"
def test_save_preserves_previous_on_serialization_failure(tmp_path, monkeypatch):
# If serialization fails mid-write, the previous registry must remain
# intact — this is the whole point of atomic write vs truncating in place.
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(
mode="personal",
people=[{"name": "Alice", "relationship": "friend", "context": "personal"}],
projects=[],
)
registry.save()
target = tmp_path / "entity_registry.json"
original = target.read_text(encoding="utf-8")
# Force os.replace to raise — simulates filesystem full / permission flip
# AFTER the temp file is written but BEFORE the rename completes.
import os as _os
real_replace = _os.replace
def boom(src, dst):
raise OSError("simulated rename failure")
monkeypatch.setattr(_os, "replace", boom)
with pytest.raises(OSError):
registry.seed(
mode="personal",
people=[{"name": "Bob", "relationship": "friend", "context": "personal"}],
projects=[],
)
registry.save()
# Restore os.replace before reading so the assertion can rely on it.
monkeypatch.setattr(_os, "replace", real_replace)
assert target.read_text(encoding="utf-8") == original
# ── seed ────────────────────────────────────────────────────────────────
def test_seed_registers_people(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(
mode="personal",
people=[
{"name": "Riley", "relationship": "daughter", "context": "personal"},
{"name": "Devon", "relationship": "friend", "context": "personal"},
],
projects=["MemPalace"],
)
assert "Riley" in registry.people
assert "Devon" in registry.people
assert registry.people["Riley"]["relationship"] == "daughter"
assert registry.people["Riley"]["source"] == "onboarding"
assert registry.people["Riley"]["confidence"] == 1.0
def test_seed_registers_projects(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="work", people=[], projects=["Acme", "Widget"])
assert registry.projects == ["Acme", "Widget"]
def test_seed_sets_mode(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="combo", people=[], projects=[])
assert registry.mode == "combo"
def test_seed_flags_ambiguous_names(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(
mode="personal",
people=[
{"name": "Grace", "relationship": "friend", "context": "personal"},
{"name": "Riley", "relationship": "daughter", "context": "personal"},
],
projects=[],
)
assert "grace" in registry.ambiguous_flags
# Riley is not a common English word
assert "riley" not in registry.ambiguous_flags
def test_seed_with_aliases(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(
mode="personal",
people=[{"name": "Maxwell", "relationship": "friend", "context": "personal"}],
projects=[],
aliases={"Max": "Maxwell"},
)
assert "Maxwell" in registry.people
assert "Max" in registry.people
assert registry.people["Max"].get("canonical") == "Maxwell"
def test_seed_skips_empty_names(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(
mode="personal",
people=[{"name": "", "relationship": "", "context": "personal"}],
projects=[],
)
assert len(registry.people) == 0
# ── lookup ──────────────────────────────────────────────────────────────
def test_lookup_known_person(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(
mode="personal",
people=[{"name": "Riley", "relationship": "daughter", "context": "personal"}],
projects=[],
)
result = registry.lookup("Riley")
assert result["type"] == "person"
assert result["confidence"] == 1.0
assert result["name"] == "Riley"
def test_lookup_known_project(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="work", people=[], projects=["MemPalace"])
result = registry.lookup("MemPalace")
assert result["type"] == "project"
assert result["confidence"] == 1.0
def test_lookup_unknown_word(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="personal", people=[], projects=[])
result = registry.lookup("Xyzzy")
assert result["type"] == "unknown"
assert result["confidence"] == 0.0
def test_lookup_case_insensitive(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(
mode="personal",
people=[{"name": "Riley", "relationship": "daughter", "context": "personal"}],
projects=[],
)
result = registry.lookup("riley")
assert result["type"] == "person"
def test_lookup_alias(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(
mode="personal",
people=[{"name": "Maxwell", "relationship": "friend", "context": "personal"}],
projects=[],
aliases={"Max": "Maxwell"},
)
result = registry.lookup("Max")
assert result["type"] == "person"
# ── disambiguation ──────────────────────────────────────────────────────
def test_lookup_ambiguous_word_as_person(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(
mode="personal",
people=[{"name": "Grace", "relationship": "friend", "context": "personal"}],
projects=[],
)
result = registry.lookup("Grace", context="I went with Grace today")
assert result["type"] == "person"
def test_lookup_ambiguous_word_as_concept(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(
mode="personal",
people=[{"name": "Ever", "relationship": "friend", "context": "personal"}],
projects=[],
)
result = registry.lookup("Ever", context="have you ever tried this")
assert result["type"] == "concept"
# ── research — local-only by default ───────────────────────────────────
def test_research_local_only_by_default(tmp_path):
"""research() must NOT call Wikipedia unless allow_network=True."""
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="personal", people=[], projects=[])
with patch(
"mempalace.entity_registry._wikipedia_lookup",
side_effect=AssertionError("network call should not happen"),
):
result = registry.research("Saoirse")
assert result["inferred_type"] == "unknown"
assert result["confidence"] == 0.0
assert result["word"] == "Saoirse"
assert "network lookup disabled" in result.get("note", "")
def test_research_with_allow_network(tmp_path):
"""research(allow_network=True) calls Wikipedia and caches result."""
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="personal", people=[], projects=[])
with patch(
"mempalace.entity_registry._wikipedia_lookup",
return_value=dict(_MOCK_SAOIRSE_PERSON),
):
result = registry.research("Saoirse", auto_confirm=True, allow_network=True)
assert result["inferred_type"] == "person"
def test_research_caches_result(tmp_path):
"""Once cached via allow_network, subsequent calls use cache without network."""
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="personal", people=[], projects=[])
with patch(
"mempalace.entity_registry._wikipedia_lookup",
return_value=dict(_MOCK_SAOIRSE_PERSON),
):
result = registry.research("Saoirse", auto_confirm=True, allow_network=True)
assert result["inferred_type"] == "person"
# Second call should use cache, not call Wikipedia again
with patch(
"mempalace.entity_registry._wikipedia_lookup",
side_effect=AssertionError("should not be called"),
):
cached = registry.research("Saoirse")
assert cached["inferred_type"] == "person"
def test_research_local_only_not_cached(tmp_path):
"""Local-only result for uncached word should NOT be persisted to cache."""
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="personal", people=[], projects=[])
registry.research("Xander") # local-only, no network
assert "Xander" not in registry._data.get("wiki_cache", {})
def test_confirm_research_adds_to_people(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="personal", people=[], projects=[])
with patch(
"mempalace.entity_registry._wikipedia_lookup",
return_value=dict(_MOCK_SAOIRSE_PERSON),
):
registry.research("Saoirse", auto_confirm=False, allow_network=True)
registry.confirm_research("Saoirse", entity_type="person", relationship="friend")
assert "Saoirse" in registry.people
assert registry.people["Saoirse"]["source"] == "wiki"
def test_wikipedia_404_returns_unknown(tmp_path):
"""A 404 from Wikipedia should return 'unknown', not assert 'person'."""
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="personal", people=[], projects=[])
mock_result = {
"inferred_type": "unknown",
"confidence": 0.3,
"wiki_summary": None,
"wiki_title": None,
"note": "not found in Wikipedia",
}
with patch("mempalace.entity_registry._wikipedia_lookup", return_value=mock_result):
result = registry.research("Zzxqy", auto_confirm=False, allow_network=True)
assert result["inferred_type"] == "unknown"
assert result["confidence"] < 0.5
# ── extract_people_from_query ───────────────────────────────────────────
def test_extract_people_from_query(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(
mode="personal",
people=[
{"name": "Riley", "relationship": "daughter", "context": "personal"},
{"name": "Devon", "relationship": "friend", "context": "personal"},
],
projects=[],
)
found = registry.extract_people_from_query("What did Riley say about the weather?")
assert "Riley" in found
assert "Devon" not in found
# ── extract_unknown_candidates ──────────────────────────────────────────
def test_extract_unknown_candidates(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="personal", people=[], projects=[])
unknowns = registry.extract_unknown_candidates("Saoirse went to the store")
assert "Saoirse" in unknowns
def test_extract_unknown_candidates_skips_known(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(
mode="personal",
people=[{"name": "Riley", "relationship": "daughter", "context": "personal"}],
projects=[],
)
unknowns = registry.extract_unknown_candidates("Riley went to the store")
assert "Riley" not in unknowns
# ── summary ─────────────────────────────────────────────────────────────
def test_summary(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(
mode="personal",
people=[{"name": "Riley", "relationship": "daughter", "context": "personal"}],
projects=["MemPalace"],
)
s = registry.summary()
assert "personal" in s
assert "Riley" in s
assert "MemPalace" in s