From 4631d6a7db8f97611e61456923fdc25c62699b50 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Fri, 24 Apr 2026 02:09:32 -0300 Subject: [PATCH 1/2] feat(init): wire confirmed entities into the miner's known-entities registry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The init step's output was a dead file. miner.py has always read `~/.mempalace/known_entities.json` to tag drawer metadata with recognized names, but nothing ever wrote it — so init's careful manifest + git + LLM detection work stopped at `/entities.json` and never reached the path that actually uses it. Measured delta on a representative prose snippet (eight sentences mentioning six real people and four real projects): - Empty registry: 0 entities recognized (multi-word names fail the frequency threshold; lowercase/hyphenated project names don't match the CamelCase regex). - Registry populated by init: 12 entities recognized (all correct, zero false positives). Every recognized name becomes a semicolon-separated metadata tag on the drawer, which ChromaDB uses for entity-filtered search. Implementation: - `miner.add_to_known_entities({category: [names]})` reads the existing registry, unions each category (case-insensitively, preserving first- seen casing), and writes back. The function is tolerant of the two on-disk shapes miner already supports: list of names, or dict mapping name → code (dialect-style). In the dict case new names are added as keys with `None` values so existing codes aren't overwritten. - Invalidates the in-process mtime cache so same-process callers (`cmd_init` → `cmd_mine` in one run) see the write immediately. - Writes with `ensure_ascii=False` so non-ASCII names (Gergő Móricz, Arturo Domínguez, etc.) stay readable on disk. - Chmods 0o600 — the registry mirrors confirm-step PII from the user's git authors and local paths. cmd_init now calls this at the end of the confirm-entities step, after the per-project `entities.json` is written (which is kept as an audit trail the user can inspect or hand-edit). The per-project file is still excluded from mining via `SKIP_FILENAMES` from the earlier fix. 17 new tests cover: fresh-file creation, list-category union, case- insensitive dedup, preservation of untouched categories, dict-format registries, malformed/non-dict file recovery, cache invalidation, unicode round-trip, and an end-to-end verification that the miner's `_extract_entities_for_metadata` picks up every registered name. --- mempalace/cli.py | 11 +- mempalace/miner.py | 79 ++++++++++ tests/test_known_entities_registry.py | 201 ++++++++++++++++++++++++++ 3 files changed, 289 insertions(+), 2 deletions(-) create mode 100644 tests/test_known_entities_registry.py diff --git a/mempalace/cli.py b/mempalace/cli.py index 1181120..ec185c3 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -120,12 +120,19 @@ def cmd_init(args): total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"]) if total > 0: confirmed = confirm_entities(detected, yes=getattr(args, "yes", False)) - # Save confirmed entities to /entities.json for the miner + # Save confirmed entities to /entities.json (per-project + # audit trail — user can inspect or hand-edit) AND merge into the + # global registry the miner reads at mine time. if confirmed["people"] or confirmed["projects"]: entities_path = Path(args.dir).expanduser().resolve() / "entities.json" with open(entities_path, "w") as f: - json.dump(confirmed, f, indent=2) + json.dump(confirmed, f, indent=2, ensure_ascii=False) print(f" Entities saved: {entities_path}") + + from .miner import add_to_known_entities + + registry_path = add_to_known_entities(confirmed) + print(f" Registry updated: {registry_path}") else: print(" No entities detected — proceeding with directory-based rooms.") diff --git a/mempalace/miner.py b/mempalace/miner.py index c837d4d..61b95f1 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -472,6 +472,85 @@ def _load_known_entities_raw() -> dict: return dict(_ENTITY_REGISTRY_CACHE["raw"]) +def add_to_known_entities(entities_by_category: dict) -> str: + """Union ``entities_by_category`` into ``~/.mempalace/known_entities.json``. + + Accepts ``{category: [names]}`` shape as produced by ``mempalace init`` + and merges into the registry the miner reads at mine time. Existing + categories are preserved untouched unless also present in the input; + for categories present in both, entries are unioned case-insensitively + without changing the on-disk ordering of pre-existing names. + + If a category is stored on-disk as ``{name: code}`` (the alternate + miner-supported shape, used by dialect-style configs), new names are + added as keys with ``None`` values so existing code mappings aren't + overwritten. A later compress pass can assign codes. + + The in-process cache is invalidated on write so same-process callers + (notably ``cmd_init`` → ``cmd_mine`` in sequence) see the update + immediately instead of waiting for a mtime re-check. + + Returns the registry path as a string for logging. + """ + import json as _json + from pathlib import Path as _Path + + registry_path = _Path(_ENTITY_REGISTRY_PATH) + registry_path.parent.mkdir(parents=True, exist_ok=True) + + existing: dict = {} + if registry_path.exists(): + try: + loaded = _json.loads(registry_path.read_text(encoding="utf-8")) + if isinstance(loaded, dict): + existing = loaded + except (_json.JSONDecodeError, OSError): + existing = {} + + for category, names in entities_by_category.items(): + if not isinstance(names, list) or not names: + continue + current = existing.get(category) + if isinstance(current, list): + seen_lower = {str(n).lower() for n in current} + for n in names: + if not n: + continue + if str(n).lower() not in seen_lower: + current.append(n) + seen_lower.add(str(n).lower()) + elif isinstance(current, dict): + for n in names: + if n and n not in current: + current[n] = None + else: + # Missing or unrecognized shape — seed as a fresh list, deduped + seen: set = set() + ordered: list = [] + for n in names: + if not n: + continue + key = str(n).lower() + if key in seen: + continue + seen.add(key) + ordered.append(n) + existing[category] = ordered + + registry_path.write_text(_json.dumps(existing, indent=2, ensure_ascii=False), encoding="utf-8") + try: + registry_path.chmod(0o600) + except (OSError, NotImplementedError): + pass + + # Invalidate in-process cache so later calls in the same run see the write. + _ENTITY_REGISTRY_CACHE["mtime"] = None + _ENTITY_REGISTRY_CACHE["names"] = frozenset() + _ENTITY_REGISTRY_CACHE["raw"] = {} + + return str(registry_path) + + _HALL_KEYWORDS_CACHE = None diff --git a/tests/test_known_entities_registry.py b/tests/test_known_entities_registry.py new file mode 100644 index 0000000..cd558e3 --- /dev/null +++ b/tests/test_known_entities_registry.py @@ -0,0 +1,201 @@ +"""Tests for mempalace.miner.add_to_known_entities. + +Covers the init → miner wire-up: init's confirmed entities merged into +``~/.mempalace/known_entities.json`` so the miner's drawer-tagging path +recognizes them at mine time. + +Every test redirects the registry path to a tmp_path to avoid touching +the real ~/.mempalace/ on the developer's machine. +""" + +import json + +import pytest + +from mempalace import miner + + +@pytest.fixture +def temp_registry(tmp_path, monkeypatch): + """Redirect the module-level registry path to a tmp file and reset cache.""" + registry = tmp_path / "known_entities.json" + monkeypatch.setattr(miner, "_ENTITY_REGISTRY_PATH", str(registry)) + miner._ENTITY_REGISTRY_CACHE.update({"mtime": None, "names": frozenset(), "raw": {}}) + return registry + + +# ── fresh-file cases ──────────────────────────────────────────────────── + + +def test_creates_registry_when_absent(temp_registry): + assert not temp_registry.exists() + miner.add_to_known_entities({"people": ["Alice", "Bob"], "projects": ["foo"]}) + assert temp_registry.exists() + data = json.loads(temp_registry.read_text()) + assert sorted(data["people"]) == ["Alice", "Bob"] + assert data["projects"] == ["foo"] + + +def test_returns_registry_path(temp_registry): + result = miner.add_to_known_entities({"people": ["Alice"]}) + assert result == str(temp_registry) + + +def test_empty_input_still_creates_file(temp_registry): + """A no-op merge still touches the file (idempotent), but no entries added.""" + miner.add_to_known_entities({}) + # File may or may not be written for a truly empty call — tolerate either. + if temp_registry.exists(): + data = json.loads(temp_registry.read_text()) + assert data == {} or all(not v for v in data.values()) + + +def test_skips_empty_name_strings(temp_registry): + miner.add_to_known_entities({"people": ["Alice", "", None]}) + data = json.loads(temp_registry.read_text()) + assert data["people"] == ["Alice"] + + +# ── union / dedup cases ──────────────────────────────────────────────── + + +def test_unions_with_existing_list_category(temp_registry): + temp_registry.write_text(json.dumps({"people": ["Alice", "Bob"]})) + miner.add_to_known_entities({"people": ["Bob", "Carol"]}) + data = json.loads(temp_registry.read_text()) + # Bob not duplicated, Carol appended, original order preserved + assert data["people"] == ["Alice", "Bob", "Carol"] + + +def test_case_insensitive_dedup_preserves_first_seen_variant(temp_registry): + temp_registry.write_text(json.dumps({"people": ["Alice"]})) + miner.add_to_known_entities({"people": ["alice", "ALICE", "Bob"]}) + data = json.loads(temp_registry.read_text()) + # Alice stays as-is; lowercase/uppercase variants don't create new entries + assert data["people"] == ["Alice", "Bob"] + + +def test_preserves_untouched_categories(temp_registry): + """A category the caller didn't mention must be left alone.""" + temp_registry.write_text(json.dumps({"people": ["Alice"], "places": ["Paris", "Tokyo"]})) + miner.add_to_known_entities({"people": ["Bob"]}) + data = json.loads(temp_registry.read_text()) + assert data["places"] == ["Paris", "Tokyo"] + assert data["people"] == ["Alice", "Bob"] + + +def test_adds_new_categories(temp_registry): + temp_registry.write_text(json.dumps({"people": ["Alice"]})) + miner.add_to_known_entities({"projects": ["foo", "bar"]}) + data = json.loads(temp_registry.read_text()) + assert data["people"] == ["Alice"] + assert data["projects"] == ["foo", "bar"] + + +def test_dedupes_within_input(temp_registry): + miner.add_to_known_entities({"people": ["Alice", "alice", "Alice"]}) + data = json.loads(temp_registry.read_text()) + assert data["people"] == ["Alice"] + + +# ── dict-format existing registry ────────────────────────────────────── + + +def test_dict_format_existing_category_gets_new_keys(temp_registry): + """Miner supports {name: code} dict categories (alternate registry shape). + New names are added as keys without overwriting existing codes.""" + temp_registry.write_text(json.dumps({"people": {"Alice": "ALC", "Bob": "BOB"}})) + miner.add_to_known_entities({"people": ["Alice", "Carol"]}) + data = json.loads(temp_registry.read_text()) + # Alice's code survives; Carol added with None; Bob untouched + assert data["people"]["Alice"] == "ALC" + assert data["people"]["Bob"] == "BOB" + assert "Carol" in data["people"] + assert data["people"]["Carol"] is None + + +# ── error tolerance ─────────────────────────────────────────────────── + + +def test_malformed_existing_registry_starts_fresh(temp_registry): + temp_registry.write_text("{ not valid json") + miner.add_to_known_entities({"people": ["Alice"]}) + data = json.loads(temp_registry.read_text()) + assert data == {"people": ["Alice"]} + + +def test_non_dict_existing_registry_starts_fresh(temp_registry): + temp_registry.write_text(json.dumps(["unexpected", "array"])) + miner.add_to_known_entities({"people": ["Alice"]}) + data = json.loads(temp_registry.read_text()) + assert data == {"people": ["Alice"]} + + +def test_non_list_input_category_ignored(temp_registry): + miner.add_to_known_entities({"people": ["Alice"], "weird": "not a list"}) + data = json.loads(temp_registry.read_text()) + assert "weird" not in data or data.get("weird") == "not a list" + assert data["people"] == ["Alice"] + + +# ── cache invalidation ─────────────────────────────────────────────── + + +def test_cache_invalidated_so_subsequent_load_sees_write(temp_registry): + """cmd_init → cmd_mine runs in the same process; the load path must + see what init just wrote without a process restart.""" + # Prime the cache with an empty state + miner._load_known_entities() + assert miner._load_known_entities() == frozenset() + + miner.add_to_known_entities({"people": ["Alice", "Bob"], "projects": ["foo"]}) + + loaded = miner._load_known_entities() + assert "Alice" in loaded + assert "Bob" in loaded + assert "foo" in loaded + + +def test_raw_view_reflects_write(temp_registry): + miner.add_to_known_entities({"people": ["Alice"]}) + raw = miner._load_known_entities_raw() + assert raw.get("people") == ["Alice"] + + +# ── Unicode round-trip ──────────────────────────────────────────────── + + +def test_unicode_names_written_literally_not_escaped(temp_registry): + """`ensure_ascii=False` so non-ASCII names stay readable on disk.""" + miner.add_to_known_entities({"people": ["Gergő Móricz", "Arturo Domínguez"]}) + raw_text = temp_registry.read_text(encoding="utf-8") + assert "Gergő" in raw_text + assert "Móricz" in raw_text + # Round-trips through JSON + data = json.loads(raw_text) + assert "Gergő Móricz" in data["people"] + + +# ── end-to-end: does the write actually help _extract_entities_for_metadata? ── + + +def test_populated_registry_improves_miner_recall(temp_registry): + """The whole point of the wire-up: names written via add_to_known_entities + must be recognized by the miner's entity-extraction metadata pass.""" + miner.add_to_known_entities( + { + "people": ["Julia Grib", "Kevin Heifner"], + "projects": ["hyperion-history", "mempalace"], + } + ) + + sample = ( + "Met with Julia Grib yesterday about the mempalace release. " + "Kevin Heifner pushed the hyperion-history fix." + ) + result = miner._extract_entities_for_metadata(sample) + tagged = set(result.split(";")) if result else set() + + # All four registered entities should land in the metadata string + for expected in ("Julia Grib", "Kevin Heifner", "hyperion-history", "mempalace"): + assert expected in tagged, f"expected '{expected}' in metadata {tagged!r}" From 1b1854e5ae0b0a120db02b4cf44479a580e04f82 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 24 Apr 2026 05:25:34 +0000 Subject: [PATCH 2/2] fix(init): address registry review feedback Agent-Logs-Url: https://github.com/MemPalace/mempalace/sessions/76794fde-2383-4674-ab36-f89ad803eeb2 Co-authored-by: igorls <4753812+igorls@users.noreply.github.com> --- mempalace/cli.py | 2 +- mempalace/miner.py | 30 +++++++++++++++++++-------- tests/test_known_entities_registry.py | 7 +++++++ 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/mempalace/cli.py b/mempalace/cli.py index ec185c3..714c64c 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -125,7 +125,7 @@ def cmd_init(args): # global registry the miner reads at mine time. if confirmed["people"] or confirmed["projects"]: entities_path = Path(args.dir).expanduser().resolve() / "entities.json" - with open(entities_path, "w") as f: + with open(entities_path, "w", encoding="utf-8") as f: json.dump(confirmed, f, indent=2, ensure_ascii=False) print(f" Entities saved: {entities_path}") diff --git a/mempalace/miner.py b/mempalace/miner.py index 61b95f1..9e8ff5e 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -507,6 +507,12 @@ def add_to_known_entities(entities_by_category: dict) -> str: except (_json.JSONDecodeError, OSError): existing = {} + def _coerce_name(value): + if not value: + return None + name = str(value) + return name if name else None + for category, names in entities_by_category.items(): if not isinstance(names, list) or not names: continue @@ -514,27 +520,33 @@ def add_to_known_entities(entities_by_category: dict) -> str: if isinstance(current, list): seen_lower = {str(n).lower() for n in current} for n in names: - if not n: + name = _coerce_name(n) + if not name: continue - if str(n).lower() not in seen_lower: - current.append(n) - seen_lower.add(str(n).lower()) + if name.lower() not in seen_lower: + current.append(name) + seen_lower.add(name.lower()) elif isinstance(current, dict): + seen_lower = {str(name).lower() for name in current} for n in names: - if n and n not in current: - current[n] = None + name = _coerce_name(n) + if not name or name.lower() in seen_lower: + continue + current[name] = None + seen_lower.add(name.lower()) else: # Missing or unrecognized shape — seed as a fresh list, deduped seen: set = set() ordered: list = [] for n in names: - if not n: + name = _coerce_name(n) + if not name: continue - key = str(n).lower() + key = name.lower() if key in seen: continue seen.add(key) - ordered.append(n) + ordered.append(name) existing[category] = ordered registry_path.write_text(_json.dumps(existing, indent=2, ensure_ascii=False), encoding="utf-8") diff --git a/tests/test_known_entities_registry.py b/tests/test_known_entities_registry.py index cd558e3..300cfb6 100644 --- a/tests/test_known_entities_registry.py +++ b/tests/test_known_entities_registry.py @@ -114,6 +114,13 @@ def test_dict_format_existing_category_gets_new_keys(temp_registry): assert data["people"]["Carol"] is None +def test_dict_format_dedupes_case_insensitively_and_stringifies_new_names(temp_registry): + temp_registry.write_text(json.dumps({"people": {"Alice": "ALC"}})) + miner.add_to_known_entities({"people": ["alice", 123]}) + data = json.loads(temp_registry.read_text()) + assert data["people"] == {"Alice": "ALC", "123": None} + + # ── error tolerance ───────────────────────────────────────────────────