Merge pull request #1224 from MemPalace/feat/privacy-warn-external-llm

feat(privacy): warn when LLM tier sends content to external API
This commit is contained in:
Igor Lins e Silva
2026-04-26 19:18:13 -03:00
committed by GitHub
4 changed files with 248 additions and 0 deletions
+13
View File
@@ -250,6 +250,19 @@ def cmd_init(args):
if ok: if ok:
llm_provider = candidate llm_provider = candidate
print(f" LLM enabled: {provider_name}/{provider_model}") print(f" LLM enabled: {provider_name}/{provider_model}")
# Privacy warning (issue #24): if the configured endpoint
# sends data off the user's machine/network, surface that
# before init proceeds. URL-based — Ollama on localhost,
# LM Studio on LAN, etc. won't trigger; Anthropic /
# cloud OpenAI-compat / any non-local endpoint will.
if candidate.is_external_service:
print(
f"{provider_name} is an EXTERNAL API. Your folder "
f"content will be sent to the provider during init. "
f"MemPalace does not control how the provider logs, "
f"retains, or uses your data. Pass --no-llm to keep "
f"init fully local."
)
else: else:
print( print(
f" No LLM provider reachable ({msg}). " f" No LLM provider reachable ({msg}). "
+70
View File
@@ -28,9 +28,65 @@ import os
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional from typing import Optional
from urllib.error import HTTPError, URLError from urllib.error import HTTPError, URLError
from urllib.parse import urlparse
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
# ── External-service heuristic (issue #24 — privacy warning support) ─────
# Used by ``LLMProvider.is_external_service`` to decide whether the
# provider's configured endpoint will send user content off the local
# machine/network. Single source of truth so all three providers share
# identical "local vs external" semantics.
_LOCALHOST_HOSTS = frozenset({"localhost", "127.0.0.1", "::1"})
def _endpoint_is_local(url: Optional[str]) -> bool:
"""Return True if ``url``'s hostname is on the user's machine or
private network.
Local includes:
- localhost, 127.0.0.1, ::1
- hostnames ending in .local (mDNS/Bonjour)
- IPv4 RFC1918: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
- IPv6 unique-local addresses (fc00::/7) — fc.../fd... prefixes
None / empty / unparseable URLs are treated as local (defensive default —
no endpoint means no external request can happen yet).
Anything else (including public IPs and FQDNs) is external.
"""
if not url:
return True
try:
host = (urlparse(url).hostname or "").lower()
except (ValueError, AttributeError):
return False
if not host:
return True
if host in _LOCALHOST_HOSTS:
return True
if host.endswith(".local"):
return True
if host.startswith("10."):
return True
if host.startswith("192.168."):
return True
if host.startswith("172."):
# 172.16.0.0 - 172.31.255.255
parts = host.split(".")
if len(parts) >= 2:
try:
if 16 <= int(parts[1]) <= 31:
return True
except ValueError:
pass
# IPv6 unique-local addresses fc00::/7 — match leading hex chars
if host.startswith("fc") or host.startswith("fd"):
return True
return False
class LLMError(RuntimeError): class LLMError(RuntimeError):
"""Raised for any provider failure — transport, parse, auth, missing model.""" """Raised for any provider failure — transport, parse, auth, missing model."""
@@ -68,6 +124,20 @@ class LLMProvider:
"""Return ``(ok, message)``. Fast probe that the provider is reachable.""" """Return ``(ok, message)``. Fast probe that the provider is reachable."""
raise NotImplementedError raise NotImplementedError
@property
def is_external_service(self) -> bool:
"""Return True if this provider's endpoint will send user content
off the local machine/network.
Used by ``mempalace init`` to decide whether to print a privacy
warning before first use (issue #24). URL-based heuristic only —
the endpoint determines, regardless of which provider class.
Subclasses that resolve their endpoint dynamically should override
if needed; the default works for the three in-tree providers
(Ollama / OpenAI-compat / Anthropic).
"""
return not _endpoint_is_local(self.endpoint)
def _http_post_json(url: str, body: dict, headers: dict, timeout: int) -> dict: def _http_post_json(url: str, body: dict, headers: dict, timeout: int) -> dict:
"""POST JSON and return the parsed response. Raises LLMError on any failure.""" """POST JSON and return the parsed response. Raises LLMError on any failure."""
+112
View File
@@ -1629,3 +1629,115 @@ def test_merge_tier_fields_no_llm_provider_returns_heuristic_only():
assert res["agent_persona_names"] == [] assert res["agent_persona_names"] == []
assert res["user_name"] is None assert res["user_name"] is None
assert res["primary_platform"] is None assert res["primary_platform"] is None
# ─────────────────────────────────────────────────────────────────────────
# External-API privacy warning (issue #24).
#
# When mempalace init resolves an LLM provider whose endpoint will send
# user content off the local machine/network, init MUST print a clear
# warning naming the provider, stating that MemPalace doesn't control
# how the provider logs/retains/uses the data, and pointing at --no-llm.
# Local providers (Ollama on localhost, LM Studio on LAN, etc.) MUST NOT
# trigger the warning.
# ─────────────────────────────────────────────────────────────────────────
def test_init_prints_privacy_warning_when_provider_is_external(
ai_dialogue_corpus: Path, tmp_path: Path, capsys
):
"""When cmd_init successfully acquires a provider whose
is_external_service is True, output must contain the privacy
warning text including the EXTERNAL marker.
"""
from mempalace.cli import cmd_init
palace = tmp_path / "palace"
args = _init_args(ai_dialogue_corpus) # default = LLM ON
fake_provider = MagicMock()
fake_provider.check_available.return_value = (True, "ok")
fake_provider.is_external_service = True
fake_provider.classify.return_value = MagicMock(text='{"classifications": []}')
with (
patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
patch("mempalace.cli.get_provider", return_value=fake_provider),
patch("mempalace.cli._maybe_run_mine_after_init"),
patch("mempalace.room_detector_local.detect_rooms_local"),
):
cmd_init(args)
out = capsys.readouterr().out
assert "EXTERNAL API" in out, (
f"Privacy warning must mention 'EXTERNAL API' when provider is external. " f"Got: {out!r}"
)
assert (
"--no-llm" in out
), f"Privacy warning must point users at --no-llm to opt out. Got: {out!r}"
# The warning should also tell users MemPalace isn't responsible
# for downstream provider behavior.
assert (
"does not control" in out.lower()
or "not responsible" in out.lower()
or "logs" in out.lower()
or "retains" in out.lower()
), (
f"Privacy warning must clarify MemPalace doesn't control how the "
f"provider handles the data. Got: {out!r}"
)
def test_init_no_privacy_warning_when_provider_is_local(
ai_dialogue_corpus: Path, tmp_path: Path, capsys
):
"""When cmd_init successfully acquires a LOCAL provider (e.g. Ollama
on localhost, LM Studio on LAN), the privacy warning MUST NOT fire —
nothing is leaving the user's machine/network.
"""
from mempalace.cli import cmd_init
palace = tmp_path / "palace"
args = _init_args(ai_dialogue_corpus) # default = LLM ON
fake_provider = MagicMock()
fake_provider.check_available.return_value = (True, "ok")
fake_provider.is_external_service = False # Local provider — no warning
fake_provider.classify.return_value = MagicMock(text='{"classifications": []}')
with (
patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
patch("mempalace.cli.get_provider", return_value=fake_provider),
patch("mempalace.cli._maybe_run_mine_after_init"),
patch("mempalace.room_detector_local.detect_rooms_local"),
):
cmd_init(args)
out = capsys.readouterr().out
assert "EXTERNAL API" not in out, (
f"Privacy warning fired for a LOCAL provider — should not have. " f"Got: {out!r}"
)
def test_init_no_privacy_warning_with_no_llm_flag(ai_dialogue_corpus: Path, tmp_path: Path, capsys):
"""With --no-llm, no provider is acquired at all, so the privacy
warning has nothing to fire on. Output must not contain it.
"""
from mempalace.cli import cmd_init
palace = tmp_path / "palace"
args = _init_args(ai_dialogue_corpus, no_llm=True)
with (
patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
patch("mempalace.cli.get_provider") as mock_get,
patch("mempalace.cli._maybe_run_mine_after_init"),
patch("mempalace.room_detector_local.detect_rooms_local"),
):
cmd_init(args)
mock_get.assert_not_called(), "--no-llm must short-circuit before provider acquisition"
out = capsys.readouterr().out
assert (
"EXTERNAL API" not in out
), f"Privacy warning fired on --no-llm path — should not have. Got: {out!r}"
+53
View File
@@ -325,3 +325,56 @@ def test_anthropic_no_key_raises_on_classify(monkeypatch):
p = AnthropicProvider(model="claude-haiku") p = AnthropicProvider(model="claude-haiku")
with pytest.raises(LLMError, match="requires ANTHROPIC_API_KEY"): with pytest.raises(LLMError, match="requires ANTHROPIC_API_KEY"):
p.classify("s", "u") p.classify("s", "u")
# ── is_external_service property (issue #24 — privacy warning support) ──
#
# `is_external_service` is True when this provider's endpoint sends data
# off the user's machine/network. Used by mempalace init to print a
# privacy warning before first run when an external API will receive
# folder content. URL-based heuristic: localhost, 127.x, ::1, .local,
# RFC1918 (10/8, 192.168/16, 172.16-31/12), and IPv6 ULA (fc/fd::) are
# all treated as local. Everything else is treated as external.
def test_ollama_provider_default_endpoint_is_local():
"""OllamaProvider's default endpoint is http://localhost:11434, which
must be classified as local — no privacy warning fires for the
typical user running Ollama on their own machine."""
p = OllamaProvider(model="gemma4:e4b")
assert p.is_external_service is False, (
f"Default OllamaProvider endpoint must be local; got "
f"is_external_service={p.is_external_service} for endpoint={p.endpoint}"
)
def test_openai_compat_provider_localhost_endpoint_is_local():
"""LM Studio / llama.cpp server / vLLM commonly bind to localhost.
Those setups must NOT trigger the external-API warning."""
p = OpenAICompatProvider(model="any", endpoint="http://localhost:1234")
assert p.is_external_service is False
p_127 = OpenAICompatProvider(model="any", endpoint="http://127.0.0.1:8000")
assert p_127.is_external_service is False
p_lan = OpenAICompatProvider(model="any", endpoint="http://192.168.1.50:1234")
assert p_lan.is_external_service is False, "LAN (RFC1918) endpoints must be local"
def test_openai_compat_provider_cloud_endpoint_is_external():
"""A user pointing openai-compat at OpenAI's hosted API or any other
non-local endpoint MUST trigger the external warning."""
p = OpenAICompatProvider(model="gpt-4o", endpoint="https://api.openai.com")
assert p.is_external_service is True, (
f"https://api.openai.com must be classified external; got "
f"is_external_service={p.is_external_service}"
)
def test_anthropic_provider_default_endpoint_is_external():
"""AnthropicProvider's default endpoint is https://api.anthropic.com,
which is always external by definition. The privacy warning MUST
fire by default for users who pass --llm-provider anthropic."""
p = AnthropicProvider(model="claude-haiku-4-5", api_key="sk-test")
assert p.is_external_service is True, (
f"Default AnthropicProvider endpoint must be external; got "
f"is_external_service={p.is_external_service} for endpoint={p.endpoint}"
)