tests/benchmarks/test_mcp_bench.py

"""
MCP server tool performance benchmarks.

Validates production readiness findings:
  - Finding #3: tool_status() unbounded col.get(include=["metadatas"]) → OOM
  - Finding #7: _get_collection() re-instantiates PersistentClient every call
  - Finding #3 variants: tool_list_wings(), tool_get_taxonomy() same pattern

Calls MCP tool handler functions directly with monkeypatched _config.
"""

import time

import chromadb
import pytest

from tests.benchmarks.data_generator import PalaceDataGenerator
from tests.benchmarks.report import record_metric


# ── Helpers ──────────────────────────────────────────────────────────────


def _make_palace(tmp_path, n_drawers, scale="small"):
    """Create a palace with exactly n_drawers, return palace_path."""
    gen = PalaceDataGenerator(seed=42, scale=scale)
    palace_path = str(tmp_path / "palace")
    gen.populate_palace_directly(palace_path, n_drawers=n_drawers, include_needles=False)
    return palace_path


def _patch_mcp_config(monkeypatch, palace_path, tmp_path):
    """Monkeypatch mcp_server._config and _kg to point at test dirs."""
    from mempalace.config import MempalaceConfig
    from mempalace.knowledge_graph import KnowledgeGraph

    cfg = MempalaceConfig(config_dir=str(tmp_path / "cfg"))
    # Override palace_path directly on the object
    monkeypatch.setattr(cfg, "_file_config", {"palace_path": palace_path})

    import mempalace.mcp_server as mcp_mod

    kg = KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3"))
    monkeypatch.setattr(mcp_mod, "_config", cfg)
    monkeypatch.setattr(mcp_mod, "_get_kg", lambda: kg)


def _get_rss_mb():
    """Get current process RSS in MB."""
    try:
        import psutil

        return psutil.Process().memory_info().rss / (1024 * 1024)
    except ImportError:
        import resource

        # ru_maxrss is in KB on Linux, bytes on macOS
        import platform

        usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        if platform.system() == "Darwin":
            return usage / (1024 * 1024)
        return usage / 1024


# ── Tests ────────────────────────────────────────────────────────────────


@pytest.mark.benchmark
class TestToolStatusOOM:
    """Finding #3: tool_status loads ALL metadata into memory."""

    SIZES = [500, 1_000, 2_500, 5_000]

    @pytest.mark.parametrize("n_drawers", SIZES)
    def test_tool_status_rss_growth(self, n_drawers, tmp_path, monkeypatch):
        """Measure RSS growth from tool_status at different palace sizes."""
        palace_path = _make_palace(tmp_path, n_drawers)
        _patch_mcp_config(monkeypatch, palace_path, tmp_path)

        from mempalace.mcp_server import tool_status

        rss_before = _get_rss_mb()
        result = tool_status()
        rss_after = _get_rss_mb()

        rss_delta = rss_after - rss_before
        assert "error" not in result, f"tool_status failed: {result}"
        assert result["total_drawers"] == n_drawers

        record_metric("mcp_status", f"rss_delta_mb_at_{n_drawers}", round(rss_delta, 2))

    @pytest.mark.parametrize("n_drawers", SIZES)
    def test_tool_status_latency(self, n_drawers, tmp_path, monkeypatch):
        """Measure tool_status response time at different palace sizes."""
        palace_path = _make_palace(tmp_path, n_drawers)
        _patch_mcp_config(monkeypatch, palace_path, tmp_path)

        from mempalace.mcp_server import tool_status

        # Warm up
        tool_status()

        start = time.perf_counter()
        result = tool_status()
        elapsed_ms = (time.perf_counter() - start) * 1000

        assert "error" not in result
        record_metric("mcp_status", f"latency_ms_at_{n_drawers}", round(elapsed_ms, 1))


@pytest.mark.benchmark
class TestToolListWingsUnbounded:
    """Finding #3 variant: tool_list_wings also fetches ALL metadata."""

    @pytest.mark.parametrize("n_drawers", [500, 1_000, 2_500, 5_000])
    def test_list_wings_latency(self, n_drawers, tmp_path, monkeypatch):
        palace_path = _make_palace(tmp_path, n_drawers)
        _patch_mcp_config(monkeypatch, palace_path, tmp_path)

        from mempalace.mcp_server import tool_list_wings

        start = time.perf_counter()
        result = tool_list_wings()
        elapsed_ms = (time.perf_counter() - start) * 1000

        assert "wings" in result
        record_metric("mcp_list_wings", f"latency_ms_at_{n_drawers}", round(elapsed_ms, 1))


@pytest.mark.benchmark
class TestToolGetTaxonomyUnbounded:
    """Finding #3 variant: tool_get_taxonomy also fetches ALL metadata."""

    @pytest.mark.parametrize("n_drawers", [500, 1_000, 2_500, 5_000])
    def test_get_taxonomy_latency(self, n_drawers, tmp_path, monkeypatch):
        palace_path = _make_palace(tmp_path, n_drawers)
        _patch_mcp_config(monkeypatch, palace_path, tmp_path)

        from mempalace.mcp_server import tool_get_taxonomy

        start = time.perf_counter()
        result = tool_get_taxonomy()
        elapsed_ms = (time.perf_counter() - start) * 1000

        assert "taxonomy" in result
        record_metric("mcp_taxonomy", f"latency_ms_at_{n_drawers}", round(elapsed_ms, 1))


@pytest.mark.benchmark
class TestClientReinstantiation:
    """Finding #7: _get_collection() creates new PersistentClient every call."""

    def test_reinstantiation_overhead(self, tmp_path, monkeypatch):
        """Measure cost of 50 _get_collection() calls vs a cached client."""
        palace_path = _make_palace(tmp_path, 500)
        _patch_mcp_config(monkeypatch, palace_path, tmp_path)

        from mempalace.mcp_server import _get_collection

        n_calls = 50

        # Measure re-instantiation (current behavior)
        start = time.perf_counter()
        for _ in range(n_calls):
            col = _get_collection()
            assert col is not None
        uncached_ms = (time.perf_counter() - start) * 1000

        # Measure cached client (what it should be)
        client = chromadb.PersistentClient(path=palace_path)
        cached_col = client.get_collection("mempalace_drawers")
        start = time.perf_counter()
        for _ in range(n_calls):
            _ = cached_col.count()
        cached_ms = (time.perf_counter() - start) * 1000

        overhead_ratio = uncached_ms / max(cached_ms, 0.01)

        record_metric("client_reinstantiation", "uncached_total_ms", round(uncached_ms, 1))
        record_metric("client_reinstantiation", "cached_total_ms", round(cached_ms, 1))
        record_metric("client_reinstantiation", "overhead_ratio", round(overhead_ratio, 2))
        record_metric("client_reinstantiation", "n_calls", n_calls)


@pytest.mark.benchmark
class TestToolSearchLatency:
    """tool_search uses query() not get(), should scale better."""

    @pytest.mark.parametrize("n_drawers", [500, 1_000, 2_500, 5_000])
    def test_search_latency(self, n_drawers, tmp_path, monkeypatch):
        palace_path = _make_palace(tmp_path, n_drawers)
        _patch_mcp_config(monkeypatch, palace_path, tmp_path)

        from mempalace.mcp_server import tool_search

        queries = ["authentication middleware", "database migration", "error handling"]
        latencies = []
        for q in queries:
            start = time.perf_counter()
            result = tool_search(query=q, limit=5)
            elapsed_ms = (time.perf_counter() - start) * 1000
            latencies.append(elapsed_ms)
            assert "error" not in result

        avg_ms = sum(latencies) / len(latencies)
        record_metric("mcp_search", f"avg_latency_ms_at_{n_drawers}", round(avg_ms, 1))


@pytest.mark.benchmark
class TestDuplicateCheckCost:
    """tool_add_drawer calls tool_check_duplicate first — measure overhead."""

    @pytest.mark.parametrize("n_drawers", [500, 1_000, 2_500])
    def test_duplicate_check_latency(self, n_drawers, tmp_path, monkeypatch):
        palace_path = _make_palace(tmp_path, n_drawers)
        _patch_mcp_config(monkeypatch, palace_path, tmp_path)

        from mempalace.mcp_server import tool_check_duplicate

        test_content = "This is unique test content for duplicate checking benchmark."
        start = time.perf_counter()
        result = tool_check_duplicate(content=test_content)
        elapsed_ms = (time.perf_counter() - start) * 1000

        assert "error" not in result
        record_metric("mcp_duplicate_check", f"latency_ms_at_{n_drawers}", round(elapsed_ms, 1))
bench: add scale benchmark suite (94 tests) 2026-04-07 19:39:06 -03:00			`"""`
			`MCP server tool performance benchmarks.`

			`Validates production readiness findings:`
			`- Finding #3: tool_status() unbounded col.get(include=["metadatas"]) → OOM`
			`- Finding #7: _get_collection() re-instantiates PersistentClient every call`
			`- Finding #3 variants: tool_list_wings(), tool_get_taxonomy() same pattern`

			`Calls MCP tool handler functions directly with monkeypatched _config.`
			`"""`

			`import time`

			`import chromadb`
			`import pytest`

fix: resolve ruff lint errors in benchmark suite 2026-04-08 05:10:26 -03:00			`from tests.benchmarks.data_generator import PalaceDataGenerator`
bench: add scale benchmark suite (94 tests) 2026-04-07 19:39:06 -03:00			`from tests.benchmarks.report import record_metric`


			`# ── Helpers ──────────────────────────────────────────────────────────────`


			`def _make_palace(tmp_path, n_drawers, scale="small"):`
			`"""Create a palace with exactly n_drawers, return palace_path."""`
			`gen = PalaceDataGenerator(seed=42, scale=scale)`
			`palace_path = str(tmp_path / "palace")`
			`gen.populate_palace_directly(palace_path, n_drawers=n_drawers, include_needles=False)`
			`return palace_path`


			`def _patch_mcp_config(monkeypatch, palace_path, tmp_path):`
			`"""Monkeypatch mcp_server._config and _kg to point at test dirs."""`
			`from mempalace.config import MempalaceConfig`
			`from mempalace.knowledge_graph import KnowledgeGraph`

			`cfg = MempalaceConfig(config_dir=str(tmp_path / "cfg"))`
			`# Override palace_path directly on the object`
			`monkeypatch.setattr(cfg, "_file_config", {"palace_path": palace_path})`

			`import mempalace.mcp_server as mcp_mod`

test(mcp): migrate _kg monkeypatches to _get_kg (#1136 ) 2026-04-24 12:48:00 +05:00			`kg = KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3"))`
bench: add scale benchmark suite (94 tests) 2026-04-07 19:39:06 -03:00			`monkeypatch.setattr(mcp_mod, "_config", cfg)`
test(mcp): migrate _kg monkeypatches to _get_kg (#1136 ) 2026-04-24 12:48:00 +05:00			`monkeypatch.setattr(mcp_mod, "_get_kg", lambda: kg)`
bench: add scale benchmark suite (94 tests) 2026-04-07 19:39:06 -03:00

			`def _get_rss_mb():`
			`"""Get current process RSS in MB."""`
			`try:`
			`import psutil`

			`return psutil.Process().memory_info().rss / (1024 * 1024)`
			`except ImportError:`
			`import resource`

			`# ru_maxrss is in KB on Linux, bytes on macOS`
			`import platform`

			`usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss`
			`if platform.system() == "Darwin":`
			`return usage / (1024 * 1024)`
			`return usage / 1024`


			`# ── Tests ────────────────────────────────────────────────────────────────`


			`@pytest.mark.benchmark`
			`class TestToolStatusOOM:`
			`"""Finding #3: tool_status loads ALL metadata into memory."""`

			`SIZES = [500, 1_000, 2_500, 5_000]`

			`@pytest.mark.parametrize("n_drawers", SIZES)`
			`def test_tool_status_rss_growth(self, n_drawers, tmp_path, monkeypatch):`
			`"""Measure RSS growth from tool_status at different palace sizes."""`
			`palace_path = _make_palace(tmp_path, n_drawers)`
			`_patch_mcp_config(monkeypatch, palace_path, tmp_path)`

			`from mempalace.mcp_server import tool_status`

			`rss_before = _get_rss_mb()`
			`result = tool_status()`
			`rss_after = _get_rss_mb()`

			`rss_delta = rss_after - rss_before`
			`assert "error" not in result, f"tool_status failed: {result}"`
			`assert result["total_drawers"] == n_drawers`

			`record_metric("mcp_status", f"rss_delta_mb_at_{n_drawers}", round(rss_delta, 2))`

			`@pytest.mark.parametrize("n_drawers", SIZES)`
			`def test_tool_status_latency(self, n_drawers, tmp_path, monkeypatch):`
			`"""Measure tool_status response time at different palace sizes."""`
			`palace_path = _make_palace(tmp_path, n_drawers)`
			`_patch_mcp_config(monkeypatch, palace_path, tmp_path)`

			`from mempalace.mcp_server import tool_status`

			`# Warm up`
			`tool_status()`

			`start = time.perf_counter()`
			`result = tool_status()`
			`elapsed_ms = (time.perf_counter() - start) * 1000`

			`assert "error" not in result`
			`record_metric("mcp_status", f"latency_ms_at_{n_drawers}", round(elapsed_ms, 1))`


			`@pytest.mark.benchmark`
			`class TestToolListWingsUnbounded:`
			`"""Finding #3 variant: tool_list_wings also fetches ALL metadata."""`

			`@pytest.mark.parametrize("n_drawers", [500, 1_000, 2_500, 5_000])`
			`def test_list_wings_latency(self, n_drawers, tmp_path, monkeypatch):`
			`palace_path = _make_palace(tmp_path, n_drawers)`
			`_patch_mcp_config(monkeypatch, palace_path, tmp_path)`

			`from mempalace.mcp_server import tool_list_wings`

			`start = time.perf_counter()`
			`result = tool_list_wings()`
			`elapsed_ms = (time.perf_counter() - start) * 1000`

			`assert "wings" in result`
			`record_metric("mcp_list_wings", f"latency_ms_at_{n_drawers}", round(elapsed_ms, 1))`


			`@pytest.mark.benchmark`
			`class TestToolGetTaxonomyUnbounded:`
			`"""Finding #3 variant: tool_get_taxonomy also fetches ALL metadata."""`

			`@pytest.mark.parametrize("n_drawers", [500, 1_000, 2_500, 5_000])`
			`def test_get_taxonomy_latency(self, n_drawers, tmp_path, monkeypatch):`
			`palace_path = _make_palace(tmp_path, n_drawers)`
			`_patch_mcp_config(monkeypatch, palace_path, tmp_path)`

			`from mempalace.mcp_server import tool_get_taxonomy`

			`start = time.perf_counter()`
			`result = tool_get_taxonomy()`
			`elapsed_ms = (time.perf_counter() - start) * 1000`

			`assert "taxonomy" in result`
			`record_metric("mcp_taxonomy", f"latency_ms_at_{n_drawers}", round(elapsed_ms, 1))`


			`@pytest.mark.benchmark`
			`class TestClientReinstantiation:`
			`"""Finding #7: _get_collection() creates new PersistentClient every call."""`

			`def test_reinstantiation_overhead(self, tmp_path, monkeypatch):`
			`"""Measure cost of 50 _get_collection() calls vs a cached client."""`
			`palace_path = _make_palace(tmp_path, 500)`
			`_patch_mcp_config(monkeypatch, palace_path, tmp_path)`

			`from mempalace.mcp_server import _get_collection`

			`n_calls = 50`

			`# Measure re-instantiation (current behavior)`
			`start = time.perf_counter()`
			`for _ in range(n_calls):`
			`col = _get_collection()`
			`assert col is not None`
			`uncached_ms = (time.perf_counter() - start) * 1000`

			`# Measure cached client (what it should be)`
			`client = chromadb.PersistentClient(path=palace_path)`
			`cached_col = client.get_collection("mempalace_drawers")`
			`start = time.perf_counter()`
			`for _ in range(n_calls):`
			`_ = cached_col.count()`
			`cached_ms = (time.perf_counter() - start) * 1000`

			`overhead_ratio = uncached_ms / max(cached_ms, 0.01)`

			`record_metric("client_reinstantiation", "uncached_total_ms", round(uncached_ms, 1))`
			`record_metric("client_reinstantiation", "cached_total_ms", round(cached_ms, 1))`
			`record_metric("client_reinstantiation", "overhead_ratio", round(overhead_ratio, 2))`
			`record_metric("client_reinstantiation", "n_calls", n_calls)`


			`@pytest.mark.benchmark`
			`class TestToolSearchLatency:`
			`"""tool_search uses query() not get(), should scale better."""`

			`@pytest.mark.parametrize("n_drawers", [500, 1_000, 2_500, 5_000])`
			`def test_search_latency(self, n_drawers, tmp_path, monkeypatch):`
			`palace_path = _make_palace(tmp_path, n_drawers)`
			`_patch_mcp_config(monkeypatch, palace_path, tmp_path)`

			`from mempalace.mcp_server import tool_search`

			`queries = ["authentication middleware", "database migration", "error handling"]`
			`latencies = []`
			`for q in queries:`
			`start = time.perf_counter()`
			`result = tool_search(query=q, limit=5)`
			`elapsed_ms = (time.perf_counter() - start) * 1000`
			`latencies.append(elapsed_ms)`
			`assert "error" not in result`

			`avg_ms = sum(latencies) / len(latencies)`
			`record_metric("mcp_search", f"avg_latency_ms_at_{n_drawers}", round(avg_ms, 1))`


			`@pytest.mark.benchmark`
			`class TestDuplicateCheckCost:`
			`"""tool_add_drawer calls tool_check_duplicate first — measure overhead."""`

			`@pytest.mark.parametrize("n_drawers", [500, 1_000, 2_500])`
			`def test_duplicate_check_latency(self, n_drawers, tmp_path, monkeypatch):`
			`palace_path = _make_palace(tmp_path, n_drawers)`
			`_patch_mcp_config(monkeypatch, palace_path, tmp_path)`

			`from mempalace.mcp_server import tool_check_duplicate`

			`test_content = "This is unique test content for duplicate checking benchmark."`
			`start = time.perf_counter()`
			`result = tool_check_duplicate(content=test_content)`
			`elapsed_ms = (time.perf_counter() - start) * 1000`

			`assert "error" not in result`
			`record_metric("mcp_duplicate_check", f"latency_ms_at_{n_drawers}", round(elapsed_ms, 1))`