benchmarks: add --llm-backend ollama for non-Anthropic rerank

The rerank pipeline was hardcoded to Anthropic's /v1/messages. Add a backend flag so the same code path can be exercised with any OpenAI-compatible endpoint — local Ollama, Ollama Cloud, or any gateway that speaks /v1/chat/completions. Enables independent verification of the "100% with Haiku rerank" claim by running the full benchmark with a different LLM family (e.g. minimax-m2.7:cloud) and zero Anthropic dependency. Both longmemeval_bench.py and locomo_bench.py: - llm_rerank*() gain backend= / base_url= kwargs - CLI: --llm-backend {anthropic,ollama}, --llm-base-url - API key required only when backend=anthropic (diary/palace modes still require it) - Parse last integer in response (reasoning models emit multi-int output) - Fallback to message.reasoning when content is empty - Raise max_tokens to 1024 for reasoning models
2026-04-14 21:20:14 -03:00
parent 4aa7e1eebd
commit 8df7b9bf2c
3 changed files with 169 additions and 66 deletions
@@ -510,11 +510,20 @@ def palace_assign_rooms(sessions, sample_id, api_key, cache, model="claude-haiku


 def llm_rerank_locomo(
-    question, retrieved_ids, retrieved_docs, api_key, top_k=10, model="claude-sonnet-4-6"
+    question,
+    retrieved_ids,
+    retrieved_docs,
+    api_key,
+    top_k=10,
+    model="claude-sonnet-4-6",
+    backend="anthropic",
+    base_url="",
 ):
    """
    Ask LLM to pick the single most relevant document for this question.
    Returns reordered retrieved_ids with the best candidate first.
+
+    Supports backend="anthropic" (default) or "ollama" (OpenAI-compat endpoint).
    """
    candidates = retrieved_ids[:top_k]
    candidate_docs = retrieved_docs[:top_k]
@@ -522,7 +531,6 @@ def llm_rerank_locomo(
    if len(candidates) <= 1:
        return retrieved_ids

-    # Build numbered list of candidates
    lines = []
    for i, (cid, doc) in enumerate(zip(candidates, candidate_docs), 1):
        snippet = doc[:300].replace("\n", " ")
@@ -534,6 +542,21 @@ def llm_rerank_locomo(
        f"Reply with just the number (1-{len(candidates)}).\n\n" + "\n".join(lines)
    )

+    if backend == "ollama":
+        url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions"
+        payload = json.dumps(
+            {
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}],
+                "max_tokens": 1024,
+                "temperature": 0.0,
+            }
+        ).encode("utf-8")
+        headers = {"content-type": "application/json"}
+        if api_key:
+            headers["authorization"] = f"Bearer {api_key}"
+    else:
+        url = "https://api.anthropic.com/v1/messages"
        payload = json.dumps(
            {
                "model": model,
@@ -541,28 +564,29 @@ def llm_rerank_locomo(
                "messages": [{"role": "user", "content": prompt}],
            }
        ).encode("utf-8")
-
-    req = urllib.request.Request(
-        "https://api.anthropic.com/v1/messages",
-        data=payload,
        headers = {
            "x-api-key": api_key,
            "anthropic-version": "2023-06-01",
            "content-type": "application/json",
-        },
-        method="POST",
-    )
+        }
+
+    req = urllib.request.Request(url, data=payload, headers=headers, method="POST")

    import socket as _socket

    for _attempt in range(3):
        try:
-            with urllib.request.urlopen(req, timeout=30) as resp:
+            with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 30) as resp:
                result = json.loads(resp.read())
+            if backend == "ollama":
+                msg = result["choices"][0]["message"]
+                raw = (msg.get("content") or "").strip() or (msg.get("reasoning") or "").strip()
+            else:
                raw = result["content"][0]["text"].strip()
-            m = re.search(r"\b(\d+)\b", raw)
+            # Take LAST integer — reasoning models often count candidates first
+            m = re.search(r"\b(\d+)\b", raw[::-1])
            if m:
-                pick = int(m.group(1))
+                pick = int(m.group(1)[::-1])
                if 1 <= pick <= len(candidates):
                    chosen_id = candidates[pick - 1]
                    reordered = [chosen_id] + [cid for cid in retrieved_ids if cid != chosen_id]
@@ -608,6 +632,8 @@ def run_benchmark(
    palace_cache_file=None,
    palace_model="claude-haiku-4-5-20251001",
    embed_model="default",
+    llm_backend="anthropic",
+    llm_base_url="",
 ):
    """Run LoCoMo retrieval benchmark."""
    with open(data_file) as f:
@@ -619,8 +645,12 @@ def run_benchmark(
    api_key = ""
    if llm_rerank_enabled or mode == "palace":
        api_key = _load_api_key(llm_key)
-        if not api_key:
-            print(f"ERROR: --mode {mode} requires an API key (--llm-key or ANTHROPIC_API_KEY).")
+        # Ollama backend doesn't require an Anthropic key. Palace mode still does
+        # (it uses Anthropic for room-assignment indexing) — so only relax the
+        # requirement when rerank is the ONLY llm use and backend is ollama.
+        needs_key = mode == "palace" or (llm_rerank_enabled and llm_backend == "anthropic")
+        if needs_key and not api_key:
+            print(f"ERROR: --mode {mode} / --llm-rerank (anthropic) requires an API key.")
            sys.exit(1)

    # Palace mode: load or create room assignment cache
@@ -888,6 +918,8 @@ def run_benchmark(
                        api_key,
                        top_k=rerank_pool,
                        model=llm_model,
+                        backend=llm_backend,
+                        base_url=llm_base_url,
                    )

                # Compute recall
@@ -1013,6 +1045,18 @@ if __name__ == "__main__":
        help="Model for LLM rerank (default: claude-sonnet-4-6)",
    )
    parser.add_argument("--llm-key", default="", help="API key (or set ANTHROPIC_API_KEY env var)")
+    parser.add_argument(
+        "--llm-backend",
+        choices=["anthropic", "ollama"],
+        default="anthropic",
+        help="Which API for --llm-rerank. 'anthropic' (default) or 'ollama' "
+        "(OpenAI-compat /v1/chat/completions — works for local + Ollama Cloud).",
+    )
+    parser.add_argument(
+        "--llm-base-url",
+        default="",
+        help="Override base URL for --llm-backend ollama. Default: http://localhost:11434.",
+    )
    parser.add_argument(
        "--hybrid-weight",
        type=float,
@@ -1049,4 +1093,6 @@ if __name__ == "__main__":
        palace_cache_file=args.palace_cache,
        palace_model=args.palace_model,
        embed_model=args.embed_model,
+        llm_backend=args.llm_backend,
+        llm_base_url=args.llm_base_url,
    )
@@ -2763,7 +2763,15 @@ def build_palace_and_retrieve_diary(


 def llm_rerank(
-    question, rankings, corpus, corpus_ids, api_key, top_k=10, model="claude-haiku-4-5-20251001"
+    question,
+    rankings,
+    corpus,
+    corpus_ids,
+    api_key,
+    top_k=10,
+    model="claude-haiku-4-5-20251001",
+    backend="anthropic",
+    base_url="",
 ):
    """
    Use an LLM to re-rank the top-k retrieved sessions.
@@ -2772,19 +2780,22 @@ def llm_rerank(
    which single session is most relevant to the question. That session
    is promoted to rank 1; the rest stay in their existing order.

-    This closes the gap for "preference" and jargon-dense "assistant"
-    failures where the right session is in top-10 semantically but not
-    top-5 — because the semantic gap (battery life ↔ phone hardware) is
-    too large for embeddings to bridge.
+    Supports two backends:
+      - "anthropic": hits https://api.anthropic.com/v1/messages with x-api-key.
+      - "ollama":    hits {base_url}/v1/chat/completions (OpenAI-compat) —
+                     works for local Ollama (default http://localhost:11434)
+                     and Ollama Cloud (:cloud model tags).

    Args:
        question:   The benchmark question string
        rankings:   Current ranked list of corpus indices (from any mode)
        corpus:     List of document strings
        corpus_ids: List of corpus IDs (parallel to corpus)
-        api_key:     Anthropic API key string
+        api_key:    Anthropic API key (only required for backend="anthropic")
        top_k:      How many top sessions to send to LLM (default: 10)
-        model:       Claude model ID for reranking (default: haiku)
+        model:      Model id (Claude model for anthropic, e.g. "minimax-m2.7:cloud" for ollama)
+        backend:    "anthropic" or "ollama"
+        base_url:   Override base URL (ollama default: http://localhost:11434)

    Returns:
        Reordered rankings list with LLM's best pick promoted to rank 1.
@@ -2796,7 +2807,6 @@ def llm_rerank(
    if not candidates:
        return rankings

-    # Format sessions for the prompt — first 500 chars each, labelled 1..N
    session_blocks = []
    for rank, idx in enumerate(candidates):
        text = corpus[idx][:500].replace("\n", " ").strip()
@@ -2813,6 +2823,21 @@ def llm_rerank(
        f"Most relevant session number:"
    )

+    if backend == "ollama":
+        url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions"
+        payload = json.dumps(
+            {
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}],
+                "max_tokens": 1024,
+                "temperature": 0.0,
+            }
+        ).encode("utf-8")
+        headers = {"content-type": "application/json"}
+        if api_key:
+            headers["authorization"] = f"Bearer {api_key}"
+    else:
+        url = "https://api.anthropic.com/v1/messages"
        payload = json.dumps(
            {
                "model": model,
@@ -2820,42 +2845,44 @@ def llm_rerank(
                "messages": [{"role": "user", "content": prompt}],
            }
        ).encode("utf-8")
-
-    req = urllib.request.Request(
-        "https://api.anthropic.com/v1/messages",
-        data=payload,
        headers = {
            "x-api-key": api_key,
            "anthropic-version": "2023-06-01",
            "content-type": "application/json",
-        },
-        method="POST",
-    )
+        }
+
+    req = urllib.request.Request(url, data=payload, headers=headers, method="POST")

    import socket as _socket

    for _attempt in range(3):
        try:
-            with urllib.request.urlopen(req, timeout=20) as resp:
+            with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 20) as resp:
                result = json.loads(resp.read())
+            if backend == "ollama":
+                msg = result["choices"][0]["message"]
+                # Reasoning models (e.g. minimax-m2.7) may emit final answer in "content"
+                # or embed it in "reasoning". Try content first, fall back to reasoning.
+                raw = (msg.get("content") or "").strip()
+                if not raw:
+                    raw = (msg.get("reasoning") or "").strip()
+            else:
                raw = result["content"][0]["text"].strip()
-            # Parse just the first integer from Haiku's response
-            m = re.search(r"\b(\d+)\b", raw)
+            m = re.search(r"\b(\d+)\b", raw[::-1])  # take LAST integer (rerank models often reason first)
            if m:
-                pick = int(m.group(1))
+                pick = int(m.group(1)[::-1])
                if 1 <= pick <= len(candidates):
                    chosen_idx = candidates[pick - 1]
                    reordered = [chosen_idx] + [i for i in rankings if i != chosen_idx]
                    return reordered
-            break  # Got a response, even if unparseable — don't retry
+            break
        except (_socket.timeout, TimeoutError):
            if _attempt < 2:
                import time as _time

-                _time.sleep(3)  # brief pause then retry
-            # else fall through to return rankings
+                _time.sleep(3)
        except (urllib.error.URLError, KeyError, ValueError, IndexError, OSError):
-            break  # Non-timeout error — fall back immediately
+            break

    return rankings

@@ -2919,6 +2946,8 @@ def run_benchmark(
    skip_precompute=False,
    split_file=None,
    split_subset=None,
+    llm_backend="anthropic",
+    llm_base_url="",
 ):
    """Run the full benchmark.

@@ -2947,10 +2976,14 @@ def run_benchmark(
    api_key = ""
    if llm_rerank_enabled or mode == "diary":
        api_key = _load_api_key(llm_key)
-        if not api_key:
+        # Ollama backend doesn't require an Anthropic API key; a local/cloud Ollama
+        # daemon with the requested model pulled is enough. Diary mode is always anthropic.
+        needs_key = (llm_backend == "anthropic") or (mode == "diary")
+        if needs_key and not api_key:
            print(
-                "ERROR: --llm-rerank / --mode diary requires an API key. "
-                "Set ANTHROPIC_API_KEY or use --llm-key."
+                "ERROR: --llm-rerank (anthropic backend) / --mode diary requires an API key. "
+                "Set ANTHROPIC_API_KEY or use --llm-key. For ollama backend, pass "
+                "--llm-backend ollama."
            )
            sys.exit(1)

@@ -3100,7 +3133,15 @@ def run_benchmark(
        if llm_rerank_enabled:
            rerank_pool = 20 if mode in ("hybrid_v3", "hybrid_v4", "palace") else 10
            rankings = llm_rerank(
-                question, rankings, corpus, corpus_ids, api_key, top_k=rerank_pool, model=llm_model
+                question,
+                rankings,
+                corpus,
+                corpus_ids,
+                api_key,
+                top_k=rerank_pool,
+                model=llm_model,
+                backend=llm_backend,
+                base_url=llm_base_url,
            )

        # Evaluate at session level
@@ -3276,7 +3317,21 @@ if __name__ == "__main__":
        default="claude-haiku-4-5-20251001",
        help="Model for LLM re-ranking and diary ingest "
        "(default: claude-haiku-4-5-20251001). "
-        "Use 'claude-sonnet-4-6' for Sonnet comparison.",
+        "Use 'claude-sonnet-4-6' for Sonnet comparison. "
+        "With --llm-backend ollama, use an Ollama model tag like 'minimax-m2.7:cloud'.",
+    )
+    parser.add_argument(
+        "--llm-backend",
+        choices=["anthropic", "ollama"],
+        default="anthropic",
+        help="Which API to hit for --llm-rerank. 'anthropic' (default) uses Anthropic's "
+        "/v1/messages endpoint. 'ollama' uses Ollama's OpenAI-compatible "
+        "/v1/chat/completions endpoint (works with local Ollama and Ollama Cloud).",
+    )
+    parser.add_argument(
+        "--llm-base-url",
+        default="",
+        help="Override base URL for --llm-backend ollama. Defaults to http://localhost:11434.",
    )
    parser.add_argument(
        "--diary-cache",
@@ -3380,4 +3435,6 @@ if __name__ == "__main__":
        args.skip_precompute,
        split_file=args.split_file,
        split_subset=split_subset,
+        llm_backend=args.llm_backend,
+        llm_base_url=args.llm_base_url,
    )
@@ -1239,7 +1239,7 @@ dev = [
 [package.metadata]
 requires-dist = [
    { name = "autocorrect", marker = "extra == 'spellcheck'", specifier = ">=2.0" },
-    { name = "chromadb", specifier = ">=0.5.0,<0.7" },
+    { name = "chromadb", specifier = ">=0.5.0" },
    { name = "psutil", marker = "extra == 'dev'", specifier = ">=5.9" },
    { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0" },
    { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0" },