perf(mining): batch per-chunk upserts and add optional GPU acceleration

The miner upserted one drawer per ChromaDB call, paying tokenizer +
ONNX session setup per chunk. The embedding device was CPU-only because
no EmbeddingFunction was ever wired through the backend.

Two changes, each a speedup in its own right; stacked they give ~10x
end-to-end on a medium corpus (20 files, 568 drawers):

1. Batched upsert. `process_file` and `_file_chunks_locked` now collect
   all chunks of a file into a single `collection.upsert(...)` so the
   embedding model runs one forward pass per file instead of N.

2. Hardware-accelerated embedding function. New `mempalace/embedding.py`
   wraps `ONNXMiniLM_L6_V2` with configurable `preferred_providers`.
   `MEMPALACE_EMBEDDING_DEVICE` (or `embedding_device` in config.json)
   selects auto / cpu / cuda / coreml / dml. Unavailable accelerators
   log a warning and fall back to CPU.

   The factory subclasses `ONNXMiniLM_L6_V2` and spoofs its `name()` to
   `"default"` so the persisted EF identity matches existing palaces
   created with ChromaDB's bare `DefaultEmbeddingFunction` -- same
   model, same 384-dim vectors, no rebuild needed when turning GPU on.

   `ChromaBackend.get_collection` / `create_collection` now pass the
   resolved EF on every call so miner writes and searcher reads agree.

Benchmarks (i9-12900KF + RTX 3090, medium scenario, 568 drawers):

  per-chunk + CPU   19.77s ·  29 drw/s   (baseline)
  batched   + CPU    8.07s ·  70 drw/s   (2.4x)
  batched   + CUDA   2.15s · 264 drw/s   (9.2x)

Reproducible via `benchmarks/mine_bench.py`.

Install paths:
  pip install mempalace[gpu]       # NVIDIA CUDA
  pip install mempalace[dml]       # DirectML (Windows)
  pip install mempalace[coreml]    # macOS Neural Engine

Mine header now prints `Device: cpu|cuda|...` so users can confirm the
accelerator engaged.
This commit is contained in:
Igor Lins e Silva
2026-04-24 19:42:35 -03:00
parent 7a757916b3
commit a4868a3589
8 changed files with 784 additions and 61 deletions
+25 -3
View File
@@ -405,6 +405,23 @@ class ChromaBackend(BaseBackend):
self._freshness: dict[str, tuple[int, float]] = {}
self._closed = False
@staticmethod
def _resolve_embedding_function():
"""Return the EF for the user's ``embedding_device`` setting.
Both ``get_collection`` and ``get_or_create_collection`` must receive
the EF explicitly — ChromaDB 1.x does not persist it with the
collection, so a reader that omits the argument silently gets the
library default and its queries won't match the writer's vectors.
"""
try:
from ..embedding import get_embedding_function
return get_embedding_function()
except Exception:
logger.exception("Failed to build embedding function; using chromadb default")
return None
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
@@ -532,12 +549,15 @@ class ChromaBackend(BaseBackend):
if options and isinstance(options, dict):
hnsw_space = options.get("hnsw_space", hnsw_space)
ef = self._resolve_embedding_function()
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
if create:
collection = client.get_or_create_collection(
collection_name, metadata={"hnsw:space": hnsw_space}
collection_name, metadata={"hnsw:space": hnsw_space}, **ef_kwargs
)
else:
collection = client.get_collection(collection_name)
collection = client.get_collection(collection_name, **ef_kwargs)
return ChromaCollection(collection)
def close_palace(self, palace) -> None:
@@ -578,8 +598,10 @@ class ChromaBackend(BaseBackend):
self, palace_path: str, collection_name: str, hnsw_space: str = "cosine"
) -> ChromaCollection:
"""Create (not get-or-create) ``collection_name`` with the given HNSW space."""
ef = self._resolve_embedding_function()
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
collection = self._client(palace_path).create_collection(
collection_name, metadata={"hnsw:space": hnsw_space}
collection_name, metadata={"hnsw:space": hnsw_space}, **ef_kwargs
)
return ChromaCollection(collection)
+17
View File
@@ -236,6 +236,23 @@ class MempalaceConfig:
pass
return normalized
@property
def embedding_device(self):
"""Hardware device for the ONNX embedding model.
Values: ``"auto"`` (default), ``"cpu"``, ``"cuda"``, ``"coreml"``,
``"dml"``. Read from env ``MEMPALACE_EMBEDDING_DEVICE`` first, then
``embedding_device`` in ``config.json``, then ``"auto"``.
``auto`` resolves to the first available accelerator at runtime via
:mod:`mempalace.embedding`; requesting an unavailable accelerator
logs a warning and falls back to CPU.
"""
env_val = os.environ.get("MEMPALACE_EMBEDDING_DEVICE")
if env_val:
return env_val.strip().lower()
return str(self._file_config.get("embedding_device", "auto")).strip().lower()
@property
def hook_silent_save(self):
"""Whether the stop hook saves directly (True) or blocks for MCP calls (False)."""
+30 -17
View File
@@ -332,31 +332,44 @@ def _file_chunks_locked(collection, source_file, chunks, wing, room, agent, extr
except Exception:
pass
# Batch the whole file into one upsert so the embedding model runs
# a single forward pass for all chunks — dramatically faster than
# one call per chunk, especially on GPU where per-call overhead
# dominates over the actual matmul.
batch_docs: list = []
batch_ids: list = []
batch_metas: list = []
filed_at = datetime.now().isoformat()
for chunk in chunks:
chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
if extract_mode == "general":
room_counts_delta[chunk_room] += 1
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
batch_docs.append(chunk["content"])
batch_ids.append(drawer_id)
batch_metas.append(
{
"wing": wing,
"room": chunk_room,
"hall": _detect_hall_cached(chunk["content"]),
"source_file": source_file,
"chunk_index": chunk["chunk_index"],
"added_by": agent,
"filed_at": filed_at,
"ingest_mode": "convos",
"extract_mode": extract_mode,
"normalize_version": NORMALIZE_VERSION,
}
)
if batch_docs:
try:
collection.upsert(
documents=[chunk["content"]],
ids=[drawer_id],
metadatas=[
{
"wing": wing,
"room": chunk_room,
"hall": _detect_hall_cached(chunk["content"]),
"source_file": source_file,
"chunk_index": chunk["chunk_index"],
"added_by": agent,
"filed_at": datetime.now().isoformat(),
"ingest_mode": "convos",
"extract_mode": extract_mode,
"normalize_version": NORMALIZE_VERSION,
}
],
documents=batch_docs,
ids=batch_ids,
metadatas=batch_metas,
)
drawers_added += 1
drawers_added = len(batch_docs)
except Exception as e:
if "already exists" not in str(e).lower():
raise
+147
View File
@@ -0,0 +1,147 @@
"""Embedding function factory with hardware acceleration.
Returns a ChromaDB-compatible embedding function bound to a user-selected
ONNX Runtime execution provider. The same ``all-MiniLM-L6-v2`` model and
384-dim vectors ChromaDB ships by default are reused, so switching device
does not invalidate existing palaces.
Supported devices (env ``MEMPALACE_EMBEDDING_DEVICE`` or ``embedding_device``
in ``~/.mempalace/config.json``):
* ``auto`` — prefer CUDA ▸ CoreML ▸ DirectML, fall back to CPU
* ``cpu`` — force CPU (the historical default)
* ``cuda`` — NVIDIA GPU via ``onnxruntime-gpu`` (``pip install mempalace[gpu]``)
* ``coreml`` — Apple Neural Engine (macOS)
* ``dml`` — DirectML (Windows / AMD / Intel GPUs)
Requesting an unavailable accelerator emits a warning and falls back to CPU
rather than hard-failing — mining must still work on a laptop without CUDA.
"""
from __future__ import annotations
import logging
from typing import Optional
logger = logging.getLogger(__name__)
_PROVIDER_MAP = {
"cpu": ["CPUExecutionProvider"],
"cuda": ["CUDAExecutionProvider", "CPUExecutionProvider"],
"coreml": ["CoreMLExecutionProvider", "CPUExecutionProvider"],
"dml": ["DmlExecutionProvider", "CPUExecutionProvider"],
}
_AUTO_ORDER = [
("CUDAExecutionProvider", "cuda"),
("CoreMLExecutionProvider", "coreml"),
("DmlExecutionProvider", "dml"),
]
_EF_CACHE: dict = {}
_WARNED: set = set()
def _resolve_providers(device: str) -> tuple[list, str]:
"""Return ``(provider_list, effective_device)`` for ``device``.
Falls back to CPU (with a one-shot warning) when the requested
accelerator is not compiled into the installed ``onnxruntime``.
"""
device = (device or "auto").strip().lower()
try:
import onnxruntime as ort
available = set(ort.get_available_providers())
except ImportError:
return (["CPUExecutionProvider"], "cpu")
if device == "auto":
for provider, name in _AUTO_ORDER:
if provider in available:
return ([provider, "CPUExecutionProvider"], name)
return (["CPUExecutionProvider"], "cpu")
requested = _PROVIDER_MAP.get(device)
if requested is None:
if device not in _WARNED:
logger.warning("Unknown embedding_device %r — falling back to cpu", device)
_WARNED.add(device)
return (["CPUExecutionProvider"], "cpu")
preferred = requested[0]
if preferred == "CPUExecutionProvider":
return (requested, "cpu")
if preferred not in available:
if device not in _WARNED:
logger.warning(
"embedding_device=%r requested but %s is not installed — "
"falling back to CPU. Install mempalace[gpu] for CUDA.",
device,
preferred,
)
_WARNED.add(device)
return (["CPUExecutionProvider"], "cpu")
return (requested, device)
def _build_ef_class():
"""Subclass ``ONNXMiniLM_L6_V2`` with name ``"default"``.
Why the rename: ChromaDB 1.5 persists the EF identity on the collection
and rejects reads that pass a differently-named EF (``onnx_mini_lm_l6_v2``
vs ``default``). The vectors and model are identical — only the
``name()`` tag differs — so spoofing the name lets one EF class serve
palaces created with ``DefaultEmbeddingFunction`` *and* palaces we
create ourselves, with the same GPU-capable ``preferred_providers``.
"""
from chromadb.utils.embedding_functions import ONNXMiniLM_L6_V2
class _MempalaceONNX(ONNXMiniLM_L6_V2):
@staticmethod
def name() -> str:
return "default"
return _MempalaceONNX
def get_embedding_function(device: Optional[str] = None):
"""Return a cached embedding function bound to the requested device.
``device=None`` reads from :class:`MempalaceConfig.embedding_device`.
The returned function is shared across calls with the same resolved
provider list so we only pay model-load cost once per process.
"""
if device is None:
from .config import MempalaceConfig
device = MempalaceConfig().embedding_device
providers, effective = _resolve_providers(device)
cache_key = tuple(providers)
cached = _EF_CACHE.get(cache_key)
if cached is not None:
return cached
ef_cls = _build_ef_class()
ef = ef_cls(preferred_providers=providers)
_EF_CACHE[cache_key] = ef
logger.info("Embedding function initialized (device=%s providers=%s)", effective, providers)
return ef
def describe_device(device: Optional[str] = None) -> str:
"""Return a short human-readable label for the resolved device.
Used by the miner CLI header so users can see at a glance whether GPU
acceleration actually engaged.
"""
if device is None:
from .config import MempalaceConfig
device = MempalaceConfig().embedding_device
_, effective = _resolve_providers(device)
return effective
+89 -40
View File
@@ -14,6 +14,7 @@ import fnmatch
from pathlib import Path
from datetime import datetime
from collections import defaultdict
from typing import Optional
from .palace import (
NORMALIZE_VERSION,
@@ -633,40 +634,62 @@ def _extract_entities_for_metadata(content: str) -> str:
return ";".join(capped)
def _build_drawer_metadata(
wing: str,
room: str,
source_file: str,
chunk_index: int,
agent: str,
content: str,
source_mtime: Optional[float],
) -> dict:
"""Build the metadata dict for one drawer without upserting.
Split out from ``add_drawer`` so ``process_file`` can batch all chunks
of a file into a single ``collection.upsert`` — one embedding forward
pass per batch instead of per chunk.
"""
metadata = {
"wing": wing,
"room": room,
"source_file": source_file,
"chunk_index": chunk_index,
"added_by": agent,
"filed_at": datetime.now().isoformat(),
"normalize_version": NORMALIZE_VERSION,
}
if source_mtime is not None:
metadata["source_mtime"] = source_mtime
metadata["hall"] = detect_hall(content)
entities = _extract_entities_for_metadata(content)
if entities:
metadata["entities"] = entities
return metadata
def add_drawer(
collection, wing: str, room: str, content: str, source_file: str, chunk_index: int, agent: str
):
"""Add one drawer to the palace."""
"""Add one drawer to the palace.
Kept for backward compatibility with external callers. In-tree the
miner uses ``_build_drawer_metadata`` + a batched ``collection.upsert``
to amortize the embedding model's forward-pass cost across chunks.
"""
drawer_id = f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(chunk_index)).encode()).hexdigest()[:24]}"
try:
metadata = {
"wing": wing,
"room": room,
"source_file": source_file,
"chunk_index": chunk_index,
"added_by": agent,
"filed_at": datetime.now().isoformat(),
"normalize_version": NORMALIZE_VERSION,
}
# Store file mtime so we can detect modifications later.
try:
metadata["source_mtime"] = os.path.getmtime(source_file)
except OSError:
pass
# Tag with hall for graph connectivity within wings
metadata["hall"] = detect_hall(content)
# Tag with entity names for filterable search
entities = _extract_entities_for_metadata(content)
if entities:
metadata["entities"] = entities
collection.upsert(
documents=[content],
ids=[drawer_id],
metadatas=[metadata],
)
return True
except Exception:
raise
source_mtime = os.path.getmtime(source_file)
except OSError:
source_mtime = None
metadata = _build_drawer_metadata(
wing, room, source_file, chunk_index, agent, content, source_mtime
)
collection.upsert(
documents=[content],
ids=[drawer_id],
metadatas=[metadata],
)
return True
# =============================================================================
@@ -725,19 +748,42 @@ def process_file(
except Exception:
pass
drawers_added = 0
# Batch all chunks of this file into a single upsert so the embedding
# model runs one forward pass over the whole file instead of N passes
# of one chunk each. On CPU this is typically a 10-30x speedup; on
# GPU the speedup is larger because per-call overhead dominates.
try:
source_mtime = os.path.getmtime(source_file)
except OSError:
source_mtime = None
batch_docs: list = []
batch_ids: list = []
batch_metas: list = []
for chunk in chunks:
added = add_drawer(
collection=collection,
wing=wing,
room=room,
content=chunk["content"],
source_file=source_file,
chunk_index=chunk["chunk_index"],
agent=agent,
drawer_id = f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
batch_docs.append(chunk["content"])
batch_ids.append(drawer_id)
batch_metas.append(
_build_drawer_metadata(
wing,
room,
source_file,
chunk["chunk_index"],
agent,
chunk["content"],
source_mtime,
)
)
if added:
drawers_added += 1
drawers_added = 0
if batch_docs:
collection.upsert(
documents=batch_docs,
ids=batch_ids,
metadatas=batch_metas,
)
drawers_added = len(batch_docs)
# Build closet — the searchable index pointing to these drawers.
# Purge first: a re-mine (mtime change or normalize_version bump) must
@@ -868,6 +914,8 @@ def mine(
if limit > 0:
files = files[:limit]
from .embedding import describe_device
print(f"\n{'=' * 55}")
print(" MemPalace Mine")
print(f"{'=' * 55}")
@@ -875,6 +923,7 @@ def mine(
print(f" Rooms: {', '.join(r['name'] for r in rooms)}")
print(f" Files: {len(files)}")
print(f" Palace: {palace_path}")
print(f" Device: {describe_device()}")
if dry_run:
print(" DRY RUN — nothing will be filed")
if not respect_gitignore: