test: cover embedding device fallback and bounded upserts

Agent-Logs-Url: https://github.com/MemPalace/mempalace/sessions/3213a67a-6871-4bb2-9ae0-23fa11001a22 Co-authored-by: igorls <4753812+igorls@users.noreply.github.com>
2026-04-24 23:06:50 +00:00
parent a4868a3589
commit fbd0904799
7 changed files with 268 additions and 57 deletions
@@ -55,6 +55,7 @@ CONVO_EXTENSIONS = {

 MIN_CHUNK_SIZE = 30
 CHUNK_SIZE = 800  # chars per drawer — align with miner.py
+DRAWER_UPSERT_BATCH_SIZE = 1000
 MAX_FILE_SIZE = 500 * 1024 * 1024  # 500 MB — skip files larger than this.
 # Matches miner.py at 500 MB. Long Claude Code sessions, multi-year
 # ChatGPT exports, and lifetime Slack dumps routinely exceed 10 MB; the
@@ -332,44 +333,43 @@ def _file_chunks_locked(collection, source_file, chunks, wing, room, agent, extr
        except Exception:
            pass

-        # Batch the whole file into one upsert so the embedding model runs
-        # a single forward pass for all chunks — dramatically faster than
-        # one call per chunk, especially on GPU where per-call overhead
-        # dominates over the actual matmul.
-        batch_docs: list = []
-        batch_ids: list = []
-        batch_metas: list = []
+        # Batch chunks into bounded upserts so large transcripts keep most of
+        # the embedding speedup without one huge Chroma/SQLite request. Keep
+        # one filed_at per source file so all transcript drawers share an
+        # ingest timestamp.
        filed_at = datetime.now().isoformat()
-        for chunk in chunks:
-            chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
-            if extract_mode == "general":
-                room_counts_delta[chunk_room] += 1
-            drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
-            batch_docs.append(chunk["content"])
-            batch_ids.append(drawer_id)
-            batch_metas.append(
-                {
-                    "wing": wing,
-                    "room": chunk_room,
-                    "hall": _detect_hall_cached(chunk["content"]),
-                    "source_file": source_file,
-                    "chunk_index": chunk["chunk_index"],
-                    "added_by": agent,
-                    "filed_at": filed_at,
-                    "ingest_mode": "convos",
-                    "extract_mode": extract_mode,
-                    "normalize_version": NORMALIZE_VERSION,
-                }
-            )
-
-        if batch_docs:
+        for batch_start in range(0, len(chunks), DRAWER_UPSERT_BATCH_SIZE):
+            batch_docs: list = []
+            batch_ids: list = []
+            batch_metas: list = []
+            for chunk in chunks[batch_start : batch_start + DRAWER_UPSERT_BATCH_SIZE]:
+                chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
+                if extract_mode == "general":
+                    room_counts_delta[chunk_room] += 1
+                drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
+                batch_docs.append(chunk["content"])
+                batch_ids.append(drawer_id)
+                batch_metas.append(
+                    {
+                        "wing": wing,
+                        "room": chunk_room,
+                        "hall": _detect_hall_cached(chunk["content"]),
+                        "source_file": source_file,
+                        "chunk_index": chunk["chunk_index"],
+                        "added_by": agent,
+                        "filed_at": filed_at,
+                        "ingest_mode": "convos",
+                        "extract_mode": extract_mode,
+                        "normalize_version": NORMALIZE_VERSION,
+                    }
+                )
            try:
                collection.upsert(
                    documents=batch_docs,
                    ids=batch_ids,
                    metadatas=batch_metas,
                )
-                drawers_added = len(batch_docs)
+                drawers_added += len(batch_docs)
            except Exception as e:
                if "already exists" not in str(e).lower():
                    raise
@@ -32,6 +32,12 @@ _PROVIDER_MAP = {
    "dml": ["DmlExecutionProvider", "CPUExecutionProvider"],
 }

+_DEVICE_EXTRA = {
+    "cuda": "mempalace[gpu]",
+    "coreml": "mempalace[coreml]",
+    "dml": "mempalace[dml]",
+}
+
 _AUTO_ORDER = [
    ("CUDAExecutionProvider", "cuda"),
    ("CoreMLExecutionProvider", "coreml"),
@@ -76,11 +82,13 @@ def _resolve_providers(device: str) -> tuple[list, str]:

    if preferred not in available:
        if device not in _WARNED:
+            extra = _DEVICE_EXTRA.get(device, "the matching mempalace extra for your device")
            logger.warning(
                "embedding_device=%r requested but %s is not installed — "
-                "falling back to CPU. Install mempalace[gpu] for CUDA.",
+                "falling back to CPU. Install %s.",
                device,
                preferred,
+                extra,
            )
            _WARNED.add(device)
        return (["CPUExecutionProvider"], "cpu")
@@ -65,6 +65,7 @@ SKIP_FILENAMES = {
 CHUNK_SIZE = 800  # chars per drawer
 CHUNK_OVERLAP = 100  # overlap between chunks
 MIN_CHUNK_SIZE = 50  # skip tiny chunks
+DRAWER_UPSERT_BATCH_SIZE = 1000
 MAX_FILE_SIZE = 500 * 1024 * 1024  # 500 MB — skip files larger than this.
 # Long Claude Code sessions and large transcript exports routinely exceed
 # 10 MB. The cap exists as a defensive rail against pathological binary
@@ -748,42 +749,41 @@ def process_file(
        except Exception:
            pass

-        # Batch all chunks of this file into a single upsert so the embedding
-        # model runs one forward pass over the whole file instead of N passes
-        # of one chunk each. On CPU this is typically a 10-30x speedup; on
-        # GPU the speedup is larger because per-call overhead dominates.
+        # Batch chunks into bounded upserts so the embedding model sees many
+        # chunks per forward pass without building one huge Chroma/SQLite
+        # request for pathological files. A bad chunk can fail its sub-batch;
+        # that is the deliberate trade-off for amortizing embedding overhead.
        try:
            source_mtime = os.path.getmtime(source_file)
        except OSError:
            source_mtime = None

-        batch_docs: list = []
-        batch_ids: list = []
-        batch_metas: list = []
-        for chunk in chunks:
-            drawer_id = f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
-            batch_docs.append(chunk["content"])
-            batch_ids.append(drawer_id)
-            batch_metas.append(
-                _build_drawer_metadata(
-                    wing,
-                    room,
-                    source_file,
-                    chunk["chunk_index"],
-                    agent,
-                    chunk["content"],
-                    source_mtime,
-                )
-            )
-
        drawers_added = 0
-        if batch_docs:
+        for batch_start in range(0, len(chunks), DRAWER_UPSERT_BATCH_SIZE):
+            batch_docs: list = []
+            batch_ids: list = []
+            batch_metas: list = []
+            for chunk in chunks[batch_start : batch_start + DRAWER_UPSERT_BATCH_SIZE]:
+                drawer_id = f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
+                batch_docs.append(chunk["content"])
+                batch_ids.append(drawer_id)
+                batch_metas.append(
+                    _build_drawer_metadata(
+                        wing,
+                        room,
+                        source_file,
+                        chunk["chunk_index"],
+                        agent,
+                        chunk["content"],
+                        source_mtime,
+                    )
+                )
            collection.upsert(
                documents=batch_docs,
                ids=batch_ids,
                metadatas=batch_metas,
            )
-            drawers_added = len(batch_docs)
+            drawers_added += len(batch_docs)

        # Build closet — the searchable index pointing to these drawers.
        # Purge first: a re-mine (mtime change or normalize_version bump) must