test: cover embedding device fallback and bounded upserts

Agent-Logs-Url: https://github.com/MemPalace/mempalace/sessions/3213a67a-6871-4bb2-9ae0-23fa11001a22

Co-authored-by: igorls <4753812+igorls@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot]
2026-04-24 23:06:50 +00:00
committed by GitHub
parent a4868a3589
commit fbd0904799
7 changed files with 268 additions and 57 deletions
+31 -31
View File
@@ -55,6 +55,7 @@ CONVO_EXTENSIONS = {
MIN_CHUNK_SIZE = 30
CHUNK_SIZE = 800 # chars per drawer — align with miner.py
DRAWER_UPSERT_BATCH_SIZE = 1000
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB — skip files larger than this.
# Matches miner.py at 500 MB. Long Claude Code sessions, multi-year
# ChatGPT exports, and lifetime Slack dumps routinely exceed 10 MB; the
@@ -332,44 +333,43 @@ def _file_chunks_locked(collection, source_file, chunks, wing, room, agent, extr
except Exception:
pass
# Batch the whole file into one upsert so the embedding model runs
# a single forward pass for all chunks — dramatically faster than
# one call per chunk, especially on GPU where per-call overhead
# dominates over the actual matmul.
batch_docs: list = []
batch_ids: list = []
batch_metas: list = []
# Batch chunks into bounded upserts so large transcripts keep most of
# the embedding speedup without one huge Chroma/SQLite request. Keep
# one filed_at per source file so all transcript drawers share an
# ingest timestamp.
filed_at = datetime.now().isoformat()
for chunk in chunks:
chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
if extract_mode == "general":
room_counts_delta[chunk_room] += 1
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
batch_docs.append(chunk["content"])
batch_ids.append(drawer_id)
batch_metas.append(
{
"wing": wing,
"room": chunk_room,
"hall": _detect_hall_cached(chunk["content"]),
"source_file": source_file,
"chunk_index": chunk["chunk_index"],
"added_by": agent,
"filed_at": filed_at,
"ingest_mode": "convos",
"extract_mode": extract_mode,
"normalize_version": NORMALIZE_VERSION,
}
)
if batch_docs:
for batch_start in range(0, len(chunks), DRAWER_UPSERT_BATCH_SIZE):
batch_docs: list = []
batch_ids: list = []
batch_metas: list = []
for chunk in chunks[batch_start : batch_start + DRAWER_UPSERT_BATCH_SIZE]:
chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
if extract_mode == "general":
room_counts_delta[chunk_room] += 1
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
batch_docs.append(chunk["content"])
batch_ids.append(drawer_id)
batch_metas.append(
{
"wing": wing,
"room": chunk_room,
"hall": _detect_hall_cached(chunk["content"]),
"source_file": source_file,
"chunk_index": chunk["chunk_index"],
"added_by": agent,
"filed_at": filed_at,
"ingest_mode": "convos",
"extract_mode": extract_mode,
"normalize_version": NORMALIZE_VERSION,
}
)
try:
collection.upsert(
documents=batch_docs,
ids=batch_ids,
metadatas=batch_metas,
)
drawers_added = len(batch_docs)
drawers_added += len(batch_docs)
except Exception as e:
if "already exists" not in str(e).lower():
raise
+9 -1
View File
@@ -32,6 +32,12 @@ _PROVIDER_MAP = {
"dml": ["DmlExecutionProvider", "CPUExecutionProvider"],
}
_DEVICE_EXTRA = {
"cuda": "mempalace[gpu]",
"coreml": "mempalace[coreml]",
"dml": "mempalace[dml]",
}
_AUTO_ORDER = [
("CUDAExecutionProvider", "cuda"),
("CoreMLExecutionProvider", "coreml"),
@@ -76,11 +82,13 @@ def _resolve_providers(device: str) -> tuple[list, str]:
if preferred not in available:
if device not in _WARNED:
extra = _DEVICE_EXTRA.get(device, "the matching mempalace extra for your device")
logger.warning(
"embedding_device=%r requested but %s is not installed — "
"falling back to CPU. Install mempalace[gpu] for CUDA.",
"falling back to CPU. Install %s.",
device,
preferred,
extra,
)
_WARNED.add(device)
return (["CPUExecutionProvider"], "cpu")
+25 -25
View File
@@ -65,6 +65,7 @@ SKIP_FILENAMES = {
CHUNK_SIZE = 800 # chars per drawer
CHUNK_OVERLAP = 100 # overlap between chunks
MIN_CHUNK_SIZE = 50 # skip tiny chunks
DRAWER_UPSERT_BATCH_SIZE = 1000
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB — skip files larger than this.
# Long Claude Code sessions and large transcript exports routinely exceed
# 10 MB. The cap exists as a defensive rail against pathological binary
@@ -748,42 +749,41 @@ def process_file(
except Exception:
pass
# Batch all chunks of this file into a single upsert so the embedding
# model runs one forward pass over the whole file instead of N passes
# of one chunk each. On CPU this is typically a 10-30x speedup; on
# GPU the speedup is larger because per-call overhead dominates.
# Batch chunks into bounded upserts so the embedding model sees many
# chunks per forward pass without building one huge Chroma/SQLite
# request for pathological files. A bad chunk can fail its sub-batch;
# that is the deliberate trade-off for amortizing embedding overhead.
try:
source_mtime = os.path.getmtime(source_file)
except OSError:
source_mtime = None
batch_docs: list = []
batch_ids: list = []
batch_metas: list = []
for chunk in chunks:
drawer_id = f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
batch_docs.append(chunk["content"])
batch_ids.append(drawer_id)
batch_metas.append(
_build_drawer_metadata(
wing,
room,
source_file,
chunk["chunk_index"],
agent,
chunk["content"],
source_mtime,
)
)
drawers_added = 0
if batch_docs:
for batch_start in range(0, len(chunks), DRAWER_UPSERT_BATCH_SIZE):
batch_docs: list = []
batch_ids: list = []
batch_metas: list = []
for chunk in chunks[batch_start : batch_start + DRAWER_UPSERT_BATCH_SIZE]:
drawer_id = f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
batch_docs.append(chunk["content"])
batch_ids.append(drawer_id)
batch_metas.append(
_build_drawer_metadata(
wing,
room,
source_file,
chunk["chunk_index"],
agent,
chunk["content"],
source_mtime,
)
)
collection.upsert(
documents=batch_docs,
ids=batch_ids,
metadatas=batch_metas,
)
drawers_added = len(batch_docs)
drawers_added += len(batch_docs)
# Build closet — the searchable index pointing to these drawers.
# Purge first: a re-mine (mtime change or normalize_version bump) must