test: cover embedding device fallback and bounded upserts
Agent-Logs-Url: https://github.com/MemPalace/mempalace/sessions/3213a67a-6871-4bb2-9ae0-23fa11001a22 Co-authored-by: igorls <4753812+igorls@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
a4868a3589
commit
fbd0904799
+31
-31
@@ -55,6 +55,7 @@ CONVO_EXTENSIONS = {
|
||||
|
||||
MIN_CHUNK_SIZE = 30
|
||||
CHUNK_SIZE = 800 # chars per drawer — align with miner.py
|
||||
DRAWER_UPSERT_BATCH_SIZE = 1000
|
||||
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB — skip files larger than this.
|
||||
# Matches miner.py at 500 MB. Long Claude Code sessions, multi-year
|
||||
# ChatGPT exports, and lifetime Slack dumps routinely exceed 10 MB; the
|
||||
@@ -332,44 +333,43 @@ def _file_chunks_locked(collection, source_file, chunks, wing, room, agent, extr
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Batch the whole file into one upsert so the embedding model runs
|
||||
# a single forward pass for all chunks — dramatically faster than
|
||||
# one call per chunk, especially on GPU where per-call overhead
|
||||
# dominates over the actual matmul.
|
||||
batch_docs: list = []
|
||||
batch_ids: list = []
|
||||
batch_metas: list = []
|
||||
# Batch chunks into bounded upserts so large transcripts keep most of
|
||||
# the embedding speedup without one huge Chroma/SQLite request. Keep
|
||||
# one filed_at per source file so all transcript drawers share an
|
||||
# ingest timestamp.
|
||||
filed_at = datetime.now().isoformat()
|
||||
for chunk in chunks:
|
||||
chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
|
||||
if extract_mode == "general":
|
||||
room_counts_delta[chunk_room] += 1
|
||||
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
|
||||
batch_docs.append(chunk["content"])
|
||||
batch_ids.append(drawer_id)
|
||||
batch_metas.append(
|
||||
{
|
||||
"wing": wing,
|
||||
"room": chunk_room,
|
||||
"hall": _detect_hall_cached(chunk["content"]),
|
||||
"source_file": source_file,
|
||||
"chunk_index": chunk["chunk_index"],
|
||||
"added_by": agent,
|
||||
"filed_at": filed_at,
|
||||
"ingest_mode": "convos",
|
||||
"extract_mode": extract_mode,
|
||||
"normalize_version": NORMALIZE_VERSION,
|
||||
}
|
||||
)
|
||||
|
||||
if batch_docs:
|
||||
for batch_start in range(0, len(chunks), DRAWER_UPSERT_BATCH_SIZE):
|
||||
batch_docs: list = []
|
||||
batch_ids: list = []
|
||||
batch_metas: list = []
|
||||
for chunk in chunks[batch_start : batch_start + DRAWER_UPSERT_BATCH_SIZE]:
|
||||
chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
|
||||
if extract_mode == "general":
|
||||
room_counts_delta[chunk_room] += 1
|
||||
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
|
||||
batch_docs.append(chunk["content"])
|
||||
batch_ids.append(drawer_id)
|
||||
batch_metas.append(
|
||||
{
|
||||
"wing": wing,
|
||||
"room": chunk_room,
|
||||
"hall": _detect_hall_cached(chunk["content"]),
|
||||
"source_file": source_file,
|
||||
"chunk_index": chunk["chunk_index"],
|
||||
"added_by": agent,
|
||||
"filed_at": filed_at,
|
||||
"ingest_mode": "convos",
|
||||
"extract_mode": extract_mode,
|
||||
"normalize_version": NORMALIZE_VERSION,
|
||||
}
|
||||
)
|
||||
try:
|
||||
collection.upsert(
|
||||
documents=batch_docs,
|
||||
ids=batch_ids,
|
||||
metadatas=batch_metas,
|
||||
)
|
||||
drawers_added = len(batch_docs)
|
||||
drawers_added += len(batch_docs)
|
||||
except Exception as e:
|
||||
if "already exists" not in str(e).lower():
|
||||
raise
|
||||
|
||||
@@ -32,6 +32,12 @@ _PROVIDER_MAP = {
|
||||
"dml": ["DmlExecutionProvider", "CPUExecutionProvider"],
|
||||
}
|
||||
|
||||
_DEVICE_EXTRA = {
|
||||
"cuda": "mempalace[gpu]",
|
||||
"coreml": "mempalace[coreml]",
|
||||
"dml": "mempalace[dml]",
|
||||
}
|
||||
|
||||
_AUTO_ORDER = [
|
||||
("CUDAExecutionProvider", "cuda"),
|
||||
("CoreMLExecutionProvider", "coreml"),
|
||||
@@ -76,11 +82,13 @@ def _resolve_providers(device: str) -> tuple[list, str]:
|
||||
|
||||
if preferred not in available:
|
||||
if device not in _WARNED:
|
||||
extra = _DEVICE_EXTRA.get(device, "the matching mempalace extra for your device")
|
||||
logger.warning(
|
||||
"embedding_device=%r requested but %s is not installed — "
|
||||
"falling back to CPU. Install mempalace[gpu] for CUDA.",
|
||||
"falling back to CPU. Install %s.",
|
||||
device,
|
||||
preferred,
|
||||
extra,
|
||||
)
|
||||
_WARNED.add(device)
|
||||
return (["CPUExecutionProvider"], "cpu")
|
||||
|
||||
+25
-25
@@ -65,6 +65,7 @@ SKIP_FILENAMES = {
|
||||
CHUNK_SIZE = 800 # chars per drawer
|
||||
CHUNK_OVERLAP = 100 # overlap between chunks
|
||||
MIN_CHUNK_SIZE = 50 # skip tiny chunks
|
||||
DRAWER_UPSERT_BATCH_SIZE = 1000
|
||||
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB — skip files larger than this.
|
||||
# Long Claude Code sessions and large transcript exports routinely exceed
|
||||
# 10 MB. The cap exists as a defensive rail against pathological binary
|
||||
@@ -748,42 +749,41 @@ def process_file(
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Batch all chunks of this file into a single upsert so the embedding
|
||||
# model runs one forward pass over the whole file instead of N passes
|
||||
# of one chunk each. On CPU this is typically a 10-30x speedup; on
|
||||
# GPU the speedup is larger because per-call overhead dominates.
|
||||
# Batch chunks into bounded upserts so the embedding model sees many
|
||||
# chunks per forward pass without building one huge Chroma/SQLite
|
||||
# request for pathological files. A bad chunk can fail its sub-batch;
|
||||
# that is the deliberate trade-off for amortizing embedding overhead.
|
||||
try:
|
||||
source_mtime = os.path.getmtime(source_file)
|
||||
except OSError:
|
||||
source_mtime = None
|
||||
|
||||
batch_docs: list = []
|
||||
batch_ids: list = []
|
||||
batch_metas: list = []
|
||||
for chunk in chunks:
|
||||
drawer_id = f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
|
||||
batch_docs.append(chunk["content"])
|
||||
batch_ids.append(drawer_id)
|
||||
batch_metas.append(
|
||||
_build_drawer_metadata(
|
||||
wing,
|
||||
room,
|
||||
source_file,
|
||||
chunk["chunk_index"],
|
||||
agent,
|
||||
chunk["content"],
|
||||
source_mtime,
|
||||
)
|
||||
)
|
||||
|
||||
drawers_added = 0
|
||||
if batch_docs:
|
||||
for batch_start in range(0, len(chunks), DRAWER_UPSERT_BATCH_SIZE):
|
||||
batch_docs: list = []
|
||||
batch_ids: list = []
|
||||
batch_metas: list = []
|
||||
for chunk in chunks[batch_start : batch_start + DRAWER_UPSERT_BATCH_SIZE]:
|
||||
drawer_id = f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
|
||||
batch_docs.append(chunk["content"])
|
||||
batch_ids.append(drawer_id)
|
||||
batch_metas.append(
|
||||
_build_drawer_metadata(
|
||||
wing,
|
||||
room,
|
||||
source_file,
|
||||
chunk["chunk_index"],
|
||||
agent,
|
||||
chunk["content"],
|
||||
source_mtime,
|
||||
)
|
||||
)
|
||||
collection.upsert(
|
||||
documents=batch_docs,
|
||||
ids=batch_ids,
|
||||
metadatas=batch_metas,
|
||||
)
|
||||
drawers_added = len(batch_docs)
|
||||
drawers_added += len(batch_docs)
|
||||
|
||||
# Build closet — the searchable index pointing to these drawers.
|
||||
# Purge first: a re-mine (mtime change or normalize_version bump) must
|
||||
|
||||
Reference in New Issue
Block a user