fix: batch ChromaDB reads to avoid SQLite variable limit

col.get() without limit generates SELECT ... WHERE id IN (...) with all
document IDs, which exceeds SQLite's ~999 variable limit when a palace
has more than ~1000 drawers.

This breaks both `mempalace compress` and `mempalace wake-up` on large
palaces. Reproduced on a 13880-file codebase (242K+ drawers).

Fix: paginate reads in batches of 500 using ChromaDB's offset/limit
parameters in both Layer1.generate() and cmd_compress().
This commit is contained in:
Maurice Wen
2026-04-07 21:40:12 +08:00
parent 1782628b8a
commit 0e77981dec
2 changed files with 45 additions and 25 deletions
+24 -13
View File
@@ -177,20 +177,31 @@ def cmd_compress(args):
print(" Run: mempalace init <dir> then mempalace mine <dir>")
sys.exit(1)
# Query drawers in the wing
# Query drawers in batches to avoid SQLite variable limit (~999)
where = {"wing": args.wing} if args.wing else None
try:
kwargs = {"include": ["documents", "metadatas"]}
if where:
kwargs["where"] = where
results = col.get(**kwargs)
except Exception as e:
print(f"\n Error reading drawers: {e}")
sys.exit(1)
docs = results["documents"]
metas = results["metadatas"]
ids = results["ids"]
_BATCH = 500
docs, metas, ids = [], [], []
offset = 0
while True:
try:
kwargs = {"include": ["documents", "metadatas"], "limit": _BATCH, "offset": offset}
if where:
kwargs["where"] = where
batch = col.get(**kwargs)
except Exception as e:
if not docs:
print(f"\n Error reading drawers: {e}")
sys.exit(1)
break
batch_docs = batch.get("documents", [])
if not batch_docs:
break
docs.extend(batch_docs)
metas.extend(batch.get("metadatas", []))
ids.extend(batch.get("ids", []))
offset += len(batch_docs)
if len(batch_docs) < _BATCH:
break
if not docs:
wing_label = f" in wing '{args.wing}'" if args.wing else ""
+21 -12
View File
@@ -96,18 +96,27 @@ class Layer1:
except Exception:
return "## L1 — No palace found. Run: mempalace mine <dir>"
# Fetch all drawers (with optional wing filter)
kwargs = {"include": ["documents", "metadatas"]}
if self.wing:
kwargs["where"] = {"wing": self.wing}
try:
results = col.get(**kwargs)
except Exception:
return "## L1 — No drawers found."
docs = results.get("documents", [])
metas = results.get("metadatas", [])
# Fetch all drawers in batches to avoid SQLite variable limit (~999)
_BATCH = 500
docs, metas = [], []
offset = 0
while True:
kwargs = {"include": ["documents", "metadatas"], "limit": _BATCH, "offset": offset}
if self.wing:
kwargs["where"] = {"wing": self.wing}
try:
batch = col.get(**kwargs)
except Exception:
break
batch_docs = batch.get("documents", [])
batch_metas = batch.get("metadatas", [])
if not batch_docs:
break
docs.extend(batch_docs)
metas.extend(batch_metas)
offset += len(batch_docs)
if len(batch_docs) < _BATCH:
break
if not docs:
return "## L1 — No memories yet."