security: harden inputs, fix shell injection, optimize DB access

- Fix command injection in hook script (pass paths via sys.argv)
- Add sanitize_name/sanitize_content validators in config.py
- Add 10MB file size guard + symlink skip in miners
- Fix SQLite connection leak in knowledge_graph.py (reuse connection)
- Use `with conn:` for proper transaction handling
- Consolidate shared palace operations into palace.py
- Add write-ahead log for audit trail on writes/deletes
- Add metadata cache with 30s TTL for status/taxonomy calls
- Upgrade md5 → sha256 for drawer/triple IDs
- Harden file permissions (0o700/0o600)
- Pin chromadb>=0.5.0,<0.7

Based on PR #252 by @anthonyonazure with lint fixes applied.

Co-Authored-By: anthonyonazure <anthonyonazure@users.noreply.github.com>
This commit is contained in:
bensig
2026-04-09 08:06:30 -07:00
parent 963c04cf45
commit 1d19dfc9d5
8 changed files with 389 additions and 203 deletions
+11 -35
View File
@@ -15,9 +15,8 @@ from pathlib import Path
from datetime import datetime
from collections import defaultdict
import chromadb
from .normalize import normalize
from .palace import SKIP_DIRS, get_collection, file_already_mined
# File types that might contain conversations
@@ -28,22 +27,8 @@ CONVO_EXTENSIONS = {
".jsonl",
}
SKIP_DIRS = {
".git",
"node_modules",
"__pycache__",
".venv",
"venv",
"env",
"dist",
"build",
".next",
".mempalace",
"tool-results",
"memory",
}
MIN_CHUNK_SIZE = 30
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this
# =============================================================================
@@ -211,23 +196,6 @@ def detect_convo_room(content: str) -> str:
# =============================================================================
def get_collection(palace_path: str):
os.makedirs(palace_path, exist_ok=True)
client = chromadb.PersistentClient(path=palace_path)
try:
return client.get_collection("mempalace_drawers")
except Exception:
return client.create_collection("mempalace_drawers")
def file_already_mined(collection, source_file: str) -> bool:
try:
results = collection.get(where={"source_file": source_file}, limit=1)
return len(results.get("ids", [])) > 0
except Exception:
return False
# =============================================================================
# SCAN FOR CONVERSATION FILES
# =============================================================================
@@ -244,6 +212,14 @@ def scan_convos(convo_dir: str) -> list:
continue
filepath = Path(root) / filename
if filepath.suffix.lower() in CONVO_EXTENSIONS:
# Skip symlinks and oversized files
if filepath.is_symlink():
continue
try:
if filepath.stat().st_size > MAX_FILE_SIZE:
continue
except OSError:
continue
files.append(filepath)
return files
@@ -356,7 +332,7 @@ def mine_convos(
chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
if extract_mode == "general":
room_counts[chunk_room] += 1
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.md5((source_file + str(chunk['chunk_index'])).encode(), usedforsecurity=False).hexdigest()[:16]}"
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
try:
collection.add(
documents=[chunk["content"]],