The _chunk_by_exchange() function was silently truncating AI responses to 8 lines via ai_lines[:8]. Any content beyond line 8 was discarded, violating the project's verbatim storage principle. Now the full AI response is preserved. When a combined exchange exceeds CHUNK_SIZE (800 chars, aligned with miner.py), it is split across consecutive drawers instead of being truncated.
This commit is contained in:
@@ -28,6 +28,7 @@ CONVO_EXTENSIONS = {
|
||||
}
|
||||
|
||||
MIN_CHUNK_SIZE = 30
|
||||
CHUNK_SIZE = 800 # chars per drawer — align with miner.py
|
||||
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this
|
||||
|
||||
|
||||
@@ -51,7 +52,12 @@ def chunk_exchanges(content: str) -> list:
|
||||
|
||||
|
||||
def _chunk_by_exchange(lines: list) -> list:
|
||||
"""One user turn (>) + the AI response that follows = one chunk."""
|
||||
"""One user turn (>) + the AI response that follows = one or more chunks.
|
||||
|
||||
The full AI response is preserved verbatim. When the combined
|
||||
user-turn + response exceeds CHUNK_SIZE the response is split across
|
||||
consecutive drawers so nothing is silently discarded.
|
||||
"""
|
||||
chunks = []
|
||||
i = 0
|
||||
|
||||
@@ -73,7 +79,24 @@ def _chunk_by_exchange(lines: list) -> list:
|
||||
ai_response = " ".join(ai_lines)
|
||||
content = f"{user_turn}\n{ai_response}" if ai_response else user_turn
|
||||
|
||||
if len(content.strip()) > MIN_CHUNK_SIZE:
|
||||
# Split into multiple drawers when the exchange exceeds CHUNK_SIZE
|
||||
if len(content) > CHUNK_SIZE:
|
||||
# First chunk: user turn + as much response as fits
|
||||
first_part = content[:CHUNK_SIZE]
|
||||
if len(first_part.strip()) > MIN_CHUNK_SIZE:
|
||||
chunks.append(
|
||||
{"content": first_part, "chunk_index": len(chunks)}
|
||||
)
|
||||
# Remaining response in CHUNK_SIZE-sized continuation drawers
|
||||
remainder = content[CHUNK_SIZE:]
|
||||
while remainder:
|
||||
part = remainder[:CHUNK_SIZE]
|
||||
remainder = remainder[CHUNK_SIZE:]
|
||||
if len(part.strip()) > MIN_CHUNK_SIZE:
|
||||
chunks.append(
|
||||
{"content": part, "chunk_index": len(chunks)}
|
||||
)
|
||||
elif len(content.strip()) > MIN_CHUNK_SIZE:
|
||||
chunks.append(
|
||||
{
|
||||
"content": content,
|
||||
|
||||
Reference in New Issue
Block a user