fix: remove 8-line AI response truncation in convo_miner (#692) (#708)

The _chunk_by_exchange() function was silently truncating AI responses
to 8 lines via ai_lines[:8]. Any content beyond line 8 was discarded,
violating the project's verbatim storage principle.

Now the full AI response is preserved. When a combined exchange exceeds
CHUNK_SIZE (800 chars, aligned with miner.py), it is split across
consecutive drawers instead of being truncated.
This commit is contained in:
Sanjay Ramadugu
2026-04-12 17:23:57 -04:00
committed by GitHub
parent d52d6c9622
commit 9b60c6edd7
+25 -2
View File
@@ -28,6 +28,7 @@ CONVO_EXTENSIONS = {
}
MIN_CHUNK_SIZE = 30
CHUNK_SIZE = 800 # chars per drawer — align with miner.py
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this
@@ -51,7 +52,12 @@ def chunk_exchanges(content: str) -> list:
def _chunk_by_exchange(lines: list) -> list:
"""One user turn (>) + the AI response that follows = one chunk."""
"""One user turn (>) + the AI response that follows = one or more chunks.
The full AI response is preserved verbatim. When the combined
user-turn + response exceeds CHUNK_SIZE the response is split across
consecutive drawers so nothing is silently discarded.
"""
chunks = []
i = 0
@@ -73,7 +79,24 @@ def _chunk_by_exchange(lines: list) -> list:
ai_response = " ".join(ai_lines)
content = f"{user_turn}\n{ai_response}" if ai_response else user_turn
if len(content.strip()) > MIN_CHUNK_SIZE:
# Split into multiple drawers when the exchange exceeds CHUNK_SIZE
if len(content) > CHUNK_SIZE:
# First chunk: user turn + as much response as fits
first_part = content[:CHUNK_SIZE]
if len(first_part.strip()) > MIN_CHUNK_SIZE:
chunks.append(
{"content": first_part, "chunk_index": len(chunks)}
)
# Remaining response in CHUNK_SIZE-sized continuation drawers
remainder = content[CHUNK_SIZE:]
while remainder:
part = remainder[:CHUNK_SIZE]
remainder = remainder[CHUNK_SIZE:]
if len(part.strip()) > MIN_CHUNK_SIZE:
chunks.append(
{"content": part, "chunk_index": len(chunks)}
)
elif len(content.strip()) > MIN_CHUNK_SIZE:
chunks.append(
{
"content": content,