423 lines
17 KiB
Python
423 lines
17 KiB
Python
|
|
"""
|
||
|
|
corpus_origin.py — Detect whether a corpus is an AI-dialogue record and,
|
||
|
|
if so, what platform and what persona names the user has assigned to the
|
||
|
|
agent.
|
||
|
|
|
||
|
|
This is the first question any downstream Pass 2 classification needs
|
||
|
|
answered. Without it, a drawer like "my three sons" in a Claude Code
|
||
|
|
dialogue corpus can't be correctly resolved to "three AI instances"
|
||
|
|
rather than "three biological children."
|
||
|
|
|
||
|
|
Two-tier detection:
|
||
|
|
|
||
|
|
Tier 1 — detect_origin_heuristic(samples)
|
||
|
|
Cheap, no API. Grep for well-known AI brand terms + turn
|
||
|
|
markers. Always runs. Outputs a hypothesis.
|
||
|
|
|
||
|
|
Tier 2 — detect_origin_llm(samples, provider)
|
||
|
|
Uses an LLMProvider (typically Haiku via mempalace.llm_client)
|
||
|
|
with the model's pre-trained knowledge of Claude/ChatGPT/Gemini
|
||
|
|
etc. Confirms platform, extracts agent persona-names the user
|
||
|
|
has assigned. One call, ~$0.01 cost.
|
||
|
|
|
||
|
|
Design principle:
|
||
|
|
Don't make the classifier re-discover what Claude, ChatGPT, Gemini, MCP,
|
||
|
|
or other well-known entities ARE — the LLM already knows them from its
|
||
|
|
training. Only corpus-specific entities (e.g. the user's persona-name
|
||
|
|
for their Claude instance) need discovery.
|
||
|
|
|
||
|
|
Default stance (when evidence is thin):
|
||
|
|
"This IS an AI-dialogue corpus" — false-negative is catastrophic for
|
||
|
|
downstream classification; false-positive is recoverable via per-drawer
|
||
|
|
voice-profile detection in later passes.
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import json
|
||
|
|
import re
|
||
|
|
from dataclasses import dataclass, field, asdict
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
|
||
|
|
# ── Well-known AI brand terms (expand as new platforms emerge) ────────────
|
||
|
|
# Detection is by PATTERN + CONTEXT, not by capitalization or English-language
|
||
|
|
# rules. Two categories:
|
||
|
|
#
|
||
|
|
# UNAMBIGUOUS — terms that have essentially no meaning outside of AI context.
|
||
|
|
# Always counted toward AI-dialogue evidence.
|
||
|
|
#
|
||
|
|
# AMBIGUOUS — terms that share a string with common English words, names,
|
||
|
|
# poetry forms, zodiac signs, animals, etc. Counted toward AI-dialogue
|
||
|
|
# evidence ONLY when at least one unambiguous AI signal also appears in
|
||
|
|
# the corpus (turn marker, unambiguous brand term, or AI infrastructure
|
||
|
|
# term). This avoids false-positives on French novels with characters
|
||
|
|
# named "Claude", astrology corpora discussing "Gemini", poetry corpora
|
||
|
|
# full of "haiku" / "sonnet", etc.
|
||
|
|
#
|
||
|
|
# All matching is CASE-INSENSITIVE — users type lowercase constantly.
|
||
|
|
|
||
|
|
_AI_UNAMBIGUOUS_TERMS = [
|
||
|
|
# Anthropic-specific
|
||
|
|
"Anthropic",
|
||
|
|
"Claude Code",
|
||
|
|
"Claude 3",
|
||
|
|
"Claude 4",
|
||
|
|
"claude mcp",
|
||
|
|
"CLAUDE.md",
|
||
|
|
".claude/",
|
||
|
|
# OpenAI-specific
|
||
|
|
"ChatGPT",
|
||
|
|
"GPT-4",
|
||
|
|
"GPT-3",
|
||
|
|
"GPT-5",
|
||
|
|
"OpenAI",
|
||
|
|
"gpt-4o",
|
||
|
|
"gpt-4-turbo",
|
||
|
|
"o1-preview",
|
||
|
|
"o3",
|
||
|
|
# Google-specific
|
||
|
|
"gemini-pro",
|
||
|
|
"gemini-1.5",
|
||
|
|
"Google AI",
|
||
|
|
# Meta / others (specific model identifiers, not bare common words)
|
||
|
|
"Mixtral",
|
||
|
|
"Cohere",
|
||
|
|
# AI-infrastructure terms with no common-English collision
|
||
|
|
"MCP",
|
||
|
|
"LLM",
|
||
|
|
"RAG",
|
||
|
|
"fine-tune",
|
||
|
|
"context window",
|
||
|
|
"embedding",
|
||
|
|
]
|
||
|
|
|
||
|
|
_AI_AMBIGUOUS_TERMS = [
|
||
|
|
# Anthropic — bare brand/model names that collide with names + poetry
|
||
|
|
"Claude", # also a common French masculine name
|
||
|
|
"Opus", # also a musical work, comic strip, magazine
|
||
|
|
"Sonnet", # also a 14-line poem form
|
||
|
|
"Haiku", # also a 17-syllable poem form
|
||
|
|
# Google — bare brand that collides with zodiac sign
|
||
|
|
"Gemini", # also the zodiac sign
|
||
|
|
"Bard", # also a poet / Shakespeare
|
||
|
|
# Meta / others
|
||
|
|
"Llama", # also the South American animal
|
||
|
|
"Mistral", # also a Mediterranean wind
|
||
|
|
# Note: 'prompt', 'completion', 'tokens' previously lived here but were
|
||
|
|
# removed: they're suppressed without an unambiguous co-signal anyway,
|
||
|
|
# and by the time a co-signal is present the corpus is already flagged.
|
||
|
|
# Keeping them just produced noisier evidence strings.
|
||
|
|
]
|
||
|
|
|
||
|
|
# Turn-marker patterns commonly seen in AI-dialogue transcripts
|
||
|
|
_TURN_MARKERS = [
|
||
|
|
r"\buser\s*:\s*",
|
||
|
|
r"\bassistant\s*:\s*",
|
||
|
|
r"\bhuman\s*:\s*",
|
||
|
|
r"\bai\s*:\s*",
|
||
|
|
r"\b>>>\s*User\b",
|
||
|
|
r"\b>>>\s*Assistant\b",
|
||
|
|
]
|
||
|
|
|
||
|
|
|
||
|
|
def _brand_pattern(term: str) -> str:
|
||
|
|
"""Build a regex for a brand term that uses word boundaries
|
||
|
|
only on edges where the term itself starts/ends with a word
|
||
|
|
character. Without this nuance:
|
||
|
|
- 'Claude' would falsely match inside 'Claudette' (no \\b)
|
||
|
|
- '.claude/' would fail to match at start of string (\\b
|
||
|
|
before non-word char requires preceding word char)
|
||
|
|
So we only attach \\b where it actually makes sense."""
|
||
|
|
escaped = re.escape(term)
|
||
|
|
prefix = r"\b" if term[0].isalnum() or term[0] == "_" else ""
|
||
|
|
suffix = r"\b" if term[-1].isalnum() or term[-1] == "_" else ""
|
||
|
|
return prefix + escaped + suffix
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class CorpusOriginResult:
|
||
|
|
"""Structured output from corpus-origin detection.
|
||
|
|
|
||
|
|
Fields:
|
||
|
|
likely_ai_dialogue — best hypothesis about whether this is AI-dialogue
|
||
|
|
confidence — 0.0 to 1.0
|
||
|
|
primary_platform — e.g. "Claude Code (Anthropic CLI)" or None
|
||
|
|
user_name — the corpus author's name if identifiable from context, else None
|
||
|
|
agent_persona_names — names the user has assigned to the AI agent(s)
|
||
|
|
(e.g. ["Echo", "Sparrow"]). Does NOT include the user's own name.
|
||
|
|
evidence — human-readable reasons for the classification
|
||
|
|
"""
|
||
|
|
|
||
|
|
likely_ai_dialogue: bool
|
||
|
|
confidence: float
|
||
|
|
primary_platform: Optional[str]
|
||
|
|
user_name: Optional[str] = None
|
||
|
|
agent_persona_names: list[str] = field(default_factory=list)
|
||
|
|
evidence: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
def to_dict(self) -> dict:
|
||
|
|
return asdict(self)
|
||
|
|
|
||
|
|
|
||
|
|
# ── Tier 1: cheap heuristic ───────────────────────────────────────────────
|
||
|
|
|
||
|
|
|
||
|
|
def detect_origin_heuristic(samples: list[str]) -> CorpusOriginResult:
|
||
|
|
"""Fast grep-based detection. No API calls.
|
||
|
|
|
||
|
|
Scores AI-dialogue likelihood by counting:
|
||
|
|
- occurrences of well-known AI brand terms
|
||
|
|
- turn-marker patterns (user:, assistant:, etc.)
|
||
|
|
|
||
|
|
Returns a CorpusOriginResult with confidence derived from signal density.
|
||
|
|
"""
|
||
|
|
combined = "\n\n".join(samples)
|
||
|
|
total_chars = max(1, len(combined))
|
||
|
|
|
||
|
|
# Count UNAMBIGUOUS brand-term hits (case-insensitive — users type
|
||
|
|
# lowercase constantly, so 'chatgpt' must trip the same as 'ChatGPT').
|
||
|
|
# Word boundaries prevent false in-word matches (see _brand_pattern).
|
||
|
|
unambiguous_hits: dict[str, int] = {}
|
||
|
|
total_unambiguous = 0
|
||
|
|
for term in _AI_UNAMBIGUOUS_TERMS:
|
||
|
|
matches = re.findall(_brand_pattern(term), combined, re.IGNORECASE)
|
||
|
|
if matches:
|
||
|
|
unambiguous_hits[term] = len(matches)
|
||
|
|
total_unambiguous += len(matches)
|
||
|
|
|
||
|
|
# Count AMBIGUOUS brand-term hits separately. These will only be
|
||
|
|
# counted toward AI-dialogue evidence if the corpus also contains
|
||
|
|
# at least one unambiguous AI signal — see co-occurrence rule below.
|
||
|
|
ambiguous_hits: dict[str, int] = {}
|
||
|
|
total_ambiguous = 0
|
||
|
|
for term in _AI_AMBIGUOUS_TERMS:
|
||
|
|
matches = re.findall(_brand_pattern(term), combined, re.IGNORECASE)
|
||
|
|
if matches:
|
||
|
|
ambiguous_hits[term] = len(matches)
|
||
|
|
total_ambiguous += len(matches)
|
||
|
|
|
||
|
|
# Count turn-marker hits (case-insensitive — transcripts vary).
|
||
|
|
turn_hits = 0
|
||
|
|
turn_types_found = set()
|
||
|
|
for pattern in _TURN_MARKERS:
|
||
|
|
matches = re.findall(pattern, combined, re.IGNORECASE)
|
||
|
|
if matches:
|
||
|
|
turn_hits += len(matches)
|
||
|
|
turn_types_found.add(pattern)
|
||
|
|
|
||
|
|
# Co-occurrence rule for ambiguous terms.
|
||
|
|
# Ambiguous terms (e.g. 'Claude' as a French name, 'Gemini' as a zodiac
|
||
|
|
# sign, 'Haiku' as a poem form) only count toward brand evidence if
|
||
|
|
# the corpus also contains at least one unambiguous AI signal. Otherwise
|
||
|
|
# we'd false-positive on French novels, astrology forums, poetry corpora,
|
||
|
|
# llama-rancher journals, etc.
|
||
|
|
has_ai_context = total_unambiguous > 0 or turn_hits > 0
|
||
|
|
counted_brand_hits = total_unambiguous + (total_ambiguous if has_ai_context else 0)
|
||
|
|
|
||
|
|
# Brand-term density per 1000 chars; turn-marker density likewise.
|
||
|
|
# Tuned on a small set of examples; these aren't magic numbers and
|
||
|
|
# can be revisited as we see more corpora.
|
||
|
|
brand_density = counted_brand_hits / (total_chars / 1000)
|
||
|
|
turn_density = turn_hits / (total_chars / 1000)
|
||
|
|
|
||
|
|
# Build evidence list
|
||
|
|
evidence: list[str] = []
|
||
|
|
shown_hits = dict(unambiguous_hits)
|
||
|
|
if has_ai_context:
|
||
|
|
shown_hits.update(ambiguous_hits)
|
||
|
|
if shown_hits:
|
||
|
|
top_terms = sorted(shown_hits.items(), key=lambda x: -x[1])[:5]
|
||
|
|
evidence.append("AI brand terms: " + ", ".join(f"'{k}' ({v}x)" for k, v in top_terms))
|
||
|
|
elif ambiguous_hits and not has_ai_context:
|
||
|
|
# Be transparent that we saw ambiguous matches but suppressed them
|
||
|
|
# for lack of co-occurring AI context.
|
||
|
|
suppressed = sorted(ambiguous_hits.items(), key=lambda x: -x[1])[:3]
|
||
|
|
evidence.append(
|
||
|
|
"Ambiguous terms present but suppressed (no co-occurring AI signal): "
|
||
|
|
+ ", ".join(f"'{k}' ({v}x)" for k, v in suppressed)
|
||
|
|
)
|
||
|
|
if turn_hits:
|
||
|
|
evidence.append(
|
||
|
|
f"Turn markers detected: {turn_hits} occurrences across {len(turn_types_found)} pattern types"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Decision logic:
|
||
|
|
# strong signal (brand OR turn hits both >= threshold) → confident AI-dialogue
|
||
|
|
# MEANINGFUL absence (enough text, zero brand, zero turn) → confident narrative
|
||
|
|
# ambiguous or insufficient text → default stance: AI-dialogue with low confidence
|
||
|
|
#
|
||
|
|
# Threshold for "meaningful absence": the samples collectively have to
|
||
|
|
# be long enough that the absence of AI signals would be expected to
|
||
|
|
# surface if the corpus really is narrative. 150 chars is the working
|
||
|
|
# floor — below that, we cannot confidently say "this is narrative."
|
||
|
|
MEANINGFUL_TEXT_FLOOR = 150
|
||
|
|
|
||
|
|
if brand_density >= 0.5 or turn_density >= 2.0:
|
||
|
|
return CorpusOriginResult(
|
||
|
|
likely_ai_dialogue=True,
|
||
|
|
confidence=min(0.95, 0.6 + 0.1 * (brand_density + turn_density)),
|
||
|
|
primary_platform=None, # tier 2 will refine
|
||
|
|
evidence=evidence,
|
||
|
|
)
|
||
|
|
if counted_brand_hits == 0 and turn_hits == 0 and total_chars >= MEANINGFUL_TEXT_FLOOR:
|
||
|
|
# Note: ambiguous-only matches (e.g. a French novel with 'Claude' as
|
||
|
|
# a character name) flow through here because counted_brand_hits == 0
|
||
|
|
# when no unambiguous AI signal co-occurs. The 'evidence' list still
|
||
|
|
# records that the ambiguous matches were seen and suppressed.
|
||
|
|
narrative_evidence = list(evidence) + [
|
||
|
|
f"no unambiguous AI signal across {total_chars} chars of text — pure narrative"
|
||
|
|
]
|
||
|
|
return CorpusOriginResult(
|
||
|
|
likely_ai_dialogue=False,
|
||
|
|
confidence=0.9,
|
||
|
|
primary_platform=None,
|
||
|
|
evidence=narrative_evidence,
|
||
|
|
)
|
||
|
|
# Ambiguous or too-short-to-tell case: default stance is AI-dialogue
|
||
|
|
# with explicit low confidence. Tier 2 (LLM) should be called to confirm.
|
||
|
|
reason = "weak signal" if (counted_brand_hits or turn_hits) else "insufficient text"
|
||
|
|
return CorpusOriginResult(
|
||
|
|
likely_ai_dialogue=True,
|
||
|
|
confidence=0.4,
|
||
|
|
primary_platform=None,
|
||
|
|
evidence=evidence
|
||
|
|
+ [
|
||
|
|
f"{reason} — applying default-stance (ai_dialogue=True, low confidence). "
|
||
|
|
"Tier 2 LLM check recommended to confirm or override."
|
||
|
|
],
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
# ── Tier 2: LLM-assisted confirmation + persona extraction ────────────────
|
||
|
|
|
||
|
|
|
||
|
|
_SYSTEM_PROMPT = """You are analyzing a corpus of text to determine whether it is a \
|
||
|
|
record of conversations with an AI agent (e.g. Claude, ChatGPT, Gemini, custom LLM \
|
||
|
|
apps), or some other kind of text (personal narrative, story, research notes, \
|
||
|
|
journal, code, etc.).
|
||
|
|
|
||
|
|
Use your pre-existing knowledge of well-known AI platforms. You don't need the \
|
||
|
|
corpus to explain what Claude or ChatGPT is — you already know. Your job is to \
|
||
|
|
detect evidence of their presence and identify what persona-names the user has \
|
||
|
|
assigned to the agent(s) they converse with.
|
||
|
|
|
||
|
|
CRITICAL distinction:
|
||
|
|
- agent_persona_names are names the USER has assigned to the AI AGENT(S)
|
||
|
|
they converse with. Example: "Echo", "Sparrow", "Henry" might be names
|
||
|
|
the user calls a Claude instance they're building a relationship with.
|
||
|
|
- Do NOT include the USER's own name in agent_persona_names. The user
|
||
|
|
is the human author of the corpus, not a persona of the agent. Even
|
||
|
|
if the user's name appears frequently in the text (writing about
|
||
|
|
themselves), that is NOT an agent persona.
|
||
|
|
- If you can identify the user's name from context, put it in user_name
|
||
|
|
(separate field). If unclear, leave user_name null.
|
||
|
|
|
||
|
|
Respond with JSON only (no prose before or after):
|
||
|
|
{
|
||
|
|
"is_ai_dialogue_corpus": <true|false>,
|
||
|
|
"confidence": <0.0 to 1.0>,
|
||
|
|
"primary_platform": <"Claude (Anthropic)" | "ChatGPT (OpenAI)" | "Gemini (Google)" | other platform name | null>,
|
||
|
|
"user_name": <user's name if clearly identifiable from context, else null>,
|
||
|
|
"agent_persona_names": [<names the user has assigned to the AI AGENT(S), NOT the user's own name>],
|
||
|
|
"evidence": [<short bullet strings explaining the decision>]
|
||
|
|
}
|
||
|
|
|
||
|
|
Default stance: if evidence is thin or mixed, return is_ai_dialogue_corpus=true \
|
||
|
|
with low confidence. False-negatives on AI-dialogue detection break downstream \
|
||
|
|
classification; false-positives are recoverable later.
|
||
|
|
"""
|
||
|
|
|
||
|
|
|
||
|
|
def _extract_json(text: str) -> Optional[dict]:
|
||
|
|
"""Pull the first JSON object out of a possibly-messy LLM response."""
|
||
|
|
text = text.strip()
|
||
|
|
if not text:
|
||
|
|
return None
|
||
|
|
# Try straight parse first
|
||
|
|
try:
|
||
|
|
return json.loads(text)
|
||
|
|
except json.JSONDecodeError:
|
||
|
|
pass
|
||
|
|
# Try to find a {...} block
|
||
|
|
start = text.find("{")
|
||
|
|
if start < 0:
|
||
|
|
return None
|
||
|
|
depth = 0
|
||
|
|
in_string = False
|
||
|
|
escape = False
|
||
|
|
for i in range(start, len(text)):
|
||
|
|
ch = text[i]
|
||
|
|
if in_string:
|
||
|
|
if escape:
|
||
|
|
escape = False
|
||
|
|
elif ch == "\\":
|
||
|
|
escape = True
|
||
|
|
elif ch == '"':
|
||
|
|
in_string = False
|
||
|
|
continue
|
||
|
|
if ch == '"':
|
||
|
|
in_string = True
|
||
|
|
elif ch == "{":
|
||
|
|
depth += 1
|
||
|
|
elif ch == "}":
|
||
|
|
depth -= 1
|
||
|
|
if depth == 0:
|
||
|
|
candidate = text[start : i + 1]
|
||
|
|
try:
|
||
|
|
return json.loads(candidate)
|
||
|
|
except json.JSONDecodeError:
|
||
|
|
return None
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def detect_origin_llm(samples: list[str], provider) -> CorpusOriginResult:
|
||
|
|
"""LLM-assisted detection. Takes samples (list of drawer-text excerpts)
|
||
|
|
and an LLMProvider (mempalace.llm_client.LLMProvider). Returns the
|
||
|
|
same CorpusOriginResult shape as the heuristic.
|
||
|
|
|
||
|
|
Falls back conservatively (default-stance ai=True, low confidence)
|
||
|
|
on any LLM error or malformed response — never raises.
|
||
|
|
"""
|
||
|
|
# Build the user prompt: concise excerpts, capped so we stay cheap
|
||
|
|
max_excerpt_chars = 800
|
||
|
|
excerpts = "\n\n---\n\n".join(
|
||
|
|
f"[sample {i + 1}]\n{s[:max_excerpt_chars]}" for i, s in enumerate(samples[:20])
|
||
|
|
)
|
||
|
|
user_prompt = f"CORPUS EXCERPTS:\n\n{excerpts}\n\nAnalyze and respond with JSON."
|
||
|
|
|
||
|
|
try:
|
||
|
|
resp = provider.classify(system=_SYSTEM_PROMPT, user=user_prompt, json_mode=True)
|
||
|
|
raw = getattr(resp, "text", "") or ""
|
||
|
|
except Exception as e:
|
||
|
|
return CorpusOriginResult(
|
||
|
|
likely_ai_dialogue=True,
|
||
|
|
confidence=0.3,
|
||
|
|
primary_platform=None,
|
||
|
|
evidence=[f"LLM provider error (fallback to default stance): {e}"],
|
||
|
|
)
|
||
|
|
|
||
|
|
parsed = _extract_json(raw)
|
||
|
|
if not parsed or not isinstance(parsed, dict):
|
||
|
|
return CorpusOriginResult(
|
||
|
|
likely_ai_dialogue=True,
|
||
|
|
confidence=0.3,
|
||
|
|
primary_platform=None,
|
||
|
|
evidence=["LLM response was not valid JSON (fallback to default stance)"],
|
||
|
|
)
|
||
|
|
|
||
|
|
# Pull fields defensively. If the LLM leaked the user_name into
|
||
|
|
# agent_persona_names despite the prompt telling it not to, filter it out.
|
||
|
|
user_name = parsed.get("user_name") or None
|
||
|
|
personas = list(parsed.get("agent_persona_names") or [])
|
||
|
|
if user_name:
|
||
|
|
personas = [p for p in personas if p.lower() != user_name.lower()]
|
||
|
|
return CorpusOriginResult(
|
||
|
|
likely_ai_dialogue=bool(parsed.get("is_ai_dialogue_corpus", True)),
|
||
|
|
confidence=float(parsed.get("confidence", 0.5)),
|
||
|
|
primary_platform=parsed.get("primary_platform") or None,
|
||
|
|
user_name=user_name,
|
||
|
|
agent_persona_names=personas,
|
||
|
|
evidence=list(parsed.get("evidence") or []),
|
||
|
|
)
|