6aebf458ff
The pattern-matching detector had several systematic false positives that
crowded the init review with nonsense. Concrete fixes:
- CamelCase extraction: add `[A-Z][a-z]+(?:[A-Z][a-z]+|[A-Z]{2,})+` to
candidate patterns so `MemPalace`, `ChromaDB`, `OpenAI`, `ChatGPT` are
visible. Previously `MemPalace` fragmented into `Mem` + `Palace`.
- Dialogue `^NAME:\s` requires >=2 matches to count. A single metadata
line like `Created: 2026-04-21` was scoring as dialogue and classifying
`Created` as a person.
- Versioned/hyphenated pattern tightened to `\b{name}[-_]v?\d+(?:\.\d+)*\b`
(version-only). The previous `\b{name}[-v]\w+` matched `context-manager`,
`multi-word`, etc. - every hyphenated compound.
- Skip LICENSE/COPYING/NOTICE/AUTHORS/PATENTS files during scan. They
produce pure-English-prose noise (`Contributor`, `Software`, `Covered`,
`Before`).
- Extra SKIP_DIRS: `.terraform`, `vendor`, `target`.
- Expand stopword list with capitalized participles/descriptors that
commonly appear at sentence start: `created`, `updated`, `extracted`,
`processed`, `total`, `summary`, `auto`, `multi`, `hybrid`, `context`,
`bridge`, `batch`, `local`, `native`, `never`, `before`, `after`, etc.
- classify_entity: high-pronoun single-category signal now classifies as
person. A diary's main character gets referenced with pronouns, not
dialogue markers - requiring two signal categories demoted `Lu` (16
pronoun hits across 30 mentions) to uncertain. Gate on
`pronoun_hits >= 5 AND pronoun_hits / frequency >= 0.2` so common
sentence-start words (`Never`, `Before`) with incidental proximity
stay uncertain.
157 lines
7.1 KiB
JSON
157 lines
7.1 KiB
JSON
{
|
|
"lang": "en",
|
|
"label": "English",
|
|
"terms": {
|
|
"palace": "palace",
|
|
"wing": "wing",
|
|
"hall": "hall",
|
|
"closet": "closet",
|
|
"drawer": "drawer",
|
|
"mine": "mine",
|
|
"search": "search",
|
|
"status": "status",
|
|
"init": "init",
|
|
"repair": "repair",
|
|
"migrate": "migrate",
|
|
"entity": "entity",
|
|
"topic": "topic"
|
|
},
|
|
"cli": {
|
|
"mine_start": "Mining {path}...",
|
|
"mine_complete": "Done. {closets} closets, {drawers} drawers created.",
|
|
"mine_skip": "Already mined. Use --force to re-mine.",
|
|
"search_no_results": "No results for: {query}",
|
|
"search_results": "Found {count} results:",
|
|
"status_palace": "Palace: {path}",
|
|
"status_wings": "{count} wings",
|
|
"status_closets": "{count} closets",
|
|
"status_drawers": "{count} drawers",
|
|
"init_complete": "Palace initialized at {path}",
|
|
"init_exists": "Palace already exists at {path}",
|
|
"repair_complete": "Repair complete. {fixed} issues fixed.",
|
|
"migrate_complete": "Migration complete.",
|
|
"no_palace": "No palace found. Run: mempalace init <dir>"
|
|
},
|
|
"aaak": {
|
|
"instruction": "Compress to index format. Hyphens between words, pipes between concepts. Drop articles and filler. Keep names and numbers exact."
|
|
},
|
|
"regex": {
|
|
"topic_pattern": "[A-Z][a-z]{2,}|[A-Za-z][A-Za-z0-9_]{2,}",
|
|
"stop_words": "the this that these those some many most each every other only such very will would could should must shall yeah okay also even then now already still back done make take give know think want need going come find work added saved session summary conversation topics source about once just really actually here there where good great better thank please sorry right wrong true false",
|
|
"quote_pattern": "\"([^\"]{20,200})\"",
|
|
"action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}"
|
|
},
|
|
"entity": {
|
|
"candidate_pattern": "[A-Z][a-z]+(?:[A-Z][a-z]+|[A-Z]{2,})+|[A-Z][a-z]{1,19}",
|
|
"multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
|
|
"person_verb_patterns": [
|
|
"\\b{name}\\s+said\\b",
|
|
"\\b{name}\\s+asked\\b",
|
|
"\\b{name}\\s+told\\b",
|
|
"\\b{name}\\s+replied\\b",
|
|
"\\b{name}\\s+laughed\\b",
|
|
"\\b{name}\\s+smiled\\b",
|
|
"\\b{name}\\s+cried\\b",
|
|
"\\b{name}\\s+felt\\b",
|
|
"\\b{name}\\s+thinks?\\b",
|
|
"\\b{name}\\s+wants?\\b",
|
|
"\\b{name}\\s+loves?\\b",
|
|
"\\b{name}\\s+hates?\\b",
|
|
"\\b{name}\\s+knows?\\b",
|
|
"\\b{name}\\s+decided\\b",
|
|
"\\b{name}\\s+pushed\\b",
|
|
"\\b{name}\\s+wrote\\b",
|
|
"\\bhey\\s+{name}\\b",
|
|
"\\bthanks?\\s+{name}\\b",
|
|
"\\bhi\\s+{name}\\b",
|
|
"\\bdear\\s+{name}\\b"
|
|
],
|
|
"pronoun_patterns": [
|
|
"\\bshe\\b",
|
|
"\\bher\\b",
|
|
"\\bhers\\b",
|
|
"\\bhe\\b",
|
|
"\\bhim\\b",
|
|
"\\bhis\\b",
|
|
"\\bthey\\b",
|
|
"\\bthem\\b",
|
|
"\\btheir\\b"
|
|
],
|
|
"dialogue_patterns": [
|
|
"^>\\s*{name}[:\\s]",
|
|
"^{name}:\\s",
|
|
"^\\[{name}\\]",
|
|
"\"{name}\\s+said"
|
|
],
|
|
"direct_address_pattern": "\\bhey\\s+{name}\\b|\\bthanks?\\s+{name}\\b|\\bhi\\s+{name}\\b",
|
|
"project_verb_patterns": [
|
|
"\\bbuilding\\s+{name}\\b",
|
|
"\\bbuilt\\s+{name}\\b",
|
|
"\\bship(?:ping|ped)?\\s+{name}\\b",
|
|
"\\blaunch(?:ing|ed)?\\s+{name}\\b",
|
|
"\\bdeploy(?:ing|ed)?\\s+{name}\\b",
|
|
"\\binstall(?:ing|ed)?\\s+{name}\\b",
|
|
"\\bthe\\s+{name}\\s+architecture\\b",
|
|
"\\bthe\\s+{name}\\s+pipeline\\b",
|
|
"\\bthe\\s+{name}\\s+system\\b",
|
|
"\\bthe\\s+{name}\\s+repo\\b",
|
|
"\\b{name}\\s+v\\d+\\b",
|
|
"\\b{name}\\.py\\b",
|
|
"\\b{name}-core\\b",
|
|
"\\b{name}-local\\b",
|
|
"\\bimport\\s+{name}\\b",
|
|
"\\bpip\\s+install\\s+{name}\\b"
|
|
],
|
|
"stopwords": [
|
|
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to",
|
|
"for", "of", "with", "by", "from", "as", "is", "was", "are", "were",
|
|
"be", "been", "being", "have", "has", "had", "do", "does", "did",
|
|
"will", "would", "could", "should", "may", "might", "must", "shall", "can",
|
|
"this", "that", "these", "those", "it", "its", "they", "them", "their",
|
|
"we", "our", "you", "your", "i", "my", "me", "he", "she", "his", "her",
|
|
"who", "what", "when", "where", "why", "how", "which",
|
|
"if", "then", "so", "not", "no", "yes", "ok", "okay",
|
|
"just", "very", "really", "also", "already", "still", "even", "only",
|
|
"here", "there", "now", "too", "up", "out", "about", "like",
|
|
"use", "get", "got", "make", "made", "take", "put", "come", "go", "see",
|
|
"know", "think", "true", "false", "none", "null", "new", "old", "all", "any", "some",
|
|
"return", "print", "def", "class", "import",
|
|
"step", "usage", "run", "check", "find", "add", "set", "list",
|
|
"args", "dict", "str", "int", "bool", "path", "file", "type", "name",
|
|
"note", "example", "option", "result", "error", "warning", "info",
|
|
"every", "each", "more", "less", "next", "last", "first", "second",
|
|
"stack", "layer", "mode", "test", "stop", "start", "copy", "move",
|
|
"source", "target", "output", "input", "data", "item", "key", "value",
|
|
"returns", "raises", "yields", "self", "cls", "kwargs",
|
|
"world", "well", "want", "topic", "choose", "social", "cars", "phones",
|
|
"healthcare", "ex", "machina", "deus", "human", "humans", "people",
|
|
"things", "something", "nothing", "everything", "anything", "someone",
|
|
"everyone", "anyone", "way", "time", "day", "life", "place", "thing",
|
|
"part", "kind", "sort", "case", "point", "idea", "fact", "sense",
|
|
"question", "answer", "reason", "number", "version", "system",
|
|
"hey", "hi", "hello", "thanks", "thank", "right", "let",
|
|
"click", "hit", "press", "tap", "drag", "drop", "open", "close",
|
|
"save", "load", "launch", "install", "download", "upload", "scroll",
|
|
"select", "enter", "submit", "cancel", "confirm", "delete", "paste",
|
|
"write", "read", "search", "show", "hide",
|
|
"desktop", "documents", "downloads", "users", "home", "library",
|
|
"applications", "preferences", "settings", "terminal",
|
|
"actor", "vector", "remote", "control", "duration", "fetch",
|
|
"agents", "tools", "others", "guards", "ethics", "regulation",
|
|
"learning", "thinking", "memory", "language", "intelligence",
|
|
"technology", "society", "culture", "future", "history", "science",
|
|
"model", "models", "network", "networks", "training", "inference",
|
|
"created", "updated", "deleted", "added", "removed", "modified",
|
|
"extracted", "processed", "generated", "compiled", "launched", "installed",
|
|
"deployed", "executed", "loaded", "parsed", "validated", "configured",
|
|
"total", "summary", "covered", "included", "pending", "failed", "success",
|
|
"ready", "active", "disabled", "enabled", "available", "completed",
|
|
"auto", "multi", "mini", "micro", "meta", "super", "hybrid",
|
|
"context", "bridge", "batch", "local", "global", "native", "cloud",
|
|
"before", "after", "during", "often", "always", "never",
|
|
"project", "contributor", "software",
|
|
"backend", "frontend", "server", "client", "service", "app", "api"
|
|
]
|
|
}
|
|
}
|