From 7006a6b42dfd8d4b4f3da01fd2bd50688aa4325f Mon Sep 17 00:00:00 2001 From: mvalentsev Date: Sat, 18 Apr 2026 21:53:11 +0500 Subject: [PATCH 1/4] feat(i18n): add entity detection to German locale --- mempalace/i18n/de.json | 82 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/mempalace/i18n/de.json b/mempalace/i18n/de.json index c6677b3..f2476e1 100644 --- a/mempalace/i18n/de.json +++ b/mempalace/i18n/de.json @@ -40,5 +40,87 @@ "stop_words": "der die das ein eine eines einer einem einen den dem des und oder aber denn weil wenn als ob auch noch schon sehr viel nur nicht mehr kann wird hat ist sind war waren sein haben wurde mit von zu für auf in an um über nach durch", "quote_pattern": "\\u201E([^\\u201C]{10,200})\\u201C|\"([^\"]{10,200})\"", "action_pattern": "(?:gebaut|behoben|geschrieben|hinzugefügt|gepusht|gemessen|getestet|überprüft|erstellt|gelöscht|aktualisiert|konfiguriert|bereitgestellt|migriert)\\s+[\\wÄÖÜäöüß\\s]{3,30}" + }, + "entity": { + "candidate_pattern": "[A-ZÄÖÜ][a-zäöüß]{1,19}", + "multi_word_pattern": "[A-ZÄÖÜ][a-zäöüß]+(?:\\s+[A-ZÄÖÜ][a-zäöüß]+)+", + "person_verb_patterns": [ + "\\b{name}\\s+sagte\\b", + "\\b{name}\\s+fragte\\b", + "\\b{name}\\s+antwortete\\b", + "\\b{name}\\s+erzählte\\b", + "\\b{name}\\s+lachte\\b", + "\\b{name}\\s+lächelte\\b", + "\\b{name}\\s+weinte\\b", + "\\b{name}\\s+fühlte\\b", + "\\b{name}\\s+denkt\\b", + "\\b{name}\\s+will\\b", + "\\b{name}\\s+liebt\\b", + "\\b{name}\\s+hasst\\b", + "\\b{name}\\s+weiß\\b", + "\\b{name}\\s+entschied\\b", + "\\b{name}\\s+schrieb\\b" + ], + "pronoun_patterns": [ + "\\ber\\b", + "\\bsie\\b", + "\\bes\\b", + "\\bihn\\b", + "\\bihm\\b", + "\\bihr\\b", + "\\bsein\\b", + "\\bihre\\b", + "\\bihnen\\b" + ], + "dialogue_patterns": [ + "^>\\s*{name}[:\\s]", + "^{name}:\\s", + "^\\[{name}\\]", + "\"{name}\\s+sagte" + ], + "direct_address_pattern": "\\bhallo\\s+{name}\\b|\\bhi\\s+{name}\\b|\\bhey\\s+{name}\\b|\\bdanke\\s+{name}\\b|\\bservus\\s+{name}\\b|\\blieber\\s+{name}\\b|\\bliebe\\s+{name}\\b|\\bsehr\\s+geehrter\\s+{name}\\b|\\bsehr\\s+geehrte\\s+{name}\\b", + "project_verb_patterns": [ + "\\bbaue\\s+{name}\\b", + "\\bgebaut\\s+{name}\\b", + "\\bstarte\\s+{name}\\b", + "\\bgestartet\\s+{name}\\b", + "\\bdeploye\\s+{name}\\b", + "\\binstalliert\\s+{name}\\b", + "\\bdie\\s+{name}\\s+architektur\\b", + "\\bdie\\s+{name}\\s+pipeline\\b", + "\\bdas\\s+{name}\\s+system\\b", + "\\bdas\\s+{name}\\s+repository\\b", + "\\b{name}\\s+v\\d+\\b", + "\\b{name}\\.py\\b", + "\\bimport\\s+{name}\\b", + "\\bpip\\s+install\\s+{name}\\b" + ], + "stopwords": [ + "der", "die", "das", "ein", "eine", "eines", "einer", "einem", "einen", + "den", "dem", "des", "und", "oder", "aber", "denn", "weil", "wenn", "als", + "ob", "auch", "noch", "schon", "sehr", "viel", "nur", "nicht", "mehr", + "kann", "wird", "hat", "ist", "sind", "war", "waren", "sein", "haben", + "wurde", "worden", "werden", "mit", "von", "zu", "für", "auf", "in", + "an", "um", "über", "nach", "durch", "bei", "aus", "seit", "vor", "zwischen", + "ich", "du", "er", "sie", "es", "wir", "ihr", "mich", "dich", "mir", "dir", + "uns", "euch", "mein", "dein", "unser", "euer", "ihre", "seine", + "wer", "was", "wann", "wo", "wie", "warum", "welcher", "welche", "welches", + "so", "dann", "jetzt", "heute", "gestern", "morgen", "hier", "dort", "da", + "immer", "nie", "manchmal", "oft", "selten", "bald", "spät", + "ja", "nein", "vielleicht", "gut", "schlecht", "besser", "bitte", "danke", + "hallo", "hi", "hey", "tschüss", + "tag", "tage", "woche", "monat", "jahr", "jahre", "zeit", "welt", "leben", + "mensch", "menschen", "leute", "person", "ding", "dinge", "sache", "sachen", + "teil", "art", "weise", "stelle", "platz", "ort", "zimmer", "haus", "land", + "grund", "frage", "antwort", "fakt", "sinn", "idee", "punkt", "fall", "aspekt", + "beispiel", "version", "nummer", "zahl", "name", "namen", "system", "modell", + "sprache", "technologie", "gesellschaft", "kultur", "geschichte", + "wissenschaft", "zukunft", "erinnerung", "gedächtnis", + "datei", "ordner", "pfad", "schlüssel", "wert", "fehler", "warnung", + "ergebnis", "eingabe", "ausgabe", "quelle", "ziel", "daten", "elemente", + "montag", "dienstag", "mittwoch", "donnerstag", "freitag", "samstag", "sonntag", + "januar", "februar", "märz", "april", "mai", "juni", "juli", "august", + "september", "oktober", "november", "dezember" + ] } } From e17f219be879ab84b2a0ac1d08466657c3473a29 Mon Sep 17 00:00:00 2001 From: mvalentsev Date: Sat, 18 Apr 2026 21:54:39 +0500 Subject: [PATCH 2/4] feat(i18n): add entity detection to Spanish locale --- mempalace/i18n/es.json | 84 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/mempalace/i18n/es.json b/mempalace/i18n/es.json index aa30e1b..dd490aa 100644 --- a/mempalace/i18n/es.json +++ b/mempalace/i18n/es.json @@ -40,5 +40,89 @@ "stop_words": "el la los las un una unos unas de del al en con por para su sus mi mis tu tus es son está están fue ser estar haber sido como pero más muy también todo todos toda todas este esta estos estas ese esa esos esas que quien cual donde cuando porque aunque sin", "quote_pattern": "\"([^\"]{10,200})\"|«([^»]{10,200})»", "action_pattern": "(?:construido|corregido|escrito|añadido|enviado|medido|probado|revisado|creado|eliminado|actualizado|configurado|desplegado|migrado)\\s+[\\wá-ú\\s]{3,30}" + }, + "entity": { + "candidate_pattern": "[A-ZÁÉÍÓÚÑÜ][a-záéíóúñü]{1,19}", + "multi_word_pattern": "[A-ZÁÉÍÓÚÑÜ][a-záéíóúñü]+(?:\\s+[A-ZÁÉÍÓÚÑÜ][a-záéíóúñü]+)+", + "person_verb_patterns": [ + "\\b{name}\\s+dijo\\b", + "\\b{name}\\s+preguntó\\b", + "\\b{name}\\s+respondió\\b", + "\\b{name}\\s+contó\\b", + "\\b{name}\\s+rió\\b", + "\\b{name}\\s+sonrió\\b", + "\\b{name}\\s+lloró\\b", + "\\b{name}\\s+sintió\\b", + "\\b{name}\\s+piensa\\b", + "\\b{name}\\s+quiere\\b", + "\\b{name}\\s+ama\\b", + "\\b{name}\\s+odia\\b", + "\\b{name}\\s+sabe\\b", + "\\b{name}\\s+decidió\\b", + "\\b{name}\\s+escribió\\b" + ], + "pronoun_patterns": [ + "\\bél\\b", + "\\bella\\b", + "\\bellos\\b", + "\\bellas\\b", + "\\blo\\b", + "\\bla\\b", + "\\ble\\b", + "\\bles\\b", + "\\bse\\b" + ], + "dialogue_patterns": [ + "^>\\s*{name}[:\\s]", + "^{name}:\\s", + "^\\[{name}\\]", + "\"{name}\\s+dijo" + ], + "direct_address_pattern": "\\bhola\\s+{name}\\b|\\bhey\\s+{name}\\b|\\bhi\\s+{name}\\b|\\bgracias\\s+{name}\\b|\\bquerido\\s+{name}\\b|\\bquerida\\s+{name}\\b|\\bestimado\\s+{name}\\b|\\bestimada\\s+{name}\\b|\\bdon\\s+{name}\\b|\\bdoña\\s+{name}\\b|\\bseñor\\s+{name}\\b|\\bseñora\\s+{name}\\b", + "project_verb_patterns": [ + "\\bconstruyo\\s+{name}\\b", + "\\bconstruí\\s+{name}\\b", + "\\barmé\\s+{name}\\b", + "\\blancé\\s+{name}\\b", + "\\bdesplegué\\s+{name}\\b", + "\\binstalé\\s+{name}\\b", + "\\bla\\s+arquitectura\\s+{name}\\b", + "\\bel\\s+pipeline\\s+{name}\\b", + "\\bel\\s+sistema\\s+{name}\\b", + "\\bel\\s+proyecto\\s+{name}\\b", + "\\bel\\s+repositorio\\s+{name}\\b", + "\\b{name}\\s+v\\d+\\b", + "\\b{name}\\.py\\b", + "\\bimport\\s+{name}\\b", + "\\bpip\\s+install\\s+{name}\\b" + ], + "stopwords": [ + "el", "la", "los", "las", "un", "una", "unos", "unas", + "de", "del", "al", "a", "en", "con", "sin", "por", "para", "sobre", + "entre", "hasta", "desde", "hacia", "contra", "según", "tras", + "y", "o", "u", "ni", "pero", "sino", "aunque", "porque", "pues", + "que", "quien", "quienes", "cual", "cuales", "cuyo", "cuya", + "donde", "cuando", "como", "cuanto", "cuanta", + "yo", "tú", "él", "ella", "nosotros", "vosotros", "ellos", "ellas", + "me", "te", "se", "nos", "os", "lo", "la", "le", "los", "las", "les", + "mi", "mis", "tu", "tus", "su", "sus", "nuestro", "nuestra", "vuestro", + "este", "esta", "estos", "estas", "ese", "esa", "esos", "esas", + "aquel", "aquella", "aquellos", "aquellas", "esto", "eso", "aquello", + "ser", "estar", "haber", "tener", "hacer", "poder", "querer", "saber", + "es", "son", "fue", "fueron", "era", "eran", "está", "están", "estaba", + "he", "ha", "hemos", "han", "había", "hay", + "muy", "mucho", "mucha", "muchos", "muchas", "poco", "poca", "pocos", "pocas", + "más", "menos", "tan", "tanto", "también", "tampoco", + "sí", "no", "quizás", "tal", "vez", + "aquí", "allí", "allá", "ahí", "acá", + "hoy", "ayer", "mañana", "ahora", "antes", "después", "luego", "entonces", + "siempre", "nunca", "jamás", "todavía", "aún", "ya", + "bien", "mal", "mejor", "peor", "bueno", "buena", "malo", "mala", + "gracias", "hola", "adiós", "por favor", "perdón", + "día", "días", "semana", "mes", "año", "años", "tiempo", "vez", "veces", + "cosa", "cosas", "persona", "gente", "mundo", "vida", "casa", "lugar", + "forma", "manera", "parte", "caso", "punto", "idea", "hecho", "razón", + "nombre", "número", "versión", "sistema", "modelo" + ] } } From 118cbe40bd4bfef9f9fe3fdf3bdb0aa33101e977 Mon Sep 17 00:00:00 2001 From: mvalentsev Date: Sat, 18 Apr 2026 21:56:45 +0500 Subject: [PATCH 3/4] feat(i18n): add entity detection to French locale --- mempalace/i18n/fr.json | 82 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/mempalace/i18n/fr.json b/mempalace/i18n/fr.json index 2e3d0b9..86df08c 100644 --- a/mempalace/i18n/fr.json +++ b/mempalace/i18n/fr.json @@ -40,5 +40,87 @@ "stop_words": "le la les un une des de du au aux en et ou mais donc or ni car que qui ce cette ces son sa ses mon ma mes ton ta tes leur leurs nous vous ils elles on ne pas plus très bien aussi avec pour dans sur par est sont fait être avoir été comme tout tous toute toutes", "quote_pattern": "«\\s*([^»]{10,200})\\s*»|\"([^\"]{10,200})\"", "action_pattern": "(?:construit|corrigé|écrit|ajouté|poussé|mesuré|testé|révisé|créé|supprimé|mis à jour|configuré|déployé|migré)\\s+[\\wà-ÿ\\s]{3,30}" + }, + "entity": { + "candidate_pattern": "[A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸÆŒ][a-zàâäçéèêëîïôöùûüÿæœ]{1,19}", + "multi_word_pattern": "[A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸÆŒ][a-zàâäçéèêëîïôöùûüÿæœ]+(?:\\s+[A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸÆŒ][a-zàâäçéèêëîïôöùûüÿæœ]+)+", + "person_verb_patterns": [ + "\\b{name}\\s+a\\s+dit\\b", + "\\b{name}\\s+a\\s+demandé\\b", + "\\b{name}\\s+a\\s+répondu\\b", + "\\b{name}\\s+a\\s+raconté\\b", + "\\b{name}\\s+a\\s+ri\\b", + "\\b{name}\\s+a\\s+souri\\b", + "\\b{name}\\s+a\\s+pleuré\\b", + "\\b{name}\\s+a\\s+senti\\b", + "\\b{name}\\s+pense\\b", + "\\b{name}\\s+veut\\b", + "\\b{name}\\s+aime\\b", + "\\b{name}\\s+déteste\\b", + "\\b{name}\\s+sait\\b", + "\\b{name}\\s+a\\s+décidé\\b", + "\\b{name}\\s+a\\s+écrit\\b" + ], + "pronoun_patterns": [ + "\\bil\\b", + "\\belle\\b", + "\\blui\\b", + "\\bils\\b", + "\\belles\\b", + "\\bleur\\b", + "\\bleurs\\b", + "\\beux\\b", + "\\bse\\b" + ], + "dialogue_patterns": [ + "^>\\s*{name}[:\\s]", + "^{name}:\\s", + "^\\[{name}\\]", + "\"{name}\\s+a\\s+dit" + ], + "direct_address_pattern": "\\bbonjour\\s+{name}\\b|\\bsalut\\s+{name}\\b|\\bmerci\\s+{name}\\b|\\bcher\\s+{name}\\b|\\bchère\\s+{name}\\b|\\bmonsieur\\s+{name}\\b|\\bmadame\\s+{name}\\b|\\bhey\\s+{name}\\b|\\bhi\\s+{name}\\b", + "project_verb_patterns": [ + "\\bconstruit\\s+{name}\\b", + "\\blancé\\s+{name}\\b", + "\\bdéployé\\s+{name}\\b", + "\\binstallé\\s+{name}\\b", + "\\bl'architecture\\s+{name}\\b", + "\\ble\\s+pipeline\\s+{name}\\b", + "\\ble\\s+système\\s+{name}\\b", + "\\ble\\s+projet\\s+{name}\\b", + "\\ble\\s+dépôt\\s+{name}\\b", + "\\b{name}\\s+v\\d+\\b", + "\\b{name}\\.py\\b", + "\\bimport\\s+{name}\\b", + "\\bpip\\s+install\\s+{name}\\b" + ], + "stopwords": [ + "le", "la", "les", "un", "une", "des", "du", "de", "au", "aux", + "en", "dans", "sur", "sous", "avec", "sans", "pour", "par", "vers", + "chez", "entre", "depuis", "pendant", "avant", "après", "jusqu", + "et", "ou", "mais", "donc", "or", "ni", "car", "que", "qui", + "dont", "où", "quand", "comment", "pourquoi", "combien", "lequel", + "ce", "cet", "cette", "ces", "celui", "celle", "ceux", "celles", + "mon", "ma", "mes", "ton", "ta", "tes", "son", "sa", "ses", + "notre", "nos", "votre", "vos", "leur", "leurs", + "je", "tu", "il", "elle", "on", "nous", "vous", "ils", "elles", + "me", "te", "se", "lui", "eux", + "être", "avoir", "faire", "dire", "aller", "voir", "savoir", "pouvoir", + "est", "sont", "était", "étaient", "fut", "furent", "sera", "seront", + "ai", "as", "a", "avons", "avez", "ont", "avait", "avaient", + "très", "bien", "mal", "peu", "beaucoup", "trop", "assez", "aussi", + "plus", "moins", "tant", "si", "tellement", + "oui", "non", "peut-être", "vraiment", + "ici", "là", "là-bas", "partout", "ailleurs", + "aujourd'hui", "hier", "demain", "maintenant", "alors", "ensuite", + "toujours", "jamais", "souvent", "parfois", "déjà", "encore", + "bon", "bonne", "mauvais", "mauvaise", "meilleur", "pire", + "merci", "bonjour", "salut", "au revoir", + "jour", "jours", "semaine", "mois", "année", "temps", "fois", + "chose", "choses", "personne", "gens", "monde", "vie", "maison", + "endroit", "lieu", "partie", "façon", "manière", "sorte", "type", + "cas", "point", "idée", "fait", "raison", "nom", "nombre", + "version", "système", "modèle", "question", "réponse" + ] } } From 5189e0d652c6e83efb3875b058e108258a7c5221 Mon Sep 17 00:00:00 2001 From: mvalentsev Date: Sat, 18 Apr 2026 21:58:11 +0500 Subject: [PATCH 4/4] test(i18n): add entity section smoke tests and schema invariants --- tests/test_i18n.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/tests/test_i18n.py b/tests/test_i18n.py index b91e352..9897f1e 100644 --- a/tests/test_i18n.py +++ b/tests/test_i18n.py @@ -85,3 +85,76 @@ def test_from_config_defaults_to_english(tmp_path): d = Dialect.from_config(str(config_path)) assert d.lang == "en", f"Expected 'en', got '{d.lang}' -- state leak from prior load_lang" + + +def test_de_entity_section_loads(): + """German entity section loads all pattern lists non-empty.""" + from mempalace.i18n import get_entity_patterns + + p = get_entity_patterns(("de",)) + assert p["candidate_patterns"], "de: empty candidate_patterns" + assert p["multi_word_patterns"], "de: empty multi_word_patterns" + assert p["person_verb_patterns"], "de: empty person_verb_patterns" + assert p["pronoun_patterns"], "de: empty pronoun_patterns" + assert p["dialogue_patterns"], "de: empty dialogue_patterns" + assert p["direct_address_patterns"], "de: empty direct_address_patterns" + assert p["project_verb_patterns"], "de: empty project_verb_patterns" + assert len(p["stopwords"]) > 50, f"de: stopwords too short ({len(p['stopwords'])})" + + +def test_es_entity_section_loads(): + """Spanish entity section loads all pattern lists non-empty.""" + from mempalace.i18n import get_entity_patterns + + p = get_entity_patterns(("es",)) + assert p["candidate_patterns"], "es: empty candidate_patterns" + assert p["multi_word_patterns"], "es: empty multi_word_patterns" + assert p["person_verb_patterns"], "es: empty person_verb_patterns" + assert p["pronoun_patterns"], "es: empty pronoun_patterns" + assert p["dialogue_patterns"], "es: empty dialogue_patterns" + assert p["direct_address_patterns"], "es: empty direct_address_patterns" + assert p["project_verb_patterns"], "es: empty project_verb_patterns" + assert len(p["stopwords"]) > 50, f"es: stopwords too short ({len(p['stopwords'])})" + + +def test_fr_entity_section_loads(): + """French entity section loads all pattern lists non-empty.""" + from mempalace.i18n import get_entity_patterns + + p = get_entity_patterns(("fr",)) + assert p["candidate_patterns"], "fr: empty candidate_patterns" + assert p["multi_word_patterns"], "fr: empty multi_word_patterns" + assert p["person_verb_patterns"], "fr: empty person_verb_patterns" + assert p["pronoun_patterns"], "fr: empty pronoun_patterns" + assert p["dialogue_patterns"], "fr: empty dialogue_patterns" + assert p["direct_address_patterns"], "fr: empty direct_address_patterns" + assert p["project_verb_patterns"], "fr: empty project_verb_patterns" + assert len(p["stopwords"]) > 50, f"fr: stopwords too short ({len(p['stopwords'])})" + + +def test_direct_address_key_is_singular_string_for_all_locales(): + """Schema invariant: any locale declaring direct-address uses the singular + ``direct_address_pattern`` (str), never the plural ``direct_address_patterns`` (list). + + The loader at ``mempalace/i18n/__init__.py:209-210`` only reads the singular key; + the plural form is the output schema of the merged dict, not the input schema. + Declaring the plural form in a locale file silently drops every direct-address + pattern in that locale after load. + """ + from mempalace.i18n import _load_entity_section, available_languages + + for lang in available_languages(): + section = _load_entity_section(lang) + if not section: + continue + assert "direct_address_patterns" not in section, ( + f"{lang}: declares plural 'direct_address_patterns' (list); " + f"loader only reads singular 'direct_address_pattern' (str). " + f"Collapse the list into one `|`-alternation string and rename the key." + ) + if "direct_address_pattern" in section: + val = section["direct_address_pattern"] + assert isinstance( + val, str + ), f"{lang}: 'direct_address_pattern' must be str, got {type(val).__name__}" + assert val, f"{lang}: 'direct_address_pattern' is empty"