Merge pull request #1001 from mvalentsev/feat/i18n-de-es-fr-entity
feat(i18n): add entity detection to German, Spanish, and French locales
This commit is contained in:
@@ -40,5 +40,87 @@
|
|||||||
"stop_words": "der die das ein eine eines einer einem einen den dem des und oder aber denn weil wenn als ob auch noch schon sehr viel nur nicht mehr kann wird hat ist sind war waren sein haben wurde mit von zu für auf in an um über nach durch",
|
"stop_words": "der die das ein eine eines einer einem einen den dem des und oder aber denn weil wenn als ob auch noch schon sehr viel nur nicht mehr kann wird hat ist sind war waren sein haben wurde mit von zu für auf in an um über nach durch",
|
||||||
"quote_pattern": "\\u201E([^\\u201C]{10,200})\\u201C|\"([^\"]{10,200})\"",
|
"quote_pattern": "\\u201E([^\\u201C]{10,200})\\u201C|\"([^\"]{10,200})\"",
|
||||||
"action_pattern": "(?:gebaut|behoben|geschrieben|hinzugefügt|gepusht|gemessen|getestet|überprüft|erstellt|gelöscht|aktualisiert|konfiguriert|bereitgestellt|migriert)\\s+[\\wÄÖÜäöüß\\s]{3,30}"
|
"action_pattern": "(?:gebaut|behoben|geschrieben|hinzugefügt|gepusht|gemessen|getestet|überprüft|erstellt|gelöscht|aktualisiert|konfiguriert|bereitgestellt|migriert)\\s+[\\wÄÖÜäöüß\\s]{3,30}"
|
||||||
|
},
|
||||||
|
"entity": {
|
||||||
|
"candidate_pattern": "[A-ZÄÖÜ][a-zäöüß]{1,19}",
|
||||||
|
"multi_word_pattern": "[A-ZÄÖÜ][a-zäöüß]+(?:\\s+[A-ZÄÖÜ][a-zäöüß]+)+",
|
||||||
|
"person_verb_patterns": [
|
||||||
|
"\\b{name}\\s+sagte\\b",
|
||||||
|
"\\b{name}\\s+fragte\\b",
|
||||||
|
"\\b{name}\\s+antwortete\\b",
|
||||||
|
"\\b{name}\\s+erzählte\\b",
|
||||||
|
"\\b{name}\\s+lachte\\b",
|
||||||
|
"\\b{name}\\s+lächelte\\b",
|
||||||
|
"\\b{name}\\s+weinte\\b",
|
||||||
|
"\\b{name}\\s+fühlte\\b",
|
||||||
|
"\\b{name}\\s+denkt\\b",
|
||||||
|
"\\b{name}\\s+will\\b",
|
||||||
|
"\\b{name}\\s+liebt\\b",
|
||||||
|
"\\b{name}\\s+hasst\\b",
|
||||||
|
"\\b{name}\\s+weiß\\b",
|
||||||
|
"\\b{name}\\s+entschied\\b",
|
||||||
|
"\\b{name}\\s+schrieb\\b"
|
||||||
|
],
|
||||||
|
"pronoun_patterns": [
|
||||||
|
"\\ber\\b",
|
||||||
|
"\\bsie\\b",
|
||||||
|
"\\bes\\b",
|
||||||
|
"\\bihn\\b",
|
||||||
|
"\\bihm\\b",
|
||||||
|
"\\bihr\\b",
|
||||||
|
"\\bsein\\b",
|
||||||
|
"\\bihre\\b",
|
||||||
|
"\\bihnen\\b"
|
||||||
|
],
|
||||||
|
"dialogue_patterns": [
|
||||||
|
"^>\\s*{name}[:\\s]",
|
||||||
|
"^{name}:\\s",
|
||||||
|
"^\\[{name}\\]",
|
||||||
|
"\"{name}\\s+sagte"
|
||||||
|
],
|
||||||
|
"direct_address_pattern": "\\bhallo\\s+{name}\\b|\\bhi\\s+{name}\\b|\\bhey\\s+{name}\\b|\\bdanke\\s+{name}\\b|\\bservus\\s+{name}\\b|\\blieber\\s+{name}\\b|\\bliebe\\s+{name}\\b|\\bsehr\\s+geehrter\\s+{name}\\b|\\bsehr\\s+geehrte\\s+{name}\\b",
|
||||||
|
"project_verb_patterns": [
|
||||||
|
"\\bbaue\\s+{name}\\b",
|
||||||
|
"\\bgebaut\\s+{name}\\b",
|
||||||
|
"\\bstarte\\s+{name}\\b",
|
||||||
|
"\\bgestartet\\s+{name}\\b",
|
||||||
|
"\\bdeploye\\s+{name}\\b",
|
||||||
|
"\\binstalliert\\s+{name}\\b",
|
||||||
|
"\\bdie\\s+{name}\\s+architektur\\b",
|
||||||
|
"\\bdie\\s+{name}\\s+pipeline\\b",
|
||||||
|
"\\bdas\\s+{name}\\s+system\\b",
|
||||||
|
"\\bdas\\s+{name}\\s+repository\\b",
|
||||||
|
"\\b{name}\\s+v\\d+\\b",
|
||||||
|
"\\b{name}\\.py\\b",
|
||||||
|
"\\bimport\\s+{name}\\b",
|
||||||
|
"\\bpip\\s+install\\s+{name}\\b"
|
||||||
|
],
|
||||||
|
"stopwords": [
|
||||||
|
"der", "die", "das", "ein", "eine", "eines", "einer", "einem", "einen",
|
||||||
|
"den", "dem", "des", "und", "oder", "aber", "denn", "weil", "wenn", "als",
|
||||||
|
"ob", "auch", "noch", "schon", "sehr", "viel", "nur", "nicht", "mehr",
|
||||||
|
"kann", "wird", "hat", "ist", "sind", "war", "waren", "sein", "haben",
|
||||||
|
"wurde", "worden", "werden", "mit", "von", "zu", "für", "auf", "in",
|
||||||
|
"an", "um", "über", "nach", "durch", "bei", "aus", "seit", "vor", "zwischen",
|
||||||
|
"ich", "du", "er", "sie", "es", "wir", "ihr", "mich", "dich", "mir", "dir",
|
||||||
|
"uns", "euch", "mein", "dein", "unser", "euer", "ihre", "seine",
|
||||||
|
"wer", "was", "wann", "wo", "wie", "warum", "welcher", "welche", "welches",
|
||||||
|
"so", "dann", "jetzt", "heute", "gestern", "morgen", "hier", "dort", "da",
|
||||||
|
"immer", "nie", "manchmal", "oft", "selten", "bald", "spät",
|
||||||
|
"ja", "nein", "vielleicht", "gut", "schlecht", "besser", "bitte", "danke",
|
||||||
|
"hallo", "hi", "hey", "tschüss",
|
||||||
|
"tag", "tage", "woche", "monat", "jahr", "jahre", "zeit", "welt", "leben",
|
||||||
|
"mensch", "menschen", "leute", "person", "ding", "dinge", "sache", "sachen",
|
||||||
|
"teil", "art", "weise", "stelle", "platz", "ort", "zimmer", "haus", "land",
|
||||||
|
"grund", "frage", "antwort", "fakt", "sinn", "idee", "punkt", "fall", "aspekt",
|
||||||
|
"beispiel", "version", "nummer", "zahl", "name", "namen", "system", "modell",
|
||||||
|
"sprache", "technologie", "gesellschaft", "kultur", "geschichte",
|
||||||
|
"wissenschaft", "zukunft", "erinnerung", "gedächtnis",
|
||||||
|
"datei", "ordner", "pfad", "schlüssel", "wert", "fehler", "warnung",
|
||||||
|
"ergebnis", "eingabe", "ausgabe", "quelle", "ziel", "daten", "elemente",
|
||||||
|
"montag", "dienstag", "mittwoch", "donnerstag", "freitag", "samstag", "sonntag",
|
||||||
|
"januar", "februar", "märz", "april", "mai", "juni", "juli", "august",
|
||||||
|
"september", "oktober", "november", "dezember"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,5 +40,89 @@
|
|||||||
"stop_words": "el la los las un una unos unas de del al en con por para su sus mi mis tu tus es son está están fue ser estar haber sido como pero más muy también todo todos toda todas este esta estos estas ese esa esos esas que quien cual donde cuando porque aunque sin",
|
"stop_words": "el la los las un una unos unas de del al en con por para su sus mi mis tu tus es son está están fue ser estar haber sido como pero más muy también todo todos toda todas este esta estos estas ese esa esos esas que quien cual donde cuando porque aunque sin",
|
||||||
"quote_pattern": "\"([^\"]{10,200})\"|«([^»]{10,200})»",
|
"quote_pattern": "\"([^\"]{10,200})\"|«([^»]{10,200})»",
|
||||||
"action_pattern": "(?:construido|corregido|escrito|añadido|enviado|medido|probado|revisado|creado|eliminado|actualizado|configurado|desplegado|migrado)\\s+[\\wá-ú\\s]{3,30}"
|
"action_pattern": "(?:construido|corregido|escrito|añadido|enviado|medido|probado|revisado|creado|eliminado|actualizado|configurado|desplegado|migrado)\\s+[\\wá-ú\\s]{3,30}"
|
||||||
|
},
|
||||||
|
"entity": {
|
||||||
|
"candidate_pattern": "[A-ZÁÉÍÓÚÑÜ][a-záéíóúñü]{1,19}",
|
||||||
|
"multi_word_pattern": "[A-ZÁÉÍÓÚÑÜ][a-záéíóúñü]+(?:\\s+[A-ZÁÉÍÓÚÑÜ][a-záéíóúñü]+)+",
|
||||||
|
"person_verb_patterns": [
|
||||||
|
"\\b{name}\\s+dijo\\b",
|
||||||
|
"\\b{name}\\s+preguntó\\b",
|
||||||
|
"\\b{name}\\s+respondió\\b",
|
||||||
|
"\\b{name}\\s+contó\\b",
|
||||||
|
"\\b{name}\\s+rió\\b",
|
||||||
|
"\\b{name}\\s+sonrió\\b",
|
||||||
|
"\\b{name}\\s+lloró\\b",
|
||||||
|
"\\b{name}\\s+sintió\\b",
|
||||||
|
"\\b{name}\\s+piensa\\b",
|
||||||
|
"\\b{name}\\s+quiere\\b",
|
||||||
|
"\\b{name}\\s+ama\\b",
|
||||||
|
"\\b{name}\\s+odia\\b",
|
||||||
|
"\\b{name}\\s+sabe\\b",
|
||||||
|
"\\b{name}\\s+decidió\\b",
|
||||||
|
"\\b{name}\\s+escribió\\b"
|
||||||
|
],
|
||||||
|
"pronoun_patterns": [
|
||||||
|
"\\bél\\b",
|
||||||
|
"\\bella\\b",
|
||||||
|
"\\bellos\\b",
|
||||||
|
"\\bellas\\b",
|
||||||
|
"\\blo\\b",
|
||||||
|
"\\bla\\b",
|
||||||
|
"\\ble\\b",
|
||||||
|
"\\bles\\b",
|
||||||
|
"\\bse\\b"
|
||||||
|
],
|
||||||
|
"dialogue_patterns": [
|
||||||
|
"^>\\s*{name}[:\\s]",
|
||||||
|
"^{name}:\\s",
|
||||||
|
"^\\[{name}\\]",
|
||||||
|
"\"{name}\\s+dijo"
|
||||||
|
],
|
||||||
|
"direct_address_pattern": "\\bhola\\s+{name}\\b|\\bhey\\s+{name}\\b|\\bhi\\s+{name}\\b|\\bgracias\\s+{name}\\b|\\bquerido\\s+{name}\\b|\\bquerida\\s+{name}\\b|\\bestimado\\s+{name}\\b|\\bestimada\\s+{name}\\b|\\bdon\\s+{name}\\b|\\bdoña\\s+{name}\\b|\\bseñor\\s+{name}\\b|\\bseñora\\s+{name}\\b",
|
||||||
|
"project_verb_patterns": [
|
||||||
|
"\\bconstruyo\\s+{name}\\b",
|
||||||
|
"\\bconstruí\\s+{name}\\b",
|
||||||
|
"\\barmé\\s+{name}\\b",
|
||||||
|
"\\blancé\\s+{name}\\b",
|
||||||
|
"\\bdesplegué\\s+{name}\\b",
|
||||||
|
"\\binstalé\\s+{name}\\b",
|
||||||
|
"\\bla\\s+arquitectura\\s+{name}\\b",
|
||||||
|
"\\bel\\s+pipeline\\s+{name}\\b",
|
||||||
|
"\\bel\\s+sistema\\s+{name}\\b",
|
||||||
|
"\\bel\\s+proyecto\\s+{name}\\b",
|
||||||
|
"\\bel\\s+repositorio\\s+{name}\\b",
|
||||||
|
"\\b{name}\\s+v\\d+\\b",
|
||||||
|
"\\b{name}\\.py\\b",
|
||||||
|
"\\bimport\\s+{name}\\b",
|
||||||
|
"\\bpip\\s+install\\s+{name}\\b"
|
||||||
|
],
|
||||||
|
"stopwords": [
|
||||||
|
"el", "la", "los", "las", "un", "una", "unos", "unas",
|
||||||
|
"de", "del", "al", "a", "en", "con", "sin", "por", "para", "sobre",
|
||||||
|
"entre", "hasta", "desde", "hacia", "contra", "según", "tras",
|
||||||
|
"y", "o", "u", "ni", "pero", "sino", "aunque", "porque", "pues",
|
||||||
|
"que", "quien", "quienes", "cual", "cuales", "cuyo", "cuya",
|
||||||
|
"donde", "cuando", "como", "cuanto", "cuanta",
|
||||||
|
"yo", "tú", "él", "ella", "nosotros", "vosotros", "ellos", "ellas",
|
||||||
|
"me", "te", "se", "nos", "os", "lo", "la", "le", "los", "las", "les",
|
||||||
|
"mi", "mis", "tu", "tus", "su", "sus", "nuestro", "nuestra", "vuestro",
|
||||||
|
"este", "esta", "estos", "estas", "ese", "esa", "esos", "esas",
|
||||||
|
"aquel", "aquella", "aquellos", "aquellas", "esto", "eso", "aquello",
|
||||||
|
"ser", "estar", "haber", "tener", "hacer", "poder", "querer", "saber",
|
||||||
|
"es", "son", "fue", "fueron", "era", "eran", "está", "están", "estaba",
|
||||||
|
"he", "ha", "hemos", "han", "había", "hay",
|
||||||
|
"muy", "mucho", "mucha", "muchos", "muchas", "poco", "poca", "pocos", "pocas",
|
||||||
|
"más", "menos", "tan", "tanto", "también", "tampoco",
|
||||||
|
"sí", "no", "quizás", "tal", "vez",
|
||||||
|
"aquí", "allí", "allá", "ahí", "acá",
|
||||||
|
"hoy", "ayer", "mañana", "ahora", "antes", "después", "luego", "entonces",
|
||||||
|
"siempre", "nunca", "jamás", "todavía", "aún", "ya",
|
||||||
|
"bien", "mal", "mejor", "peor", "bueno", "buena", "malo", "mala",
|
||||||
|
"gracias", "hola", "adiós", "por favor", "perdón",
|
||||||
|
"día", "días", "semana", "mes", "año", "años", "tiempo", "vez", "veces",
|
||||||
|
"cosa", "cosas", "persona", "gente", "mundo", "vida", "casa", "lugar",
|
||||||
|
"forma", "manera", "parte", "caso", "punto", "idea", "hecho", "razón",
|
||||||
|
"nombre", "número", "versión", "sistema", "modelo"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,5 +40,87 @@
|
|||||||
"stop_words": "le la les un une des de du au aux en et ou mais donc or ni car que qui ce cette ces son sa ses mon ma mes ton ta tes leur leurs nous vous ils elles on ne pas plus très bien aussi avec pour dans sur par est sont fait être avoir été comme tout tous toute toutes",
|
"stop_words": "le la les un une des de du au aux en et ou mais donc or ni car que qui ce cette ces son sa ses mon ma mes ton ta tes leur leurs nous vous ils elles on ne pas plus très bien aussi avec pour dans sur par est sont fait être avoir été comme tout tous toute toutes",
|
||||||
"quote_pattern": "«\\s*([^»]{10,200})\\s*»|\"([^\"]{10,200})\"",
|
"quote_pattern": "«\\s*([^»]{10,200})\\s*»|\"([^\"]{10,200})\"",
|
||||||
"action_pattern": "(?:construit|corrigé|écrit|ajouté|poussé|mesuré|testé|révisé|créé|supprimé|mis à jour|configuré|déployé|migré)\\s+[\\wà-ÿ\\s]{3,30}"
|
"action_pattern": "(?:construit|corrigé|écrit|ajouté|poussé|mesuré|testé|révisé|créé|supprimé|mis à jour|configuré|déployé|migré)\\s+[\\wà-ÿ\\s]{3,30}"
|
||||||
|
},
|
||||||
|
"entity": {
|
||||||
|
"candidate_pattern": "[A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸÆŒ][a-zàâäçéèêëîïôöùûüÿæœ]{1,19}",
|
||||||
|
"multi_word_pattern": "[A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸÆŒ][a-zàâäçéèêëîïôöùûüÿæœ]+(?:\\s+[A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸÆŒ][a-zàâäçéèêëîïôöùûüÿæœ]+)+",
|
||||||
|
"person_verb_patterns": [
|
||||||
|
"\\b{name}\\s+a\\s+dit\\b",
|
||||||
|
"\\b{name}\\s+a\\s+demandé\\b",
|
||||||
|
"\\b{name}\\s+a\\s+répondu\\b",
|
||||||
|
"\\b{name}\\s+a\\s+raconté\\b",
|
||||||
|
"\\b{name}\\s+a\\s+ri\\b",
|
||||||
|
"\\b{name}\\s+a\\s+souri\\b",
|
||||||
|
"\\b{name}\\s+a\\s+pleuré\\b",
|
||||||
|
"\\b{name}\\s+a\\s+senti\\b",
|
||||||
|
"\\b{name}\\s+pense\\b",
|
||||||
|
"\\b{name}\\s+veut\\b",
|
||||||
|
"\\b{name}\\s+aime\\b",
|
||||||
|
"\\b{name}\\s+déteste\\b",
|
||||||
|
"\\b{name}\\s+sait\\b",
|
||||||
|
"\\b{name}\\s+a\\s+décidé\\b",
|
||||||
|
"\\b{name}\\s+a\\s+écrit\\b"
|
||||||
|
],
|
||||||
|
"pronoun_patterns": [
|
||||||
|
"\\bil\\b",
|
||||||
|
"\\belle\\b",
|
||||||
|
"\\blui\\b",
|
||||||
|
"\\bils\\b",
|
||||||
|
"\\belles\\b",
|
||||||
|
"\\bleur\\b",
|
||||||
|
"\\bleurs\\b",
|
||||||
|
"\\beux\\b",
|
||||||
|
"\\bse\\b"
|
||||||
|
],
|
||||||
|
"dialogue_patterns": [
|
||||||
|
"^>\\s*{name}[:\\s]",
|
||||||
|
"^{name}:\\s",
|
||||||
|
"^\\[{name}\\]",
|
||||||
|
"\"{name}\\s+a\\s+dit"
|
||||||
|
],
|
||||||
|
"direct_address_pattern": "\\bbonjour\\s+{name}\\b|\\bsalut\\s+{name}\\b|\\bmerci\\s+{name}\\b|\\bcher\\s+{name}\\b|\\bchère\\s+{name}\\b|\\bmonsieur\\s+{name}\\b|\\bmadame\\s+{name}\\b|\\bhey\\s+{name}\\b|\\bhi\\s+{name}\\b",
|
||||||
|
"project_verb_patterns": [
|
||||||
|
"\\bconstruit\\s+{name}\\b",
|
||||||
|
"\\blancé\\s+{name}\\b",
|
||||||
|
"\\bdéployé\\s+{name}\\b",
|
||||||
|
"\\binstallé\\s+{name}\\b",
|
||||||
|
"\\bl'architecture\\s+{name}\\b",
|
||||||
|
"\\ble\\s+pipeline\\s+{name}\\b",
|
||||||
|
"\\ble\\s+système\\s+{name}\\b",
|
||||||
|
"\\ble\\s+projet\\s+{name}\\b",
|
||||||
|
"\\ble\\s+dépôt\\s+{name}\\b",
|
||||||
|
"\\b{name}\\s+v\\d+\\b",
|
||||||
|
"\\b{name}\\.py\\b",
|
||||||
|
"\\bimport\\s+{name}\\b",
|
||||||
|
"\\bpip\\s+install\\s+{name}\\b"
|
||||||
|
],
|
||||||
|
"stopwords": [
|
||||||
|
"le", "la", "les", "un", "une", "des", "du", "de", "au", "aux",
|
||||||
|
"en", "dans", "sur", "sous", "avec", "sans", "pour", "par", "vers",
|
||||||
|
"chez", "entre", "depuis", "pendant", "avant", "après", "jusqu",
|
||||||
|
"et", "ou", "mais", "donc", "or", "ni", "car", "que", "qui",
|
||||||
|
"dont", "où", "quand", "comment", "pourquoi", "combien", "lequel",
|
||||||
|
"ce", "cet", "cette", "ces", "celui", "celle", "ceux", "celles",
|
||||||
|
"mon", "ma", "mes", "ton", "ta", "tes", "son", "sa", "ses",
|
||||||
|
"notre", "nos", "votre", "vos", "leur", "leurs",
|
||||||
|
"je", "tu", "il", "elle", "on", "nous", "vous", "ils", "elles",
|
||||||
|
"me", "te", "se", "lui", "eux",
|
||||||
|
"être", "avoir", "faire", "dire", "aller", "voir", "savoir", "pouvoir",
|
||||||
|
"est", "sont", "était", "étaient", "fut", "furent", "sera", "seront",
|
||||||
|
"ai", "as", "a", "avons", "avez", "ont", "avait", "avaient",
|
||||||
|
"très", "bien", "mal", "peu", "beaucoup", "trop", "assez", "aussi",
|
||||||
|
"plus", "moins", "tant", "si", "tellement",
|
||||||
|
"oui", "non", "peut-être", "vraiment",
|
||||||
|
"ici", "là", "là-bas", "partout", "ailleurs",
|
||||||
|
"aujourd'hui", "hier", "demain", "maintenant", "alors", "ensuite",
|
||||||
|
"toujours", "jamais", "souvent", "parfois", "déjà", "encore",
|
||||||
|
"bon", "bonne", "mauvais", "mauvaise", "meilleur", "pire",
|
||||||
|
"merci", "bonjour", "salut", "au revoir",
|
||||||
|
"jour", "jours", "semaine", "mois", "année", "temps", "fois",
|
||||||
|
"chose", "choses", "personne", "gens", "monde", "vie", "maison",
|
||||||
|
"endroit", "lieu", "partie", "façon", "manière", "sorte", "type",
|
||||||
|
"cas", "point", "idée", "fait", "raison", "nom", "nombre",
|
||||||
|
"version", "système", "modèle", "question", "réponse"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -85,3 +85,76 @@ def test_from_config_defaults_to_english(tmp_path):
|
|||||||
|
|
||||||
d = Dialect.from_config(str(config_path))
|
d = Dialect.from_config(str(config_path))
|
||||||
assert d.lang == "en", f"Expected 'en', got '{d.lang}' -- state leak from prior load_lang"
|
assert d.lang == "en", f"Expected 'en', got '{d.lang}' -- state leak from prior load_lang"
|
||||||
|
|
||||||
|
|
||||||
|
def test_de_entity_section_loads():
|
||||||
|
"""German entity section loads all pattern lists non-empty."""
|
||||||
|
from mempalace.i18n import get_entity_patterns
|
||||||
|
|
||||||
|
p = get_entity_patterns(("de",))
|
||||||
|
assert p["candidate_patterns"], "de: empty candidate_patterns"
|
||||||
|
assert p["multi_word_patterns"], "de: empty multi_word_patterns"
|
||||||
|
assert p["person_verb_patterns"], "de: empty person_verb_patterns"
|
||||||
|
assert p["pronoun_patterns"], "de: empty pronoun_patterns"
|
||||||
|
assert p["dialogue_patterns"], "de: empty dialogue_patterns"
|
||||||
|
assert p["direct_address_patterns"], "de: empty direct_address_patterns"
|
||||||
|
assert p["project_verb_patterns"], "de: empty project_verb_patterns"
|
||||||
|
assert len(p["stopwords"]) > 50, f"de: stopwords too short ({len(p['stopwords'])})"
|
||||||
|
|
||||||
|
|
||||||
|
def test_es_entity_section_loads():
|
||||||
|
"""Spanish entity section loads all pattern lists non-empty."""
|
||||||
|
from mempalace.i18n import get_entity_patterns
|
||||||
|
|
||||||
|
p = get_entity_patterns(("es",))
|
||||||
|
assert p["candidate_patterns"], "es: empty candidate_patterns"
|
||||||
|
assert p["multi_word_patterns"], "es: empty multi_word_patterns"
|
||||||
|
assert p["person_verb_patterns"], "es: empty person_verb_patterns"
|
||||||
|
assert p["pronoun_patterns"], "es: empty pronoun_patterns"
|
||||||
|
assert p["dialogue_patterns"], "es: empty dialogue_patterns"
|
||||||
|
assert p["direct_address_patterns"], "es: empty direct_address_patterns"
|
||||||
|
assert p["project_verb_patterns"], "es: empty project_verb_patterns"
|
||||||
|
assert len(p["stopwords"]) > 50, f"es: stopwords too short ({len(p['stopwords'])})"
|
||||||
|
|
||||||
|
|
||||||
|
def test_fr_entity_section_loads():
|
||||||
|
"""French entity section loads all pattern lists non-empty."""
|
||||||
|
from mempalace.i18n import get_entity_patterns
|
||||||
|
|
||||||
|
p = get_entity_patterns(("fr",))
|
||||||
|
assert p["candidate_patterns"], "fr: empty candidate_patterns"
|
||||||
|
assert p["multi_word_patterns"], "fr: empty multi_word_patterns"
|
||||||
|
assert p["person_verb_patterns"], "fr: empty person_verb_patterns"
|
||||||
|
assert p["pronoun_patterns"], "fr: empty pronoun_patterns"
|
||||||
|
assert p["dialogue_patterns"], "fr: empty dialogue_patterns"
|
||||||
|
assert p["direct_address_patterns"], "fr: empty direct_address_patterns"
|
||||||
|
assert p["project_verb_patterns"], "fr: empty project_verb_patterns"
|
||||||
|
assert len(p["stopwords"]) > 50, f"fr: stopwords too short ({len(p['stopwords'])})"
|
||||||
|
|
||||||
|
|
||||||
|
def test_direct_address_key_is_singular_string_for_all_locales():
|
||||||
|
"""Schema invariant: any locale declaring direct-address uses the singular
|
||||||
|
``direct_address_pattern`` (str), never the plural ``direct_address_patterns`` (list).
|
||||||
|
|
||||||
|
The loader at ``mempalace/i18n/__init__.py:209-210`` only reads the singular key;
|
||||||
|
the plural form is the output schema of the merged dict, not the input schema.
|
||||||
|
Declaring the plural form in a locale file silently drops every direct-address
|
||||||
|
pattern in that locale after load.
|
||||||
|
"""
|
||||||
|
from mempalace.i18n import _load_entity_section, available_languages
|
||||||
|
|
||||||
|
for lang in available_languages():
|
||||||
|
section = _load_entity_section(lang)
|
||||||
|
if not section:
|
||||||
|
continue
|
||||||
|
assert "direct_address_patterns" not in section, (
|
||||||
|
f"{lang}: declares plural 'direct_address_patterns' (list); "
|
||||||
|
f"loader only reads singular 'direct_address_pattern' (str). "
|
||||||
|
f"Collapse the list into one `|`-alternation string and rename the key."
|
||||||
|
)
|
||||||
|
if "direct_address_pattern" in section:
|
||||||
|
val = section["direct_address_pattern"]
|
||||||
|
assert isinstance(
|
||||||
|
val, str
|
||||||
|
), f"{lang}: 'direct_address_pattern' must be str, got {type(val).__name__}"
|
||||||
|
assert val, f"{lang}: 'direct_address_pattern' is empty"
|
||||||
|
|||||||
Reference in New Issue
Block a user