683e940f70
zh-TW and zh-CN previously had no `entity` section. Calling
`detect_entities(..., languages=("zh-TW",))` silently fell back to
English patterns (i18n/__init__.py:231-233), so no Chinese names
were ever extracted — Chinese-speaking users got zero people or
projects detected from their own notes.
This adds entity sections for both locales:
- `candidate_pattern`: common-surname-prefixed CJK n-grams (~100
surnames covering >95% of Taiwanese / PRC names), length capped
at {1,2} trailing chars so greedy matches don't swallow the
trailing verb character (e.g. 朱宜振說).
- `boundary_chars`: `\u4E00-\u9FFF` so the i18n loader's
script-aware wrap (introduced in #932) fires `\b` at CJK↔non-CJK
transitions. This is the same mechanism used for Devanagari,
applied to the CJK range.
- `person_verb_patterns`: Chinese verbs attach directly to the
name with no whitespace, so patterns are written as `{name}說`,
`{name}問`, `{name}決定` — no `\b` or `\s+` separators.
- `dialogue_patterns`: full-width colon `:`, Chinese quotes
「」『』, plus the standard Latin forms.
- `pronoun_patterns`: 他 / 她 / 它 / 他們 / 她們 / 您 / 咱.
- `stopwords`: ~140 common particles, pronouns, time expressions,
question words, conjunctions, UI nouns, and politeness forms.
**Known limitation** (explicitly covered by a test): CJK scripts
have no word delimiters, so a name flanked by CJK on both sides
with no punctuation or whitespace break is not extracted. This
is a fundamental limit of regex-based CJK entity detection —
resolving it would require a dictionary tokeniser. Realistic
Chinese technical writing contains enough non-CJK neighbours
(bullet lines, inline English, full-width punctuation, newlines)
that 3+ occurrences normally produce matches. Verified against a
realistic zh-TW PKM note: 朱宜振 extracted 11x from 8 sentences
with 0.99 person-classification confidence.
**Follow-ups** (separate PRs): same pattern for `ja` and `ko`,
both of which currently share the silent fallback-to-English bug.
Tests: 7 new tests in `tests/test_entity_detector.py`:
- `test_zh_tw_candidate_extraction_at_boundaries`
- `test_zh_tw_person_classification`
- `test_zh_tw_stopwords_filter_common_particles`
- `test_zh_tw_falls_back_to_english_for_non_cjk_names`
- `test_zh_cn_candidate_extraction`
- `test_zh_cn_and_zh_tw_union_covers_both_variants`
- `test_zh_tw_known_limitation_inline_name_no_boundary`
Full suite: 957 passed, 0 failed.
133 lines
5.1 KiB
JSON
133 lines
5.1 KiB
JSON
{
|
||
"lang": "zh-CN",
|
||
"label": "简体中文",
|
||
"terms": {
|
||
"palace": "宫殿",
|
||
"wing": "翼",
|
||
"hall": "走廊",
|
||
"closet": "柜子",
|
||
"drawer": "抽屉",
|
||
"mine": "挖掘",
|
||
"search": "搜索",
|
||
"status": "状态",
|
||
"init": "初始化",
|
||
"repair": "修复",
|
||
"migrate": "迁移",
|
||
"entity": "实体",
|
||
"topic": "主题"
|
||
},
|
||
"cli": {
|
||
"mine_start": "正在挖掘 {path}...",
|
||
"mine_complete": "完成。创建了 {closets} 个柜子、{drawers} 个抽屉。",
|
||
"mine_skip": "已挖掘。使用 --force 重新执行。",
|
||
"search_no_results": "未找到结果: {query}",
|
||
"search_results": "找到 {count} 个结果:",
|
||
"status_palace": "宫殿: {path}",
|
||
"status_wings": "{count} 个翼",
|
||
"status_closets": "{count} 个柜子",
|
||
"status_drawers": "{count} 个抽屉",
|
||
"init_complete": "宫殿已初始化于 {path}",
|
||
"init_exists": "{path} 中已存在宫殿",
|
||
"repair_complete": "修复完成。已修正 {fixed} 个问题。",
|
||
"migrate_complete": "迁移完成。",
|
||
"no_palace": "未找到宫殿。请运行: mempalace init <目录>"
|
||
},
|
||
"aaak": {
|
||
"instruction": "用中文压缩。概念之间用管道符(|),词语之间用连字符(-)。省略虚词和连接词。保留专有名词和数字的准确性。"
|
||
},
|
||
"regex": {
|
||
"topic_pattern": "[\\u4E00-\\u9FFF]{2,}|[A-Za-z][A-Za-z0-9_]{2,}",
|
||
"stop_words": "的 了 在 是 我 有 和 就 不 人 都 一 一个 上 也 很 到 说 要 去 你 会 着 没有 看 好 自己 这 那 她 他 它 们 但是 因为 所以 如果 虽然 然后 或者 而且",
|
||
"quote_pattern": "\\u201C([^\\u201D]{10,100})\\u201D|\"([^\"]{10,200})\"",
|
||
"action_pattern": "(构建|修复|添加|删除|确认|创建|实现|修理|编写|测试|验证|更新|配置|启动|停止)(?:了|完成|成功)"
|
||
},
|
||
"entity": {
|
||
"boundary_chars": "\\u4E00-\\u9FFF",
|
||
"candidate_pattern": "[王李张刘陈杨赵黄周吴徐孙朱胡郭何高林罗郑梁谢宋唐许韩冯邓曹彭曾萧田董袁潘于蒋蔡余杜叶程苏魏吕丁任沈姚卢姜崔钟谭陆汪范金石廖贾夏韦方白邹孟熊秦邱江尹薛阎段雷侯龙史陶黎贺顾毛郝龚邵万钱严武戴莫孔向汤温庞殷章葛管甘卞冉蓝殷习][\\u4E00-\\u9FFF]{1,2}",
|
||
"person_verb_patterns": [
|
||
"{name}说",
|
||
"{name}问",
|
||
"{name}答",
|
||
"{name}表示",
|
||
"{name}回答",
|
||
"{name}提出",
|
||
"{name}决定",
|
||
"{name}认为",
|
||
"{name}指出",
|
||
"{name}解释",
|
||
"{name}告诉",
|
||
"{name}写道",
|
||
"{name}想",
|
||
"{name}觉得",
|
||
"{name}知道",
|
||
"{name}喜欢",
|
||
"{name}讨厌",
|
||
"{name}确认",
|
||
"{name}提醒",
|
||
"{name}分享",
|
||
"{name}建议",
|
||
"{name}同意",
|
||
"{name}反对"
|
||
],
|
||
"pronoun_patterns": [
|
||
"他们",
|
||
"她们",
|
||
"他",
|
||
"她",
|
||
"它",
|
||
"您",
|
||
"咱"
|
||
],
|
||
"dialogue_patterns": [
|
||
"^>\\s*{name}[::\\s]",
|
||
"^{name}[::]\\s?",
|
||
"^\\[{name}\\]",
|
||
"\u201C{name}[\u201D::]",
|
||
"「{name}[」::]"
|
||
],
|
||
"direct_address_pattern": "嘿\\s*{name}|喂\\s*{name}|谢谢\\s*{name}|感谢\\s*{name}|哈喽\\s*{name}|亲爱的\\s*{name}",
|
||
"project_verb_patterns": [
|
||
"建立{name}",
|
||
"打造{name}",
|
||
"部署{name}",
|
||
"启动{name}",
|
||
"发布{name}",
|
||
"上线{name}",
|
||
"开发{name}",
|
||
"维护{name}",
|
||
"{name}系统",
|
||
"{name}平台",
|
||
"{name}项目",
|
||
"{name}架构",
|
||
"{name}管线",
|
||
"{name}v\\d+",
|
||
"\\bimport\\s+{name}\\b",
|
||
"\\bpip\\s+install\\s+{name}\\b"
|
||
],
|
||
"stopwords": [
|
||
"的", "了", "着", "过", "得", "地", "吗", "吧", "呢", "啊", "喔", "耶",
|
||
"我", "你", "妳", "他", "她", "它", "您", "咱",
|
||
"我们", "你们", "妳们", "他们", "她们", "它们", "咱们",
|
||
"自己", "大家", "有人", "没人",
|
||
"今天", "明天", "昨天", "前天", "后天", "今年", "明年", "去年",
|
||
"早上", "下午", "晚上", "中午", "凌晨",
|
||
"现在", "刚才", "刚刚", "等等", "等下", "待会",
|
||
"最近", "以前", "之前", "之后", "以后", "后来",
|
||
"什么", "为什么", "怎么", "怎样", "哪里", "哪个",
|
||
"这个", "那个", "这里", "那里", "这些", "那些", "这样", "那样",
|
||
"但是", "可是", "然后", "所以", "因为", "如果", "虽然",
|
||
"而且", "或者", "或是", "还是", "不过", "只是", "不只",
|
||
"既然", "不然", "否则", "此外", "另外",
|
||
"很", "非常", "相当", "真的", "确实", "当然", "其实",
|
||
"已经", "正在", "即将", "将要", "刚好", "恰好",
|
||
"可能", "也许", "或许", "大概", "应该", "必须", "一定",
|
||
"完成", "执行", "进行", "开始", "结束", "继续", "停止", "完毕",
|
||
"没有", "有点", "有些", "一些", "许多", "很多",
|
||
"问题", "答案", "原因", "结果", "情况", "状况",
|
||
"主要", "重要", "基本", "简单", "复杂", "特别",
|
||
"谢谢", "感谢", "对不起", "不好意思", "请问",
|
||
"欢迎", "再见", "你好", "您好", "哈喽", "拜拜"
|
||
]
|
||
}
|
||
}
|