fix: fix remove_leading_symbols remove [ (#34832)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
wangxiaolei 2026-04-09 16:22:09 +08:00 committed by GitHub
parent 03750b76ac
commit d042cbc62e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 65 additions and 2 deletions

View File

@ -19,5 +19,18 @@ def remove_leading_symbols(text: str) -> str:
# Match Unicode ranges for punctuation and symbols
# FIXME this pattern is confused quick fix for #11868 maybe refactor it later
pattern = r'^[\[\]\u2000-\u2025\u2027-\u206F\u2E00-\u2E7F\u3000-\u300F\u3011-\u303F"#$%&\'()*+,./:;<=>?@^_`~]+'
pattern = re.compile(
r"""
^
(?:
[\u2000-\u2025] # General Punctuation: spaces, quotes, dashes
| [\u2027-\u206F] # General Punctuation: ellipsis, underscores, etc.
| [\u2E00-\u2E7F] # Supplemental Punctuation: medieval, ancient marks
| [\u3000-\u300F] # CJK Punctuation: 、。〃「」『》』 (excludes 【】)
| [\u3012-\u303F] # CJK Punctuation: 〖〗〔〕〘〙〚〛〜 etc.
| ["#$%&'()*+,./:;<=>?@^_`~] # ASCII punctuation (excludes []【】)
)+
""",
re.VERBOSE,
)
return re.sub(pattern, "", text)

View File

@ -19,7 +19,57 @@ from core.tools.utils.text_processing_utils import remove_leading_symbols
("[Google](https://google.com) is a search engine", "[Google](https://google.com) is a search engine"),
("[Example](http://example.com) some text", "[Example](http://example.com) some text"),
# Leading symbols before markdown link are removed, including the opening bracket [
("@[Test](https://example.com)", "Test](https://example.com)"),
("@[Test](https://example.com)", "[Test](https://example.com)"),
("~~标题~~", "标题~~"),
('""quoted', "quoted"),
("''test", "test"),
("##话题", "话题"),
("$$价格", "价格"),
("%%百分比", "百分比"),
("&&与逻辑", "与逻辑"),
("((括号))", "括号))"),
("**强调**", "强调**"),
("++自增", "自增"),
(",,逗号", "逗号"),
("..省略", "省略"),
("//注释", "注释"),
("::范围", "范围"),
(";;分号", "分号"),
("<<左移", "左移"),
("==等于", "等于"),
(">>右移", "右移"),
("??疑问", "疑问"),
("@@提及", "提及"),
("^^上标", "上标"),
("__下划线", "下划线"),
("``代码", "代码"),
("~~删除线", "删除线"),
(" 全角空格开头", "全角空格开头"),
("、顿号开头", "顿号开头"),
("。句号开头", "句号开头"),
("「引号」测试", "引号」测试"),
("『书名号』", "书名号』"),
("【保留】测试", "【保留】测试"),
("〖括号〗测试", "括号〗测试"),
("〔括号〕测试", "括号〕测试"),
("~~【保留】~~", "【保留】~~"),
('"[公告]"', '[公告]"'),
("[公告] 更新", "[公告] 更新"),
("【通知】重要", "【通知】重要"),
("[[嵌套]]", "[[嵌套]]"),
("【【嵌套】】", "【【嵌套】】"),
("[【混合】]", "[【混合】]"),
("normal text", "normal text"),
("123数字", "123数字"),
("中文开头", "中文开头"),
("alpha", "alpha"),
("~", ""),
("", ""),
("[", "["),
("~~~", ""),
("【【【", "【【【"),
("\t制表符", "\t制表符"),
("\n换行", "\n换行"),
],
)
def test_remove_leading_symbols(input_text, expected_output):