From d042cbc62e24833026c0d1e8840d7ec99addb65b Mon Sep 17 00:00:00 2001 From: wangxiaolei Date: Thu, 9 Apr 2026 16:22:09 +0800 Subject: [PATCH] fix: fix remove_leading_symbols remove [ (#34832) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- api/core/tools/utils/text_processing_utils.py | 15 +++++- .../unit_tests/utils/test_text_processing.py | 52 ++++++++++++++++++- 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/api/core/tools/utils/text_processing_utils.py b/api/core/tools/utils/text_processing_utils.py index 4bfaa5e49b..1dd0605f28 100644 --- a/api/core/tools/utils/text_processing_utils.py +++ b/api/core/tools/utils/text_processing_utils.py @@ -19,5 +19,18 @@ def remove_leading_symbols(text: str) -> str: # Match Unicode ranges for punctuation and symbols # FIXME this pattern is confused quick fix for #11868 maybe refactor it later - pattern = r'^[\[\]\u2000-\u2025\u2027-\u206F\u2E00-\u2E7F\u3000-\u300F\u3011-\u303F"#$%&\'()*+,./:;<=>?@^_`~]+' + pattern = re.compile( + r""" + ^ + (?: + [\u2000-\u2025] # General Punctuation: spaces, quotes, dashes + | [\u2027-\u206F] # General Punctuation: ellipsis, underscores, etc. + | [\u2E00-\u2E7F] # Supplemental Punctuation: medieval, ancient marks + | [\u3000-\u300F] # CJK Punctuation: 、。〃「」『》』 (excludes 【】) + | [\u3012-\u303F] # CJK Punctuation: 〖〗〔〕〘〙〚〛〜 etc. + | ["#$%&'()*+,./:;<=>?@^_`~] # ASCII punctuation (excludes []【】) + )+ + """, + re.VERBOSE, + ) return re.sub(pattern, "", text) diff --git a/api/tests/unit_tests/utils/test_text_processing.py b/api/tests/unit_tests/utils/test_text_processing.py index bf61162a66..5f6ccbcdff 100644 --- a/api/tests/unit_tests/utils/test_text_processing.py +++ b/api/tests/unit_tests/utils/test_text_processing.py @@ -19,7 +19,57 @@ from core.tools.utils.text_processing_utils import remove_leading_symbols ("[Google](https://google.com) is a search engine", "[Google](https://google.com) is a search engine"), ("[Example](http://example.com) some text", "[Example](http://example.com) some text"), # Leading symbols before markdown link are removed, including the opening bracket [ - ("@[Test](https://example.com)", "Test](https://example.com)"), + ("@[Test](https://example.com)", "[Test](https://example.com)"), + ("~~标题~~", "标题~~"), + ('""quoted', "quoted"), + ("''test", "test"), + ("##话题", "话题"), + ("$$价格", "价格"), + ("%%百分比", "百分比"), + ("&&与逻辑", "与逻辑"), + ("((括号))", "括号))"), + ("**强调**", "强调**"), + ("++自增", "自增"), + (",,逗号", "逗号"), + ("..省略", "省略"), + ("//注释", "注释"), + ("::范围", "范围"), + (";;分号", "分号"), + ("<<左移", "左移"), + ("==等于", "等于"), + (">>右移", "右移"), + ("??疑问", "疑问"), + ("@@提及", "提及"), + ("^^上标", "上标"), + ("__下划线", "下划线"), + ("``代码", "代码"), + ("~~删除线", "删除线"), + (" 全角空格开头", "全角空格开头"), + ("、顿号开头", "顿号开头"), + ("。句号开头", "句号开头"), + ("「引号」测试", "引号」测试"), + ("『书名号』", "书名号』"), + ("【保留】测试", "【保留】测试"), + ("〖括号〗测试", "括号〗测试"), + ("〔括号〕测试", "括号〕测试"), + ("~~【保留】~~", "【保留】~~"), + ('"[公告]"', '[公告]"'), + ("[公告] 更新", "[公告] 更新"), + ("【通知】重要", "【通知】重要"), + ("[[嵌套]]", "[[嵌套]]"), + ("【【嵌套】】", "【【嵌套】】"), + ("[【混合】]", "[【混合】]"), + ("normal text", "normal text"), + ("123数字", "123数字"), + ("中文开头", "中文开头"), + ("alpha", "alpha"), + ("~", ""), + ("【", "【"), + ("[", "["), + ("~~~", ""), + ("【【【", "【【【"), + ("\t制表符", "\t制表符"), + ("\n换行", "\n换行"), ], ) def test_remove_leading_symbols(input_text, expected_output):