mirror of
https://github.com/langgenius/dify.git
synced 2026-04-15 18:06:36 +08:00
fix: fix remove_leading_symbols remove [ (#34832)
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
03750b76ac
commit
d042cbc62e
@ -19,5 +19,18 @@ def remove_leading_symbols(text: str) -> str:
|
||||
|
||||
# Match Unicode ranges for punctuation and symbols
|
||||
# FIXME this pattern is confused quick fix for #11868 maybe refactor it later
|
||||
pattern = r'^[\[\]\u2000-\u2025\u2027-\u206F\u2E00-\u2E7F\u3000-\u300F\u3011-\u303F"#$%&\'()*+,./:;<=>?@^_`~]+'
|
||||
pattern = re.compile(
|
||||
r"""
|
||||
^
|
||||
(?:
|
||||
[\u2000-\u2025] # General Punctuation: spaces, quotes, dashes
|
||||
| [\u2027-\u206F] # General Punctuation: ellipsis, underscores, etc.
|
||||
| [\u2E00-\u2E7F] # Supplemental Punctuation: medieval, ancient marks
|
||||
| [\u3000-\u300F] # CJK Punctuation: 、。〃「」『》』 (excludes 【】)
|
||||
| [\u3012-\u303F] # CJK Punctuation: 〖〗〔〕〘〙〚〛〜 etc.
|
||||
| ["#$%&'()*+,./:;<=>?@^_`~] # ASCII punctuation (excludes []【】)
|
||||
)+
|
||||
""",
|
||||
re.VERBOSE,
|
||||
)
|
||||
return re.sub(pattern, "", text)
|
||||
|
||||
@ -19,7 +19,57 @@ from core.tools.utils.text_processing_utils import remove_leading_symbols
|
||||
("[Google](https://google.com) is a search engine", "[Google](https://google.com) is a search engine"),
|
||||
("[Example](http://example.com) some text", "[Example](http://example.com) some text"),
|
||||
# Leading symbols before markdown link are removed, including the opening bracket [
|
||||
("@[Test](https://example.com)", "Test](https://example.com)"),
|
||||
("@[Test](https://example.com)", "[Test](https://example.com)"),
|
||||
("~~标题~~", "标题~~"),
|
||||
('""quoted', "quoted"),
|
||||
("''test", "test"),
|
||||
("##话题", "话题"),
|
||||
("$$价格", "价格"),
|
||||
("%%百分比", "百分比"),
|
||||
("&&与逻辑", "与逻辑"),
|
||||
("((括号))", "括号))"),
|
||||
("**强调**", "强调**"),
|
||||
("++自增", "自增"),
|
||||
(",,逗号", "逗号"),
|
||||
("..省略", "省略"),
|
||||
("//注释", "注释"),
|
||||
("::范围", "范围"),
|
||||
(";;分号", "分号"),
|
||||
("<<左移", "左移"),
|
||||
("==等于", "等于"),
|
||||
(">>右移", "右移"),
|
||||
("??疑问", "疑问"),
|
||||
("@@提及", "提及"),
|
||||
("^^上标", "上标"),
|
||||
("__下划线", "下划线"),
|
||||
("``代码", "代码"),
|
||||
("~~删除线", "删除线"),
|
||||
(" 全角空格开头", "全角空格开头"),
|
||||
("、顿号开头", "顿号开头"),
|
||||
("。句号开头", "句号开头"),
|
||||
("「引号」测试", "引号」测试"),
|
||||
("『书名号』", "书名号』"),
|
||||
("【保留】测试", "【保留】测试"),
|
||||
("〖括号〗测试", "括号〗测试"),
|
||||
("〔括号〕测试", "括号〕测试"),
|
||||
("~~【保留】~~", "【保留】~~"),
|
||||
('"[公告]"', '[公告]"'),
|
||||
("[公告] 更新", "[公告] 更新"),
|
||||
("【通知】重要", "【通知】重要"),
|
||||
("[[嵌套]]", "[[嵌套]]"),
|
||||
("【【嵌套】】", "【【嵌套】】"),
|
||||
("[【混合】]", "[【混合】]"),
|
||||
("normal text", "normal text"),
|
||||
("123数字", "123数字"),
|
||||
("中文开头", "中文开头"),
|
||||
("alpha", "alpha"),
|
||||
("~", ""),
|
||||
("【", "【"),
|
||||
("[", "["),
|
||||
("~~~", ""),
|
||||
("【【【", "【【【"),
|
||||
("\t制表符", "\t制表符"),
|
||||
("\n换行", "\n换行"),
|
||||
],
|
||||
)
|
||||
def test_remove_leading_symbols(input_text, expected_output):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user