From 3e7f8bad567c86b3cb1129ac9f7c1c9bfad08df7 Mon Sep 17 00:00:00 2001 From: Jin <3130104027@stmail.ujs.edu.cn> Date: Sat, 21 Jun 2025 23:10:00 +0800 Subject: [PATCH] fix: markdown_extractor lost chunks if it starts without a header(#21308) (#21309) --- api/core/rag/extractor/markdown_extractor.py | 15 +++++-------- .../rag/extractor/test_markdown_extractor.py | 22 +++++++++++++++++++ 2 files changed, 27 insertions(+), 10 deletions(-) create mode 100644 api/tests/unit_tests/core/rag/extractor/test_markdown_extractor.py diff --git a/api/core/rag/extractor/markdown_extractor.py b/api/core/rag/extractor/markdown_extractor.py index 849852ac23..c97765b1dc 100644 --- a/api/core/rag/extractor/markdown_extractor.py +++ b/api/core/rag/extractor/markdown_extractor.py @@ -68,22 +68,17 @@ class MarkdownExtractor(BaseExtractor): continue header_match = re.match(r"^#+\s", line) if header_match: - if current_header is not None: - markdown_tups.append((current_header, current_text)) - + markdown_tups.append((current_header, current_text)) current_header = line current_text = "" else: current_text += line + "\n" markdown_tups.append((current_header, current_text)) - if current_header is not None: - # pass linting, assert keys are defined - markdown_tups = [ - (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) for key, value in markdown_tups - ] - else: - markdown_tups = [(key, re.sub("\n", "", value)) for key, value in markdown_tups] + markdown_tups = [ + (re.sub(r"#", "", cast(str, key)).strip() if key else None, re.sub(r"<.*?>", "", value)) + for key, value in markdown_tups + ] return markdown_tups diff --git a/api/tests/unit_tests/core/rag/extractor/test_markdown_extractor.py b/api/tests/unit_tests/core/rag/extractor/test_markdown_extractor.py new file mode 100644 index 0000000000..d4cf534c56 --- /dev/null +++ b/api/tests/unit_tests/core/rag/extractor/test_markdown_extractor.py @@ -0,0 +1,22 @@ +from core.rag.extractor.markdown_extractor import MarkdownExtractor + + +def test_markdown_to_tups(): + markdown = """ +this is some text without header + +# title 1 +this is balabala text + +## title 2 +this is more specific text. + """ + extractor = MarkdownExtractor(file_path="dummy_path") + updated_output = extractor.markdown_to_tups(markdown) + assert len(updated_output) == 3 + key, header_value = updated_output[0] + assert key == None + assert header_value.strip() == "this is some text without header" + title_1, value = updated_output[1] + assert title_1.strip() == "title 1" + assert value.strip() == "this is balabala text"