From 3e7f8bad567c86b3cb1129ac9f7c1c9bfad08df7 Mon Sep 17 00:00:00 2001
From: Jin <3130104027@stmail.ujs.edu.cn>
Date: Sat, 21 Jun 2025 23:10:00 +0800
Subject: [PATCH] fix: markdown_extractor lost chunks if it starts without a
 header(#21308) (#21309)

---
 api/core/rag/extractor/markdown_extractor.py  | 15 +++++--------
 .../rag/extractor/test_markdown_extractor.py  | 22 +++++++++++++++++++
 2 files changed, 27 insertions(+), 10 deletions(-)
 create mode 100644 api/tests/unit_tests/core/rag/extractor/test_markdown_extractor.py

diff --git a/api/core/rag/extractor/markdown_extractor.py b/api/core/rag/extractor/markdown_extractor.py
index 849852ac23..c97765b1dc 100644
--- a/api/core/rag/extractor/markdown_extractor.py
+++ b/api/core/rag/extractor/markdown_extractor.py
@@ -68,22 +68,17 @@ class MarkdownExtractor(BaseExtractor):
                 continue
             header_match = re.match(r"^#+\s", line)
             if header_match:
-                if current_header is not None:
-                    markdown_tups.append((current_header, current_text))
-
+                markdown_tups.append((current_header, current_text))
                 current_header = line
                 current_text = ""
             else:
                 current_text += line + "\n"
         markdown_tups.append((current_header, current_text))
 
-        if current_header is not None:
-            # pass linting, assert keys are defined
-            markdown_tups = [
-                (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) for key, value in markdown_tups
-            ]
-        else:
-            markdown_tups = [(key, re.sub("\n", "", value)) for key, value in markdown_tups]
+        markdown_tups = [
+            (re.sub(r"#", "", cast(str, key)).strip() if key else None, re.sub(r"<.*?>", "", value))
+            for key, value in markdown_tups
+        ]
 
         return markdown_tups
 
diff --git a/api/tests/unit_tests/core/rag/extractor/test_markdown_extractor.py b/api/tests/unit_tests/core/rag/extractor/test_markdown_extractor.py
new file mode 100644
index 0000000000..d4cf534c56
--- /dev/null
+++ b/api/tests/unit_tests/core/rag/extractor/test_markdown_extractor.py
@@ -0,0 +1,22 @@
+from core.rag.extractor.markdown_extractor import MarkdownExtractor
+
+
+def test_markdown_to_tups():
+    markdown = """
+this is some text without header
+
+# title 1
+this is balabala text
+
+## title 2
+this is more specific text.
+        """
+    extractor = MarkdownExtractor(file_path="dummy_path")
+    updated_output = extractor.markdown_to_tups(markdown)
+    assert len(updated_output) == 3
+    key, header_value = updated_output[0]
+    assert key == None
+    assert header_value.strip() == "this is some text without header"
+    title_1, value = updated_output[1]
+    assert title_1.strip() == "title 1"
+    assert value.strip() == "this is balabala text"