Fix: Correctly handle merged cells in DOCX tables to prevent content duplication and loss (#27871)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
2025-11-13 15:56:24 +08:00 · 2025-11-13 15:56:24 +08:00 · 81832c14ee
parent b86022c64a
commit 81832c14ee
2 changed files with 54 additions and 3 deletions
--- a/api/core/rag/extractor/word_extractor.py
+++ b/api/core/rag/extractor/word_extractor.py
@ -152,13 +152,15 @@ class WordExtractor(BaseExtractor):
        # Initialize a row, all of which are empty by default
        row_cells = [""] * total_cols
        col_index = 0
-        for cell in row.cells:
+        while col_index < len(row.cells):
            # make sure the col_index is not out of range
-            while col_index < total_cols and row_cells[col_index] != "":
+            while col_index < len(row.cells) and row_cells[col_index] != "":
                col_index += 1
            # if col_index is out of range the loop is jumped
-            if col_index >= total_cols:
+            if col_index >= len(row.cells):
                break
+            # get the correct cell
+            cell = row.cells[col_index]
            cell_content = self._parse_cell(cell, image_map).strip()
            cell_colspan = cell.grid_span or 1
            for i in range(cell_colspan):
--- a/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py
+++ b/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py
@ -0,0 +1,49 @@
+"""Primarily used for testing merged cell scenarios"""
+
+from docx import Document
+
+from core.rag.extractor.word_extractor import WordExtractor
+
+
+def _generate_table_with_merged_cells():
+    doc = Document()
+
+    """
+    The table looks like this:
+    +-----+-----+-----+
+    | 1-1 & 1-2 | 1-3 |
+    +-----+-----+-----+
+    | 2-1 | 2-2 | 2-3 |
+    |  &  |-----+-----+
+    | 3-1 | 3-2 | 3-3 |
+    +-----+-----+-----+
+    """
+    table = doc.add_table(rows=3, cols=3)
+    table.style = "Table Grid"
+
+    for i in range(3):
+        for j in range(3):
+            cell = table.cell(i, j)
+            cell.text = f"{i + 1}-{j + 1}"
+
+    # Merge cells
+    cell_0_0 = table.cell(0, 0)
+    cell_0_1 = table.cell(0, 1)
+    merged_cell_1 = cell_0_0.merge(cell_0_1)
+    merged_cell_1.text = "1-1 & 1-2"
+
+    cell_1_0 = table.cell(1, 0)
+    cell_2_0 = table.cell(2, 0)
+    merged_cell_2 = cell_1_0.merge(cell_2_0)
+    merged_cell_2.text = "2-1 & 3-1"
+
+    ground_truth = [["1-1 & 1-2", "", "1-3"], ["2-1 & 3-1", "2-2", "2-3"], ["2-1 & 3-1", "3-2", "3-3"]]
+
+    return doc.tables[0], ground_truth
+
+
+def test_parse_row():
+    table, gt = _generate_table_with_merged_cells()
+    extractor = object.__new__(WordExtractor)
+    for idx, row in enumerate(table.rows):
+        assert extractor._parse_row(row, {}, 3) == gt[idx]