Fix: Correctly handle merged cells in DOCX tables to prevent content duplication and loss (#27871)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
李龙飞 2025-11-13 15:56:24 +08:00 committed by GitHub
parent b86022c64a
commit 81832c14ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 54 additions and 3 deletions

View File

@ -152,13 +152,15 @@ class WordExtractor(BaseExtractor):
# Initialize a row, all of which are empty by default
row_cells = [""] * total_cols
col_index = 0
for cell in row.cells:
while col_index < len(row.cells):
# make sure the col_index is not out of range
while col_index < total_cols and row_cells[col_index] != "":
while col_index < len(row.cells) and row_cells[col_index] != "":
col_index += 1
# if col_index is out of range the loop is jumped
if col_index >= total_cols:
if col_index >= len(row.cells):
break
# get the correct cell
cell = row.cells[col_index]
cell_content = self._parse_cell(cell, image_map).strip()
cell_colspan = cell.grid_span or 1
for i in range(cell_colspan):

View File

@ -0,0 +1,49 @@
"""Primarily used for testing merged cell scenarios"""
from docx import Document
from core.rag.extractor.word_extractor import WordExtractor
def _generate_table_with_merged_cells():
doc = Document()
"""
The table looks like this:
+-----+-----+-----+
| 1-1 & 1-2 | 1-3 |
+-----+-----+-----+
| 2-1 | 2-2 | 2-3 |
| & |-----+-----+
| 3-1 | 3-2 | 3-3 |
+-----+-----+-----+
"""
table = doc.add_table(rows=3, cols=3)
table.style = "Table Grid"
for i in range(3):
for j in range(3):
cell = table.cell(i, j)
cell.text = f"{i + 1}-{j + 1}"
# Merge cells
cell_0_0 = table.cell(0, 0)
cell_0_1 = table.cell(0, 1)
merged_cell_1 = cell_0_0.merge(cell_0_1)
merged_cell_1.text = "1-1 & 1-2"
cell_1_0 = table.cell(1, 0)
cell_2_0 = table.cell(2, 0)
merged_cell_2 = cell_1_0.merge(cell_2_0)
merged_cell_2.text = "2-1 & 3-1"
ground_truth = [["1-1 & 1-2", "", "1-3"], ["2-1 & 3-1", "2-2", "2-3"], ["2-1 & 3-1", "3-2", "3-3"]]
return doc.tables[0], ground_truth
def test_parse_row():
table, gt = _generate_table_with_merged_cells()
extractor = object.__new__(WordExtractor)
for idx, row in enumerate(table.rows):
assert extractor._parse_row(row, {}, 3) == gt[idx]