diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index 1a9704688a..c7a5568866 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -152,13 +152,15 @@ class WordExtractor(BaseExtractor): # Initialize a row, all of which are empty by default row_cells = [""] * total_cols col_index = 0 - for cell in row.cells: + while col_index < len(row.cells): # make sure the col_index is not out of range - while col_index < total_cols and row_cells[col_index] != "": + while col_index < len(row.cells) and row_cells[col_index] != "": col_index += 1 # if col_index is out of range the loop is jumped - if col_index >= total_cols: + if col_index >= len(row.cells): break + # get the correct cell + cell = row.cells[col_index] cell_content = self._parse_cell(cell, image_map).strip() cell_colspan = cell.grid_span or 1 for i in range(cell_colspan): diff --git a/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py b/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py new file mode 100644 index 0000000000..3635e4dbf9 --- /dev/null +++ b/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py @@ -0,0 +1,49 @@ +"""Primarily used for testing merged cell scenarios""" + +from docx import Document + +from core.rag.extractor.word_extractor import WordExtractor + + +def _generate_table_with_merged_cells(): + doc = Document() + + """ + The table looks like this: + +-----+-----+-----+ + | 1-1 & 1-2 | 1-3 | + +-----+-----+-----+ + | 2-1 | 2-2 | 2-3 | + | & |-----+-----+ + | 3-1 | 3-2 | 3-3 | + +-----+-----+-----+ + """ + table = doc.add_table(rows=3, cols=3) + table.style = "Table Grid" + + for i in range(3): + for j in range(3): + cell = table.cell(i, j) + cell.text = f"{i + 1}-{j + 1}" + + # Merge cells + cell_0_0 = table.cell(0, 0) + cell_0_1 = table.cell(0, 1) + merged_cell_1 = cell_0_0.merge(cell_0_1) + merged_cell_1.text = "1-1 & 1-2" + + cell_1_0 = table.cell(1, 0) + cell_2_0 = table.cell(2, 0) + merged_cell_2 = cell_1_0.merge(cell_2_0) + merged_cell_2.text = "2-1 & 3-1" + + ground_truth = [["1-1 & 1-2", "", "1-3"], ["2-1 & 3-1", "2-2", "2-3"], ["2-1 & 3-1", "3-2", "3-3"]] + + return doc.tables[0], ground_truth + + +def test_parse_row(): + table, gt = _generate_table_with_merged_cells() + extractor = object.__new__(WordExtractor) + for idx, row in enumerate(table.rows): + assert extractor._parse_row(row, {}, 3) == gt[idx]