From 12e39365fa91cf635251f4b0606405600d95e467 Mon Sep 17 00:00:00 2001 From: Nie Ronghua <40586915+NieRonghua@users.noreply.github.com> Date: Fri, 12 Dec 2025 12:15:03 +0800 Subject: [PATCH] perf(core/rag): optimize Excel extractor performance and memory usage (#29551) Co-authored-by: 01393547 Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- api/core/rag/extractor/excel_extractor.py | 128 +++++++++++++++++----- 1 file changed, 99 insertions(+), 29 deletions(-) diff --git a/api/core/rag/extractor/excel_extractor.py b/api/core/rag/extractor/excel_extractor.py index ea9c6bd73a..875bfd1439 100644 --- a/api/core/rag/extractor/excel_extractor.py +++ b/api/core/rag/extractor/excel_extractor.py @@ -1,7 +1,7 @@ """Abstract interface for document loader implementations.""" import os -from typing import cast +from typing import TypedDict import pandas as pd from openpyxl import load_workbook @@ -10,6 +10,12 @@ from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document +class Candidate(TypedDict): + idx: int + count: int + map: dict[int, str] + + class ExcelExtractor(BaseExtractor): """Load Excel files. @@ -30,32 +36,38 @@ class ExcelExtractor(BaseExtractor): file_extension = os.path.splitext(self._file_path)[-1].lower() if file_extension == ".xlsx": - wb = load_workbook(self._file_path, data_only=True) - for sheet_name in wb.sheetnames: - sheet = wb[sheet_name] - data = sheet.values - cols = next(data, None) - if cols is None: - continue - df = pd.DataFrame(data, columns=cols) - - df.dropna(how="all", inplace=True) - - for index, row in df.iterrows(): - page_content = [] - for col_index, (k, v) in enumerate(row.items()): - if pd.notna(v): - cell = sheet.cell( - row=cast(int, index) + 2, column=col_index + 1 - ) # +2 to account for header and 1-based index - if cell.hyperlink: - value = f"[{v}]({cell.hyperlink.target})" - page_content.append(f'"{k}":"{value}"') - else: - page_content.append(f'"{k}":"{v}"') - documents.append( - Document(page_content=";".join(page_content), metadata={"source": self._file_path}) - ) + wb = load_workbook(self._file_path, read_only=True, data_only=True) + try: + for sheet_name in wb.sheetnames: + sheet = wb[sheet_name] + header_row_idx, column_map, max_col_idx = self._find_header_and_columns(sheet) + if not column_map: + continue + start_row = header_row_idx + 1 + for row in sheet.iter_rows(min_row=start_row, max_col=max_col_idx, values_only=False): + if all(cell.value is None for cell in row): + continue + page_content = [] + for col_idx, cell in enumerate(row): + value = cell.value + if col_idx in column_map: + col_name = column_map[col_idx] + if hasattr(cell, "hyperlink") and cell.hyperlink: + target = getattr(cell.hyperlink, "target", None) + if target: + value = f"[{value}]({target})" + if value is None: + value = "" + elif not isinstance(value, str): + value = str(value) + value = value.strip().replace('"', '\\"') + page_content.append(f'"{col_name}":"{value}"') + if page_content: + documents.append( + Document(page_content=";".join(page_content), metadata={"source": self._file_path}) + ) + finally: + wb.close() elif file_extension == ".xls": excel_file = pd.ExcelFile(self._file_path, engine="xlrd") @@ -63,9 +75,9 @@ class ExcelExtractor(BaseExtractor): df = excel_file.parse(sheet_name=excel_sheet_name) df.dropna(how="all", inplace=True) - for _, row in df.iterrows(): + for _, series_row in df.iterrows(): page_content = [] - for k, v in row.items(): + for k, v in series_row.items(): if pd.notna(v): page_content.append(f'"{k}":"{v}"') documents.append( @@ -75,3 +87,61 @@ class ExcelExtractor(BaseExtractor): raise ValueError(f"Unsupported file extension: {file_extension}") return documents + + def _find_header_and_columns(self, sheet, scan_rows=10) -> tuple[int, dict[int, str], int]: + """ + Scan first N rows to find the most likely header row. + Returns: + header_row_idx: 1-based index of the header row + column_map: Dict mapping 0-based column index to column name + max_col_idx: 1-based index of the last valid column (for iter_rows boundary) + """ + # Store potential candidates: (row_index, non_empty_count, column_map) + candidates: list[Candidate] = [] + + # Limit scan to avoid performance issues on huge files + # We iterate manually to control the read scope + for current_row_idx, row in enumerate(sheet.iter_rows(min_row=1, max_row=scan_rows, values_only=True), start=1): + # Filter out empty cells and build a temp map for this row + # col_idx is 0-based + row_map = {} + for col_idx, cell_value in enumerate(row): + if cell_value is not None and str(cell_value).strip(): + row_map[col_idx] = str(cell_value).strip().replace('"', '\\"') + + if not row_map: + continue + + non_empty_count = len(row_map) + + # Header selection heuristic (implemented): + # - Prefer the first row with at least 2 non-empty columns. + # - Fallback: choose the row with the most non-empty columns + # (tie-breaker: smaller row index). + candidates.append({"idx": current_row_idx, "count": non_empty_count, "map": row_map}) + + if not candidates: + return 0, {}, 0 + + # Choose the best candidate header row. + + best_candidate: Candidate | None = None + + # Strategy: prefer the first row with >= 2 non-empty columns; otherwise fallback. + + for cand in candidates: + if cand["count"] >= 2: + best_candidate = cand + break + + # Fallback: if no row has >= 2 columns, or all have 1, just take the one with max columns + if not best_candidate: + # Sort by count desc, then index asc + candidates.sort(key=lambda x: (-x["count"], x["idx"])) + best_candidate = candidates[0] + + # Determine max_col_idx (1-based for openpyxl) + # It is the index of the last valid column in our map + 1 + max_col_idx = max(best_candidate["map"].keys()) + 1 + + return best_candidate["idx"], best_candidate["map"], max_col_idx