perf(core/rag): optimize Excel extractor performance and memory usage (#29551)

Co-authored-by: 01393547 <nieronghua@sf-express.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
2025-12-12 12:15:03 +08:00 · 2025-12-12 12:15:03 +08:00 · 12e39365fa
parent d48300d08c
commit 12e39365fa
1 changed files with 99 additions and 29 deletions
--- a/api/core/rag/extractor/excel_extractor.py
+++ b/api/core/rag/extractor/excel_extractor.py
@ -1,7 +1,7 @@
 """Abstract interface for document loader implementations."""

 import os
-from typing import cast
+from typing import TypedDict

 import pandas as pd
 from openpyxl import load_workbook
@ -10,6 +10,12 @@ from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document


+class Candidate(TypedDict):
+    idx: int
+    count: int
+    map: dict[int, str]
+
+
 class ExcelExtractor(BaseExtractor):
    """Load Excel files.

@ -30,32 +36,38 @@ class ExcelExtractor(BaseExtractor):
        file_extension = os.path.splitext(self._file_path)[-1].lower()

        if file_extension == ".xlsx":
-            wb = load_workbook(self._file_path, data_only=True)
-            for sheet_name in wb.sheetnames:
-                sheet = wb[sheet_name]
-                data = sheet.values
-                cols = next(data, None)
-                if cols is None:
-                    continue
-                df = pd.DataFrame(data, columns=cols)
-
-                df.dropna(how="all", inplace=True)
-
-                for index, row in df.iterrows():
-                    page_content = []
-                    for col_index, (k, v) in enumerate(row.items()):
-                        if pd.notna(v):
-                            cell = sheet.cell(
-                                row=cast(int, index) + 2, column=col_index + 1
-                            )  # +2 to account for header and 1-based index
-                            if cell.hyperlink:
-                                value = f"[{v}]({cell.hyperlink.target})"
-                                page_content.append(f'"{k}":"{value}"')
-                            else:
-                                page_content.append(f'"{k}":"{v}"')
-                    documents.append(
-                        Document(page_content=";".join(page_content), metadata={"source": self._file_path})
-                    )
+            wb = load_workbook(self._file_path, read_only=True, data_only=True)
+            try:
+                for sheet_name in wb.sheetnames:
+                    sheet = wb[sheet_name]
+                    header_row_idx, column_map, max_col_idx = self._find_header_and_columns(sheet)
+                    if not column_map:
+                        continue
+                    start_row = header_row_idx + 1
+                    for row in sheet.iter_rows(min_row=start_row, max_col=max_col_idx, values_only=False):
+                        if all(cell.value is None for cell in row):
+                            continue
+                        page_content = []
+                        for col_idx, cell in enumerate(row):
+                            value = cell.value
+                            if col_idx in column_map:
+                                col_name = column_map[col_idx]
+                                if hasattr(cell, "hyperlink") and cell.hyperlink:
+                                    target = getattr(cell.hyperlink, "target", None)
+                                    if target:
+                                        value = f"[{value}]({target})"
+                                if value is None:
+                                    value = ""
+                                elif not isinstance(value, str):
+                                    value = str(value)
+                                value = value.strip().replace('"', '\\"')
+                                page_content.append(f'"{col_name}":"{value}"')
+                        if page_content:
+                            documents.append(
+                                Document(page_content=";".join(page_content), metadata={"source": self._file_path})
+                            )
+            finally:
+                wb.close()

        elif file_extension == ".xls":
            excel_file = pd.ExcelFile(self._file_path, engine="xlrd")
@ -63,9 +75,9 @@ class ExcelExtractor(BaseExtractor):
                df = excel_file.parse(sheet_name=excel_sheet_name)
                df.dropna(how="all", inplace=True)

-                for _, row in df.iterrows():
+                for _, series_row in df.iterrows():
                    page_content = []
-                    for k, v in row.items():
+                    for k, v in series_row.items():
                        if pd.notna(v):
                            page_content.append(f'"{k}":"{v}"')
                    documents.append(
@ -75,3 +87,61 @@ class ExcelExtractor(BaseExtractor):
            raise ValueError(f"Unsupported file extension: {file_extension}")

        return documents
+
+    def _find_header_and_columns(self, sheet, scan_rows=10) -> tuple[int, dict[int, str], int]:
+        """
+        Scan first N rows to find the most likely header row.
+        Returns:
+            header_row_idx: 1-based index of the header row
+            column_map: Dict mapping 0-based column index to column name
+            max_col_idx: 1-based index of the last valid column (for iter_rows boundary)
+        """
+        # Store potential candidates: (row_index, non_empty_count, column_map)
+        candidates: list[Candidate] = []
+
+        # Limit scan to avoid performance issues on huge files
+        # We iterate manually to control the read scope
+        for current_row_idx, row in enumerate(sheet.iter_rows(min_row=1, max_row=scan_rows, values_only=True), start=1):
+            # Filter out empty cells and build a temp map for this row
+            # col_idx is 0-based
+            row_map = {}
+            for col_idx, cell_value in enumerate(row):
+                if cell_value is not None and str(cell_value).strip():
+                    row_map[col_idx] = str(cell_value).strip().replace('"', '\\"')
+
+            if not row_map:
+                continue
+
+            non_empty_count = len(row_map)
+
+            # Header selection heuristic (implemented):
+            # - Prefer the first row with at least 2 non-empty columns.
+            # - Fallback: choose the row with the most non-empty columns
+            #   (tie-breaker: smaller row index).
+            candidates.append({"idx": current_row_idx, "count": non_empty_count, "map": row_map})
+
+        if not candidates:
+            return 0, {}, 0
+
+        # Choose the best candidate header row.
+
+        best_candidate: Candidate | None = None
+
+        # Strategy: prefer the first row with >= 2 non-empty columns; otherwise fallback.
+
+        for cand in candidates:
+            if cand["count"] >= 2:
+                best_candidate = cand
+                break
+
+        # Fallback: if no row has >= 2 columns, or all have 1, just take the one with max columns
+        if not best_candidate:
+            # Sort by count desc, then index asc
+            candidates.sort(key=lambda x: (-x["count"], x["idx"]))
+            best_candidate = candidates[0]
+
+        # Determine max_col_idx (1-based for openpyxl)
+        # It is the index of the last valid column in our map + 1
+        max_col_idx = max(best_candidate["map"].keys()) + 1
+
+        return best_candidate["idx"], best_candidate["map"], max_col_idx