From 8f3fd9a728ef43aec8103548888f1744678a72d6 Mon Sep 17 00:00:00 2001 From: wangxiaolei Date: Mon, 15 Dec 2025 11:40:26 +0800 Subject: [PATCH] perf: commit once (#29590) --- api/core/rag/extractor/word_extractor.py | 10 +-- .../core/rag/extractor/test_word_extractor.py | 85 +++++++++++++++++++ 2 files changed, 89 insertions(+), 6 deletions(-) diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index 438932cfd6..044b118635 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -84,7 +84,7 @@ class WordExtractor(BaseExtractor): image_count = 0 image_map = {} - for rId, rel in doc.part.rels.items(): + for r_id, rel in doc.part.rels.items(): if "image" in rel.target_ref: image_count += 1 if rel.is_external: @@ -121,9 +121,8 @@ class WordExtractor(BaseExtractor): used_at=naive_utc_now(), ) db.session.add(upload_file) - db.session.commit() - # Use rId as key for external images since target_part is undefined - image_map[rId] = f"![image]({dify_config.FILES_URL}/files/{upload_file.id}/file-preview)" + # Use r_id as key for external images since target_part is undefined + image_map[r_id] = f"![image]({dify_config.FILES_URL}/files/{upload_file.id}/file-preview)" else: image_ext = rel.target_ref.split(".")[-1] if image_ext is None: @@ -151,12 +150,11 @@ class WordExtractor(BaseExtractor): used_at=naive_utc_now(), ) db.session.add(upload_file) - db.session.commit() # Use target_part as key for internal images image_map[rel.target_part] = ( f"![image]({dify_config.FILES_URL}/files/{upload_file.id}/file-preview)" ) - + db.session.commit() return image_map def _table_to_markdown(self, table, image_map): diff --git a/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py b/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py index 3635e4dbf9..fd0b0e2e44 100644 --- a/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py +++ b/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py @@ -1,7 +1,10 @@ """Primarily used for testing merged cell scenarios""" +from types import SimpleNamespace + from docx import Document +import core.rag.extractor.word_extractor as we from core.rag.extractor.word_extractor import WordExtractor @@ -47,3 +50,85 @@ def test_parse_row(): extractor = object.__new__(WordExtractor) for idx, row in enumerate(table.rows): assert extractor._parse_row(row, {}, 3) == gt[idx] + + +def test_extract_images_from_docx(monkeypatch): + external_bytes = b"ext-bytes" + internal_bytes = b"int-bytes" + + # Patch storage.save to capture writes + saves: list[tuple[str, bytes]] = [] + + def save(key: str, data: bytes): + saves.append((key, data)) + + monkeypatch.setattr(we, "storage", SimpleNamespace(save=save)) + + # Patch db.session to record adds/commit + class DummySession: + def __init__(self): + self.added = [] + self.committed = False + + def add(self, obj): + self.added.append(obj) + + def commit(self): + self.committed = True + + db_stub = SimpleNamespace(session=DummySession()) + monkeypatch.setattr(we, "db", db_stub) + + # Patch config values used for URL composition and storage type + monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False) + monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False) + + # Patch UploadFile to avoid real DB models + class FakeUploadFile: + _i = 0 + + def __init__(self, **kwargs): # kwargs match the real signature fields + type(self)._i += 1 + self.id = f"u{self._i}" + + monkeypatch.setattr(we, "UploadFile", FakeUploadFile) + + # Patch external image fetcher + def fake_get(url: str): + assert url == "https://example.com/image.png" + return SimpleNamespace(status_code=200, headers={"Content-Type": "image/png"}, content=external_bytes) + + monkeypatch.setattr(we, "ssrf_proxy", SimpleNamespace(get=fake_get)) + + # A hashable internal part object with a blob attribute + class HashablePart: + def __init__(self, blob: bytes): + self.blob = blob + + def __hash__(self) -> int: # ensure it can be used as a dict key like real docx parts + return id(self) + + # Build a minimal doc object with both external and internal image rels + internal_part = HashablePart(blob=internal_bytes) + rel_ext = SimpleNamespace(is_external=True, target_ref="https://example.com/image.png") + rel_int = SimpleNamespace(is_external=False, target_ref="word/media/image1.png", target_part=internal_part) + doc = SimpleNamespace(part=SimpleNamespace(rels={"rId1": rel_ext, "rId2": rel_int})) + + extractor = object.__new__(WordExtractor) + extractor.tenant_id = "t1" + extractor.user_id = "u1" + + image_map = extractor._extract_images_from_docx(doc) + + # Returned map should contain entries for external (keyed by rId) and internal (keyed by target_part) + assert set(image_map.keys()) == {"rId1", internal_part} + assert all(v.startswith("![image](") and v.endswith("/file-preview)") for v in image_map.values()) + + # Storage should receive both payloads + payloads = {data for _, data in saves} + assert external_bytes in payloads + assert internal_bytes in payloads + + # DB interactions should be recorded + assert len(db_stub.session.added) == 2 + assert db_stub.session.committed is True