mirror of https://github.com/langgenius/dify.git
perf: commit once (#29590)
This commit is contained in:
parent
1a18012f98
commit
8f3fd9a728
|
|
@ -84,7 +84,7 @@ class WordExtractor(BaseExtractor):
|
||||||
image_count = 0
|
image_count = 0
|
||||||
image_map = {}
|
image_map = {}
|
||||||
|
|
||||||
for rId, rel in doc.part.rels.items():
|
for r_id, rel in doc.part.rels.items():
|
||||||
if "image" in rel.target_ref:
|
if "image" in rel.target_ref:
|
||||||
image_count += 1
|
image_count += 1
|
||||||
if rel.is_external:
|
if rel.is_external:
|
||||||
|
|
@ -121,9 +121,8 @@ class WordExtractor(BaseExtractor):
|
||||||
used_at=naive_utc_now(),
|
used_at=naive_utc_now(),
|
||||||
)
|
)
|
||||||
db.session.add(upload_file)
|
db.session.add(upload_file)
|
||||||
db.session.commit()
|
# Use r_id as key for external images since target_part is undefined
|
||||||
# Use rId as key for external images since target_part is undefined
|
image_map[r_id] = f""
|
||||||
image_map[rId] = f""
|
|
||||||
else:
|
else:
|
||||||
image_ext = rel.target_ref.split(".")[-1]
|
image_ext = rel.target_ref.split(".")[-1]
|
||||||
if image_ext is None:
|
if image_ext is None:
|
||||||
|
|
@ -151,12 +150,11 @@ class WordExtractor(BaseExtractor):
|
||||||
used_at=naive_utc_now(),
|
used_at=naive_utc_now(),
|
||||||
)
|
)
|
||||||
db.session.add(upload_file)
|
db.session.add(upload_file)
|
||||||
db.session.commit()
|
|
||||||
# Use target_part as key for internal images
|
# Use target_part as key for internal images
|
||||||
image_map[rel.target_part] = (
|
image_map[rel.target_part] = (
|
||||||
f""
|
f""
|
||||||
)
|
)
|
||||||
|
db.session.commit()
|
||||||
return image_map
|
return image_map
|
||||||
|
|
||||||
def _table_to_markdown(self, table, image_map):
|
def _table_to_markdown(self, table, image_map):
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,10 @@
|
||||||
"""Primarily used for testing merged cell scenarios"""
|
"""Primarily used for testing merged cell scenarios"""
|
||||||
|
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
from docx import Document
|
from docx import Document
|
||||||
|
|
||||||
|
import core.rag.extractor.word_extractor as we
|
||||||
from core.rag.extractor.word_extractor import WordExtractor
|
from core.rag.extractor.word_extractor import WordExtractor
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -47,3 +50,85 @@ def test_parse_row():
|
||||||
extractor = object.__new__(WordExtractor)
|
extractor = object.__new__(WordExtractor)
|
||||||
for idx, row in enumerate(table.rows):
|
for idx, row in enumerate(table.rows):
|
||||||
assert extractor._parse_row(row, {}, 3) == gt[idx]
|
assert extractor._parse_row(row, {}, 3) == gt[idx]
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_images_from_docx(monkeypatch):
|
||||||
|
external_bytes = b"ext-bytes"
|
||||||
|
internal_bytes = b"int-bytes"
|
||||||
|
|
||||||
|
# Patch storage.save to capture writes
|
||||||
|
saves: list[tuple[str, bytes]] = []
|
||||||
|
|
||||||
|
def save(key: str, data: bytes):
|
||||||
|
saves.append((key, data))
|
||||||
|
|
||||||
|
monkeypatch.setattr(we, "storage", SimpleNamespace(save=save))
|
||||||
|
|
||||||
|
# Patch db.session to record adds/commit
|
||||||
|
class DummySession:
|
||||||
|
def __init__(self):
|
||||||
|
self.added = []
|
||||||
|
self.committed = False
|
||||||
|
|
||||||
|
def add(self, obj):
|
||||||
|
self.added.append(obj)
|
||||||
|
|
||||||
|
def commit(self):
|
||||||
|
self.committed = True
|
||||||
|
|
||||||
|
db_stub = SimpleNamespace(session=DummySession())
|
||||||
|
monkeypatch.setattr(we, "db", db_stub)
|
||||||
|
|
||||||
|
# Patch config values used for URL composition and storage type
|
||||||
|
monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False)
|
||||||
|
monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False)
|
||||||
|
|
||||||
|
# Patch UploadFile to avoid real DB models
|
||||||
|
class FakeUploadFile:
|
||||||
|
_i = 0
|
||||||
|
|
||||||
|
def __init__(self, **kwargs): # kwargs match the real signature fields
|
||||||
|
type(self)._i += 1
|
||||||
|
self.id = f"u{self._i}"
|
||||||
|
|
||||||
|
monkeypatch.setattr(we, "UploadFile", FakeUploadFile)
|
||||||
|
|
||||||
|
# Patch external image fetcher
|
||||||
|
def fake_get(url: str):
|
||||||
|
assert url == "https://example.com/image.png"
|
||||||
|
return SimpleNamespace(status_code=200, headers={"Content-Type": "image/png"}, content=external_bytes)
|
||||||
|
|
||||||
|
monkeypatch.setattr(we, "ssrf_proxy", SimpleNamespace(get=fake_get))
|
||||||
|
|
||||||
|
# A hashable internal part object with a blob attribute
|
||||||
|
class HashablePart:
|
||||||
|
def __init__(self, blob: bytes):
|
||||||
|
self.blob = blob
|
||||||
|
|
||||||
|
def __hash__(self) -> int: # ensure it can be used as a dict key like real docx parts
|
||||||
|
return id(self)
|
||||||
|
|
||||||
|
# Build a minimal doc object with both external and internal image rels
|
||||||
|
internal_part = HashablePart(blob=internal_bytes)
|
||||||
|
rel_ext = SimpleNamespace(is_external=True, target_ref="https://example.com/image.png")
|
||||||
|
rel_int = SimpleNamespace(is_external=False, target_ref="word/media/image1.png", target_part=internal_part)
|
||||||
|
doc = SimpleNamespace(part=SimpleNamespace(rels={"rId1": rel_ext, "rId2": rel_int}))
|
||||||
|
|
||||||
|
extractor = object.__new__(WordExtractor)
|
||||||
|
extractor.tenant_id = "t1"
|
||||||
|
extractor.user_id = "u1"
|
||||||
|
|
||||||
|
image_map = extractor._extract_images_from_docx(doc)
|
||||||
|
|
||||||
|
# Returned map should contain entries for external (keyed by rId) and internal (keyed by target_part)
|
||||||
|
assert set(image_map.keys()) == {"rId1", internal_part}
|
||||||
|
assert all(v.startswith(" and v.endswith("/file-preview)") for v in image_map.values())
|
||||||
|
|
||||||
|
# Storage should receive both payloads
|
||||||
|
payloads = {data for _, data in saves}
|
||||||
|
assert external_bytes in payloads
|
||||||
|
assert internal_bytes in payloads
|
||||||
|
|
||||||
|
# DB interactions should be recorded
|
||||||
|
assert len(db_stub.session.added) == 2
|
||||||
|
assert db_stub.session.committed is True
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue