fix: docx extractor external image failed (#29558)

This commit is contained in:
Jyong 2025-12-12 13:41:51 +08:00 committed by GitHub
parent ac40309850
commit db42f467c8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 89 additions and 41 deletions

View File

@ -84,22 +84,46 @@ class WordExtractor(BaseExtractor):
image_count = 0
image_map = {}
for rel in doc.part.rels.values():
for rId, rel in doc.part.rels.items():
if "image" in rel.target_ref:
image_count += 1
if rel.is_external:
url = rel.target_ref
response = ssrf_proxy.get(url)
if not self._is_valid_url(url):
continue
try:
response = ssrf_proxy.get(url)
except Exception as e:
logger.warning("Failed to download image from URL: %s: %s", url, str(e))
continue
if response.status_code == 200:
image_ext = mimetypes.guess_extension(response.headers["Content-Type"])
image_ext = mimetypes.guess_extension(response.headers.get("Content-Type", ""))
if image_ext is None:
continue
file_uuid = str(uuid.uuid4())
file_key = "image_files/" + self.tenant_id + "/" + file_uuid + "." + image_ext
file_key = "image_files/" + self.tenant_id + "/" + file_uuid + image_ext
mime_type, _ = mimetypes.guess_type(file_key)
storage.save(file_key, response.content)
else:
continue
# save file to db
upload_file = UploadFile(
tenant_id=self.tenant_id,
storage_type=dify_config.STORAGE_TYPE,
key=file_key,
name=file_key,
size=0,
extension=str(image_ext),
mime_type=mime_type or "",
created_by=self.user_id,
created_by_role=CreatorUserRole.ACCOUNT,
created_at=naive_utc_now(),
used=True,
used_by=self.user_id,
used_at=naive_utc_now(),
)
db.session.add(upload_file)
db.session.commit()
# Use rId as key for external images since target_part is undefined
image_map[rId] = f"![image]({dify_config.FILES_URL}/files/{upload_file.id}/file-preview)"
else:
image_ext = rel.target_ref.split(".")[-1]
if image_ext is None:
@ -110,26 +134,28 @@ class WordExtractor(BaseExtractor):
mime_type, _ = mimetypes.guess_type(file_key)
storage.save(file_key, rel.target_part.blob)
# save file to db
upload_file = UploadFile(
tenant_id=self.tenant_id,
storage_type=dify_config.STORAGE_TYPE,
key=file_key,
name=file_key,
size=0,
extension=str(image_ext),
mime_type=mime_type or "",
created_by=self.user_id,
created_by_role=CreatorUserRole.ACCOUNT,
created_at=naive_utc_now(),
used=True,
used_by=self.user_id,
used_at=naive_utc_now(),
)
db.session.add(upload_file)
db.session.commit()
image_map[rel.target_part] = f"![image]({dify_config.FILES_URL}/files/{upload_file.id}/file-preview)"
# save file to db
upload_file = UploadFile(
tenant_id=self.tenant_id,
storage_type=dify_config.STORAGE_TYPE,
key=file_key,
name=file_key,
size=0,
extension=str(image_ext),
mime_type=mime_type or "",
created_by=self.user_id,
created_by_role=CreatorUserRole.ACCOUNT,
created_at=naive_utc_now(),
used=True,
used_by=self.user_id,
used_at=naive_utc_now(),
)
db.session.add(upload_file)
db.session.commit()
# Use target_part as key for internal images
image_map[rel.target_part] = (
f"![image]({dify_config.FILES_URL}/files/{upload_file.id}/file-preview)"
)
return image_map
@ -186,11 +212,17 @@ class WordExtractor(BaseExtractor):
image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
if not image_id:
continue
image_part = paragraph.part.rels[image_id].target_part
if image_part in image_map:
image_link = image_map[image_part]
paragraph_content.append(image_link)
rel = paragraph.part.rels.get(image_id)
if rel is None:
continue
# For external images, use image_id as key; for internal, use target_part
if rel.is_external:
if image_id in image_map:
paragraph_content.append(image_map[image_id])
else:
image_part = rel.target_part
if image_part in image_map:
paragraph_content.append(image_map[image_part])
else:
paragraph_content.append(run.text)
return "".join(paragraph_content).strip()
@ -227,6 +259,18 @@ class WordExtractor(BaseExtractor):
def parse_paragraph(paragraph):
paragraph_content = []
def append_image_link(image_id, has_drawing):
"""Helper to append image link from image_map based on relationship type."""
rel = doc.part.rels[image_id]
if rel.is_external:
if image_id in image_map and not has_drawing:
paragraph_content.append(image_map[image_id])
else:
image_part = rel.target_part
if image_part in image_map and not has_drawing:
paragraph_content.append(image_map[image_part])
for run in paragraph.runs:
if hasattr(run.element, "tag") and isinstance(run.element.tag, str) and run.element.tag.endswith("r"):
# Process drawing type images
@ -243,10 +287,18 @@ class WordExtractor(BaseExtractor):
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
)
if embed_id:
image_part = doc.part.related_parts.get(embed_id)
if image_part in image_map:
has_drawing = True
paragraph_content.append(image_map[image_part])
rel = doc.part.rels.get(embed_id)
if rel is not None and rel.is_external:
# External image: use embed_id as key
if embed_id in image_map:
has_drawing = True
paragraph_content.append(image_map[embed_id])
else:
# Internal image: use target_part as key
image_part = doc.part.related_parts.get(embed_id)
if image_part in image_map:
has_drawing = True
paragraph_content.append(image_map[image_part])
# Process pict type images
shape_elements = run.element.findall(
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pict"
@ -261,9 +313,7 @@ class WordExtractor(BaseExtractor):
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
)
if image_id and image_id in doc.part.rels:
image_part = doc.part.rels[image_id].target_part
if image_part in image_map and not has_drawing:
paragraph_content.append(image_map[image_part])
append_image_link(image_id, has_drawing)
# Find imagedata element in VML
image_data = shape.find(".//{urn:schemas-microsoft-com:vml}imagedata")
if image_data is not None:
@ -271,9 +321,7 @@ class WordExtractor(BaseExtractor):
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
)
if image_id and image_id in doc.part.rels:
image_part = doc.part.rels[image_id].target_part
if image_part in image_map and not has_drawing:
paragraph_content.append(image_map[image_part])
append_image_link(image_id, has_drawing)
if run.text.strip():
paragraph_content.append(run.text.strip())
return "".join(paragraph_content) if paragraph_content else ""