From 114a34e0086dd59c46325ffb21adc4de49d36dab Mon Sep 17 00:00:00 2001 From: Zhiqiang Yang Date: Tue, 6 Jan 2026 03:24:26 +0000 Subject: [PATCH] fix: correct docx hyperlink extraction (#30360) --- api/core/rag/extractor/word_extractor.py | 131 +++++++++++++----- .../core/rag/extractor/test_word_extractor.py | 111 +++++++++++++++ 2 files changed, 207 insertions(+), 35 deletions(-) diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index f67f613e9d..511f5a698d 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -7,10 +7,11 @@ import re import tempfile import uuid from urllib.parse import urlparse -from xml.etree import ElementTree import httpx from docx import Document as DocxDocument +from docx.oxml.ns import qn +from docx.text.run import Run from configs import dify_config from core.helper import ssrf_proxy @@ -229,44 +230,20 @@ class WordExtractor(BaseExtractor): image_map = self._extract_images_from_docx(doc) - hyperlinks_url = None - url_pattern = re.compile(r"http://[^\s+]+//|https://[^\s+]+") - for para in doc.paragraphs: - for run in para.runs: - if run.text and hyperlinks_url: - result = f" [{run.text}]({hyperlinks_url}) " - run.text = result - hyperlinks_url = None - if "HYPERLINK" in run.element.xml: - try: - xml = ElementTree.XML(run.element.xml) - x_child = [c for c in xml.iter() if c is not None] - for x in x_child: - if x is None: - continue - if x.tag.endswith("instrText"): - if x.text is None: - continue - for i in url_pattern.findall(x.text): - hyperlinks_url = str(i) - except Exception: - logger.exception("Failed to parse HYPERLINK xml") - def parse_paragraph(paragraph): - paragraph_content = [] - - def append_image_link(image_id, has_drawing): + def append_image_link(image_id, has_drawing, target_buffer): """Helper to append image link from image_map based on relationship type.""" rel = doc.part.rels[image_id] if rel.is_external: if image_id in image_map and not has_drawing: - paragraph_content.append(image_map[image_id]) + target_buffer.append(image_map[image_id]) else: image_part = rel.target_part if image_part in image_map and not has_drawing: - paragraph_content.append(image_map[image_part]) + target_buffer.append(image_map[image_part]) - for run in paragraph.runs: + def process_run(run, target_buffer): + # Helper to extract text and embedded images from a run element and append them to target_buffer if hasattr(run.element, "tag") and isinstance(run.element.tag, str) and run.element.tag.endswith("r"): # Process drawing type images drawing_elements = run.element.findall( @@ -287,13 +264,13 @@ class WordExtractor(BaseExtractor): # External image: use embed_id as key if embed_id in image_map: has_drawing = True - paragraph_content.append(image_map[embed_id]) + target_buffer.append(image_map[embed_id]) else: # Internal image: use target_part as key image_part = doc.part.related_parts.get(embed_id) if image_part in image_map: has_drawing = True - paragraph_content.append(image_map[image_part]) + target_buffer.append(image_map[image_part]) # Process pict type images shape_elements = run.element.findall( ".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pict" @@ -308,7 +285,7 @@ class WordExtractor(BaseExtractor): "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id" ) if image_id and image_id in doc.part.rels: - append_image_link(image_id, has_drawing) + append_image_link(image_id, has_drawing, target_buffer) # Find imagedata element in VML image_data = shape.find(".//{urn:schemas-microsoft-com:vml}imagedata") if image_data is not None: @@ -316,9 +293,93 @@ class WordExtractor(BaseExtractor): "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id" ) if image_id and image_id in doc.part.rels: - append_image_link(image_id, has_drawing) + append_image_link(image_id, has_drawing, target_buffer) if run.text.strip(): - paragraph_content.append(run.text.strip()) + target_buffer.append(run.text.strip()) + + def process_hyperlink(hyperlink_elem, target_buffer): + # Helper to extract text from a hyperlink element and append it to target_buffer + r_id = hyperlink_elem.get(qn("r:id")) + + # Extract text from runs inside the hyperlink + link_text_parts = [] + for run_elem in hyperlink_elem.findall(qn("w:r")): + run = Run(run_elem, paragraph) + # Hyperlink text may be split across multiple runs (e.g., with different formatting), + # so collect all run texts first + if run.text: + link_text_parts.append(run.text) + + link_text = "".join(link_text_parts).strip() + + # Resolve URL + if r_id: + try: + rel = doc.part.rels.get(r_id) + if rel and rel.is_external: + link_text = f"[{link_text or rel.target_ref}]({rel.target_ref})" + except Exception: + logger.exception("Failed to resolve URL for hyperlink with r:id: %s", r_id) + + if link_text: + target_buffer.append(link_text) + + paragraph_content = [] + # State for legacy HYPERLINK fields + hyperlink_field_url = None + hyperlink_field_text_parts: list = [] + is_collecting_field_text = False + # Iterate through paragraph elements in document order + for child in paragraph._element: + tag = child.tag + if tag == qn("w:r"): + # Regular run + run = Run(child, paragraph) + + # Check for fldChar (begin/end/separate) and instrText for legacy hyperlinks + fld_chars = child.findall(qn("w:fldChar")) + instr_texts = child.findall(qn("w:instrText")) + + # Handle Fields + if fld_chars or instr_texts: + # Process instrText to find HYPERLINK "url" + for instr in instr_texts: + if instr.text and "HYPERLINK" in instr.text: + # Quick regex to extract URL + match = re.search(r'HYPERLINK\s+"([^"]+)"', instr.text, re.IGNORECASE) + if match: + hyperlink_field_url = match.group(1) + + # Process fldChar + for fld_char in fld_chars: + fld_char_type = fld_char.get(qn("w:fldCharType")) + if fld_char_type == "begin": + # Start of a field: reset legacy link state + hyperlink_field_url = None + hyperlink_field_text_parts = [] + is_collecting_field_text = False + elif fld_char_type == "separate": + # Separator: if we found a URL, start collecting visible text + if hyperlink_field_url: + is_collecting_field_text = True + elif fld_char_type == "end": + # End of field + if is_collecting_field_text and hyperlink_field_url: + # Create markdown link and append to main content + display_text = "".join(hyperlink_field_text_parts).strip() + if display_text: + link_md = f"[{display_text}]({hyperlink_field_url})" + paragraph_content.append(link_md) + # Reset state + hyperlink_field_url = None + hyperlink_field_text_parts = [] + is_collecting_field_text = False + + # Decide where to append content + target_buffer = hyperlink_field_text_parts if is_collecting_field_text else paragraph_content + process_run(run, target_buffer) + elif tag == qn("w:hyperlink"): + process_hyperlink(child, paragraph_content) return "".join(paragraph_content) if paragraph_content else "" paragraphs = doc.paragraphs.copy() diff --git a/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py b/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py index 3203aab8c3..f9e59a5f05 100644 --- a/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py +++ b/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py @@ -1,8 +1,12 @@ """Primarily used for testing merged cell scenarios""" +import os +import tempfile from types import SimpleNamespace from docx import Document +from docx.oxml import OxmlElement +from docx.oxml.ns import qn import core.rag.extractor.word_extractor as we from core.rag.extractor.word_extractor import WordExtractor @@ -165,3 +169,110 @@ def test_extract_images_from_docx_uses_internal_files_url(): dify_config.FILES_URL = original_files_url if original_internal_files_url is not None: dify_config.INTERNAL_FILES_URL = original_internal_files_url + + +def test_extract_hyperlinks(monkeypatch): + # Mock db and storage to avoid issues during image extraction (even if no images are present) + monkeypatch.setattr(we, "storage", SimpleNamespace(save=lambda k, d: None)) + db_stub = SimpleNamespace(session=SimpleNamespace(add=lambda o: None, commit=lambda: None)) + monkeypatch.setattr(we, "db", db_stub) + monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False) + monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False) + + doc = Document() + p = doc.add_paragraph("Visit ") + + # Adding a hyperlink manually + r_id = "rId99" + hyperlink = OxmlElement("w:hyperlink") + hyperlink.set(qn("r:id"), r_id) + + new_run = OxmlElement("w:r") + t = OxmlElement("w:t") + t.text = "Dify" + new_run.append(t) + hyperlink.append(new_run) + p._p.append(hyperlink) + + # Add relationship to the part + doc.part.rels.add_relationship( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink", + "https://dify.ai", + r_id, + is_external=True, + ) + + with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp: + doc.save(tmp.name) + tmp_path = tmp.name + + try: + extractor = WordExtractor(tmp_path, "tenant_id", "user_id") + docs = extractor.extract() + # Verify modern hyperlink extraction + assert "Visit[Dify](https://dify.ai)" in docs[0].page_content + finally: + if os.path.exists(tmp_path): + os.remove(tmp_path) + + +def test_extract_legacy_hyperlinks(monkeypatch): + # Mock db and storage + monkeypatch.setattr(we, "storage", SimpleNamespace(save=lambda k, d: None)) + db_stub = SimpleNamespace(session=SimpleNamespace(add=lambda o: None, commit=lambda: None)) + monkeypatch.setattr(we, "db", db_stub) + monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False) + monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False) + + doc = Document() + p = doc.add_paragraph() + + # Construct a legacy HYPERLINK field: + # 1. w:fldChar (begin) + # 2. w:instrText (HYPERLINK "http://example.com") + # 3. w:fldChar (separate) + # 4. w:r (visible text "Example") + # 5. w:fldChar (end) + + run1 = OxmlElement("w:r") + fldCharBegin = OxmlElement("w:fldChar") + fldCharBegin.set(qn("w:fldCharType"), "begin") + run1.append(fldCharBegin) + p._p.append(run1) + + run2 = OxmlElement("w:r") + instrText = OxmlElement("w:instrText") + instrText.text = ' HYPERLINK "http://example.com" ' + run2.append(instrText) + p._p.append(run2) + + run3 = OxmlElement("w:r") + fldCharSep = OxmlElement("w:fldChar") + fldCharSep.set(qn("w:fldCharType"), "separate") + run3.append(fldCharSep) + p._p.append(run3) + + run4 = OxmlElement("w:r") + t4 = OxmlElement("w:t") + t4.text = "Example" + run4.append(t4) + p._p.append(run4) + + run5 = OxmlElement("w:r") + fldCharEnd = OxmlElement("w:fldChar") + fldCharEnd.set(qn("w:fldCharType"), "end") + run5.append(fldCharEnd) + p._p.append(run5) + + with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp: + doc.save(tmp.name) + tmp_path = tmp.name + + try: + extractor = WordExtractor(tmp_path, "tenant_id", "user_id") + docs = extractor.extract() + # Verify legacy hyperlink extraction + assert "[Example](http://example.com)" in docs[0].page_content + finally: + if os.path.exists(tmp_path): + os.remove(tmp_path)