From fc91a7a38b9c23160e51373072c1674a8c083580 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yanli=20=E7=9B=90=E7=B2=92?= Date: Mon, 9 Feb 2026 20:14:52 +0800 Subject: [PATCH] Fix docx segment image URLs --- api/core/rag/extractor/pdf_extractor.py | 3 +- api/core/rag/extractor/word_extractor.py | 5 ++- api/models/dataset.py | 4 +-- .../core/rag/extractor/test_pdf_extractor.py | 4 +-- .../core/rag/extractor/test_word_extractor.py | 36 ++----------------- .../unit_tests/models/test_dataset_models.py | 29 +++++++++++++++ 6 files changed, 38 insertions(+), 43 deletions(-) diff --git a/api/core/rag/extractor/pdf_extractor.py b/api/core/rag/extractor/pdf_extractor.py index 6aabcac704..90b5ec001c 100644 --- a/api/core/rag/extractor/pdf_extractor.py +++ b/api/core/rag/extractor/pdf_extractor.py @@ -114,7 +114,6 @@ class PdfExtractor(BaseExtractor): """ image_content = [] upload_files = [] - base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL try: image_objects = page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,)) @@ -164,7 +163,7 @@ class PdfExtractor(BaseExtractor): used_at=naive_utc_now(), ) upload_files.append(upload_file) - image_content.append(f"![image]({base_url}/files/{upload_file.id}/file-preview)") + image_content.append(f"![image](/files/{upload_file.id}/file-preview)") except Exception as e: logger.warning("Failed to extract image from PDF: %s", e) continue diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index 1ddbfc5864..9df0ea07ff 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -87,7 +87,6 @@ class WordExtractor(BaseExtractor): def _extract_images_from_docx(self, doc): image_count = 0 image_map = {} - base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL for r_id, rel in doc.part.rels.items(): if "image" in rel.target_ref: @@ -126,7 +125,7 @@ class WordExtractor(BaseExtractor): used_at=naive_utc_now(), ) db.session.add(upload_file) - image_map[r_id] = f"![image]({base_url}/files/{upload_file.id}/file-preview)" + image_map[r_id] = f"![image](/files/{upload_file.id}/file-preview)" else: image_ext = rel.target_ref.split(".")[-1] if image_ext is None: @@ -154,7 +153,7 @@ class WordExtractor(BaseExtractor): used_at=naive_utc_now(), ) db.session.add(upload_file) - image_map[rel.target_part] = f"![image]({base_url}/files/{upload_file.id}/file-preview)" + image_map[rel.target_part] = f"![image](/files/{upload_file.id}/file-preview)" db.session.commit() return image_map diff --git a/api/models/dataset.py b/api/models/dataset.py index e7da2961bc..02b27c4161 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -809,7 +809,7 @@ class DocumentSegment(Base): text = self.content # For data before v0.10.0 - pattern = r"/files/([a-f0-9\-]+)/image-preview(?:\?.*?)?" + pattern = r"(?:https?://[^\s\)\"\']+)?/files/([a-f0-9\-]+)/image-preview(?:\?[^\s\)\"\']*)?" matches = re.finditer(pattern, text) for match in matches: upload_file_id = match.group(1) @@ -826,7 +826,7 @@ class DocumentSegment(Base): signed_urls.append((match.start(), match.end(), signed_url)) # For data after v0.10.0 - pattern = r"/files/([a-f0-9\-]+)/file-preview(?:\?.*?)?" + pattern = r"(?:https?://[^\s\)\"\']+)?/files/([a-f0-9\-]+)/file-preview(?:\?[^\s\)\"\']*)?" matches = re.finditer(pattern, text) for match in matches: upload_file_id = match.group(1) diff --git a/api/tests/unit_tests/core/rag/extractor/test_pdf_extractor.py b/api/tests/unit_tests/core/rag/extractor/test_pdf_extractor.py index 3167a9a301..66aceec87c 100644 --- a/api/tests/unit_tests/core/rag/extractor/test_pdf_extractor.py +++ b/api/tests/unit_tests/core/rag/extractor/test_pdf_extractor.py @@ -87,7 +87,7 @@ def test_extract_images_formats(mock_dependencies, monkeypatch, image_bytes, exp mock_raw.FPDF_PAGEOBJ_IMAGE = 1 result = extractor._extract_images(mock_page) - assert f"![image](http://files.local/files/{file_id}/file-preview)" in result + assert f"![image](/files/{file_id}/file-preview)" in result assert len(saves) == 1 assert saves[0][1] == image_bytes assert len(db_stub.session.added) == 1 @@ -180,7 +180,7 @@ def test_extract_images_failures(mock_dependencies): result = extractor._extract_images(mock_page) # Should have one success - assert "![image](http://files.local/files/test_file_id/file-preview)" in result + assert "![image](/files/test_file_id/file-preview)" in result assert len(saves) == 1 assert saves[0][1] == jpeg_bytes assert db_stub.session.committed is True diff --git a/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py b/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py index 0792ada194..a46c3bb19c 100644 --- a/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py +++ b/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py @@ -123,6 +123,7 @@ def test_extract_images_from_docx(monkeypatch): # Patch config values used for URL composition and storage type monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False) + monkeypatch.setattr(we.dify_config, "INTERNAL_FILES_URL", "http://internal.docker:5001", raising=False) monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False) # Patch UploadFile to avoid real DB models @@ -164,7 +165,7 @@ def test_extract_images_from_docx(monkeypatch): # Returned map should contain entries for external (keyed by rId) and internal (keyed by target_part) assert set(image_map.keys()) == {"rId1", internal_part} - assert all(v.startswith("![image](") and v.endswith("/file-preview)") for v in image_map.values()) + assert all(v.startswith("![image](/files/") and v.endswith("/file-preview)") for v in image_map.values()) # Storage should receive both payloads payloads = {data for _, data in saves} @@ -176,39 +177,6 @@ def test_extract_images_from_docx(monkeypatch): assert db_stub.session.committed is True -def test_extract_images_from_docx_uses_internal_files_url(): - """Test that INTERNAL_FILES_URL takes precedence over FILES_URL for plugin access.""" - # Test the URL generation logic directly - from configs import dify_config - - # Mock the configuration values - original_files_url = getattr(dify_config, "FILES_URL", None) - original_internal_files_url = getattr(dify_config, "INTERNAL_FILES_URL", None) - - try: - # Set both URLs - INTERNAL should take precedence - dify_config.FILES_URL = "http://external.example.com" - dify_config.INTERNAL_FILES_URL = "http://internal.docker:5001" - - # Test the URL generation logic (same as in word_extractor.py) - upload_file_id = "test_file_id" - - # This is the pattern we fixed in the word extractor - base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL - generated_url = f"{base_url}/files/{upload_file_id}/file-preview" - - # Verify that INTERNAL_FILES_URL is used instead of FILES_URL - assert "http://internal.docker:5001" in generated_url, f"Expected internal URL, got: {generated_url}" - assert "http://external.example.com" not in generated_url, f"Should not use external URL, got: {generated_url}" - - finally: - # Restore original values - if original_files_url is not None: - dify_config.FILES_URL = original_files_url - if original_internal_files_url is not None: - dify_config.INTERNAL_FILES_URL = original_internal_files_url - - def test_extract_hyperlinks(monkeypatch): # Mock db and storage to avoid issues during image extraction (even if no images are present) monkeypatch.setattr(we, "storage", SimpleNamespace(save=lambda k, d: None)) diff --git a/api/tests/unit_tests/models/test_dataset_models.py b/api/tests/unit_tests/models/test_dataset_models.py index 2322c556e2..e6ec9fd1b8 100644 --- a/api/tests/unit_tests/models/test_dataset_models.py +++ b/api/tests/unit_tests/models/test_dataset_models.py @@ -547,6 +547,35 @@ class TestDocumentSegmentIndexing: assert segment.index_node_hash == index_node_hash assert segment.keywords == keywords + def test_document_segment_sign_content_strips_absolute_files_host(self): + """Test that sign_content strips scheme/host from absolute /files URLs and returns a signed relative URL.""" + # Arrange + upload_file_id = "1602650a-4fe4-423c-85a2-af76c083e3c4" + segment = DocumentSegment( + tenant_id=str(uuid4()), + dataset_id=str(uuid4()), + document_id=str(uuid4()), + position=1, + content=f"![image](http://internal.docker:5001/files/{upload_file_id}/file-preview)", + word_count=1, + tokens=1, + created_by=str(uuid4()), + ) + + import models.dataset as dataset_module + + # Act + with patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True), patch( + "models.dataset.time.time", return_value=1700000000 + ), patch("models.dataset.os.urandom", return_value=b"\x00" * 16): + signed = segment.get_sign_content() + + # Assert + assert "internal.docker:5001" not in signed + assert f"/files/{upload_file_id}/file-preview?timestamp=" in signed + assert "&nonce=" in signed + assert "&sign=" in signed + def test_document_segment_with_answer_field(self): """Test creating a document segment with answer field for QA model.""" # Arrange