From 269bf883c2a400565e5146fc6ade6b8385905869 Mon Sep 17 00:00:00 2001 From: FFXN Date: Tue, 3 Mar 2026 14:31:51 +0800 Subject: [PATCH 01/15] fix: Add the validation of doc_form in the Document-related service APIs. --- api/controllers/console/datasets/datasets.py | 8 ++++++++ api/controllers/service_api/dataset/document.py | 16 +++++++++++++++- api/models/dataset.py | 1 + .../knowledge_entities/knowledge_entities.py | 15 ++++++++++++++- 4 files changed, 38 insertions(+), 2 deletions(-) diff --git a/api/controllers/console/datasets/datasets.py b/api/controllers/console/datasets/datasets.py index a06b872846..b1cde105a9 100644 --- a/api/controllers/console/datasets/datasets.py +++ b/api/controllers/console/datasets/datasets.py @@ -119,6 +119,14 @@ def _validate_indexing_technique(value: str | None) -> str | None: return value +def _validate_doc_form(value: str | None) -> str | None: + if value is None: + return value + if value not in Dataset.DOC_FORM_LIST: + raise ValueError("Invalid doc_form.") + return value + + class DatasetCreatePayload(BaseModel): name: str = Field(..., min_length=1, max_length=40) description: str = Field("", max_length=400) diff --git a/api/controllers/service_api/dataset/document.py b/api/controllers/service_api/dataset/document.py index 0aeb4a2d36..dc8da025d4 100644 --- a/api/controllers/service_api/dataset/document.py +++ b/api/controllers/service_api/dataset/document.py @@ -4,7 +4,7 @@ from uuid import UUID from flask import request from flask_restx import marshal -from pydantic import BaseModel, Field, model_validator +from pydantic import BaseModel, Field, field_validator, model_validator from sqlalchemy import desc, select from werkzeug.exceptions import Forbidden, NotFound @@ -60,6 +60,13 @@ class DocumentTextCreatePayload(BaseModel): embedding_model: str | None = None embedding_model_provider: str | None = None + @field_validator("doc_form") + @classmethod + def validate_doc_form(cls, value: str) -> str: + if value not in Dataset.DOC_FORM_LIST: + raise ValueError("Invalid doc_form.") + return value + DEFAULT_REF_TEMPLATE_SWAGGER_2_0 = "#/definitions/{model}" @@ -72,6 +79,13 @@ class DocumentTextUpdate(BaseModel): doc_language: str = "English" retrieval_model: RetrievalModel | None = None + @field_validator("doc_form") + @classmethod + def validate_doc_form(cls, value: str) -> str: + if value not in Dataset.DOC_FORM_LIST: + raise ValueError("Invalid doc_form.") + return value + @model_validator(mode="after") def check_text_and_name(self) -> Self: if self.text is not None and self.name is None: diff --git a/api/models/dataset.py b/api/models/dataset.py index e7da2961bc..b7b803f4ea 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -51,6 +51,7 @@ class Dataset(Base): INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None] PROVIDER_LIST = ["vendor", "external", None] + DOC_FORM_LIST = ["text_model", "qa_model", "hierarchical_model"] id: Mapped[str] = mapped_column(StringUUID, default=lambda: str(uuid4())) tenant_id: Mapped[str] = mapped_column(StringUUID) diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py index 8dc5b93501..66309f0e59 100644 --- a/api/services/entities/knowledge_entities/knowledge_entities.py +++ b/api/services/entities/knowledge_entities/knowledge_entities.py @@ -1,8 +1,9 @@ from enum import StrEnum from typing import Literal -from pydantic import BaseModel +from pydantic import BaseModel, field_validator +from core.rag.index_processor.constant.index_type import IndexStructureType from core.rag.retrieval.retrieval_methods import RetrievalMethod @@ -127,6 +128,18 @@ class KnowledgeConfig(BaseModel): name: str | None = None is_multimodal: bool = False + @field_validator("doc_form") + @classmethod + def validate_doc_form(cls, value: str) -> str: + valid_forms = [ + IndexStructureType.PARAGRAPH_INDEX, + IndexStructureType.QA_INDEX, + IndexStructureType.PARENT_CHILD_INDEX, + ] + if value not in valid_forms: + raise ValueError("Invalid doc_form.") + return value + class SegmentCreateArgs(BaseModel): content: str | None = None From 08b28b4029b85445d11f0740b917032ddd35f77e Mon Sep 17 00:00:00 2001 From: FFXN Date: Tue, 3 Mar 2026 14:59:51 +0800 Subject: [PATCH 02/15] fix: Add the validation of doc_form in the Document-related service APIs. --- api/controllers/console/datasets/datasets.py | 8 ++++++++ api/models/dataset.py | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/api/controllers/console/datasets/datasets.py b/api/controllers/console/datasets/datasets.py index b1cde105a9..87676ea635 100644 --- a/api/controllers/console/datasets/datasets.py +++ b/api/controllers/console/datasets/datasets.py @@ -187,6 +187,14 @@ class IndexingEstimatePayload(BaseModel): raise ValueError("indexing_technique is required.") return result + @field_validator("doc_form") + @classmethod + def validate_doc_form(cls, value: str) -> str: + result = _validate_doc_form(value) + if result is None: + return "text_model" + return result + class ConsoleDatasetListQuery(BaseModel): page: int = Field(default=1, description="Page number") diff --git a/api/models/dataset.py b/api/models/dataset.py index b7b803f4ea..4ef39fcde1 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -19,6 +19,7 @@ from sqlalchemy.orm import Mapped, Session, mapped_column from configs import dify_config from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource +from core.rag.index_processor.constant.index_type import IndexStructureType from core.rag.index_processor.constant.query_type import QueryType from core.rag.retrieval.retrieval_methods import RetrievalMethod from core.tools.signature import sign_upload_file @@ -51,7 +52,7 @@ class Dataset(Base): INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None] PROVIDER_LIST = ["vendor", "external", None] - DOC_FORM_LIST = ["text_model", "qa_model", "hierarchical_model"] + DOC_FORM_LIST = [member.value for member in IndexStructureType] id: Mapped[str] = mapped_column(StringUUID, default=lambda: str(uuid4())) tenant_id: Mapped[str] = mapped_column(StringUUID) From 1b32e70dc5eb9f479185c2291dba3147a2c92d98 Mon Sep 17 00:00:00 2001 From: FFXN Date: Wed, 18 Mar 2026 10:51:07 +0800 Subject: [PATCH 03/15] fix: When can not obtain pipeline template detail failed from upstream service including remote template service and database, return responding error message. --- .../console/datasets/rag_pipeline/rag_pipeline.py | 2 ++ .../pipeline_template/remote/remote_retrieval.py | 12 ++++++++---- api/services/rag_pipeline/rag_pipeline.py | 10 +++++++++- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/api/controllers/console/datasets/rag_pipeline/rag_pipeline.py b/api/controllers/console/datasets/rag_pipeline/rag_pipeline.py index 6e0cd31b8d..b4b42ed594 100644 --- a/api/controllers/console/datasets/rag_pipeline/rag_pipeline.py +++ b/api/controllers/console/datasets/rag_pipeline/rag_pipeline.py @@ -46,6 +46,8 @@ class PipelineTemplateDetailApi(Resource): type = request.args.get("type", default="built-in", type=str) rag_pipeline_service = RagPipelineService() pipeline_template = rag_pipeline_service.get_pipeline_template_detail(template_id, type) + if not pipeline_template: + return {"error": "Pipeline template not found from upstream service."}, 404 return pipeline_template, 200 diff --git a/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py b/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py index 571ca6c7a6..5820a51ac3 100644 --- a/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py +++ b/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py @@ -35,17 +35,21 @@ class RemotePipelineTemplateRetrieval(PipelineTemplateRetrievalBase): return PipelineTemplateType.REMOTE @classmethod - def fetch_pipeline_template_detail_from_dify_official(cls, template_id: str) -> dict | None: + def fetch_pipeline_template_detail_from_dify_official(cls, template_id: str) -> dict: """ Fetch pipeline template detail from dify official. - :param template_id: Pipeline ID - :return: + + :param template_id: Pipeline template ID + :return: Template detail dict + :raises ValueError: When upstream returns a non-200 status code """ domain = dify_config.HOSTED_FETCH_PIPELINE_TEMPLATES_REMOTE_DOMAIN url = f"{domain}/pipeline-templates/{template_id}" response = httpx.get(url, timeout=httpx.Timeout(10.0, connect=3.0)) if response.status_code != 200: - return None + raise ValueError( + f"fetch pipeline template detail failed, status_code: {response.status_code}, response: {response.text}" + ) data: dict = response.json() return data diff --git a/api/services/rag_pipeline/rag_pipeline.py b/api/services/rag_pipeline/rag_pipeline.py index 2118043a98..015a9aee04 100644 --- a/api/services/rag_pipeline/rag_pipeline.py +++ b/api/services/rag_pipeline/rag_pipeline.py @@ -117,13 +117,21 @@ class RagPipelineService: def get_pipeline_template_detail(cls, template_id: str, type: str = "built-in") -> dict | None: """ Get pipeline template detail. + :param template_id: template id - :return: + :param type: template type, "built-in" or "customized" + :return: template detail dict, or None if not found """ if type == "built-in": mode = dify_config.HOSTED_FETCH_PIPELINE_TEMPLATES_MODE retrieval_instance = PipelineTemplateRetrievalFactory.get_pipeline_template_factory(mode)() built_in_result: dict | None = retrieval_instance.get_pipeline_template_detail(template_id) + if not built_in_result: + logger.warning( + "pipeline template not found after all retrieval attempts, template_id: %s, mode: %s", + template_id, + mode, + ) return built_in_result else: mode = "customized" From 6c2decfbfb508094ecf420668b2ed11d42da51ae Mon Sep 17 00:00:00 2001 From: FFXN <31929997+FFXN@users.noreply.github.com> Date: Wed, 18 Mar 2026 11:00:47 +0800 Subject: [PATCH 04/15] Apply suggestion from @gemini-code-assist[bot] Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../rag_pipeline/pipeline_template/remote/remote_retrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py b/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py index 5820a51ac3..c5d36eb720 100644 --- a/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py +++ b/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py @@ -48,7 +48,7 @@ class RemotePipelineTemplateRetrieval(PipelineTemplateRetrievalBase): response = httpx.get(url, timeout=httpx.Timeout(10.0, connect=3.0)) if response.status_code != 200: raise ValueError( - f"fetch pipeline template detail failed, status_code: {response.status_code}, response: {response.text}" + f"fetch pipeline template detail failed, status_code: {response.status_code}, response: {response.text[:1000]}" ) data: dict = response.json() return data From e7cbfb89d6a246070fef7f03d4477e032d0f46b9 Mon Sep 17 00:00:00 2001 From: FFXN <31929997+FFXN@users.noreply.github.com> Date: Wed, 18 Mar 2026 11:05:39 +0800 Subject: [PATCH 05/15] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- api/services/rag_pipeline/rag_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/services/rag_pipeline/rag_pipeline.py b/api/services/rag_pipeline/rag_pipeline.py index 00d550f2c9..17b0393edb 100644 --- a/api/services/rag_pipeline/rag_pipeline.py +++ b/api/services/rag_pipeline/rag_pipeline.py @@ -128,7 +128,7 @@ class RagPipelineService: built_in_result: dict | None = retrieval_instance.get_pipeline_template_detail(template_id) if not built_in_result: logger.warning( - "pipeline template not found after all retrieval attempts, template_id: %s, mode: %s", + "pipeline template retrieval returned empty result, template_id: %s, mode: %s", template_id, mode, ) From a6e03c6735adf377ac9cb42fcb497a651fa79bde Mon Sep 17 00:00:00 2001 From: FFXN <31929997+FFXN@users.noreply.github.com> Date: Wed, 18 Mar 2026 11:07:00 +0800 Subject: [PATCH 06/15] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- api/controllers/console/datasets/rag_pipeline/rag_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/controllers/console/datasets/rag_pipeline/rag_pipeline.py b/api/controllers/console/datasets/rag_pipeline/rag_pipeline.py index b4b42ed594..4f31093cfe 100644 --- a/api/controllers/console/datasets/rag_pipeline/rag_pipeline.py +++ b/api/controllers/console/datasets/rag_pipeline/rag_pipeline.py @@ -46,7 +46,7 @@ class PipelineTemplateDetailApi(Resource): type = request.args.get("type", default="built-in", type=str) rag_pipeline_service = RagPipelineService() pipeline_template = rag_pipeline_service.get_pipeline_template_detail(template_id, type) - if not pipeline_template: + if pipeline_template is None: return {"error": "Pipeline template not found from upstream service."}, 404 return pipeline_template, 200 From 2f0f97aa6629e9f8240626c88d7c0490b95fd7df Mon Sep 17 00:00:00 2001 From: FFXN <31929997+FFXN@users.noreply.github.com> Date: Wed, 18 Mar 2026 11:09:28 +0800 Subject: [PATCH 07/15] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- api/services/rag_pipeline/rag_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/services/rag_pipeline/rag_pipeline.py b/api/services/rag_pipeline/rag_pipeline.py index 17b0393edb..f3aedafac9 100644 --- a/api/services/rag_pipeline/rag_pipeline.py +++ b/api/services/rag_pipeline/rag_pipeline.py @@ -126,7 +126,7 @@ class RagPipelineService: mode = dify_config.HOSTED_FETCH_PIPELINE_TEMPLATES_MODE retrieval_instance = PipelineTemplateRetrievalFactory.get_pipeline_template_factory(mode)() built_in_result: dict | None = retrieval_instance.get_pipeline_template_detail(template_id) - if not built_in_result: + if built_in_result is None: logger.warning( "pipeline template retrieval returned empty result, template_id: %s, mode: %s", template_id, From b85af2ec47ffe621e5f1c751b87e73baaf9d2424 Mon Sep 17 00:00:00 2001 From: FFXN Date: Wed, 18 Mar 2026 11:20:50 +0800 Subject: [PATCH 08/15] fix: When can not obtain pipeline template detail failed from upstream service including remote template service and database, return responding error message. --- .../remote/remote_retrieval.py | 7 +++- .../rag_pipeline/test_rag_pipeline.py | 38 +++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py b/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py index c5d36eb720..c5775d9a37 100644 --- a/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py +++ b/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py @@ -15,7 +15,8 @@ class RemotePipelineTemplateRetrieval(PipelineTemplateRetrievalBase): Retrieval recommended app from dify official """ - def get_pipeline_template_detail(self, template_id: str): + def get_pipeline_template_detail(self, template_id: str) -> dict | None: + result: dict | None try: result = self.fetch_pipeline_template_detail_from_dify_official(template_id) except Exception as e: @@ -48,7 +49,9 @@ class RemotePipelineTemplateRetrieval(PipelineTemplateRetrievalBase): response = httpx.get(url, timeout=httpx.Timeout(10.0, connect=3.0)) if response.status_code != 200: raise ValueError( - f"fetch pipeline template detail failed, status_code: {response.status_code}, response: {response.text[:1000]}" + f"fetch pipeline template detail failed," + + f" status_code: {response.status_code}," + + f" response: {response.text[:1000]}" ) data: dict = response.json() return data diff --git a/api/tests/unit_tests/controllers/console/datasets/rag_pipeline/test_rag_pipeline.py b/api/tests/unit_tests/controllers/console/datasets/rag_pipeline/test_rag_pipeline.py index 3b8679f4ec..ebbb34e069 100644 --- a/api/tests/unit_tests/controllers/console/datasets/rag_pipeline/test_rag_pipeline.py +++ b/api/tests/unit_tests/controllers/console/datasets/rag_pipeline/test_rag_pipeline.py @@ -59,6 +59,44 @@ class TestPipelineTemplateDetailApi: assert status == 200 assert response == template + def test_get_returns_404_when_template_not_found(self, app): + api = PipelineTemplateDetailApi() + method = unwrap(api.get) + + service = MagicMock() + service.get_pipeline_template_detail.return_value = None + + with ( + app.test_request_context("/?type=built-in"), + patch( + "controllers.console.datasets.rag_pipeline.rag_pipeline.RagPipelineService", + return_value=service, + ), + ): + response, status = method(api, "non-existent-id") + + assert status == 404 + assert "error" in response + + def test_get_returns_404_for_customized_type_not_found(self, app): + api = PipelineTemplateDetailApi() + method = unwrap(api.get) + + service = MagicMock() + service.get_pipeline_template_detail.return_value = None + + with ( + app.test_request_context("/?type=customized"), + patch( + "controllers.console.datasets.rag_pipeline.rag_pipeline.RagPipelineService", + return_value=service, + ), + ): + response, status = method(api, "non-existent-id") + + assert status == 404 + assert "error" in response + class TestCustomizedPipelineTemplateApi: def test_patch_success(self, app): From 917d362a5856b01cb178e446934295bcefc06c84 Mon Sep 17 00:00:00 2001 From: FFXN Date: Thu, 19 Mar 2026 18:08:00 +0800 Subject: [PATCH 09/15] fix: Querying document list based on hit_count caused slow SQL. --- api/controllers/console/datasets/datasets_document.py | 1 + 1 file changed, 1 insertion(+) diff --git a/api/controllers/console/datasets/datasets_document.py b/api/controllers/console/datasets/datasets_document.py index 0c441553be..bc90c4ffbd 100644 --- a/api/controllers/console/datasets/datasets_document.py +++ b/api/controllers/console/datasets/datasets_document.py @@ -298,6 +298,7 @@ class DatasetDocumentListApi(Resource): if sort == "hit_count": sub_query = ( sa.select(DocumentSegment.document_id, sa.func.sum(DocumentSegment.hit_count).label("total_hit_count")) + .where(DocumentSegment.dataset_id == str(dataset_id)) .group_by(DocumentSegment.document_id) .subquery() ) From 7fd549fd39cd5955982b8682cbdc5f81f71cbfa1 Mon Sep 17 00:00:00 2001 From: FFXN Date: Mon, 13 Apr 2026 18:44:53 +0800 Subject: [PATCH 10/15] fix: Compatibility issues with the summary index feature when using the weaviate vector database. --- .../vdb/weaviate/weaviate_vector.py | 52 ++++++++++++------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py index 25b65b82a9..417f06a4cf 100644 --- a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py +++ b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py @@ -20,7 +20,7 @@ from pydantic import BaseModel, model_validator from weaviate.classes.data import DataObject from weaviate.classes.init import Auth from weaviate.classes.query import Filter, MetadataQuery -from weaviate.exceptions import UnexpectedStatusCodeError +from weaviate.exceptions import UnexpectedStatusCodeError, WeaviateQueryError from configs import dify_config from core.rag.datasource.vdb.field import Field @@ -230,6 +230,8 @@ class WeaviateVector(BaseVector): wc.Property(name="doc_id", data_type=wc.DataType.TEXT), wc.Property(name="doc_type", data_type=wc.DataType.TEXT), wc.Property(name="chunk_index", data_type=wc.DataType.INT), + wc.Property(name="is_summary", data_type=wc.DataType.BOOL), + wc.Property(name="original_chunk_id", data_type=wc.DataType.TEXT), ], vector_config=wc.Configure.Vectors.self_provided(), ) @@ -262,6 +264,10 @@ class WeaviateVector(BaseVector): to_add.append(wc.Property(name="doc_type", data_type=wc.DataType.TEXT)) if "chunk_index" not in existing: to_add.append(wc.Property(name="chunk_index", data_type=wc.DataType.INT)) + if "is_summary" not in existing: + to_add.append(wc.Property(name="is_summary", data_type=wc.DataType.BOOL)) + if "original_chunk_id" not in existing: + to_add.append(wc.Property(name="original_chunk_id", data_type=wc.DataType.TEXT)) for prop in to_add: try: @@ -400,15 +406,20 @@ class WeaviateVector(BaseVector): top_k = int(kwargs.get("top_k", 4)) score_threshold = float(kwargs.get("score_threshold") or 0.0) - res = col.query.near_vector( - near_vector=query_vector, - limit=top_k, - return_properties=props, - return_metadata=MetadataQuery(distance=True), - include_vector=False, - filters=where, - target_vector="default", - ) + query_kwargs = { + "near_vector": query_vector, + "limit": top_k, + "return_properties": props, + "return_metadata": MetadataQuery(distance=True), + "include_vector": False, + "filters": where, + "target_vector": "default", + } + try: + res = col.query.near_vector(**query_kwargs) + except WeaviateQueryError: + self._ensure_properties() + res = col.query.near_vector(**query_kwargs) docs: list[Document] = [] for obj in res.objects: @@ -446,14 +457,19 @@ class WeaviateVector(BaseVector): top_k = int(kwargs.get("top_k", 4)) - res = col.query.bm25( - query=query, - query_properties=[Field.TEXT_KEY.value], - limit=top_k, - return_properties=props, - include_vector=True, - filters=where, - ) + query_kwargs = { + "query": query, + "query_properties": [Field.TEXT_KEY.value], + "limit": top_k, + "return_properties": props, + "include_vector": True, + "filters": where, + } + try: + res = col.query.bm25(**query_kwargs) + except WeaviateQueryError: + self._ensure_properties() + res = col.query.bm25(**query_kwargs) docs: list[Document] = [] for obj in res.objects: From e62a67c719a1e4eb2a7ed6e809c181a84be221b2 Mon Sep 17 00:00:00 2001 From: FFXN Date: Tue, 28 Apr 2026 16:22:27 +0800 Subject: [PATCH 11/15] fix: hit-testing response failed because of Pydantic check. --- .../console/datasets/hit_testing_base.py | 49 +++++++++++++++++- .../console/datasets/test_hit_testing_base.py | 36 +++++++++++++ .../service_api/dataset/test_hit_testing.py | 51 +++++++++++++++++++ 3 files changed, 135 insertions(+), 1 deletion(-) diff --git a/api/controllers/console/datasets/hit_testing_base.py b/api/controllers/console/datasets/hit_testing_base.py index 699fa599c8..71ab1513ed 100644 --- a/api/controllers/console/datasets/hit_testing_base.py +++ b/api/controllers/console/datasets/hit_testing_base.py @@ -38,6 +38,48 @@ class HitTestingPayload(BaseModel): class DatasetsHitTestingBase: + @staticmethod + def _normalize_hit_testing_query(query: Any) -> str: + """Return the user-visible query string from legacy and current response shapes.""" + if isinstance(query, str): + return query + + if isinstance(query, dict): + content = query.get("content") + if isinstance(content, str): + return content + + raise ValueError("Invalid hit testing query response") + + @staticmethod + def _normalize_hit_testing_records(records: Any) -> list[dict[str, Any]]: + """Coerce nullable collection fields into lists before response validation.""" + if not isinstance(records, list): + return [] + + normalized_records: list[dict[str, Any]] = [] + for record in records: + if not isinstance(record, dict): + continue + + normalized_record = dict(record) + segment = normalized_record.get("segment") + if isinstance(segment, dict): + normalized_segment = dict(segment) + if normalized_segment.get("keywords") is None: + normalized_segment["keywords"] = [] + normalized_record["segment"] = normalized_segment + + if normalized_record.get("child_chunks") is None: + normalized_record["child_chunks"] = [] + + if normalized_record.get("files") is None: + normalized_record["files"] = [] + + normalized_records.append(normalized_record) + + return normalized_records + @staticmethod def get_and_validate_dataset(dataset_id: str): assert isinstance(current_user, Account) @@ -75,7 +117,12 @@ class DatasetsHitTestingBase: attachment_ids=args.get("attachment_ids"), limit=10, ) - return {"query": response["query"], "records": marshal(response["records"], hit_testing_record_fields)} + return { + "query": DatasetsHitTestingBase._normalize_hit_testing_query(response.get("query")), + "records": DatasetsHitTestingBase._normalize_hit_testing_records( + marshal(response.get("records", []), hit_testing_record_fields) + ), + } except services.errors.index.IndexNotInitializedError: raise DatasetNotInitializedError() except ProviderTokenNotInitError as ex: diff --git a/api/tests/unit_tests/controllers/console/datasets/test_hit_testing_base.py b/api/tests/unit_tests/controllers/console/datasets/test_hit_testing_base.py index e4acd91b76..d29b34beb2 100644 --- a/api/tests/unit_tests/controllers/console/datasets/test_hit_testing_base.py +++ b/api/tests/unit_tests/controllers/console/datasets/test_hit_testing_base.py @@ -134,6 +134,42 @@ class TestPerformHitTesting: assert result["query"] == "hello" assert result["records"] == [] + def test_success_normalizes_legacy_query_and_nullable_list_fields(self, dataset): + response = { + "query": {"content": "hello"}, + "records": [ + { + "segment": {"id": "segment-1", "keywords": None}, + "child_chunks": None, + "files": None, + "score": 0.8, + } + ], + } + + with ( + patch.object( + HitTestingService, + "retrieve", + return_value=response, + ), + patch( + "controllers.console.datasets.hit_testing_base.marshal", + return_value=response["records"], + ), + ): + result = DatasetsHitTestingBase.perform_hit_testing(dataset, {"query": "hello"}) + + assert result["query"] == "hello" + assert result["records"] == [ + { + "segment": {"id": "segment-1", "keywords": []}, + "child_chunks": [], + "files": [], + "score": 0.8, + } + ] + def test_index_not_initialized(self, dataset): with patch.object( HitTestingService, diff --git a/api/tests/unit_tests/controllers/service_api/dataset/test_hit_testing.py b/api/tests/unit_tests/controllers/service_api/dataset/test_hit_testing.py index 95c2f5cf92..9be8e56f56 100644 --- a/api/tests/unit_tests/controllers/service_api/dataset/test_hit_testing.py +++ b/api/tests/unit_tests/controllers/service_api/dataset/test_hit_testing.py @@ -171,6 +171,57 @@ class TestHitTestingApiPost: assert passed_retrieval_model["search_method"] == "semantic_search" assert passed_retrieval_model["top_k"] == 10 + @patch("controllers.service_api.dataset.hit_testing.service_api_ns") + @patch("controllers.console.datasets.hit_testing_base.marshal") + @patch("controllers.console.datasets.hit_testing_base.HitTestingService") + @patch("controllers.console.datasets.hit_testing_base.DatasetService") + @patch("controllers.console.datasets.hit_testing_base.current_user", new_callable=lambda: Mock(spec=Account)) + def test_post_normalizes_legacy_query_and_nullable_list_fields( + self, + mock_current_user, + mock_dataset_svc, + mock_hit_svc, + mock_marshal, + mock_ns, + app, + ): + """Test service API normalizes legacy query shape and nullable list fields.""" + dataset_id = str(uuid.uuid4()) + tenant_id = str(uuid.uuid4()) + + mock_dataset = Mock() + mock_dataset.id = dataset_id + + mock_dataset_svc.get_dataset.return_value = mock_dataset + mock_dataset_svc.check_dataset_permission.return_value = None + + mock_hit_svc.retrieve.return_value = {"query": {"content": "legacy query"}, "records": ["placeholder"]} + mock_hit_svc.hit_testing_args_check.return_value = None + mock_marshal.return_value = [ + { + "segment": {"id": "segment-1", "keywords": None}, + "child_chunks": None, + "files": None, + "score": 0.9, + } + ] + + mock_ns.payload = {"query": "legacy query"} + + with app.test_request_context(): + api = HitTestingApi() + response = HitTestingApi.post.__wrapped__(api, tenant_id, dataset_id) + + assert response["query"] == "legacy query" + assert response["records"] == [ + { + "segment": {"id": "segment-1", "keywords": []}, + "child_chunks": [], + "files": [], + "score": 0.9, + } + ] + @patch("controllers.service_api.dataset.hit_testing.service_api_ns") @patch("controllers.console.datasets.hit_testing_base.DatasetService") @patch("controllers.console.datasets.hit_testing_base.current_user", new_callable=lambda: Mock(spec=Account)) From 8a23126f295408f13c44b627ff18ba52b79eead6 Mon Sep 17 00:00:00 2001 From: FFXN Date: Fri, 8 May 2026 13:55:05 +0800 Subject: [PATCH 12/15] fix: Image rendering in the knowledge base failed. --- api/core/rag/extractor/pdf_extractor.py | 2 +- api/core/rag/extractor/word_extractor.py | 2 +- api/core/tools/signature.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/api/core/rag/extractor/pdf_extractor.py b/api/core/rag/extractor/pdf_extractor.py index 02f0efc908..25f6fe3e2a 100644 --- a/api/core/rag/extractor/pdf_extractor.py +++ b/api/core/rag/extractor/pdf_extractor.py @@ -115,7 +115,7 @@ class PdfExtractor(BaseExtractor): """ image_content = [] upload_files = [] - base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL + base_url = dify_config.FILES_URL try: image_objects = page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,)) diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index 0330a43b28..60f8906181 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -110,7 +110,7 @@ class WordExtractor(BaseExtractor): def _extract_images_from_docx(self, doc): image_count = 0 image_map = {} - base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL + base_url = dify_config.FILES_URL for r_id, rel in doc.part.rels.items(): if "image" in rel.target_ref: diff --git a/api/core/tools/signature.py b/api/core/tools/signature.py index 1807226924..e8281fd9d4 100644 --- a/api/core/tools/signature.py +++ b/api/core/tools/signature.py @@ -26,12 +26,12 @@ def sign_tool_file(tool_file_id: str, extension: str, for_external: bool = True) return f"{file_preview_url}?timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}" -def sign_upload_file(upload_file_id: str, extension: str) -> str: +def sign_upload_file(upload_file_id: str, extension: str, for_external: bool = True) -> str: """ sign file to get a temporary url for plugin access """ # Use internal URL for plugin/tool file access in Docker environments - base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL + base_url = dify_config.FILES_URL if for_external else (dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL) file_preview_url = f"{base_url}/files/{upload_file_id}/image-preview" timestamp = str(int(time.time())) From 9e137e12ab74b6ddaf2f7fba91c1d66bf5a4bfb2 Mon Sep 17 00:00:00 2001 From: FFXN Date: Fri, 8 May 2026 14:16:25 +0800 Subject: [PATCH 13/15] fix: Image rendering in the knowledge base failed. --- api/core/tools/signature.py | 7 +++++-- .../unit_tests/core/tools/test_signature.py | 20 ++++++++++++++++++- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/api/core/tools/signature.py b/api/core/tools/signature.py index e8281fd9d4..8f00dde259 100644 --- a/api/core/tools/signature.py +++ b/api/core/tools/signature.py @@ -28,9 +28,12 @@ def sign_tool_file(tool_file_id: str, extension: str, for_external: bool = True) def sign_upload_file(upload_file_id: str, extension: str, for_external: bool = True) -> str: """ - sign file to get a temporary url for plugin access + Sign an upload file to get a temporary image preview URL. + + External URLs are the default because uploaded-file previews are returned to + user-facing retrieval responses. Internal URLs remain available for callers + that need to access files inside the deployment network. """ - # Use internal URL for plugin/tool file access in Docker environments base_url = dify_config.FILES_URL if for_external else (dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL) file_preview_url = f"{base_url}/files/{upload_file_id}/image-preview" diff --git a/api/tests/unit_tests/core/tools/test_signature.py b/api/tests/unit_tests/core/tools/test_signature.py index 353988d7a6..7f76bdf541 100644 --- a/api/tests/unit_tests/core/tools/test_signature.py +++ b/api/tests/unit_tests/core/tools/test_signature.py @@ -89,7 +89,7 @@ def test_verify_tool_file_signature_rejects_expired_signature(monkeypatch: pytes assert verify_tool_file_signature("tool-file-id", timestamp, nonce, sign) is False -def test_sign_upload_file_prefers_internal_url(monkeypatch: pytest.MonkeyPatch) -> None: +def test_sign_upload_file_for_external_uses_files_url(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr("core.tools.signature.time.time", lambda: 1700000000) monkeypatch.setattr("core.tools.signature.os.urandom", lambda _: b"\x03" * 16) monkeypatch.setattr("core.tools.signature.dify_config.SECRET_KEY", "unit-secret") @@ -100,6 +100,24 @@ def test_sign_upload_file_prefers_internal_url(monkeypatch: pytest.MonkeyPatch) parsed = urlparse(url) query = parse_qs(parsed.query) + assert parsed.netloc == "files.example.com" + assert parsed.path == "/files/upload-id/image-preview" + assert query["timestamp"][0] + assert query["nonce"][0] + assert query["sign"][0] + + +def test_sign_upload_file_for_internal_prefers_internal_url(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr("core.tools.signature.time.time", lambda: 1700000000) + monkeypatch.setattr("core.tools.signature.os.urandom", lambda _: b"\x08" * 16) + monkeypatch.setattr("core.tools.signature.dify_config.SECRET_KEY", "unit-secret") + monkeypatch.setattr("core.tools.signature.dify_config.FILES_URL", "https://files.example.com") + monkeypatch.setattr("core.tools.signature.dify_config.INTERNAL_FILES_URL", "https://internal.example.com") + + url = sign_upload_file("upload-id", ".png", for_external=False) + parsed = urlparse(url) + query = parse_qs(parsed.query) + assert parsed.netloc == "internal.example.com" assert parsed.path == "/files/upload-id/image-preview" assert query["timestamp"][0] From ea719903883c772dc5da84af7d72673df6cc99a7 Mon Sep 17 00:00:00 2001 From: FFXN Date: Sat, 9 May 2026 10:09:54 +0800 Subject: [PATCH 14/15] fix: Using CONSOLE_API_URL to generate an image preview URL causes the image preview to fail on the chunks details page of the knowledge base. --- api/models/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/models/dataset.py b/api/models/dataset.py index a00e9f7640..6ed9387fc2 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -1020,7 +1020,7 @@ class DocumentSegment(Base): encoded_sign = base64.urlsafe_b64encode(sign).decode() params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}" - reference_url = dify_config.CONSOLE_API_URL or "" + reference_url = dify_config.FILES_URL or dify_config.CONSOLE_API_URL or "" base_url = f"{reference_url}/files/{upload_file_id}/image-preview" source_url = f"{base_url}?{params}" attachment_list.append( From d7f99d64588c854aa47aad5386ca49b19064900e Mon Sep 17 00:00:00 2001 From: FFXN Date: Sat, 9 May 2026 10:55:59 +0800 Subject: [PATCH 15/15] fix: Using CONSOLE_API_URL to generate an image preview URL causes the image preview to fail on the chunks details page of the knowledge base. --- .../unit_tests/models/test_dataset_models.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/api/tests/unit_tests/models/test_dataset_models.py b/api/tests/unit_tests/models/test_dataset_models.py index 51d95c4239..3f14ebe8bf 100644 --- a/api/tests/unit_tests/models/test_dataset_models.py +++ b/api/tests/unit_tests/models/test_dataset_models.py @@ -12,7 +12,9 @@ This test suite covers: import json import pickle from datetime import UTC, datetime +from types import SimpleNamespace from unittest.mock import Mock, patch +from urllib.parse import parse_qs, urlparse from uuid import uuid4 from core.rag.index_processor.constant.index_type import IndexTechniqueType @@ -676,6 +678,51 @@ class TestDocumentSegmentIndexing: # Assert assert segment.hit_count == 5 + def test_document_segment_attachments_prefers_files_url_for_source_url(self, monkeypatch): + """Test attachment source URLs use FILES_URL before falling back to CONSOLE_API_URL.""" + # Arrange + segment = DocumentSegment( + tenant_id="tenant-1", + dataset_id="dataset-1", + document_id="document-1", + position=1, + content="Test", + word_count=1, + tokens=2, + created_by="user-1", + ) + segment.id = "segment-1" + attachment = SimpleNamespace( + id="upload-1", + name="image.png", + size=128, + extension="png", + mime_type="image/png", + ) + + monkeypatch.setattr("models.dataset.time.time", lambda: 1700000000) + monkeypatch.setattr("models.dataset.os.urandom", lambda _: b"\x01" * 16) + monkeypatch.setattr("models.dataset.dify_config.SECRET_KEY", "unit-secret") + monkeypatch.setattr("models.dataset.dify_config.FILES_URL", "https://files.example.com") + monkeypatch.setattr("models.dataset.dify_config.CONSOLE_API_URL", "https://console.example.com") + + with patch("models.dataset.db") as mock_db: + mock_db.session.execute.return_value.all.return_value = [(Mock(), attachment)] + + # Act + attachments = segment.attachments + + # Assert + assert len(attachments) == 1 + source_url = attachments[0]["source_url"] + parsed = urlparse(source_url) + query = parse_qs(parsed.query) + assert parsed.netloc == "files.example.com" + assert parsed.path == "/files/upload-1/image-preview" + assert query["timestamp"] == ["1700000000"] + assert query["nonce"] == ["01010101010101010101010101010101"] + assert query["sign"][0] + def test_document_segment_error_tracking(self): """Test document segment error tracking.""" # Arrange