From 40d35304ea3de92bedbeac190f4ab0d6895fa37e Mon Sep 17 00:00:00 2001 From: Bowen Liang Date: Thu, 9 Oct 2025 10:21:56 +0800 Subject: [PATCH] fix: check allowed file extensions in rag transform pipeline and use set type instead of list for performance in file extensions (#26593) --- api/constants/__init__.py | 44 +++++++++++++------ api/libs/collection_utils.py | 14 ++++++ .../rag_pipeline_transform_service.py | 3 +- 3 files changed, 45 insertions(+), 16 deletions(-) create mode 100644 api/libs/collection_utils.py diff --git a/api/constants/__init__.py b/api/constants/__init__.py index fe8f4f8785..9141fbea95 100644 --- a/api/constants/__init__.py +++ b/api/constants/__init__.py @@ -1,4 +1,5 @@ from configs import dify_config +from libs.collection_utils import convert_to_lower_and_upper_set HIDDEN_VALUE = "[__HIDDEN__]" UNKNOWN_VALUE = "[__UNKNOWN__]" @@ -6,24 +7,39 @@ UUID_NIL = "00000000-0000-0000-0000-000000000000" DEFAULT_FILE_NUMBER_LIMITS = 3 -IMAGE_EXTENSIONS = ["jpg", "jpeg", "png", "webp", "gif", "svg"] -IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS]) +IMAGE_EXTENSIONS = convert_to_lower_and_upper_set({"jpg", "jpeg", "png", "webp", "gif", "svg"}) -VIDEO_EXTENSIONS = ["mp4", "mov", "mpeg", "webm"] -VIDEO_EXTENSIONS.extend([ext.upper() for ext in VIDEO_EXTENSIONS]) +VIDEO_EXTENSIONS = convert_to_lower_and_upper_set({"mp4", "mov", "mpeg", "webm"}) -AUDIO_EXTENSIONS = ["mp3", "m4a", "wav", "amr", "mpga"] -AUDIO_EXTENSIONS.extend([ext.upper() for ext in AUDIO_EXTENSIONS]) +AUDIO_EXTENSIONS = convert_to_lower_and_upper_set({"mp3", "m4a", "wav", "amr", "mpga"}) - -_doc_extensions: list[str] +_doc_extensions: set[str] if dify_config.ETL_TYPE == "Unstructured": - _doc_extensions = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls", "vtt", "properties"] - _doc_extensions.extend(("doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub")) + _doc_extensions = { + "txt", + "markdown", + "md", + "mdx", + "pdf", + "html", + "htm", + "xlsx", + "xls", + "vtt", + "properties", + "doc", + "docx", + "csv", + "eml", + "msg", + "pptx", + "xml", + "epub", + } if dify_config.UNSTRUCTURED_API_URL: - _doc_extensions.append("ppt") + _doc_extensions.add("ppt") else: - _doc_extensions = [ + _doc_extensions = { "txt", "markdown", "md", @@ -37,5 +53,5 @@ else: "csv", "vtt", "properties", - ] -DOCUMENT_EXTENSIONS = _doc_extensions + [ext.upper() for ext in _doc_extensions] + } +DOCUMENT_EXTENSIONS: set[str] = convert_to_lower_and_upper_set(_doc_extensions) diff --git a/api/libs/collection_utils.py b/api/libs/collection_utils.py new file mode 100644 index 0000000000..f97308ca44 --- /dev/null +++ b/api/libs/collection_utils.py @@ -0,0 +1,14 @@ +def convert_to_lower_and_upper_set(inputs: list[str] | set[str]) -> set[str]: + """ + Convert a list or set of strings to a set containing both lower and upper case versions of each string. + + Args: + inputs (list[str] | set[str]): A list or set of strings to be converted. + + Returns: + set[str]: A set containing both lower and upper case versions of each string. + """ + if not inputs: + return set() + else: + return {case for s in inputs if s for case in (s.lower(), s.upper())} diff --git a/api/services/rag_pipeline/rag_pipeline_transform_service.py b/api/services/rag_pipeline/rag_pipeline_transform_service.py index db9508824b..3d5a85b57f 100644 --- a/api/services/rag_pipeline/rag_pipeline_transform_service.py +++ b/api/services/rag_pipeline/rag_pipeline_transform_service.py @@ -149,8 +149,7 @@ class RagPipelineTransformService: file_extensions = node.get("data", {}).get("fileExtensions", []) if not file_extensions: return node - file_extensions = [file_extension.lower() for file_extension in file_extensions] - node["data"]["fileExtensions"] = DOCUMENT_EXTENSIONS + node["data"]["fileExtensions"] = [ext.lower() for ext in file_extensions if ext in DOCUMENT_EXTENSIONS] return node def _deal_knowledge_index(