From 40d35304ea3de92bedbeac190f4ab0d6895fa37e Mon Sep 17 00:00:00 2001
From: Bowen Liang <liang.bowen.123@qq.com>
Date: Thu, 9 Oct 2025 10:21:56 +0800
Subject: [PATCH] fix: check allowed file extensions in rag transform pipeline
 and use set type instead of list for performance in file extensions  (#26593)

---
 api/constants/__init__.py                     | 44 +++++++++++++------
 api/libs/collection_utils.py                  | 14 ++++++
 .../rag_pipeline_transform_service.py         |  3 +-
 3 files changed, 45 insertions(+), 16 deletions(-)
 create mode 100644 api/libs/collection_utils.py

diff --git a/api/constants/__init__.py b/api/constants/__init__.py
index fe8f4f8785..9141fbea95 100644
--- a/api/constants/__init__.py
+++ b/api/constants/__init__.py
@@ -1,4 +1,5 @@
 from configs import dify_config
+from libs.collection_utils import convert_to_lower_and_upper_set
 
 HIDDEN_VALUE = "[__HIDDEN__]"
 UNKNOWN_VALUE = "[__UNKNOWN__]"
@@ -6,24 +7,39 @@ UUID_NIL = "00000000-0000-0000-0000-000000000000"
 
 DEFAULT_FILE_NUMBER_LIMITS = 3
 
-IMAGE_EXTENSIONS = ["jpg", "jpeg", "png", "webp", "gif", "svg"]
-IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS])
+IMAGE_EXTENSIONS = convert_to_lower_and_upper_set({"jpg", "jpeg", "png", "webp", "gif", "svg"})
 
-VIDEO_EXTENSIONS = ["mp4", "mov", "mpeg", "webm"]
-VIDEO_EXTENSIONS.extend([ext.upper() for ext in VIDEO_EXTENSIONS])
+VIDEO_EXTENSIONS = convert_to_lower_and_upper_set({"mp4", "mov", "mpeg", "webm"})
 
-AUDIO_EXTENSIONS = ["mp3", "m4a", "wav", "amr", "mpga"]
-AUDIO_EXTENSIONS.extend([ext.upper() for ext in AUDIO_EXTENSIONS])
+AUDIO_EXTENSIONS = convert_to_lower_and_upper_set({"mp3", "m4a", "wav", "amr", "mpga"})
 
-
-_doc_extensions: list[str]
+_doc_extensions: set[str]
 if dify_config.ETL_TYPE == "Unstructured":
-    _doc_extensions = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls", "vtt", "properties"]
-    _doc_extensions.extend(("doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub"))
+    _doc_extensions = {
+        "txt",
+        "markdown",
+        "md",
+        "mdx",
+        "pdf",
+        "html",
+        "htm",
+        "xlsx",
+        "xls",
+        "vtt",
+        "properties",
+        "doc",
+        "docx",
+        "csv",
+        "eml",
+        "msg",
+        "pptx",
+        "xml",
+        "epub",
+    }
     if dify_config.UNSTRUCTURED_API_URL:
-        _doc_extensions.append("ppt")
+        _doc_extensions.add("ppt")
 else:
-    _doc_extensions = [
+    _doc_extensions = {
         "txt",
         "markdown",
         "md",
@@ -37,5 +53,5 @@ else:
         "csv",
         "vtt",
         "properties",
-    ]
-DOCUMENT_EXTENSIONS = _doc_extensions + [ext.upper() for ext in _doc_extensions]
+    }
+DOCUMENT_EXTENSIONS: set[str] = convert_to_lower_and_upper_set(_doc_extensions)
diff --git a/api/libs/collection_utils.py b/api/libs/collection_utils.py
new file mode 100644
index 0000000000..f97308ca44
--- /dev/null
+++ b/api/libs/collection_utils.py
@@ -0,0 +1,14 @@
+def convert_to_lower_and_upper_set(inputs: list[str] | set[str]) -> set[str]:
+    """
+    Convert a list or set of strings to a set containing both lower and upper case versions of each string.
+
+    Args:
+        inputs (list[str] | set[str]): A list or set of strings to be converted.
+
+    Returns:
+        set[str]: A set containing both lower and upper case versions of each string.
+    """
+    if not inputs:
+        return set()
+    else:
+        return {case for s in inputs if s for case in (s.lower(), s.upper())}
diff --git a/api/services/rag_pipeline/rag_pipeline_transform_service.py b/api/services/rag_pipeline/rag_pipeline_transform_service.py
index db9508824b..3d5a85b57f 100644
--- a/api/services/rag_pipeline/rag_pipeline_transform_service.py
+++ b/api/services/rag_pipeline/rag_pipeline_transform_service.py
@@ -149,8 +149,7 @@ class RagPipelineTransformService:
         file_extensions = node.get("data", {}).get("fileExtensions", [])
         if not file_extensions:
             return node
-        file_extensions = [file_extension.lower() for file_extension in file_extensions]
-        node["data"]["fileExtensions"] = DOCUMENT_EXTENSIONS
+        node["data"]["fileExtensions"] = [ext.lower() for ext in file_extensions if ext in DOCUMENT_EXTENSIONS]
         return node
 
     def _deal_knowledge_index(