fix: check allowed file extensions in rag transform pipeline and use set type instead of list for performance in file extensions (#26593)

This commit is contained in:
Bowen Liang 2025-10-09 10:21:56 +08:00 committed by GitHub
parent 89821d66bb
commit 40d35304ea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 45 additions and 16 deletions

View File

@ -1,4 +1,5 @@
from configs import dify_config from configs import dify_config
from libs.collection_utils import convert_to_lower_and_upper_set
HIDDEN_VALUE = "[__HIDDEN__]" HIDDEN_VALUE = "[__HIDDEN__]"
UNKNOWN_VALUE = "[__UNKNOWN__]" UNKNOWN_VALUE = "[__UNKNOWN__]"
@ -6,24 +7,39 @@ UUID_NIL = "00000000-0000-0000-0000-000000000000"
DEFAULT_FILE_NUMBER_LIMITS = 3 DEFAULT_FILE_NUMBER_LIMITS = 3
IMAGE_EXTENSIONS = ["jpg", "jpeg", "png", "webp", "gif", "svg"] IMAGE_EXTENSIONS = convert_to_lower_and_upper_set({"jpg", "jpeg", "png", "webp", "gif", "svg"})
IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS])
VIDEO_EXTENSIONS = ["mp4", "mov", "mpeg", "webm"] VIDEO_EXTENSIONS = convert_to_lower_and_upper_set({"mp4", "mov", "mpeg", "webm"})
VIDEO_EXTENSIONS.extend([ext.upper() for ext in VIDEO_EXTENSIONS])
AUDIO_EXTENSIONS = ["mp3", "m4a", "wav", "amr", "mpga"] AUDIO_EXTENSIONS = convert_to_lower_and_upper_set({"mp3", "m4a", "wav", "amr", "mpga"})
AUDIO_EXTENSIONS.extend([ext.upper() for ext in AUDIO_EXTENSIONS])
_doc_extensions: set[str]
_doc_extensions: list[str]
if dify_config.ETL_TYPE == "Unstructured": if dify_config.ETL_TYPE == "Unstructured":
_doc_extensions = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls", "vtt", "properties"] _doc_extensions = {
_doc_extensions.extend(("doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub")) "txt",
"markdown",
"md",
"mdx",
"pdf",
"html",
"htm",
"xlsx",
"xls",
"vtt",
"properties",
"doc",
"docx",
"csv",
"eml",
"msg",
"pptx",
"xml",
"epub",
}
if dify_config.UNSTRUCTURED_API_URL: if dify_config.UNSTRUCTURED_API_URL:
_doc_extensions.append("ppt") _doc_extensions.add("ppt")
else: else:
_doc_extensions = [ _doc_extensions = {
"txt", "txt",
"markdown", "markdown",
"md", "md",
@ -37,5 +53,5 @@ else:
"csv", "csv",
"vtt", "vtt",
"properties", "properties",
] }
DOCUMENT_EXTENSIONS = _doc_extensions + [ext.upper() for ext in _doc_extensions] DOCUMENT_EXTENSIONS: set[str] = convert_to_lower_and_upper_set(_doc_extensions)

View File

@ -0,0 +1,14 @@
def convert_to_lower_and_upper_set(inputs: list[str] | set[str]) -> set[str]:
"""
Convert a list or set of strings to a set containing both lower and upper case versions of each string.
Args:
inputs (list[str] | set[str]): A list or set of strings to be converted.
Returns:
set[str]: A set containing both lower and upper case versions of each string.
"""
if not inputs:
return set()
else:
return {case for s in inputs if s for case in (s.lower(), s.upper())}

View File

@ -149,8 +149,7 @@ class RagPipelineTransformService:
file_extensions = node.get("data", {}).get("fileExtensions", []) file_extensions = node.get("data", {}).get("fileExtensions", [])
if not file_extensions: if not file_extensions:
return node return node
file_extensions = [file_extension.lower() for file_extension in file_extensions] node["data"]["fileExtensions"] = [ext.lower() for ext in file_extensions if ext in DOCUMENT_EXTENSIONS]
node["data"]["fileExtensions"] = DOCUMENT_EXTENSIONS
return node return node
def _deal_knowledge_index( def _deal_knowledge_index(