mirror of https://github.com/langgenius/dify.git
fix: check allowed file extensions in rag transform pipeline and use set type instead of list for performance in file extensions (#26593)
This commit is contained in:
parent
89821d66bb
commit
40d35304ea
|
|
@ -1,4 +1,5 @@
|
|||
from configs import dify_config
|
||||
from libs.collection_utils import convert_to_lower_and_upper_set
|
||||
|
||||
HIDDEN_VALUE = "[__HIDDEN__]"
|
||||
UNKNOWN_VALUE = "[__UNKNOWN__]"
|
||||
|
|
@ -6,24 +7,39 @@ UUID_NIL = "00000000-0000-0000-0000-000000000000"
|
|||
|
||||
DEFAULT_FILE_NUMBER_LIMITS = 3
|
||||
|
||||
IMAGE_EXTENSIONS = ["jpg", "jpeg", "png", "webp", "gif", "svg"]
|
||||
IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS])
|
||||
IMAGE_EXTENSIONS = convert_to_lower_and_upper_set({"jpg", "jpeg", "png", "webp", "gif", "svg"})
|
||||
|
||||
VIDEO_EXTENSIONS = ["mp4", "mov", "mpeg", "webm"]
|
||||
VIDEO_EXTENSIONS.extend([ext.upper() for ext in VIDEO_EXTENSIONS])
|
||||
VIDEO_EXTENSIONS = convert_to_lower_and_upper_set({"mp4", "mov", "mpeg", "webm"})
|
||||
|
||||
AUDIO_EXTENSIONS = ["mp3", "m4a", "wav", "amr", "mpga"]
|
||||
AUDIO_EXTENSIONS.extend([ext.upper() for ext in AUDIO_EXTENSIONS])
|
||||
AUDIO_EXTENSIONS = convert_to_lower_and_upper_set({"mp3", "m4a", "wav", "amr", "mpga"})
|
||||
|
||||
|
||||
_doc_extensions: list[str]
|
||||
_doc_extensions: set[str]
|
||||
if dify_config.ETL_TYPE == "Unstructured":
|
||||
_doc_extensions = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls", "vtt", "properties"]
|
||||
_doc_extensions.extend(("doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub"))
|
||||
_doc_extensions = {
|
||||
"txt",
|
||||
"markdown",
|
||||
"md",
|
||||
"mdx",
|
||||
"pdf",
|
||||
"html",
|
||||
"htm",
|
||||
"xlsx",
|
||||
"xls",
|
||||
"vtt",
|
||||
"properties",
|
||||
"doc",
|
||||
"docx",
|
||||
"csv",
|
||||
"eml",
|
||||
"msg",
|
||||
"pptx",
|
||||
"xml",
|
||||
"epub",
|
||||
}
|
||||
if dify_config.UNSTRUCTURED_API_URL:
|
||||
_doc_extensions.append("ppt")
|
||||
_doc_extensions.add("ppt")
|
||||
else:
|
||||
_doc_extensions = [
|
||||
_doc_extensions = {
|
||||
"txt",
|
||||
"markdown",
|
||||
"md",
|
||||
|
|
@ -37,5 +53,5 @@ else:
|
|||
"csv",
|
||||
"vtt",
|
||||
"properties",
|
||||
]
|
||||
DOCUMENT_EXTENSIONS = _doc_extensions + [ext.upper() for ext in _doc_extensions]
|
||||
}
|
||||
DOCUMENT_EXTENSIONS: set[str] = convert_to_lower_and_upper_set(_doc_extensions)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,14 @@
|
|||
def convert_to_lower_and_upper_set(inputs: list[str] | set[str]) -> set[str]:
|
||||
"""
|
||||
Convert a list or set of strings to a set containing both lower and upper case versions of each string.
|
||||
|
||||
Args:
|
||||
inputs (list[str] | set[str]): A list or set of strings to be converted.
|
||||
|
||||
Returns:
|
||||
set[str]: A set containing both lower and upper case versions of each string.
|
||||
"""
|
||||
if not inputs:
|
||||
return set()
|
||||
else:
|
||||
return {case for s in inputs if s for case in (s.lower(), s.upper())}
|
||||
|
|
@ -149,8 +149,7 @@ class RagPipelineTransformService:
|
|||
file_extensions = node.get("data", {}).get("fileExtensions", [])
|
||||
if not file_extensions:
|
||||
return node
|
||||
file_extensions = [file_extension.lower() for file_extension in file_extensions]
|
||||
node["data"]["fileExtensions"] = DOCUMENT_EXTENSIONS
|
||||
node["data"]["fileExtensions"] = [ext.lower() for ext in file_extensions if ext in DOCUMENT_EXTENSIONS]
|
||||
return node
|
||||
|
||||
def _deal_knowledge_index(
|
||||
|
|
|
|||
Loading…
Reference in New Issue