mirror of https://github.com/langgenius/dify.git
fix: check allowed file extensions in rag transform pipeline and use set type instead of list for performance in file extensions (#26593)
This commit is contained in:
parent
89821d66bb
commit
40d35304ea
|
|
@ -1,4 +1,5 @@
|
||||||
from configs import dify_config
|
from configs import dify_config
|
||||||
|
from libs.collection_utils import convert_to_lower_and_upper_set
|
||||||
|
|
||||||
HIDDEN_VALUE = "[__HIDDEN__]"
|
HIDDEN_VALUE = "[__HIDDEN__]"
|
||||||
UNKNOWN_VALUE = "[__UNKNOWN__]"
|
UNKNOWN_VALUE = "[__UNKNOWN__]"
|
||||||
|
|
@ -6,24 +7,39 @@ UUID_NIL = "00000000-0000-0000-0000-000000000000"
|
||||||
|
|
||||||
DEFAULT_FILE_NUMBER_LIMITS = 3
|
DEFAULT_FILE_NUMBER_LIMITS = 3
|
||||||
|
|
||||||
IMAGE_EXTENSIONS = ["jpg", "jpeg", "png", "webp", "gif", "svg"]
|
IMAGE_EXTENSIONS = convert_to_lower_and_upper_set({"jpg", "jpeg", "png", "webp", "gif", "svg"})
|
||||||
IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS])
|
|
||||||
|
|
||||||
VIDEO_EXTENSIONS = ["mp4", "mov", "mpeg", "webm"]
|
VIDEO_EXTENSIONS = convert_to_lower_and_upper_set({"mp4", "mov", "mpeg", "webm"})
|
||||||
VIDEO_EXTENSIONS.extend([ext.upper() for ext in VIDEO_EXTENSIONS])
|
|
||||||
|
|
||||||
AUDIO_EXTENSIONS = ["mp3", "m4a", "wav", "amr", "mpga"]
|
AUDIO_EXTENSIONS = convert_to_lower_and_upper_set({"mp3", "m4a", "wav", "amr", "mpga"})
|
||||||
AUDIO_EXTENSIONS.extend([ext.upper() for ext in AUDIO_EXTENSIONS])
|
|
||||||
|
|
||||||
|
_doc_extensions: set[str]
|
||||||
_doc_extensions: list[str]
|
|
||||||
if dify_config.ETL_TYPE == "Unstructured":
|
if dify_config.ETL_TYPE == "Unstructured":
|
||||||
_doc_extensions = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls", "vtt", "properties"]
|
_doc_extensions = {
|
||||||
_doc_extensions.extend(("doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub"))
|
"txt",
|
||||||
|
"markdown",
|
||||||
|
"md",
|
||||||
|
"mdx",
|
||||||
|
"pdf",
|
||||||
|
"html",
|
||||||
|
"htm",
|
||||||
|
"xlsx",
|
||||||
|
"xls",
|
||||||
|
"vtt",
|
||||||
|
"properties",
|
||||||
|
"doc",
|
||||||
|
"docx",
|
||||||
|
"csv",
|
||||||
|
"eml",
|
||||||
|
"msg",
|
||||||
|
"pptx",
|
||||||
|
"xml",
|
||||||
|
"epub",
|
||||||
|
}
|
||||||
if dify_config.UNSTRUCTURED_API_URL:
|
if dify_config.UNSTRUCTURED_API_URL:
|
||||||
_doc_extensions.append("ppt")
|
_doc_extensions.add("ppt")
|
||||||
else:
|
else:
|
||||||
_doc_extensions = [
|
_doc_extensions = {
|
||||||
"txt",
|
"txt",
|
||||||
"markdown",
|
"markdown",
|
||||||
"md",
|
"md",
|
||||||
|
|
@ -37,5 +53,5 @@ else:
|
||||||
"csv",
|
"csv",
|
||||||
"vtt",
|
"vtt",
|
||||||
"properties",
|
"properties",
|
||||||
]
|
}
|
||||||
DOCUMENT_EXTENSIONS = _doc_extensions + [ext.upper() for ext in _doc_extensions]
|
DOCUMENT_EXTENSIONS: set[str] = convert_to_lower_and_upper_set(_doc_extensions)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
def convert_to_lower_and_upper_set(inputs: list[str] | set[str]) -> set[str]:
|
||||||
|
"""
|
||||||
|
Convert a list or set of strings to a set containing both lower and upper case versions of each string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs (list[str] | set[str]): A list or set of strings to be converted.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
set[str]: A set containing both lower and upper case versions of each string.
|
||||||
|
"""
|
||||||
|
if not inputs:
|
||||||
|
return set()
|
||||||
|
else:
|
||||||
|
return {case for s in inputs if s for case in (s.lower(), s.upper())}
|
||||||
|
|
@ -149,8 +149,7 @@ class RagPipelineTransformService:
|
||||||
file_extensions = node.get("data", {}).get("fileExtensions", [])
|
file_extensions = node.get("data", {}).get("fileExtensions", [])
|
||||||
if not file_extensions:
|
if not file_extensions:
|
||||||
return node
|
return node
|
||||||
file_extensions = [file_extension.lower() for file_extension in file_extensions]
|
node["data"]["fileExtensions"] = [ext.lower() for ext in file_extensions if ext in DOCUMENT_EXTENSIONS]
|
||||||
node["data"]["fileExtensions"] = DOCUMENT_EXTENSIONS
|
|
||||||
return node
|
return node
|
||||||
|
|
||||||
def _deal_knowledge_index(
|
def _deal_knowledge_index(
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue