diff --git a/api/core/file/file_manager.py b/api/core/file/file_manager.py index b69d7a74c0..8a7b35ccf4 100644 --- a/api/core/file/file_manager.py +++ b/api/core/file/file_manager.py @@ -1,4 +1,6 @@ import base64 +from pathlib import Path +import tempfile from configs import dify_config from core.file import file_repository @@ -109,6 +111,38 @@ def _download_file_content(path: str, /): return data +def download_to_target_path(f: File, temp_dir: str, /): + if f.transfer_method == FileTransferMethod.TOOL_FILE: + tool_file = file_repository.get_tool_file(session=db.session(), file=f) + suffix = Path(tool_file.file_key).suffix + target_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" + _download_file_to_target_path(tool_file.file_key, target_path) + return target_path + elif f.transfer_method == FileTransferMethod.LOCAL_FILE: + upload_file = file_repository.get_upload_file(session=db.session(), file=f) + suffix = Path(upload_file.key).suffix + target_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" + _download_file_to_target_path(upload_file.key, target_path) + return target_path + else: + raise ValueError(f"Unsupported transfer method: {f.transfer_method}") + + +def _download_file_to_target_path(path: str, target_path: str, /): + """ + Download and return the contents of a file as bytes. + + This function loads the file from storage and ensures it's in bytes format. + + Args: + path (str): The path to the file in storage. + target_path (str): The path to the target file. + Raises: + ValueError: If the loaded file is not a bytes object. + """ + storage.download(path, target_path) + + def _get_encoded_string(f: File, /): match f.transfer_method: case FileTransferMethod.REMOTE_URL: diff --git a/api/core/tools/provider/builtin/dify_extractor/__init__.py b/api/core/tools/provider/builtin/dify_extractor/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/api/core/tools/provider/builtin/dify_extractor/_assets/icon.png b/api/core/tools/provider/builtin/dify_extractor/_assets/icon.png deleted file mode 100644 index 5155a73059..0000000000 Binary files a/api/core/tools/provider/builtin/dify_extractor/_assets/icon.png and /dev/null differ diff --git a/api/core/tools/provider/builtin/dify_extractor/dalle.py b/api/core/tools/provider/builtin/dify_extractor/dalle.py deleted file mode 100644 index 5bd16e49e8..0000000000 --- a/api/core/tools/provider/builtin/dify_extractor/dalle.py +++ /dev/null @@ -1,20 +0,0 @@ -from typing import Any - -from core.tools.errors import ToolProviderCredentialValidationError -from core.tools.provider.builtin.dalle.tools.dalle2 import DallE2Tool -from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController - - -class DALLEProvider(BuiltinToolProviderController): - def _validate_credentials(self, credentials: dict[str, Any]) -> None: - try: - DallE2Tool().fork_tool_runtime( - runtime={ - "credentials": credentials, - } - ).invoke( - user_id="", - tool_parameters={"prompt": "cute girl, blue eyes, white hair, anime style", "size": "small", "n": 1}, - ) - except Exception as e: - raise ToolProviderCredentialValidationError(str(e)) diff --git a/api/core/tools/provider/builtin/dify_extractor/dalle.yaml b/api/core/tools/provider/builtin/dify_extractor/dalle.yaml deleted file mode 100644 index 37cf93c28a..0000000000 --- a/api/core/tools/provider/builtin/dify_extractor/dalle.yaml +++ /dev/null @@ -1,61 +0,0 @@ -identity: - author: Dify - name: dalle - label: - en_US: DALL-E - zh_Hans: DALL-E 绘画 - pt_BR: DALL-E - description: - en_US: DALL-E art - zh_Hans: DALL-E 绘画 - pt_BR: DALL-E art - icon: icon.png - tags: - - image - - productivity -credentials_for_provider: - openai_api_key: - type: secret-input - required: true - label: - en_US: OpenAI API key - zh_Hans: OpenAI API key - pt_BR: OpenAI API key - help: - en_US: Please input your OpenAI API key - zh_Hans: 请输入你的 OpenAI API key - pt_BR: Please input your OpenAI API key - placeholder: - en_US: Please input your OpenAI API key - zh_Hans: 请输入你的 OpenAI API key - pt_BR: Please input your OpenAI API key - openai_organization_id: - type: text-input - required: false - label: - en_US: OpenAI organization ID - zh_Hans: OpenAI organization ID - pt_BR: OpenAI organization ID - help: - en_US: Please input your OpenAI organization ID - zh_Hans: 请输入你的 OpenAI organization ID - pt_BR: Please input your OpenAI organization ID - placeholder: - en_US: Please input your OpenAI organization ID - zh_Hans: 请输入你的 OpenAI organization ID - pt_BR: Please input your OpenAI organization ID - openai_base_url: - type: text-input - required: false - label: - en_US: OpenAI base URL - zh_Hans: OpenAI base URL - pt_BR: OpenAI base URL - help: - en_US: Please input your OpenAI base URL - zh_Hans: 请输入你的 OpenAI base URL - pt_BR: Please input your OpenAI base URL - placeholder: - en_US: Please input your OpenAI base URL - zh_Hans: 请输入你的 OpenAI base URL - pt_BR: Please input your OpenAI base URL diff --git a/api/core/tools/provider/builtin/dify_extractor/tools/file_extractor.py b/api/core/tools/provider/builtin/dify_extractor/tools/file_extractor.py deleted file mode 100644 index c4b77d0aca..0000000000 --- a/api/core/tools/provider/builtin/dify_extractor/tools/file_extractor.py +++ /dev/null @@ -1,32 +0,0 @@ -from base64 import b64decode -from typing import Any, Union - -from openai import OpenAI -from yarl import URL -from core.file.enums import FileType - -from core.file.file_manager import download -from core.tools.entities.tool_entities import ToolInvokeMessage -from core.tools.errors import ToolParameterValidationError -from core.tools.tool.builtin_tool import BuiltinTool - - -class FileExtractorTool(BuiltinTool): - def _invoke( - self, - user_id: str, - tool_parameters: dict[str, Any], - ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: - """ - invoke tools - """ - # image file for workflow mode - file = tool_parameters.get("file") - if file and file.type != FileType.DOCUMENT: - raise ToolParameterValidationError("Not a valid document") - - if file: - file_binary = download(file) - else: - raise ToolParameterValidationError("Please provide either file") - return result diff --git a/api/core/tools/provider/builtin/file_extractor/_assets/icon.png b/api/core/tools/provider/builtin/file_extractor/_assets/icon.png new file mode 100644 index 0000000000..4bb8d8c1e5 Binary files /dev/null and b/api/core/tools/provider/builtin/file_extractor/_assets/icon.png differ diff --git a/api/core/tools/provider/builtin/file_extractor/file_extractor.py b/api/core/tools/provider/builtin/file_extractor/file_extractor.py new file mode 100644 index 0000000000..50a7aedd57 --- /dev/null +++ b/api/core/tools/provider/builtin/file_extractor/file_extractor.py @@ -0,0 +1,10 @@ +from typing import Any + +from core.tools.errors import ToolProviderCredentialValidationError +from core.tools.provider.builtin.dalle.tools.dalle2 import DallE2Tool +from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController + + +class FileExtractorProvider(BuiltinToolProviderController): + def _validate_credentials(self, credentials: dict[str, Any]) -> None: + pass diff --git a/api/core/tools/provider/builtin/file_extractor/file_extractor.yaml b/api/core/tools/provider/builtin/file_extractor/file_extractor.yaml new file mode 100644 index 0000000000..fa197a1255 --- /dev/null +++ b/api/core/tools/provider/builtin/file_extractor/file_extractor.yaml @@ -0,0 +1,15 @@ +identity: + author: Jyong + name: file_extractor + label: + en_US: File Extractor + zh_Hans: 文件提取 + pt_BR: File Extractor + description: + en_US: Extract text from file + zh_Hans: 从文件中提取文本 + pt_BR: Extract text from file + icon: icon.png + tags: + - utilities + - productivity diff --git a/api/core/tools/provider/builtin/file_extractor/tools/file_extractor.py b/api/core/tools/provider/builtin/file_extractor/tools/file_extractor.py new file mode 100644 index 0000000000..3582dc2ffd --- /dev/null +++ b/api/core/tools/provider/builtin/file_extractor/tools/file_extractor.py @@ -0,0 +1,47 @@ +from base64 import b64decode +import tempfile +from typing import Any, Union + +from openai import OpenAI +from yarl import URL +from core.file.enums import FileType + +from core.file.file_manager import download_to_target_path +from core.rag.extractor.text_extractor import TextExtractor +from core.rag.splitter.fixed_text_splitter import FixedRecursiveCharacterTextSplitter +from core.tools.entities.tool_entities import ToolInvokeMessage +from core.tools.errors import ToolParameterValidationError +from core.tools.tool.builtin_tool import BuiltinTool + + +class FileExtractorTool(BuiltinTool): + def _invoke( + self, + user_id: str, + tool_parameters: dict[str, Any], + ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: + """ + invoke tools + """ + # image file for workflow mode + file = tool_parameters.get("text_file") + if file and file.type != FileType.DOCUMENT: + raise ToolParameterValidationError("Not a valid document") + + if file: + with tempfile.TemporaryDirectory() as temp_dir: + file_path = download_to_target_path(file, temp_dir) + extractor = TextExtractor(file_path, autodetect_encoding=True) + documents = extractor.extract() + character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder( + chunk_size=tool_parameters.get("max_token", 500), + chunk_overlap=0, + fixed_separator=tool_parameters.get("separator", "\n\n"), + separators=["\n\n", "。", ". ", " ", ""], + embedding_model_instance=None, + ) + chunks = character_splitter.split_documents(documents) + return self.create_json_message(json.dumps([chunk.page_content for chunk in chunks])) + + else: + raise ToolParameterValidationError("Please provide either file") diff --git a/api/core/tools/provider/builtin/dify_extractor/tools/file_extractor.yaml b/api/core/tools/provider/builtin/file_extractor/tools/file_extractor.yaml similarity index 98% rename from api/core/tools/provider/builtin/dify_extractor/tools/file_extractor.yaml rename to api/core/tools/provider/builtin/file_extractor/tools/file_extractor.yaml index 15f2273f04..5937fbc873 100644 --- a/api/core/tools/provider/builtin/dify_extractor/tools/file_extractor.yaml +++ b/api/core/tools/provider/builtin/file_extractor/tools/file_extractor.yaml @@ -24,7 +24,7 @@ parameters: zh_Hans: 要提取的 text 文档。 llm_description: you should not input this parameter. just input the image_id. form: llm - - name: separators + - name: separator type: string required: false label: