mirror of
https://github.com/langgenius/dify.git
synced 2026-05-13 08:57:28 +08:00
text extractor tool
This commit is contained in:
parent
67b1190535
commit
bc7cc06572
@ -1,4 +1,6 @@
|
|||||||
import base64
|
import base64
|
||||||
|
from pathlib import Path
|
||||||
|
import tempfile
|
||||||
|
|
||||||
from configs import dify_config
|
from configs import dify_config
|
||||||
from core.file import file_repository
|
from core.file import file_repository
|
||||||
@ -109,6 +111,38 @@ def _download_file_content(path: str, /):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def download_to_target_path(f: File, temp_dir: str, /):
|
||||||
|
if f.transfer_method == FileTransferMethod.TOOL_FILE:
|
||||||
|
tool_file = file_repository.get_tool_file(session=db.session(), file=f)
|
||||||
|
suffix = Path(tool_file.file_key).suffix
|
||||||
|
target_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
|
||||||
|
_download_file_to_target_path(tool_file.file_key, target_path)
|
||||||
|
return target_path
|
||||||
|
elif f.transfer_method == FileTransferMethod.LOCAL_FILE:
|
||||||
|
upload_file = file_repository.get_upload_file(session=db.session(), file=f)
|
||||||
|
suffix = Path(upload_file.key).suffix
|
||||||
|
target_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
|
||||||
|
_download_file_to_target_path(upload_file.key, target_path)
|
||||||
|
return target_path
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported transfer method: {f.transfer_method}")
|
||||||
|
|
||||||
|
|
||||||
|
def _download_file_to_target_path(path: str, target_path: str, /):
|
||||||
|
"""
|
||||||
|
Download and return the contents of a file as bytes.
|
||||||
|
|
||||||
|
This function loads the file from storage and ensures it's in bytes format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): The path to the file in storage.
|
||||||
|
target_path (str): The path to the target file.
|
||||||
|
Raises:
|
||||||
|
ValueError: If the loaded file is not a bytes object.
|
||||||
|
"""
|
||||||
|
storage.download(path, target_path)
|
||||||
|
|
||||||
|
|
||||||
def _get_encoded_string(f: File, /):
|
def _get_encoded_string(f: File, /):
|
||||||
match f.transfer_method:
|
match f.transfer_method:
|
||||||
case FileTransferMethod.REMOTE_URL:
|
case FileTransferMethod.REMOTE_URL:
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 153 KiB |
@ -1,20 +0,0 @@
|
|||||||
from typing import Any
|
|
||||||
|
|
||||||
from core.tools.errors import ToolProviderCredentialValidationError
|
|
||||||
from core.tools.provider.builtin.dalle.tools.dalle2 import DallE2Tool
|
|
||||||
from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
|
|
||||||
|
|
||||||
|
|
||||||
class DALLEProvider(BuiltinToolProviderController):
|
|
||||||
def _validate_credentials(self, credentials: dict[str, Any]) -> None:
|
|
||||||
try:
|
|
||||||
DallE2Tool().fork_tool_runtime(
|
|
||||||
runtime={
|
|
||||||
"credentials": credentials,
|
|
||||||
}
|
|
||||||
).invoke(
|
|
||||||
user_id="",
|
|
||||||
tool_parameters={"prompt": "cute girl, blue eyes, white hair, anime style", "size": "small", "n": 1},
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
raise ToolProviderCredentialValidationError(str(e))
|
|
||||||
@ -1,61 +0,0 @@
|
|||||||
identity:
|
|
||||||
author: Dify
|
|
||||||
name: dalle
|
|
||||||
label:
|
|
||||||
en_US: DALL-E
|
|
||||||
zh_Hans: DALL-E 绘画
|
|
||||||
pt_BR: DALL-E
|
|
||||||
description:
|
|
||||||
en_US: DALL-E art
|
|
||||||
zh_Hans: DALL-E 绘画
|
|
||||||
pt_BR: DALL-E art
|
|
||||||
icon: icon.png
|
|
||||||
tags:
|
|
||||||
- image
|
|
||||||
- productivity
|
|
||||||
credentials_for_provider:
|
|
||||||
openai_api_key:
|
|
||||||
type: secret-input
|
|
||||||
required: true
|
|
||||||
label:
|
|
||||||
en_US: OpenAI API key
|
|
||||||
zh_Hans: OpenAI API key
|
|
||||||
pt_BR: OpenAI API key
|
|
||||||
help:
|
|
||||||
en_US: Please input your OpenAI API key
|
|
||||||
zh_Hans: 请输入你的 OpenAI API key
|
|
||||||
pt_BR: Please input your OpenAI API key
|
|
||||||
placeholder:
|
|
||||||
en_US: Please input your OpenAI API key
|
|
||||||
zh_Hans: 请输入你的 OpenAI API key
|
|
||||||
pt_BR: Please input your OpenAI API key
|
|
||||||
openai_organization_id:
|
|
||||||
type: text-input
|
|
||||||
required: false
|
|
||||||
label:
|
|
||||||
en_US: OpenAI organization ID
|
|
||||||
zh_Hans: OpenAI organization ID
|
|
||||||
pt_BR: OpenAI organization ID
|
|
||||||
help:
|
|
||||||
en_US: Please input your OpenAI organization ID
|
|
||||||
zh_Hans: 请输入你的 OpenAI organization ID
|
|
||||||
pt_BR: Please input your OpenAI organization ID
|
|
||||||
placeholder:
|
|
||||||
en_US: Please input your OpenAI organization ID
|
|
||||||
zh_Hans: 请输入你的 OpenAI organization ID
|
|
||||||
pt_BR: Please input your OpenAI organization ID
|
|
||||||
openai_base_url:
|
|
||||||
type: text-input
|
|
||||||
required: false
|
|
||||||
label:
|
|
||||||
en_US: OpenAI base URL
|
|
||||||
zh_Hans: OpenAI base URL
|
|
||||||
pt_BR: OpenAI base URL
|
|
||||||
help:
|
|
||||||
en_US: Please input your OpenAI base URL
|
|
||||||
zh_Hans: 请输入你的 OpenAI base URL
|
|
||||||
pt_BR: Please input your OpenAI base URL
|
|
||||||
placeholder:
|
|
||||||
en_US: Please input your OpenAI base URL
|
|
||||||
zh_Hans: 请输入你的 OpenAI base URL
|
|
||||||
pt_BR: Please input your OpenAI base URL
|
|
||||||
@ -1,32 +0,0 @@
|
|||||||
from base64 import b64decode
|
|
||||||
from typing import Any, Union
|
|
||||||
|
|
||||||
from openai import OpenAI
|
|
||||||
from yarl import URL
|
|
||||||
from core.file.enums import FileType
|
|
||||||
|
|
||||||
from core.file.file_manager import download
|
|
||||||
from core.tools.entities.tool_entities import ToolInvokeMessage
|
|
||||||
from core.tools.errors import ToolParameterValidationError
|
|
||||||
from core.tools.tool.builtin_tool import BuiltinTool
|
|
||||||
|
|
||||||
|
|
||||||
class FileExtractorTool(BuiltinTool):
|
|
||||||
def _invoke(
|
|
||||||
self,
|
|
||||||
user_id: str,
|
|
||||||
tool_parameters: dict[str, Any],
|
|
||||||
) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
|
|
||||||
"""
|
|
||||||
invoke tools
|
|
||||||
"""
|
|
||||||
# image file for workflow mode
|
|
||||||
file = tool_parameters.get("file")
|
|
||||||
if file and file.type != FileType.DOCUMENT:
|
|
||||||
raise ToolParameterValidationError("Not a valid document")
|
|
||||||
|
|
||||||
if file:
|
|
||||||
file_binary = download(file)
|
|
||||||
else:
|
|
||||||
raise ToolParameterValidationError("Please provide either file")
|
|
||||||
return result
|
|
||||||
BIN
api/core/tools/provider/builtin/file_extractor/_assets/icon.png
Normal file
BIN
api/core/tools/provider/builtin/file_extractor/_assets/icon.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 4.3 KiB |
@ -0,0 +1,10 @@
|
|||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from core.tools.errors import ToolProviderCredentialValidationError
|
||||||
|
from core.tools.provider.builtin.dalle.tools.dalle2 import DallE2Tool
|
||||||
|
from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
|
||||||
|
|
||||||
|
|
||||||
|
class FileExtractorProvider(BuiltinToolProviderController):
|
||||||
|
def _validate_credentials(self, credentials: dict[str, Any]) -> None:
|
||||||
|
pass
|
||||||
@ -0,0 +1,15 @@
|
|||||||
|
identity:
|
||||||
|
author: Jyong
|
||||||
|
name: file_extractor
|
||||||
|
label:
|
||||||
|
en_US: File Extractor
|
||||||
|
zh_Hans: 文件提取
|
||||||
|
pt_BR: File Extractor
|
||||||
|
description:
|
||||||
|
en_US: Extract text from file
|
||||||
|
zh_Hans: 从文件中提取文本
|
||||||
|
pt_BR: Extract text from file
|
||||||
|
icon: icon.png
|
||||||
|
tags:
|
||||||
|
- utilities
|
||||||
|
- productivity
|
||||||
@ -0,0 +1,47 @@
|
|||||||
|
from base64 import b64decode
|
||||||
|
import tempfile
|
||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
from yarl import URL
|
||||||
|
from core.file.enums import FileType
|
||||||
|
|
||||||
|
from core.file.file_manager import download_to_target_path
|
||||||
|
from core.rag.extractor.text_extractor import TextExtractor
|
||||||
|
from core.rag.splitter.fixed_text_splitter import FixedRecursiveCharacterTextSplitter
|
||||||
|
from core.tools.entities.tool_entities import ToolInvokeMessage
|
||||||
|
from core.tools.errors import ToolParameterValidationError
|
||||||
|
from core.tools.tool.builtin_tool import BuiltinTool
|
||||||
|
|
||||||
|
|
||||||
|
class FileExtractorTool(BuiltinTool):
|
||||||
|
def _invoke(
|
||||||
|
self,
|
||||||
|
user_id: str,
|
||||||
|
tool_parameters: dict[str, Any],
|
||||||
|
) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
|
||||||
|
"""
|
||||||
|
invoke tools
|
||||||
|
"""
|
||||||
|
# image file for workflow mode
|
||||||
|
file = tool_parameters.get("text_file")
|
||||||
|
if file and file.type != FileType.DOCUMENT:
|
||||||
|
raise ToolParameterValidationError("Not a valid document")
|
||||||
|
|
||||||
|
if file:
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
file_path = download_to_target_path(file, temp_dir)
|
||||||
|
extractor = TextExtractor(file_path, autodetect_encoding=True)
|
||||||
|
documents = extractor.extract()
|
||||||
|
character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder(
|
||||||
|
chunk_size=tool_parameters.get("max_token", 500),
|
||||||
|
chunk_overlap=0,
|
||||||
|
fixed_separator=tool_parameters.get("separator", "\n\n"),
|
||||||
|
separators=["\n\n", "。", ". ", " ", ""],
|
||||||
|
embedding_model_instance=None,
|
||||||
|
)
|
||||||
|
chunks = character_splitter.split_documents(documents)
|
||||||
|
return self.create_json_message(json.dumps([chunk.page_content for chunk in chunks]))
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ToolParameterValidationError("Please provide either file")
|
||||||
@ -24,7 +24,7 @@ parameters:
|
|||||||
zh_Hans: 要提取的 text 文档。
|
zh_Hans: 要提取的 text 文档。
|
||||||
llm_description: you should not input this parameter. just input the image_id.
|
llm_description: you should not input this parameter. just input the image_id.
|
||||||
form: llm
|
form: llm
|
||||||
- name: separators
|
- name: separator
|
||||||
type: string
|
type: string
|
||||||
required: false
|
required: false
|
||||||
label:
|
label:
|
||||||
Loading…
Reference in New Issue
Block a user