From 9277156b6c87626b8479da695c469ab2871ebe53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hu=E1=BB=B3nh=20Gia=20B=C3=B4i?= Date: Fri, 6 Dec 2024 17:55:59 +0700 Subject: [PATCH] fix(document_extractor): pptx file type and missing metadata_filename UnstructuredIO (#11364) Co-authored-by: Julian Huynh --- .../workflow/nodes/document_extractor/node.py | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index d490a2eb03..59afe7ac87 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -1,6 +1,8 @@ import csv import io import json +import os +import tempfile import docx import pandas as pd @@ -264,14 +266,20 @@ def _extract_text_from_ppt(file_content: bytes) -> str: def _extract_text_from_pptx(file_content: bytes) -> str: try: - with io.BytesIO(file_content) as file: - if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY: - elements = partition_via_api( - file=file, - api_url=dify_config.UNSTRUCTURED_API_URL, - api_key=dify_config.UNSTRUCTURED_API_KEY, - ) - else: + if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY: + with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as temp_file: + temp_file.write(file_content) + temp_file.flush() + with open(temp_file.name, "rb") as file: + elements = partition_via_api( + file=file, + metadata_filename=temp_file.name, + api_url=dify_config.UNSTRUCTURED_API_URL, + api_key=dify_config.UNSTRUCTURED_API_KEY, + ) + os.unlink(temp_file.name) + else: + with io.BytesIO(file_content) as file: elements = partition_pptx(file=file) return "\n".join([getattr(element, "text", "") for element in elements]) except Exception as e: