From 3c0adfb48aa636c5533c281551b0d3f2fd714f3b Mon Sep 17 00:00:00 2001
From: jyong <718720800@qq.com>
Date: Fri, 29 Aug 2025 16:27:22 +0800
Subject: [PATCH] fix chunk format

---
 api/core/file/models.py                       |  3 +-
 .../index_processor/index_processor_base.py   |  4 +-
 .../processor/paragraph_index_processor.py    | 62 ++++++++++---------
 .../processor/parent_child_index_processor.py |  5 +-
 .../processor/qa_index_processor.py           |  4 +-
 .../knowledge_index/knowledge_index_node.py   |  2 +-
 6 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/api/core/file/models.py b/api/core/file/models.py
index f61334e7bc..59bbb68cf2 100644
--- a/api/core/file/models.py
+++ b/api/core/file/models.py
@@ -115,11 +115,10 @@ class File(BaseModel):
             if self.related_id is None:
                 raise ValueError("Missing file related_id")
             return helpers.get_signed_file_url(upload_file_id=self.related_id)
-        elif self.transfer_method == FileTransferMethod.TOOL_FILE:
+        elif self.transfer_method == FileTransferMethod.TOOL_FILE or self.transfer_method == FileTransferMethod.DATASOURCE_FILE:
             assert self.related_id is not None
             assert self.extension is not None
             return sign_tool_file(tool_file_id=self.related_id, extension=self.extension)
-
     def to_plugin_parameter(self) -> dict[str, Any]:
         return {
             "dify_model_identity": FILE_MODEL_IDENTITY,
diff --git a/api/core/rag/index_processor/index_processor_base.py b/api/core/rag/index_processor/index_processor_base.py
index 379191d7f0..ffe97e9330 100644
--- a/api/core/rag/index_processor/index_processor_base.py
+++ b/api/core/rag/index_processor/index_processor_base.py
@@ -38,11 +38,11 @@ class BaseIndexProcessor(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
+    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
         raise NotImplementedError
 
     @abstractmethod
-    def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
+    def format_preview(self, chunks: Any) -> Mapping[str, Any]:
         raise NotImplementedError
 
     @abstractmethod
diff --git a/api/core/rag/index_processor/processor/paragraph_index_processor.py b/api/core/rag/index_processor/processor/paragraph_index_processor.py
index 6a114b6bb2..cdd5898e8d 100644
--- a/api/core/rag/index_processor/processor/paragraph_index_processor.py
+++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py
@@ -131,33 +131,37 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
                 docs.append(doc)
         return docs
 
-    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
-        paragraph = GeneralStructureChunk(**chunks)
-        documents = []
-        for content in paragraph.general_chunks:
-            metadata = {
-                "dataset_id": dataset.id,
-                "document_id": document.id,
-                "doc_id": str(uuid.uuid4()),
-                "doc_hash": helper.generate_text_hash(content),
-            }
-            doc = Document(page_content=content, metadata=metadata)
-            documents.append(doc)
-        if documents:
-            # save node to document segment
-            doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id)
-            # add document segments
-            doc_store.add_documents(docs=documents, save_child=False)
-            if dataset.indexing_technique == "high_quality":
-                vector = Vector(dataset)
-                vector.create(documents)
-            elif dataset.indexing_technique == "economy":
-                keyword = Keyword(dataset)
-                keyword.add_texts(documents)
+    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
+        if isinstance(chunks, list):
+            documents = []
+            for content in chunks:
+                metadata = {
+                    "dataset_id": dataset.id,
+                    "document_id": document.id,
+                    "doc_id": str(uuid.uuid4()),
+                    "doc_hash": helper.generate_text_hash(content),
+                }
+                doc = Document(page_content=content, metadata=metadata)
+                documents.append(doc)
+            if documents:
+                # save node to document segment
+                doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id)
+                # add document segments
+                doc_store.add_documents(docs=documents, save_child=False)
+                if dataset.indexing_technique == "high_quality":
+                    vector = Vector(dataset)
+                    vector.create(documents)
+                elif dataset.indexing_technique == "economy":
+                    keyword = Keyword(dataset)
+                    keyword.add_texts(documents)
+        else:
+            raise ValueError("Chunks is not a list")
 
-    def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
-        paragraph = GeneralStructureChunk(**chunks)
-        preview = []
-        for content in paragraph.general_chunks:
-            preview.append({"content": content})
-        return {"preview": preview, "total_segments": len(paragraph.general_chunks)}
+    def format_preview(self, chunks: Any) -> Mapping[str, Any]:
+        if isinstance(chunks, list):
+            preview = []
+            for content in chunks:
+                preview.append({"content": content})
+            return {"preview": preview, "total_segments": len(chunks)}
+        else:
+            raise ValueError("Chunks is not a list")
diff --git a/api/core/rag/index_processor/processor/parent_child_index_processor.py b/api/core/rag/index_processor/processor/parent_child_index_processor.py
index 09d3e3bad6..5653ca9344 100644
--- a/api/core/rag/index_processor/processor/parent_child_index_processor.py
+++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py
@@ -207,7 +207,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
                     child_nodes.append(child_document)
         return child_nodes
 
-    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
+    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
         parent_childs = ParentChildStructureChunk(**chunks)
         documents = []
         for parent_child in parent_childs.parent_child_chunks:
@@ -257,7 +257,8 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
                     vector = Vector(dataset)
                     vector.create(all_child_documents)
 
-    def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
+    def format_preview(self, chunks: Any) -> Mapping[str, Any]:
+
         parent_childs = ParentChildStructureChunk(**chunks)
         preview = []
         for parent_child in parent_childs.parent_child_chunks:
diff --git a/api/core/rag/index_processor/processor/qa_index_processor.py b/api/core/rag/index_processor/processor/qa_index_processor.py
index e2554cc998..8257b85110 100644
--- a/api/core/rag/index_processor/processor/qa_index_processor.py
+++ b/api/core/rag/index_processor/processor/qa_index_processor.py
@@ -166,7 +166,7 @@ class QAIndexProcessor(BaseIndexProcessor):
                 docs.append(doc)
         return docs
 
-    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
+    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
         qa_chunks = QAStructureChunk(**chunks)
         documents = []
         for qa_chunk in qa_chunks.qa_chunks:
@@ -189,7 +189,7 @@ class QAIndexProcessor(BaseIndexProcessor):
             else:
                 raise ValueError("Indexing technique must be high quality.")
 
-    def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
+    def format_preview(self, chunks: Any) -> Mapping[str, Any]:
         qa_chunks = QAStructureChunk(**chunks)
         preview = []
         for qa_chunk in qa_chunks.qa_chunks:
diff --git a/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py b/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py
index 2efcb69445..83a2b8c53f 100644
--- a/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py
+++ b/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py
@@ -174,7 +174,7 @@ class KnowledgeIndexNode(Node):
             "display_status": document.indexing_status,
         }
 
-    def _get_preview_output(self, chunk_structure: str, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
+    def _get_preview_output(self, chunk_structure: str, chunks: Any) -> Mapping[str, Any]:
         index_processor = IndexProcessorFactory(chunk_structure).init_index_processor()
         return index_processor.format_preview(chunks)