fix chunk format

2026-05-13 08:57:28 +08:00 · 2025-08-29 16:27:22 +08:00 · 2025-08-29 16:27:22 +08:00 · 3c0adfb48a
commit 3c0adfb48a
parent b3dbf9fe94
6 changed files with 42 additions and 38 deletions
--- a/api/core/file/models.py
+++ b/api/core/file/models.py
@ -115,11 +115,10 @@ class File(BaseModel):
            if self.related_id is None:
                raise ValueError("Missing file related_id")
            return helpers.get_signed_file_url(upload_file_id=self.related_id)
-        elif self.transfer_method == FileTransferMethod.TOOL_FILE:
+        elif self.transfer_method == FileTransferMethod.TOOL_FILE or self.transfer_method == FileTransferMethod.DATASOURCE_FILE:
            assert self.related_id is not None
            assert self.extension is not None
            return sign_tool_file(tool_file_id=self.related_id, extension=self.extension)
-
    def to_plugin_parameter(self) -> dict[str, Any]:
        return {
            "dify_model_identity": FILE_MODEL_IDENTITY,
--- a/api/core/rag/index_processor/index_processor_base.py
+++ b/api/core/rag/index_processor/index_processor_base.py
@ -38,11 +38,11 @@ class BaseIndexProcessor(ABC):
        raise NotImplementedError

    @abstractmethod
-    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
+    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
        raise NotImplementedError

    @abstractmethod
-    def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
+    def format_preview(self, chunks: Any) -> Mapping[str, Any]:
        raise NotImplementedError

    @abstractmethod
--- a/api/core/rag/index_processor/processor/paragraph_index_processor.py
+++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py
@ -131,33 +131,37 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
                docs.append(doc)
        return docs

-    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
-        paragraph = GeneralStructureChunk(**chunks)
-        documents = []
-        for content in paragraph.general_chunks:
-            metadata = {
-                "dataset_id": dataset.id,
-                "document_id": document.id,
-                "doc_id": str(uuid.uuid4()),
-                "doc_hash": helper.generate_text_hash(content),
-            }
-            doc = Document(page_content=content, metadata=metadata)
-            documents.append(doc)
-        if documents:
-            # save node to document segment
-            doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id)
-            # add document segments
-            doc_store.add_documents(docs=documents, save_child=False)
-            if dataset.indexing_technique == "high_quality":
-                vector = Vector(dataset)
-                vector.create(documents)
-            elif dataset.indexing_technique == "economy":
-                keyword = Keyword(dataset)
-                keyword.add_texts(documents)
+    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
+        if isinstance(chunks, list):
+            documents = []
+            for content in chunks:
+                metadata = {
+                    "dataset_id": dataset.id,
+                    "document_id": document.id,
+                    "doc_id": str(uuid.uuid4()),
+                    "doc_hash": helper.generate_text_hash(content),
+                }
+                doc = Document(page_content=content, metadata=metadata)
+                documents.append(doc)
+            if documents:
+                # save node to document segment
+                doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id)
+                # add document segments
+                doc_store.add_documents(docs=documents, save_child=False)
+                if dataset.indexing_technique == "high_quality":
+                    vector = Vector(dataset)
+                    vector.create(documents)
+                elif dataset.indexing_technique == "economy":
+                    keyword = Keyword(dataset)
+                    keyword.add_texts(documents)
+        else:
+            raise ValueError("Chunks is not a list")

-    def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
-        paragraph = GeneralStructureChunk(**chunks)
-        preview = []
-        for content in paragraph.general_chunks:
-            preview.append({"content": content})
-        return {"preview": preview, "total_segments": len(paragraph.general_chunks)}
+    def format_preview(self, chunks: Any) -> Mapping[str, Any]:
+        if isinstance(chunks, list):
+            preview = []
+            for content in chunks:
+                preview.append({"content": content})
+            return {"preview": preview, "total_segments": len(chunks)}
+        else:
+            raise ValueError("Chunks is not a list")
--- a/api/core/rag/index_processor/processor/parent_child_index_processor.py
+++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py
@ -207,7 +207,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
                    child_nodes.append(child_document)
        return child_nodes

-    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
+    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
        parent_childs = ParentChildStructureChunk(**chunks)
        documents = []
        for parent_child in parent_childs.parent_child_chunks:
@ -257,7 +257,8 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
                    vector = Vector(dataset)
                    vector.create(all_child_documents)

-    def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
+    def format_preview(self, chunks: Any) -> Mapping[str, Any]:
+
        parent_childs = ParentChildStructureChunk(**chunks)
        preview = []
        for parent_child in parent_childs.parent_child_chunks:
--- a/api/core/rag/index_processor/processor/qa_index_processor.py
+++ b/api/core/rag/index_processor/processor/qa_index_processor.py
@ -166,7 +166,7 @@ class QAIndexProcessor(BaseIndexProcessor):
                docs.append(doc)
        return docs

-    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
+    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
        qa_chunks = QAStructureChunk(**chunks)
        documents = []
        for qa_chunk in qa_chunks.qa_chunks:
@ -189,7 +189,7 @@ class QAIndexProcessor(BaseIndexProcessor):
            else:
                raise ValueError("Indexing technique must be high quality.")

-    def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
+    def format_preview(self, chunks: Any) -> Mapping[str, Any]:
        qa_chunks = QAStructureChunk(**chunks)
        preview = []
        for qa_chunk in qa_chunks.qa_chunks:
--- a/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py
+++ b/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py
@ -174,7 +174,7 @@ class KnowledgeIndexNode(Node):
            "display_status": document.indexing_status,
        }

-    def _get_preview_output(self, chunk_structure: str, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
+    def _get_preview_output(self, chunk_structure: str, chunks: Any) -> Mapping[str, Any]:
        index_processor = IndexProcessorFactory(chunk_structure).init_index_processor()
        return index_processor.format_preview(chunks)