From 3c0adfb48aa636c5533c281551b0d3f2fd714f3b Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Fri, 29 Aug 2025 16:27:22 +0800 Subject: [PATCH] fix chunk format --- api/core/file/models.py | 3 +- .../index_processor/index_processor_base.py | 4 +- .../processor/paragraph_index_processor.py | 62 ++++++++++--------- .../processor/parent_child_index_processor.py | 5 +- .../processor/qa_index_processor.py | 4 +- .../knowledge_index/knowledge_index_node.py | 2 +- 6 files changed, 42 insertions(+), 38 deletions(-) diff --git a/api/core/file/models.py b/api/core/file/models.py index f61334e7bc..59bbb68cf2 100644 --- a/api/core/file/models.py +++ b/api/core/file/models.py @@ -115,11 +115,10 @@ class File(BaseModel): if self.related_id is None: raise ValueError("Missing file related_id") return helpers.get_signed_file_url(upload_file_id=self.related_id) - elif self.transfer_method == FileTransferMethod.TOOL_FILE: + elif self.transfer_method == FileTransferMethod.TOOL_FILE or self.transfer_method == FileTransferMethod.DATASOURCE_FILE: assert self.related_id is not None assert self.extension is not None return sign_tool_file(tool_file_id=self.related_id, extension=self.extension) - def to_plugin_parameter(self) -> dict[str, Any]: return { "dify_model_identity": FILE_MODEL_IDENTITY, diff --git a/api/core/rag/index_processor/index_processor_base.py b/api/core/rag/index_processor/index_processor_base.py index 379191d7f0..ffe97e9330 100644 --- a/api/core/rag/index_processor/index_processor_base.py +++ b/api/core/rag/index_processor/index_processor_base.py @@ -38,11 +38,11 @@ class BaseIndexProcessor(ABC): raise NotImplementedError @abstractmethod - def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]): + def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any): raise NotImplementedError @abstractmethod - def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]: + def format_preview(self, chunks: Any) -> Mapping[str, Any]: raise NotImplementedError @abstractmethod diff --git a/api/core/rag/index_processor/processor/paragraph_index_processor.py b/api/core/rag/index_processor/processor/paragraph_index_processor.py index 6a114b6bb2..cdd5898e8d 100644 --- a/api/core/rag/index_processor/processor/paragraph_index_processor.py +++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py @@ -131,33 +131,37 @@ class ParagraphIndexProcessor(BaseIndexProcessor): docs.append(doc) return docs - def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]): - paragraph = GeneralStructureChunk(**chunks) - documents = [] - for content in paragraph.general_chunks: - metadata = { - "dataset_id": dataset.id, - "document_id": document.id, - "doc_id": str(uuid.uuid4()), - "doc_hash": helper.generate_text_hash(content), - } - doc = Document(page_content=content, metadata=metadata) - documents.append(doc) - if documents: - # save node to document segment - doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id) - # add document segments - doc_store.add_documents(docs=documents, save_child=False) - if dataset.indexing_technique == "high_quality": - vector = Vector(dataset) - vector.create(documents) - elif dataset.indexing_technique == "economy": - keyword = Keyword(dataset) - keyword.add_texts(documents) + def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any): + if isinstance(chunks, list): + documents = [] + for content in chunks: + metadata = { + "dataset_id": dataset.id, + "document_id": document.id, + "doc_id": str(uuid.uuid4()), + "doc_hash": helper.generate_text_hash(content), + } + doc = Document(page_content=content, metadata=metadata) + documents.append(doc) + if documents: + # save node to document segment + doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id) + # add document segments + doc_store.add_documents(docs=documents, save_child=False) + if dataset.indexing_technique == "high_quality": + vector = Vector(dataset) + vector.create(documents) + elif dataset.indexing_technique == "economy": + keyword = Keyword(dataset) + keyword.add_texts(documents) + else: + raise ValueError("Chunks is not a list") - def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]: - paragraph = GeneralStructureChunk(**chunks) - preview = [] - for content in paragraph.general_chunks: - preview.append({"content": content}) - return {"preview": preview, "total_segments": len(paragraph.general_chunks)} + def format_preview(self, chunks: Any) -> Mapping[str, Any]: + if isinstance(chunks, list): + preview = [] + for content in chunks: + preview.append({"content": content}) + return {"preview": preview, "total_segments": len(chunks)} + else: + raise ValueError("Chunks is not a list") diff --git a/api/core/rag/index_processor/processor/parent_child_index_processor.py b/api/core/rag/index_processor/processor/parent_child_index_processor.py index 09d3e3bad6..5653ca9344 100644 --- a/api/core/rag/index_processor/processor/parent_child_index_processor.py +++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py @@ -207,7 +207,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor): child_nodes.append(child_document) return child_nodes - def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]): + def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any): parent_childs = ParentChildStructureChunk(**chunks) documents = [] for parent_child in parent_childs.parent_child_chunks: @@ -257,7 +257,8 @@ class ParentChildIndexProcessor(BaseIndexProcessor): vector = Vector(dataset) vector.create(all_child_documents) - def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]: + def format_preview(self, chunks: Any) -> Mapping[str, Any]: + parent_childs = ParentChildStructureChunk(**chunks) preview = [] for parent_child in parent_childs.parent_child_chunks: diff --git a/api/core/rag/index_processor/processor/qa_index_processor.py b/api/core/rag/index_processor/processor/qa_index_processor.py index e2554cc998..8257b85110 100644 --- a/api/core/rag/index_processor/processor/qa_index_processor.py +++ b/api/core/rag/index_processor/processor/qa_index_processor.py @@ -166,7 +166,7 @@ class QAIndexProcessor(BaseIndexProcessor): docs.append(doc) return docs - def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]): + def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any): qa_chunks = QAStructureChunk(**chunks) documents = [] for qa_chunk in qa_chunks.qa_chunks: @@ -189,7 +189,7 @@ class QAIndexProcessor(BaseIndexProcessor): else: raise ValueError("Indexing technique must be high quality.") - def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]: + def format_preview(self, chunks: Any) -> Mapping[str, Any]: qa_chunks = QAStructureChunk(**chunks) preview = [] for qa_chunk in qa_chunks.qa_chunks: diff --git a/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py b/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py index 2efcb69445..83a2b8c53f 100644 --- a/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py +++ b/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py @@ -174,7 +174,7 @@ class KnowledgeIndexNode(Node): "display_status": document.indexing_status, } - def _get_preview_output(self, chunk_structure: str, chunks: Mapping[str, Any]) -> Mapping[str, Any]: + def _get_preview_output(self, chunk_structure: str, chunks: Any) -> Mapping[str, Any]: index_processor = IndexProcessorFactory(chunk_structure).init_index_processor() return index_processor.format_preview(chunks)