From c2afb84884e84011e1014e7d8206438066e3ae5a Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Fri, 29 Aug 2025 17:10:18 +0800 Subject: [PATCH] fix chunk format --- .../processor/paragraph_index_processor.py | 6 +++++- .../processor/parent_child_index_processor.py | 8 +++++++- .../rag/index_processor/processor/qa_index_processor.py | 7 ++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/api/core/rag/index_processor/processor/paragraph_index_processor.py b/api/core/rag/index_processor/processor/paragraph_index_processor.py index cdd5898e8d..30b28217dd 100644 --- a/api/core/rag/index_processor/processor/paragraph_index_processor.py +++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py @@ -11,6 +11,7 @@ from core.rag.datasource.vdb.vector_factory import Vector from core.rag.docstore.dataset_docstore import DatasetDocumentStore from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.extractor.extract_processor import ExtractProcessor +from core.rag.index_processor.constant.index_type import IndexType from core.rag.index_processor.index_processor_base import BaseIndexProcessor from core.rag.models.document import Document, GeneralStructureChunk from core.tools.utils.text_processing_utils import remove_leading_symbols @@ -162,6 +163,9 @@ class ParagraphIndexProcessor(BaseIndexProcessor): preview = [] for content in chunks: preview.append({"content": content}) - return {"preview": preview, "total_segments": len(chunks)} + return {"chunk_structure": IndexType.PARAGRAPH_INDEX, + "preview": preview, + "total_segments": len(chunks) + } else: raise ValueError("Chunks is not a list") diff --git a/api/core/rag/index_processor/processor/parent_child_index_processor.py b/api/core/rag/index_processor/processor/parent_child_index_processor.py index 5653ca9344..5013046bf5 100644 --- a/api/core/rag/index_processor/processor/parent_child_index_processor.py +++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py @@ -13,6 +13,7 @@ from core.rag.datasource.vdb.vector_factory import Vector from core.rag.docstore.dataset_docstore import DatasetDocumentStore from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.extractor.extract_processor import ExtractProcessor +from core.rag.index_processor.constant.index_type import IndexType from core.rag.index_processor.index_processor_base import BaseIndexProcessor from core.rag.models.document import ChildDocument, Document, ParentChildStructureChunk from extensions.ext_database import db @@ -263,4 +264,9 @@ class ParentChildIndexProcessor(BaseIndexProcessor): preview = [] for parent_child in parent_childs.parent_child_chunks: preview.append({"content": parent_child.parent_content, "child_chunks": parent_child.child_contents}) - return {"preview": preview, "total_segments": len(parent_childs.parent_child_chunks)} + return { + "chunk_structure": IndexType.PARENT_CHILD_INDEX, + "parent_mode": parent_childs.parent_mode, + "preview": preview, + "total_segments": len(parent_childs.parent_child_chunks), + } diff --git a/api/core/rag/index_processor/processor/qa_index_processor.py b/api/core/rag/index_processor/processor/qa_index_processor.py index 8257b85110..df223f07c1 100644 --- a/api/core/rag/index_processor/processor/qa_index_processor.py +++ b/api/core/rag/index_processor/processor/qa_index_processor.py @@ -18,6 +18,7 @@ from core.rag.datasource.vdb.vector_factory import Vector from core.rag.docstore.dataset_docstore import DatasetDocumentStore from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.extractor.extract_processor import ExtractProcessor +from core.rag.index_processor.constant.index_type import IndexType from core.rag.index_processor.index_processor_base import BaseIndexProcessor from core.rag.models.document import Document, QAStructureChunk from core.tools.utils.text_processing_utils import remove_leading_symbols @@ -194,7 +195,11 @@ class QAIndexProcessor(BaseIndexProcessor): preview = [] for qa_chunk in qa_chunks.qa_chunks: preview.append({"question": qa_chunk.question, "answer": qa_chunk.answer}) - return {"qa_preview": preview, "total_segments": len(qa_chunks.qa_chunks)} + return { + "chunk_structure": IndexType.QA_INDEX, + "qa_preview": preview, + "total_segments": len(qa_chunks.qa_chunks), + } def _format_qa_document(self, flask_app: Flask, tenant_id: str, document_node, all_qa_documents, document_language): format_documents = []