fix chunk format

This commit is contained in:
jyong 2025-08-29 17:10:18 +08:00
parent 3c0adfb48a
commit c2afb84884
3 changed files with 18 additions and 3 deletions

View File

@ -11,6 +11,7 @@ from core.rag.datasource.vdb.vector_factory import Vector
from core.rag.docstore.dataset_docstore import DatasetDocumentStore
from core.rag.extractor.entity.extract_setting import ExtractSetting
from core.rag.extractor.extract_processor import ExtractProcessor
from core.rag.index_processor.constant.index_type import IndexType
from core.rag.index_processor.index_processor_base import BaseIndexProcessor
from core.rag.models.document import Document, GeneralStructureChunk
from core.tools.utils.text_processing_utils import remove_leading_symbols
@ -162,6 +163,9 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
preview = []
for content in chunks:
preview.append({"content": content})
return {"preview": preview, "total_segments": len(chunks)}
return {"chunk_structure": IndexType.PARAGRAPH_INDEX,
"preview": preview,
"total_segments": len(chunks)
}
else:
raise ValueError("Chunks is not a list")

View File

@ -13,6 +13,7 @@ from core.rag.datasource.vdb.vector_factory import Vector
from core.rag.docstore.dataset_docstore import DatasetDocumentStore
from core.rag.extractor.entity.extract_setting import ExtractSetting
from core.rag.extractor.extract_processor import ExtractProcessor
from core.rag.index_processor.constant.index_type import IndexType
from core.rag.index_processor.index_processor_base import BaseIndexProcessor
from core.rag.models.document import ChildDocument, Document, ParentChildStructureChunk
from extensions.ext_database import db
@ -263,4 +264,9 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
preview = []
for parent_child in parent_childs.parent_child_chunks:
preview.append({"content": parent_child.parent_content, "child_chunks": parent_child.child_contents})
return {"preview": preview, "total_segments": len(parent_childs.parent_child_chunks)}
return {
"chunk_structure": IndexType.PARENT_CHILD_INDEX,
"parent_mode": parent_childs.parent_mode,
"preview": preview,
"total_segments": len(parent_childs.parent_child_chunks),
}

View File

@ -18,6 +18,7 @@ from core.rag.datasource.vdb.vector_factory import Vector
from core.rag.docstore.dataset_docstore import DatasetDocumentStore
from core.rag.extractor.entity.extract_setting import ExtractSetting
from core.rag.extractor.extract_processor import ExtractProcessor
from core.rag.index_processor.constant.index_type import IndexType
from core.rag.index_processor.index_processor_base import BaseIndexProcessor
from core.rag.models.document import Document, QAStructureChunk
from core.tools.utils.text_processing_utils import remove_leading_symbols
@ -194,7 +195,11 @@ class QAIndexProcessor(BaseIndexProcessor):
preview = []
for qa_chunk in qa_chunks.qa_chunks:
preview.append({"question": qa_chunk.question, "answer": qa_chunk.answer})
return {"qa_preview": preview, "total_segments": len(qa_chunks.qa_chunks)}
return {
"chunk_structure": IndexType.QA_INDEX,
"qa_preview": preview,
"total_segments": len(qa_chunks.qa_chunks),
}
def _format_qa_document(self, flask_app: Flask, tenant_id: str, document_node, all_qa_documents, document_language):
format_documents = []