From 6f67a34349749f9d6269a3600e34371c2f0195a2 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Wed, 18 Jun 2025 14:37:18 +0800 Subject: [PATCH] r2 qa index --- .../processor/qa_index_processor.py | 34 ++++++++++++++++--- api/core/rag/models/document.py | 14 ++++++++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/api/core/rag/index_processor/processor/qa_index_processor.py b/api/core/rag/index_processor/processor/qa_index_processor.py index 5fed36c9b0..8b1bc181d5 100644 --- a/api/core/rag/index_processor/processor/qa_index_processor.py +++ b/api/core/rag/index_processor/processor/qa_index_processor.py @@ -15,13 +15,15 @@ from core.llm_generator.llm_generator import LLMGenerator from core.rag.cleaner.clean_processor import CleanProcessor from core.rag.datasource.retrieval_service import RetrievalService from core.rag.datasource.vdb.vector_factory import Vector +from core.rag.docstore.dataset_docstore import DatasetDocumentStore from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.extractor.extract_processor import ExtractProcessor from core.rag.index_processor.index_processor_base import BaseIndexProcessor -from core.rag.models.document import Document +from core.rag.models.document import Document, QAStructureChunk from core.tools.utils.text_processing_utils import remove_leading_symbols from libs import helper from models.dataset import Dataset +from models.dataset import Document as DatasetDocument from services.entities.knowledge_entities.knowledge_entities import Rule @@ -162,11 +164,35 @@ class QAIndexProcessor(BaseIndexProcessor): docs.append(doc) return docs - def index(self, dataset: Dataset, document: Document, chunks: Mapping[str, Any]): - pass + def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]): + qa_chunks = QAStructureChunk(**chunks) + documents = [] + for qa_chunk in qa_chunks.qa_chunks: + metadata = { + "dataset_id": dataset.id, + "document_id": document.id, + "doc_id": str(uuid.uuid4()), + "doc_hash": helper.generate_text_hash(qa_chunk.question), + "answer": qa_chunk.answer, + } + doc = Document(page_content=qa_chunk.question, metadata=metadata) + documents.append(doc) + if documents: + # save node to document segment + doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id) + doc_store.add_documents(docs=documents, save_child=False) + if dataset.indexing_technique == "high_quality": + vector = Vector(dataset) + vector.create(documents) + else: + raise ValueError("Indexing technique must be high quality.") def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]: - return {"preview": chunks} + qa_chunks = QAStructureChunk(**chunks) + preview = [] + for qa_chunk in qa_chunks.qa_chunks: + preview.append({"question": qa_chunk.question, "answer": qa_chunk.answer}) + return {"qa_preview": preview, "total_segments": len(qa_chunks.qa_chunks)} def _format_qa_document(self, flask_app: Flask, tenant_id: str, document_node, all_qa_documents, document_language): format_documents = [] diff --git a/api/core/rag/models/document.py b/api/core/rag/models/document.py index 97d53123b6..3f82bda2c6 100644 --- a/api/core/rag/models/document.py +++ b/api/core/rag/models/document.py @@ -60,6 +60,20 @@ class ParentChildStructureChunk(BaseModel): parent_child_chunks: list[ParentChildChunk] +class QAChunk(BaseModel): + """ + QA Chunk. + """ + + question: str + answer: str + +class QAStructureChunk(BaseModel): + """ + QAStructureChunk. + """ + qa_chunks: list[QAChunk] + class BaseDocumentTransformer(ABC): """Abstract base class for document transformation systems.