transform document

This commit is contained in:
jyong 2025-08-05 18:16:24 +08:00
parent 6faa4b107b
commit 2f163bad8f
2 changed files with 23 additions and 3 deletions

View File

@ -1,5 +1,6 @@
"""Paragraph index processor.""" """Paragraph index processor."""
import json
import uuid import uuid
from collections.abc import Mapping from collections.abc import Mapping
from typing import Any, Optional from typing import Any, Optional
@ -16,7 +17,7 @@ from core.rag.index_processor.index_processor_base import BaseIndexProcessor
from core.rag.models.document import ChildDocument, Document, ParentChildStructureChunk from core.rag.models.document import ChildDocument, Document, ParentChildStructureChunk
from extensions.ext_database import db from extensions.ext_database import db
from libs import helper from libs import helper
from models.dataset import ChildChunk, Dataset, DocumentSegment from models.dataset import ChildChunk, Dataset, DatasetProcessRule, DocumentSegment
from models.dataset import Document as DatasetDocument from models.dataset import Document as DatasetDocument
from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule
@ -228,13 +229,31 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
doc = Document(page_content=parent_child.parent_content, metadata=metadata, children=child_documents) doc = Document(page_content=parent_child.parent_content, metadata=metadata, children=child_documents)
documents.append(doc) documents.append(doc)
if documents: if documents:
# update document parent mode
dataset_process_rule = DatasetProcessRule(
dataset_id=dataset.id,
mode="hierarchical",
rules=json.dumps({
"parent_mode": parent_childs.parent_mode,
}),
created_by=document.created_by,
)
db.session.add(dataset_process_rule)
db.session.flush()
document.dataset_process_rule_id = dataset_process_rule.id
db.session.commit()
# save node to document segment # save node to document segment
doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id) doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id)
# add document segments # add document segments
doc_store.add_documents(docs=documents, save_child=True) doc_store.add_documents(docs=documents, save_child=True)
if dataset.indexing_technique == "high_quality": if dataset.indexing_technique == "high_quality":
vector = Vector(dataset) all_child_documents = []
vector.create(documents) for doc in documents:
if doc.children:
all_child_documents.extend(doc.children)
if all_child_documents:
vector = Vector(dataset)
vector.create(all_child_documents)
def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]: def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
parent_childs = ParentChildStructureChunk(**chunks) parent_childs = ParentChildStructureChunk(**chunks)

View File

@ -58,6 +58,7 @@ class ParentChildStructureChunk(BaseModel):
""" """
parent_child_chunks: list[ParentChildChunk] parent_child_chunks: list[ParentChildChunk]
parent_mode: str = "paragraph"
class QAChunk(BaseModel): class QAChunk(BaseModel):