mirror of
https://github.com/langgenius/dify.git
synced 2026-04-27 02:36:29 +08:00
transform document
This commit is contained in:
parent
6faa4b107b
commit
2f163bad8f
@ -1,5 +1,6 @@
|
|||||||
"""Paragraph index processor."""
|
"""Paragraph index processor."""
|
||||||
|
|
||||||
|
import json
|
||||||
import uuid
|
import uuid
|
||||||
from collections.abc import Mapping
|
from collections.abc import Mapping
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
@ -16,7 +17,7 @@ from core.rag.index_processor.index_processor_base import BaseIndexProcessor
|
|||||||
from core.rag.models.document import ChildDocument, Document, ParentChildStructureChunk
|
from core.rag.models.document import ChildDocument, Document, ParentChildStructureChunk
|
||||||
from extensions.ext_database import db
|
from extensions.ext_database import db
|
||||||
from libs import helper
|
from libs import helper
|
||||||
from models.dataset import ChildChunk, Dataset, DocumentSegment
|
from models.dataset import ChildChunk, Dataset, DatasetProcessRule, DocumentSegment
|
||||||
from models.dataset import Document as DatasetDocument
|
from models.dataset import Document as DatasetDocument
|
||||||
from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule
|
from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule
|
||||||
|
|
||||||
@ -228,13 +229,31 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
|
|||||||
doc = Document(page_content=parent_child.parent_content, metadata=metadata, children=child_documents)
|
doc = Document(page_content=parent_child.parent_content, metadata=metadata, children=child_documents)
|
||||||
documents.append(doc)
|
documents.append(doc)
|
||||||
if documents:
|
if documents:
|
||||||
|
# update document parent mode
|
||||||
|
dataset_process_rule = DatasetProcessRule(
|
||||||
|
dataset_id=dataset.id,
|
||||||
|
mode="hierarchical",
|
||||||
|
rules=json.dumps({
|
||||||
|
"parent_mode": parent_childs.parent_mode,
|
||||||
|
}),
|
||||||
|
created_by=document.created_by,
|
||||||
|
)
|
||||||
|
db.session.add(dataset_process_rule)
|
||||||
|
db.session.flush()
|
||||||
|
document.dataset_process_rule_id = dataset_process_rule.id
|
||||||
|
db.session.commit()
|
||||||
# save node to document segment
|
# save node to document segment
|
||||||
doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id)
|
doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id)
|
||||||
# add document segments
|
# add document segments
|
||||||
doc_store.add_documents(docs=documents, save_child=True)
|
doc_store.add_documents(docs=documents, save_child=True)
|
||||||
if dataset.indexing_technique == "high_quality":
|
if dataset.indexing_technique == "high_quality":
|
||||||
vector = Vector(dataset)
|
all_child_documents = []
|
||||||
vector.create(documents)
|
for doc in documents:
|
||||||
|
if doc.children:
|
||||||
|
all_child_documents.extend(doc.children)
|
||||||
|
if all_child_documents:
|
||||||
|
vector = Vector(dataset)
|
||||||
|
vector.create(all_child_documents)
|
||||||
|
|
||||||
def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
|
def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
|
||||||
parent_childs = ParentChildStructureChunk(**chunks)
|
parent_childs = ParentChildStructureChunk(**chunks)
|
||||||
|
|||||||
@ -58,6 +58,7 @@ class ParentChildStructureChunk(BaseModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
parent_child_chunks: list[ParentChildChunk]
|
parent_child_chunks: list[ParentChildChunk]
|
||||||
|
parent_mode: str = "paragraph"
|
||||||
|
|
||||||
|
|
||||||
class QAChunk(BaseModel):
|
class QAChunk(BaseModel):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user