mirror of
https://github.com/langgenius/dify.git
synced 2026-04-07 16:57:59 +08:00
fix chunk format
This commit is contained in:
parent
b3dbf9fe94
commit
3c0adfb48a
@ -115,11 +115,10 @@ class File(BaseModel):
|
||||
if self.related_id is None:
|
||||
raise ValueError("Missing file related_id")
|
||||
return helpers.get_signed_file_url(upload_file_id=self.related_id)
|
||||
elif self.transfer_method == FileTransferMethod.TOOL_FILE:
|
||||
elif self.transfer_method == FileTransferMethod.TOOL_FILE or self.transfer_method == FileTransferMethod.DATASOURCE_FILE:
|
||||
assert self.related_id is not None
|
||||
assert self.extension is not None
|
||||
return sign_tool_file(tool_file_id=self.related_id, extension=self.extension)
|
||||
|
||||
def to_plugin_parameter(self) -> dict[str, Any]:
|
||||
return {
|
||||
"dify_model_identity": FILE_MODEL_IDENTITY,
|
||||
|
||||
@ -38,11 +38,11 @@ class BaseIndexProcessor(ABC):
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
|
||||
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
|
||||
def format_preview(self, chunks: Any) -> Mapping[str, Any]:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
|
||||
@ -131,33 +131,37 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
||||
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
|
||||
paragraph = GeneralStructureChunk(**chunks)
|
||||
documents = []
|
||||
for content in paragraph.general_chunks:
|
||||
metadata = {
|
||||
"dataset_id": dataset.id,
|
||||
"document_id": document.id,
|
||||
"doc_id": str(uuid.uuid4()),
|
||||
"doc_hash": helper.generate_text_hash(content),
|
||||
}
|
||||
doc = Document(page_content=content, metadata=metadata)
|
||||
documents.append(doc)
|
||||
if documents:
|
||||
# save node to document segment
|
||||
doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id)
|
||||
# add document segments
|
||||
doc_store.add_documents(docs=documents, save_child=False)
|
||||
if dataset.indexing_technique == "high_quality":
|
||||
vector = Vector(dataset)
|
||||
vector.create(documents)
|
||||
elif dataset.indexing_technique == "economy":
|
||||
keyword = Keyword(dataset)
|
||||
keyword.add_texts(documents)
|
||||
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
|
||||
if isinstance(chunks, list):
|
||||
documents = []
|
||||
for content in chunks:
|
||||
metadata = {
|
||||
"dataset_id": dataset.id,
|
||||
"document_id": document.id,
|
||||
"doc_id": str(uuid.uuid4()),
|
||||
"doc_hash": helper.generate_text_hash(content),
|
||||
}
|
||||
doc = Document(page_content=content, metadata=metadata)
|
||||
documents.append(doc)
|
||||
if documents:
|
||||
# save node to document segment
|
||||
doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id)
|
||||
# add document segments
|
||||
doc_store.add_documents(docs=documents, save_child=False)
|
||||
if dataset.indexing_technique == "high_quality":
|
||||
vector = Vector(dataset)
|
||||
vector.create(documents)
|
||||
elif dataset.indexing_technique == "economy":
|
||||
keyword = Keyword(dataset)
|
||||
keyword.add_texts(documents)
|
||||
else:
|
||||
raise ValueError("Chunks is not a list")
|
||||
|
||||
def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
|
||||
paragraph = GeneralStructureChunk(**chunks)
|
||||
preview = []
|
||||
for content in paragraph.general_chunks:
|
||||
preview.append({"content": content})
|
||||
return {"preview": preview, "total_segments": len(paragraph.general_chunks)}
|
||||
def format_preview(self, chunks: Any) -> Mapping[str, Any]:
|
||||
if isinstance(chunks, list):
|
||||
preview = []
|
||||
for content in chunks:
|
||||
preview.append({"content": content})
|
||||
return {"preview": preview, "total_segments": len(chunks)}
|
||||
else:
|
||||
raise ValueError("Chunks is not a list")
|
||||
|
||||
@ -207,7 +207,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
|
||||
child_nodes.append(child_document)
|
||||
return child_nodes
|
||||
|
||||
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
|
||||
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
|
||||
parent_childs = ParentChildStructureChunk(**chunks)
|
||||
documents = []
|
||||
for parent_child in parent_childs.parent_child_chunks:
|
||||
@ -257,7 +257,8 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
|
||||
vector = Vector(dataset)
|
||||
vector.create(all_child_documents)
|
||||
|
||||
def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
|
||||
def format_preview(self, chunks: Any) -> Mapping[str, Any]:
|
||||
|
||||
parent_childs = ParentChildStructureChunk(**chunks)
|
||||
preview = []
|
||||
for parent_child in parent_childs.parent_child_chunks:
|
||||
|
||||
@ -166,7 +166,7 @@ class QAIndexProcessor(BaseIndexProcessor):
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
||||
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
|
||||
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
|
||||
qa_chunks = QAStructureChunk(**chunks)
|
||||
documents = []
|
||||
for qa_chunk in qa_chunks.qa_chunks:
|
||||
@ -189,7 +189,7 @@ class QAIndexProcessor(BaseIndexProcessor):
|
||||
else:
|
||||
raise ValueError("Indexing technique must be high quality.")
|
||||
|
||||
def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
|
||||
def format_preview(self, chunks: Any) -> Mapping[str, Any]:
|
||||
qa_chunks = QAStructureChunk(**chunks)
|
||||
preview = []
|
||||
for qa_chunk in qa_chunks.qa_chunks:
|
||||
|
||||
@ -174,7 +174,7 @@ class KnowledgeIndexNode(Node):
|
||||
"display_status": document.indexing_status,
|
||||
}
|
||||
|
||||
def _get_preview_output(self, chunk_structure: str, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
|
||||
def _get_preview_output(self, chunk_structure: str, chunks: Any) -> Mapping[str, Any]:
|
||||
index_processor = IndexProcessorFactory(chunk_structure).init_index_processor()
|
||||
return index_processor.format_preview(chunks)
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user