fix chunk format

This commit is contained in:
jyong 2025-08-29 16:27:22 +08:00
parent b3dbf9fe94
commit 3c0adfb48a
6 changed files with 42 additions and 38 deletions

View File

@ -115,11 +115,10 @@ class File(BaseModel):
if self.related_id is None:
raise ValueError("Missing file related_id")
return helpers.get_signed_file_url(upload_file_id=self.related_id)
elif self.transfer_method == FileTransferMethod.TOOL_FILE:
elif self.transfer_method == FileTransferMethod.TOOL_FILE or self.transfer_method == FileTransferMethod.DATASOURCE_FILE:
assert self.related_id is not None
assert self.extension is not None
return sign_tool_file(tool_file_id=self.related_id, extension=self.extension)
def to_plugin_parameter(self) -> dict[str, Any]:
return {
"dify_model_identity": FILE_MODEL_IDENTITY,

View File

@ -38,11 +38,11 @@ class BaseIndexProcessor(ABC):
raise NotImplementedError
@abstractmethod
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
raise NotImplementedError
@abstractmethod
def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
def format_preview(self, chunks: Any) -> Mapping[str, Any]:
raise NotImplementedError
@abstractmethod

View File

@ -131,33 +131,37 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
docs.append(doc)
return docs
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
paragraph = GeneralStructureChunk(**chunks)
documents = []
for content in paragraph.general_chunks:
metadata = {
"dataset_id": dataset.id,
"document_id": document.id,
"doc_id": str(uuid.uuid4()),
"doc_hash": helper.generate_text_hash(content),
}
doc = Document(page_content=content, metadata=metadata)
documents.append(doc)
if documents:
# save node to document segment
doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id)
# add document segments
doc_store.add_documents(docs=documents, save_child=False)
if dataset.indexing_technique == "high_quality":
vector = Vector(dataset)
vector.create(documents)
elif dataset.indexing_technique == "economy":
keyword = Keyword(dataset)
keyword.add_texts(documents)
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
if isinstance(chunks, list):
documents = []
for content in chunks:
metadata = {
"dataset_id": dataset.id,
"document_id": document.id,
"doc_id": str(uuid.uuid4()),
"doc_hash": helper.generate_text_hash(content),
}
doc = Document(page_content=content, metadata=metadata)
documents.append(doc)
if documents:
# save node to document segment
doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id)
# add document segments
doc_store.add_documents(docs=documents, save_child=False)
if dataset.indexing_technique == "high_quality":
vector = Vector(dataset)
vector.create(documents)
elif dataset.indexing_technique == "economy":
keyword = Keyword(dataset)
keyword.add_texts(documents)
else:
raise ValueError("Chunks is not a list")
def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
paragraph = GeneralStructureChunk(**chunks)
preview = []
for content in paragraph.general_chunks:
preview.append({"content": content})
return {"preview": preview, "total_segments": len(paragraph.general_chunks)}
def format_preview(self, chunks: Any) -> Mapping[str, Any]:
if isinstance(chunks, list):
preview = []
for content in chunks:
preview.append({"content": content})
return {"preview": preview, "total_segments": len(chunks)}
else:
raise ValueError("Chunks is not a list")

View File

@ -207,7 +207,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
child_nodes.append(child_document)
return child_nodes
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
parent_childs = ParentChildStructureChunk(**chunks)
documents = []
for parent_child in parent_childs.parent_child_chunks:
@ -257,7 +257,8 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
vector = Vector(dataset)
vector.create(all_child_documents)
def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
def format_preview(self, chunks: Any) -> Mapping[str, Any]:
parent_childs = ParentChildStructureChunk(**chunks)
preview = []
for parent_child in parent_childs.parent_child_chunks:

View File

@ -166,7 +166,7 @@ class QAIndexProcessor(BaseIndexProcessor):
docs.append(doc)
return docs
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Mapping[str, Any]):
def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
qa_chunks = QAStructureChunk(**chunks)
documents = []
for qa_chunk in qa_chunks.qa_chunks:
@ -189,7 +189,7 @@ class QAIndexProcessor(BaseIndexProcessor):
else:
raise ValueError("Indexing technique must be high quality.")
def format_preview(self, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
def format_preview(self, chunks: Any) -> Mapping[str, Any]:
qa_chunks = QAStructureChunk(**chunks)
preview = []
for qa_chunk in qa_chunks.qa_chunks:

View File

@ -174,7 +174,7 @@ class KnowledgeIndexNode(Node):
"display_status": document.indexing_status,
}
def _get_preview_output(self, chunk_structure: str, chunks: Mapping[str, Any]) -> Mapping[str, Any]:
def _get_preview_output(self, chunk_structure: str, chunks: Any) -> Mapping[str, Any]:
index_processor = IndexProcessorFactory(chunk_structure).init_index_processor()
return index_processor.format_preview(chunks)