"""Service API endpoints for dataset document management. The canonical Service API paths use hyphenated route segments. Legacy underscore aliases remain registered for backward compatibility, but they must stay marked deprecated in generated API docs so clients migrate toward the canonical paths. """ import json from collections.abc import Mapping from contextlib import ExitStack from copy import deepcopy from typing import Annotated, Any, Literal, Self, override from uuid import UUID from flask import request, send_file from pydantic import BaseModel, Field, GetJsonSchemaHandler, WithJsonSchema, field_validator, model_validator from sqlalchemy import desc, func, select from werkzeug.exceptions import Forbidden, NotFound import services from controllers.common.controller_schemas import DocumentBatchDownloadZipPayload from controllers.common.errors import ( FilenameNotExistsError, FileTooLargeError, NoFileUploadedError, TooManyFilesError, UnsupportedFileTypeError, ) from controllers.common.fields import BinaryFileResponse, UrlResponse from controllers.common.schema import ( query_params_from_model, register_enum_models, register_response_schema_models, register_schema_models, ) from controllers.service_api import service_api_ns from controllers.service_api.app.error import ProviderNotInitializeError from controllers.service_api.dataset.error import ( ArchivedDocumentImmutableError, DocumentIndexingError, InvalidMetadataError, ) from controllers.service_api.schema import binary_response from controllers.service_api.wraps import ( DatasetApiResource, cloud_edition_billing_rate_limit_check, cloud_edition_billing_resource_check, ) from core.errors.error import ProviderTokenNotInitError from core.rag.entities import PreProcessingRule, Rule, Segmentation from core.rag.retrieval.retrieval_methods import RetrievalMethod from extensions.ext_database import db from fields.base import ResponseModel from fields.document_fields import ( DocumentListResponse, DocumentMetadataResponse, DocumentResponse, DocumentStatusListResponse, ) from libs.helper import dump_response from libs.login import current_user from models.dataset import Dataset, Document, DocumentSegment from models.enums import SegmentStatus from services.dataset_service import DatasetService, DocumentService from services.entities.knowledge_entities.knowledge_entities import ( DocForm, IndexingTechnique, KnowledgeConfig, ProcessRule, RetrievalModel, ) from services.file_service import FileService from services.summary_index_service import SummaryIndexService class DocumentTextCreatePayload(BaseModel): name: str = Field(description="Document name.") text: str = Field(description="Document text content.") process_rule: ProcessRule | None = Field(default=None, description="Processing rules for chunking.") original_document_id: str | None = Field(default=None, description="Original document ID for replacement.") doc_form: DocForm = Field( default="text_model", description=( "`text_model` for standard text chunking, `hierarchical_model` for parent-child chunk structure, " "`qa_model` for question-answer pair extraction." ), ) doc_language: str = Field(default="English", description="Language of the document for processing optimization.") indexing_technique: IndexingTechnique = Field( default=None, description=( "`high_quality` uses embedding models for precise search; `economy` uses keyword-based indexing. " "Required when adding the first document to a knowledge base; subsequent documents inherit the " "knowledge base's indexing technique if omitted." ), ) retrieval_model: RetrievalModel | None = Field( default=None, description="Retrieval model configuration. Controls how chunks are searched and ranked.", ) embedding_model: str | None = Field( default=None, description=( "Embedding model name. Use the `model` field from " "[Get Available Models](/api-reference/models/get-available-models) with `model_type=text-embedding`." ), ) embedding_model_provider: str | None = Field( default=None, description=( "Embedding model provider. Use the `provider` field from " "[Get Available Models](/api-reference/models/get-available-models) with `model_type=text-embedding`." ), ) @field_validator("doc_form") @classmethod def validate_doc_form(cls, value: str) -> str: if value not in Dataset.DOC_FORM_LIST: raise ValueError("Invalid doc_form.") return value class DocumentTextUpdate(BaseModel): name: str | None = Field(default=None, description="Document name. Required when `text` is provided.") text: str | None = Field(default=None, description="Document text content.") process_rule: ProcessRule | None = Field(default=None, description="Processing rules for chunking.") doc_form: DocForm = Field( default="text_model", description=( "`text_model` for standard text chunking, `hierarchical_model` for parent-child chunk structure, " "`qa_model` for question-answer pair extraction." ), ) doc_language: str = Field(default="English", description="Language of the document for processing optimization.") retrieval_model: RetrievalModel | None = Field( default=None, description="Retrieval model configuration. Controls how chunks are searched and ranked.", ) @field_validator("doc_form") @classmethod def validate_doc_form(cls, value: str) -> str: if value not in Dataset.DOC_FORM_LIST: raise ValueError("Invalid doc_form.") return value @classmethod @override def __get_pydantic_json_schema__(cls, core_schema: Any, handler: GetJsonSchemaHandler) -> dict[str, Any]: schema = handler.resolve_ref_schema(handler(core_schema)) properties = schema.get("properties") if not isinstance(properties, dict): return schema text_branch_properties = deepcopy(properties) text_branch_properties["text"] = _non_null_property_schema(properties.get("text")) text_branch_properties["name"] = _non_null_property_schema(properties.get("name")) no_text_branch_properties = deepcopy(properties) no_text_branch_properties["text"] = {"description": "Document text content.", "type": "null"} return { **schema, "anyOf": [ { "properties": text_branch_properties, "required": ["name", "text"], "type": "object", }, { "properties": no_text_branch_properties, "type": "object", }, ], } @model_validator(mode="after") def check_text_and_name(self) -> Self: if self.text is not None and self.name is None: raise ValueError("name is required when text is provided") return self def _non_null_property_schema(property_schema: object) -> dict[str, Any]: if not isinstance(property_schema, dict): return {} any_of = property_schema.get("anyOf") if isinstance(any_of, list): non_null_candidates = [ candidate for candidate in any_of if isinstance(candidate, dict) and candidate.get("type") != "null" ] if len(non_null_candidates) == 1: return { **{key: value for key, value in property_schema.items() if key != "anyOf"}, **deepcopy(non_null_candidates[0]), } return deepcopy(property_schema) DocumentDisplayStatus = Annotated[ str | None, WithJsonSchema( { "anyOf": [ { "enum": ["queuing", "indexing", "paused", "error", "available", "disabled", "archived"], "type": "string", }, {"type": "null"}, ] } ), ] class DocumentListQuery(BaseModel): page: int = Field(default=1, description="Page number to retrieve.") limit: int = Field(default=20, description="Number of items per page. Server caps at `100`.") keyword: str | None = Field(default=None, description="Search keyword to filter by document name.") status: DocumentDisplayStatus = Field(default=None, description="Filter by display status.") class DocumentGetQuery(BaseModel): metadata: Literal["all", "only", "without"] = Field( default="all", description=( "`all` returns all fields including metadata. `only` returns only `id`, `doc_type`, and " "`doc_metadata`. `without` returns all fields except `doc_metadata`." ), ) DOCUMENT_CREATE_BY_FILE_PARAMS = { "dataset_id": "Knowledge base ID.", "file": { "in": "formData", "type": "file", "required": True, "description": "Document file to upload.", }, "data": { "in": "formData", "type": "string", "required": False, "description": ( "JSON string containing configuration. Accepts the same fields as " "[Create Document by Text](/api-reference/documents/create-document-by-text) (`indexing_technique`, " "`doc_form`, `doc_language`, `process_rule`, `retrieval_model`, `embedding_model`, " "`embedding_model_provider`) except `name` and `text`." ), }, } DOCUMENT_UPDATE_BY_FILE_PARAMS = { "dataset_id": "Knowledge base ID.", "document_id": "Document ID.", "file": { "in": "formData", "type": "file", "required": False, "description": "Replacement document file to upload.", }, "data": { "in": "formData", "type": "string", "required": False, "description": ( "JSON string containing document update settings such as `doc_form`, `doc_language`, `process_rule`, " "`retrieval_model`, `embedding_model`, and `embedding_model_provider`. `name` and `text` are not used " "for file updates." ), }, } class DocumentAndBatchResponse(ResponseModel): document: DocumentResponse batch: str class DocumentDetailResponse(ResponseModel): id: str position: int | None = None data_source_type: str | None = None data_source_info: dict[str, Any] | None = Field(default=None) dataset_process_rule_id: str | None = None dataset_process_rule: dict[str, Any] | None = Field(default=None) document_process_rule: dict[str, Any] | None = Field(default=None) name: str | None = None created_from: str | None = None created_by: str | None = None created_at: int | None = None tokens: int | None = None indexing_status: str | None = None completed_at: int | None = None updated_at: int | None = None indexing_latency: float | None = None error: str | None = None enabled: bool | None = None disabled_at: int | None = None disabled_by: str | None = None archived: bool | None = None doc_type: str | None = None doc_metadata: list[DocumentMetadataResponse] | None = None segment_count: int | None = None average_segment_length: float | None = None hit_count: int | None = None display_status: str | None = None doc_form: str | None = None doc_language: str | None = None summary_index_status: str | None = None need_summary: bool | None = None register_enum_models(service_api_ns, RetrievalMethod) register_schema_models( service_api_ns, ProcessRule, RetrievalModel, DocumentTextCreatePayload, DocumentTextUpdate, DocumentListQuery, DocumentGetQuery, DocumentBatchDownloadZipPayload, Rule, PreProcessingRule, Segmentation, ) register_response_schema_models( service_api_ns, BinaryFileResponse, UrlResponse, DocumentResponse, DocumentAndBatchResponse, DocumentDetailResponse, DocumentListResponse, DocumentStatusListResponse, ) def _create_document_by_text(tenant_id: str, dataset_id: UUID) -> tuple[Mapping[str, object], int]: """Create a document from text for both canonical and legacy routes.""" payload = DocumentTextCreatePayload.model_validate(service_api_ns.payload or {}) args = payload.model_dump(exclude_none=True) dataset_id_str = str(dataset_id) tenant_id_str = str(tenant_id) dataset = db.session.scalar( select(Dataset).where(Dataset.tenant_id == tenant_id_str, Dataset.id == dataset_id_str).limit(1) ) if not dataset: raise ValueError("Dataset does not exist.") if not dataset.indexing_technique and not args.get("indexing_technique"): raise ValueError("indexing_technique is required.") embedding_model_provider = payload.embedding_model_provider embedding_model = payload.embedding_model if embedding_model_provider and embedding_model: DatasetService.check_embedding_model_setting(tenant_id_str, embedding_model_provider, embedding_model) retrieval_model = payload.retrieval_model if ( retrieval_model and retrieval_model.reranking_model and retrieval_model.reranking_model.reranking_provider_name and retrieval_model.reranking_model.reranking_model_name ): DatasetService.check_reranking_model_setting( tenant_id_str, retrieval_model.reranking_model.reranking_provider_name, retrieval_model.reranking_model.reranking_model_name, ) if not current_user: raise ValueError("current_user is required") upload_file = FileService(db.engine).upload_text( text=payload.text, text_name=payload.name, user_id=current_user.id, tenant_id=tenant_id_str ) data_source = { "type": "upload_file", "info_list": {"data_source_type": "upload_file", "file_info_list": {"file_ids": [upload_file.id]}}, } args["data_source"] = data_source knowledge_config = KnowledgeConfig.model_validate(args) DocumentService.document_create_args_validate(knowledge_config) if not current_user: raise ValueError("current_user is required") try: documents, batch = DocumentService.save_document_with_dataset_id( dataset=dataset, knowledge_config=knowledge_config, account=current_user, dataset_process_rule=dataset.latest_process_rule if "process_rule" not in args else None, created_from="api", ) except ProviderTokenNotInitError as ex: raise ProviderNotInitializeError(ex.description) document = documents[0] return dump_response(DocumentAndBatchResponse, {"document": document, "batch": batch}), 200 def _update_document_by_text(tenant_id: str, dataset_id: UUID, document_id: UUID) -> tuple[Mapping[str, object], int]: """Update a document from text for both canonical and legacy routes.""" payload = DocumentTextUpdate.model_validate(service_api_ns.payload or {}) dataset = db.session.scalar( select(Dataset).where(Dataset.tenant_id == tenant_id, Dataset.id == str(dataset_id)).limit(1) ) args = payload.model_dump(exclude_none=True) if not dataset: raise ValueError("Dataset does not exist.") retrieval_model = payload.retrieval_model if ( retrieval_model and retrieval_model.reranking_model and retrieval_model.reranking_model.reranking_provider_name and retrieval_model.reranking_model.reranking_model_name ): DatasetService.check_reranking_model_setting( tenant_id, retrieval_model.reranking_model.reranking_provider_name, retrieval_model.reranking_model.reranking_model_name, ) # indexing_technique is already set in dataset since this is an update args["indexing_technique"] = dataset.indexing_technique if args.get("text"): text = args.get("text") name = args.get("name") if not current_user: raise ValueError("current_user is required") upload_file = FileService(db.engine).upload_text( text=str(text), text_name=str(name), user_id=current_user.id, tenant_id=tenant_id ) data_source = { "type": "upload_file", "info_list": {"data_source_type": "upload_file", "file_info_list": {"file_ids": [upload_file.id]}}, } args["data_source"] = data_source args["original_document_id"] = str(document_id) knowledge_config = KnowledgeConfig.model_validate(args) DocumentService.document_create_args_validate(knowledge_config) try: documents, batch = DocumentService.save_document_with_dataset_id( dataset=dataset, knowledge_config=knowledge_config, account=current_user, dataset_process_rule=dataset.latest_process_rule if "process_rule" not in args else None, created_from="api", ) except ProviderTokenNotInitError as ex: raise ProviderNotInitializeError(ex.description) document = documents[0] return dump_response(DocumentAndBatchResponse, {"document": document, "batch": batch}), 200 @service_api_ns.route("/datasets//document/create-by-text") class DocumentAddByTextApi(DatasetApiResource): """Resource for the canonical text document creation route.""" @service_api_ns.doc( summary="Create Document by Text", description=( "Create a document from raw text content. The document is processed asynchronously — use the " "returned `batch` ID with [Get Document Indexing Status](/api-reference/documents/" "get-document-indexing-status) to track progress." ), tags=["Documents"], responses={ 200: "Document created successfully.", 400: ( "- `provider_not_initialize` : No valid model provider credentials found. Please go to " "Settings -> Model Provider to complete your provider credentials.\n" "- `invalid_param` : Knowledge base does not exist. / indexing_technique is required. / " "Invalid doc_form (must be `text_model`, `hierarchical_model`, or `qa_model`)." ), }, ) @service_api_ns.expect(service_api_ns.models[DocumentTextCreatePayload.__name__]) @service_api_ns.doc("create_document_by_text") @service_api_ns.doc(description="Create a new document by providing text content") @service_api_ns.doc(params={"dataset_id": "Knowledge base ID."}) @service_api_ns.doc( responses={ 200: "Document created successfully", 401: "Unauthorized - invalid API token", 400: "Bad request - invalid parameters", } ) @service_api_ns.response( 200, "Document created successfully", service_api_ns.models[DocumentAndBatchResponse.__name__] ) @cloud_edition_billing_resource_check("vector_space", "dataset") @cloud_edition_billing_resource_check("documents", "dataset") @cloud_edition_billing_rate_limit_check("knowledge", "dataset") def post(self, tenant_id: str, dataset_id: UUID): """Create document by text.""" return _create_document_by_text(tenant_id=tenant_id, dataset_id=dataset_id) @service_api_ns.route("/datasets//document/create_by_text") class DeprecatedDocumentAddByTextApi(DatasetApiResource): """Deprecated resource alias for text document creation.""" @service_api_ns.expect(service_api_ns.models[DocumentTextCreatePayload.__name__]) @service_api_ns.doc("create_document_by_text_deprecated") @service_api_ns.doc(deprecated=True) @service_api_ns.doc( description=( "Deprecated legacy alias for creating a new document by providing text content. " "Use /datasets/{dataset_id}/document/create-by-text instead." ) ) @service_api_ns.doc(params={"dataset_id": "Knowledge base ID."}) @service_api_ns.doc( responses={ 200: "Document created successfully", 401: "Unauthorized - invalid API token", 400: "Bad request - invalid parameters", } ) @service_api_ns.response( 200, "Document created successfully", service_api_ns.models[DocumentAndBatchResponse.__name__] ) @cloud_edition_billing_resource_check("vector_space", "dataset") @cloud_edition_billing_resource_check("documents", "dataset") @cloud_edition_billing_rate_limit_check("knowledge", "dataset") def post(self, tenant_id: str, dataset_id: UUID): """Create document by text through the deprecated underscore alias.""" return _create_document_by_text(tenant_id=tenant_id, dataset_id=dataset_id) @service_api_ns.route("/datasets//documents//update-by-text") class DocumentUpdateByTextApi(DatasetApiResource): """Resource for the canonical text document update route.""" @service_api_ns.doc( summary="Update Document by Text", description=( "Update an existing document's text content, name, or processing configuration. Re-triggers " "indexing if content changes — use the returned `batch` ID with [Get Document Indexing " "Status](/api-reference/documents/get-document-indexing-status) to track progress." ), tags=["Documents"], responses={ 200: "Document updated successfully.", 400: ( "- `provider_not_initialize` : No valid model provider credentials found. Please go to " "Settings -> Model Provider to complete your provider credentials.\n" "- `invalid_param` : Knowledge base does not exist, name is required when text is " "provided, or invalid doc_form (must be `text_model`, `hierarchical_model`, or " "`qa_model`)." ), }, ) @service_api_ns.expect(service_api_ns.models[DocumentTextUpdate.__name__]) @service_api_ns.doc("update_document_by_text") @service_api_ns.doc(description="Update an existing document by providing text content") @service_api_ns.doc(params={"dataset_id": "Knowledge base ID.", "document_id": "Document ID."}) @service_api_ns.doc( responses={ 200: "Document updated successfully", 401: "Unauthorized - invalid API token", 404: "Document not found", } ) @service_api_ns.response( 200, "Document updated successfully", service_api_ns.models[DocumentAndBatchResponse.__name__] ) @cloud_edition_billing_resource_check("vector_space", "dataset") @cloud_edition_billing_rate_limit_check("knowledge", "dataset") def post(self, tenant_id: str, dataset_id: UUID, document_id: UUID): """Update document by text.""" return _update_document_by_text(tenant_id=tenant_id, dataset_id=dataset_id, document_id=document_id) @service_api_ns.route("/datasets//documents//update_by_text") class DeprecatedDocumentUpdateByTextApi(DatasetApiResource): """Deprecated resource alias for text document updates.""" @service_api_ns.expect(service_api_ns.models[DocumentTextUpdate.__name__]) @service_api_ns.doc("update_document_by_text_deprecated") @service_api_ns.doc(deprecated=True) @service_api_ns.doc( description=( "Deprecated legacy alias for updating an existing document by providing text content. " "Use /datasets/{dataset_id}/documents/{document_id}/update-by-text instead." ) ) @service_api_ns.doc(params={"dataset_id": "Knowledge base ID.", "document_id": "Document ID."}) @service_api_ns.doc( responses={ 200: "Document updated successfully", 401: "Unauthorized - invalid API token", 404: "Document not found", } ) @service_api_ns.response( 200, "Document updated successfully", service_api_ns.models[DocumentAndBatchResponse.__name__] ) @cloud_edition_billing_resource_check("vector_space", "dataset") @cloud_edition_billing_rate_limit_check("knowledge", "dataset") def post(self, tenant_id: str, dataset_id: UUID, document_id: UUID): """Update document by text through the deprecated underscore alias.""" return _update_document_by_text(tenant_id=tenant_id, dataset_id=dataset_id, document_id=document_id) @service_api_ns.route( "/datasets//document/create_by_file", doc={ "post": { "deprecated": True, "description": ( "Deprecated legacy alias for creating a new document by uploading a file. " "Use /datasets/{dataset_id}/document/create-by-file instead." ), } }, ) @service_api_ns.route("/datasets//document/create-by-file") class DocumentAddByFileApi(DatasetApiResource): """Resource for documents.""" @service_api_ns.doc( summary="Create Document by File", description=( "Create a document by uploading a file. Supports common document formats (PDF, TXT, DOCX, " "etc.). Processing is asynchronous — use the returned `batch` ID with [Get Document " "Indexing Status](/api-reference/documents/get-document-indexing-status) to track progress." ), tags=["Documents"], responses={ 200: "Document created successfully.", 400: ( "- `no_file_uploaded` : Please upload your file.\n" "- `too_many_files` : Only one file is allowed.\n" "- `filename_not_exists_error` : The specified filename does not exist.\n" "- `provider_not_initialize` : No valid model provider credentials found. Please go to " "Settings -> Model Provider to complete your provider credentials.\n" "- `invalid_param` : Knowledge base does not exist, external datasets not supported, " "file too large, unsupported file type, missing required fields, or invalid doc_form " "(must be `text_model`, `hierarchical_model`, or `qa_model`)." ), }, ) @service_api_ns.doc("create_document_by_file") @service_api_ns.doc(description="Create a new document by uploading a file") @service_api_ns.doc(consumes=["multipart/form-data"], params=DOCUMENT_CREATE_BY_FILE_PARAMS) @service_api_ns.doc( responses={ 200: "Document created successfully", 401: "Unauthorized - invalid API token", 400: "Bad request - invalid file or parameters", } ) @service_api_ns.response( 200, "Document created successfully", service_api_ns.models[DocumentAndBatchResponse.__name__] ) @cloud_edition_billing_resource_check("vector_space", "dataset") @cloud_edition_billing_resource_check("documents", "dataset") @cloud_edition_billing_rate_limit_check("knowledge", "dataset") def post(self, tenant_id, dataset_id: UUID): """Create document by upload file.""" dataset = db.session.scalar( select(Dataset).where(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).limit(1) ) if not dataset: raise ValueError("Dataset does not exist.") if dataset.provider == "external": raise ValueError("External datasets are not supported.") args = {} if "data" in request.form: args = json.loads(request.form["data"]) if "doc_form" not in args: args["doc_form"] = dataset.chunk_structure or "text_model" if "doc_language" not in args: args["doc_language"] = "English" # get dataset info tenant_id = str(tenant_id) indexing_technique = args.get("indexing_technique") or dataset.indexing_technique if not indexing_technique: raise ValueError("indexing_technique is required.") args["indexing_technique"] = indexing_technique if "embedding_model_provider" in args: DatasetService.check_embedding_model_setting( tenant_id, args["embedding_model_provider"], args["embedding_model"] ) if ( "retrieval_model" in args and args["retrieval_model"].get("reranking_model") and args["retrieval_model"].get("reranking_model").get("reranking_provider_name") ): DatasetService.check_reranking_model_setting( tenant_id, args["retrieval_model"].get("reranking_model").get("reranking_provider_name"), args["retrieval_model"].get("reranking_model").get("reranking_model_name"), ) # check file if "file" not in request.files: raise NoFileUploadedError() if len(request.files) > 1: raise TooManyFilesError() # save file info file = request.files["file"] if not file.filename: raise FilenameNotExistsError if not current_user: raise ValueError("current_user is required") upload_file = FileService(db.engine).upload_file( filename=file.filename, content=file.stream.read(), mimetype=file.mimetype, user=current_user, source="datasets", ) data_source = { "type": "upload_file", "info_list": {"data_source_type": "upload_file", "file_info_list": {"file_ids": [upload_file.id]}}, } args["data_source"] = data_source # validate args knowledge_config = KnowledgeConfig.model_validate(args) DocumentService.document_create_args_validate(knowledge_config) dataset_process_rule = dataset.latest_process_rule if "process_rule" not in args else None if not knowledge_config.original_document_id and not dataset_process_rule and not knowledge_config.process_rule: raise ValueError("process_rule is required.") try: documents, batch = DocumentService.save_document_with_dataset_id( dataset=dataset, knowledge_config=knowledge_config, account=dataset.created_by_account, dataset_process_rule=dataset_process_rule, created_from="api", ) except ProviderTokenNotInitError as ex: raise ProviderNotInitializeError(ex.description) document = documents[0] return dump_response(DocumentAndBatchResponse, {"document": document, "batch": batch}), 200 def _update_document_by_file(tenant_id: str, dataset_id: UUID, document_id: UUID) -> tuple[Mapping[str, object], int]: """Update a document from an uploaded file for canonical and deprecated routes.""" dataset_id_str = str(dataset_id) tenant_id_str = str(tenant_id) dataset = db.session.scalar( select(Dataset).where(Dataset.tenant_id == tenant_id_str, Dataset.id == dataset_id_str).limit(1) ) if not dataset: raise ValueError("Dataset does not exist.") if dataset.provider == "external": raise ValueError("External datasets are not supported.") args: dict[str, object] = {} if "data" in request.form: args = json.loads(request.form["data"]) if "doc_form" not in args: args["doc_form"] = dataset.chunk_structure or "text_model" if "doc_language" not in args: args["doc_language"] = "English" # indexing_technique is already set in dataset since this is an update args["indexing_technique"] = dataset.indexing_technique if "file" in request.files: # save file info file = request.files["file"] if len(request.files) > 1: raise TooManyFilesError() if not file.filename: raise FilenameNotExistsError if not current_user: raise ValueError("current_user is required") try: upload_file = FileService(db.engine).upload_file( filename=file.filename, content=file.stream.read(), mimetype=file.mimetype, user=current_user, source="datasets", ) except services.errors.file.FileTooLargeError as file_too_large_error: raise FileTooLargeError(file_too_large_error.description) except services.errors.file.UnsupportedFileTypeError: raise UnsupportedFileTypeError() data_source = { "type": "upload_file", "info_list": {"data_source_type": "upload_file", "file_info_list": {"file_ids": [upload_file.id]}}, } args["data_source"] = data_source # validate args args["original_document_id"] = str(document_id) knowledge_config = KnowledgeConfig.model_validate(args) DocumentService.document_create_args_validate(knowledge_config) try: documents, _ = DocumentService.save_document_with_dataset_id( dataset=dataset, knowledge_config=knowledge_config, account=dataset.created_by_account, dataset_process_rule=dataset.latest_process_rule if "process_rule" not in args else None, created_from="api", ) except ProviderTokenNotInitError as ex: raise ProviderNotInitializeError(ex.description) document = documents[0] return dump_response(DocumentAndBatchResponse, {"document": document, "batch": document.batch}), 200 @service_api_ns.route( "/datasets//documents//update_by_file", "/datasets//documents//update-by-file", ) class DeprecatedDocumentUpdateByFileApi(DatasetApiResource): """Deprecated resource aliases for file document updates.""" @service_api_ns.doc( summary="Update Document by File", description=( "Update an existing document by uploading a new file. Re-triggers indexing — use the returned " "`batch` ID with [Get Document Indexing Status](/api-reference/documents/" "get-document-indexing-status) to track progress." ), tags=["Documents"], responses={ 200: "Document updated successfully.", 400: ( "- `too_many_files` : Only one file is allowed.\n" "- `filename_not_exists_error` : The specified filename does not exist.\n" "- `provider_not_initialize` : No valid model provider credentials found. Please go to " "Settings -> Model Provider to complete your provider credentials.\n" "- `invalid_param` : Knowledge base does not exist, external datasets not supported, " "file too large, unsupported file type, or invalid doc_form (must be `text_model`, " "`hierarchical_model`, or `qa_model`)." ), }, ) @service_api_ns.doc("update_document_by_file_deprecated") @service_api_ns.doc(deprecated=True) @service_api_ns.doc( description=( "Deprecated legacy alias for updating an existing document by uploading a file. " "Use PATCH /datasets/{dataset_id}/documents/{document_id} instead." ) ) @service_api_ns.doc(consumes=["multipart/form-data"], params=DOCUMENT_UPDATE_BY_FILE_PARAMS) @service_api_ns.doc( responses={ 200: "Document updated successfully", 401: "Unauthorized - invalid API token", 404: "Document not found", } ) @service_api_ns.response( 200, "Document updated successfully", service_api_ns.models[DocumentAndBatchResponse.__name__] ) @cloud_edition_billing_resource_check("vector_space", "dataset") @cloud_edition_billing_rate_limit_check("knowledge", "dataset") def post(self, tenant_id: str, dataset_id: UUID, document_id: UUID): """Update document by file through the deprecated file-update aliases.""" return _update_document_by_file(tenant_id=tenant_id, dataset_id=dataset_id, document_id=document_id) @service_api_ns.route("/datasets//documents") class DocumentListApi(DatasetApiResource): @service_api_ns.doc( summary="List Documents", description=( "Returns a paginated list of documents in the knowledge base. Supports filtering by keyword " "and indexing status." ), tags=["Documents"], responses={ 200: "List of documents.", 404: "`not_found` : Knowledge base not found.", }, ) @service_api_ns.doc("list_documents") @service_api_ns.doc(description="List all documents in a dataset") @service_api_ns.doc(params={"dataset_id": "Knowledge base ID.", **query_params_from_model(DocumentListQuery)}) @service_api_ns.doc( responses={ 200: "Documents retrieved successfully", 401: "Unauthorized - invalid API token", 404: "Dataset not found", } ) @service_api_ns.response( 200, "Documents retrieved successfully", service_api_ns.models[DocumentListResponse.__name__] ) def get(self, tenant_id, dataset_id: UUID): dataset_id_str = str(dataset_id) tenant_id = str(tenant_id) query_params = DocumentListQuery.model_validate(request.args.to_dict()) dataset = db.session.scalar( select(Dataset).where(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id_str).limit(1) ) if not dataset: raise NotFound("Dataset not found.") query = select(Document).where(Document.dataset_id == dataset_id_str, Document.tenant_id == tenant_id) if query_params.status: query = DocumentService.apply_display_status_filter(query, query_params.status) if query_params.keyword: search = f"%{query_params.keyword}%" query = query.where(Document.name.like(search)) query = query.order_by(desc(Document.created_at), desc(Document.position)) paginated_documents = db.paginate( select=query, page=query_params.page, per_page=query_params.limit, max_per_page=100, error_out=False ) documents = paginated_documents.items DocumentService.enrich_documents_with_summary_index_status( documents=documents, dataset=dataset, tenant_id=tenant_id, ) response = { "data": documents, "has_more": len(documents) == query_params.limit, "limit": query_params.limit, "total": paginated_documents.total, "page": query_params.page, } return dump_response(DocumentListResponse, response) @service_api_ns.route("/datasets//documents/download-zip") class DocumentBatchDownloadZipApi(DatasetApiResource): """Download multiple uploaded-file documents as a single ZIP archive.""" @service_api_ns.doc( summary="Download Documents as ZIP", description=( "Download multiple uploaded-file documents as a single ZIP archive. Accepts up to `100` document IDs." ), tags=["Documents"], responses={ 200: "ZIP archive containing the requested documents.", 403: "`forbidden` : Insufficient permissions.", 404: "`not_found` : Document or dataset not found.", }, ) @binary_response(service_api_ns, "application/zip") @service_api_ns.expect(service_api_ns.models[DocumentBatchDownloadZipPayload.__name__]) @service_api_ns.doc("download_documents_as_zip") @service_api_ns.doc(description="Download selected uploaded documents as a single ZIP archive") @service_api_ns.doc(params={"dataset_id": "Knowledge base ID."}) @service_api_ns.doc( responses={ 200: "ZIP archive generated successfully", 401: "Unauthorized - invalid API token", 403: "Forbidden - insufficient permissions", 404: "Document or dataset not found", } ) @service_api_ns.response(200, "ZIP archive generated successfully") @cloud_edition_billing_rate_limit_check("knowledge", "dataset") def post(self, tenant_id, dataset_id: UUID): payload = DocumentBatchDownloadZipPayload.model_validate(service_api_ns.payload or {}) upload_files, download_name = DocumentService.prepare_document_batch_download_zip( dataset_id=str(dataset_id), document_ids=[str(document_id) for document_id in payload.document_ids], tenant_id=str(tenant_id), current_user=current_user, ) with ExitStack() as stack: zip_path = stack.enter_context(FileService.build_upload_files_zip_tempfile(upload_files=upload_files)) response = send_file( zip_path, mimetype="application/zip", as_attachment=True, download_name=download_name, ) cleanup = stack.pop_all() response.call_on_close(cleanup.close) return response @service_api_ns.route("/datasets//documents//indexing-status") class DocumentIndexingStatusApi(DatasetApiResource): @service_api_ns.doc( summary="Get Document Indexing Status", description=( "Check the indexing progress of documents in a batch. Returns the current processing stage " "and chunk completion counts for each document. Poll this endpoint until `indexing_status` " "reaches `completed` or `error`. The status progresses through: `waiting` → `parsing` → " "`cleaning` → `splitting` → `indexing` → `completed`." ), tags=["Documents"], responses={ 200: "Indexing status for documents in the batch.", 404: "`not_found` : Knowledge base not found. / Documents not found.", }, ) @service_api_ns.doc("get_document_indexing_status") @service_api_ns.doc(description="Get indexing status for documents in a batch") @service_api_ns.doc(params={"dataset_id": "Knowledge base ID.", "batch": "Batch ID."}) @service_api_ns.doc( responses={ 200: "Indexing status retrieved successfully", 401: "Unauthorized - invalid API token", 404: "Dataset or documents not found", } ) @service_api_ns.response( 200, "Indexing status retrieved successfully", service_api_ns.models[DocumentStatusListResponse.__name__], ) def get(self, tenant_id, dataset_id: UUID, batch: str): dataset_id_str = str(dataset_id) tenant_id = str(tenant_id) # get dataset dataset = db.session.scalar( select(Dataset).where(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id_str).limit(1) ) if not dataset: raise NotFound("Dataset not found.") # get documents documents = DocumentService.get_batch_documents(dataset_id_str, batch) if not documents: raise NotFound("Documents not found.") documents_status = [] for document in documents: completed_segments = ( db.session.scalar( select(func.count(DocumentSegment.id)).where( DocumentSegment.completed_at.isnot(None), DocumentSegment.document_id == str(document.id), DocumentSegment.status != SegmentStatus.RE_SEGMENT, ) ) or 0 ) total_segments = ( db.session.scalar( select(func.count(DocumentSegment.id)).where( DocumentSegment.document_id == str(document.id), DocumentSegment.status != SegmentStatus.RE_SEGMENT, ) ) or 0 ) # Create a dictionary with document attributes and additional fields document_dict = { "id": document.id, "indexing_status": "paused" if document.is_paused else document.indexing_status, "processing_started_at": document.processing_started_at, "parsing_completed_at": document.parsing_completed_at, "cleaning_completed_at": document.cleaning_completed_at, "splitting_completed_at": document.splitting_completed_at, "completed_at": document.completed_at, "paused_at": document.paused_at, "error": document.error, "stopped_at": document.stopped_at, "completed_segments": completed_segments, "total_segments": total_segments, } documents_status.append(document_dict) return dump_response(DocumentStatusListResponse, {"data": documents_status}) @service_api_ns.route("/datasets//documents//download") class DocumentDownloadApi(DatasetApiResource): """Return a signed download URL for a document's original uploaded file.""" @service_api_ns.doc( summary="Download Document", description="Get a signed download URL for a document's original uploaded file.", tags=["Documents"], responses={ 200: "Download URL generated successfully.", 403: "`forbidden` : No permission to access this document.", 404: "`not_found` : Document not found.", }, ) @service_api_ns.doc("get_document_download_url") @service_api_ns.doc(description="Get a signed download URL for a document's original uploaded file") @service_api_ns.doc(params={"dataset_id": "Knowledge base ID.", "document_id": "Document ID."}) @service_api_ns.doc( responses={ 200: "Download URL generated successfully", 401: "Unauthorized - invalid API token", 403: "Forbidden - insufficient permissions", 404: "Document or upload file not found", } ) @service_api_ns.response( 200, "Download URL generated successfully", service_api_ns.models[UrlResponse.__name__], ) @cloud_edition_billing_rate_limit_check("knowledge", "dataset") def get(self, tenant_id, dataset_id: UUID, document_id: UUID): dataset = self.get_dataset(str(dataset_id), str(tenant_id)) document = DocumentService.get_document(dataset.id, str(document_id)) if not document: raise NotFound("Document not found.") if document.tenant_id != str(tenant_id): raise Forbidden("No permission.") return {"url": DocumentService.get_document_download_url(document)} @service_api_ns.route("/datasets//documents/") class DocumentApi(DatasetApiResource): METADATA_CHOICES = {"all", "only", "without"} @service_api_ns.doc( summary="Get Document", description=( "Retrieve detailed information about a specific document, including its indexing status, " "metadata, and processing statistics." ), tags=["Documents"], responses={ 200: ( "Document details. The response shape varies based on the `metadata` query parameter. When " "`metadata` is `only`, only `id`, `doc_type`, and `doc_metadata` are returned. When " "`metadata` is `without`, `doc_type` and `doc_metadata` are omitted." ), 400: "`invalid_metadata` : Invalid metadata value for the specified key.", 403: "`forbidden` : No permission.", 404: "`not_found` : Document not found.", }, ) @service_api_ns.doc("get_document") @service_api_ns.doc(description="Get a specific document by ID") @service_api_ns.doc(params={"dataset_id": "Knowledge base ID.", "document_id": "Document ID."}) @service_api_ns.doc(params=query_params_from_model(DocumentGetQuery)) @service_api_ns.doc( responses={ 200: "Document retrieved successfully", 401: "Unauthorized - invalid API token", 403: "Forbidden - insufficient permissions", 404: "Document not found", } ) @service_api_ns.response( 200, "Document retrieved successfully", service_api_ns.models[DocumentDetailResponse.__name__], ) def get(self, tenant_id, dataset_id: UUID, document_id: UUID): dataset_id_str = str(dataset_id) document_id_str = str(document_id) dataset = self.get_dataset(dataset_id_str, tenant_id) document = DocumentService.get_document(dataset.id, document_id_str) if not document: raise NotFound("Document not found.") if document.tenant_id != str(tenant_id): raise Forbidden("No permission.") metadata = request.args.get("metadata", "all") if metadata not in self.METADATA_CHOICES: raise InvalidMetadataError(f"Invalid metadata value: {metadata}") # Calculate summary_index_status if needed summary_index_status = None has_summary_index = dataset.summary_index_setting and dataset.summary_index_setting.get("enable") is True if has_summary_index and document.need_summary is True: summary_index_status = SummaryIndexService.get_document_summary_index_status( document_id=document_id_str, dataset_id=dataset_id_str, tenant_id=tenant_id, ) if metadata == "only": response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata_details} elif metadata == "without": dataset_process_rules = DatasetService.get_process_rules(dataset_id_str) document_process_rules = document.dataset_process_rule.to_dict() if document.dataset_process_rule else {} data_source_info = document.data_source_detail_dict response = { "id": document.id, "position": document.position, "data_source_type": document.data_source_type, "data_source_info": data_source_info, "dataset_process_rule_id": document.dataset_process_rule_id, "dataset_process_rule": dataset_process_rules, "document_process_rule": document_process_rules, "name": document.name, "created_from": document.created_from, "created_by": document.created_by, "created_at": int(document.created_at.timestamp()), "tokens": document.tokens, "indexing_status": document.indexing_status, "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None, "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None, "indexing_latency": document.indexing_latency, "error": document.error, "enabled": document.enabled, "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None, "disabled_by": document.disabled_by, "archived": document.archived, "segment_count": document.segment_count, "average_segment_length": document.average_segment_length, "hit_count": document.hit_count, "display_status": document.display_status, "doc_form": document.doc_form, "doc_language": document.doc_language, "summary_index_status": summary_index_status, "need_summary": document.need_summary if document.need_summary is not None else False, } else: dataset_process_rules = DatasetService.get_process_rules(dataset_id_str) document_process_rules = document.dataset_process_rule.to_dict() if document.dataset_process_rule else {} data_source_info = document.data_source_detail_dict response = { "id": document.id, "position": document.position, "data_source_type": document.data_source_type, "data_source_info": data_source_info, "dataset_process_rule_id": document.dataset_process_rule_id, "dataset_process_rule": dataset_process_rules, "document_process_rule": document_process_rules, "name": document.name, "created_from": document.created_from, "created_by": document.created_by, "created_at": int(document.created_at.timestamp()), "tokens": document.tokens, "indexing_status": document.indexing_status, "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None, "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None, "indexing_latency": document.indexing_latency, "error": document.error, "enabled": document.enabled, "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None, "disabled_by": document.disabled_by, "archived": document.archived, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata_details, "segment_count": document.segment_count, "average_segment_length": document.average_segment_length, "hit_count": document.hit_count, "display_status": document.display_status, "doc_form": document.doc_form, "doc_language": document.doc_language, "summary_index_status": summary_index_status, "need_summary": document.need_summary if document.need_summary is not None else False, } return response @service_api_ns.doc("update_document_by_file") @service_api_ns.doc(description="Update an existing document by uploading a file") @service_api_ns.doc(consumes=["multipart/form-data"], params=DOCUMENT_UPDATE_BY_FILE_PARAMS) @service_api_ns.doc( responses={ 200: "Document updated successfully", 401: "Unauthorized - invalid API token", 404: "Document not found", } ) @service_api_ns.response( 200, "Document updated successfully", service_api_ns.models[DocumentAndBatchResponse.__name__] ) @cloud_edition_billing_resource_check("vector_space", "dataset") @cloud_edition_billing_rate_limit_check("knowledge", "dataset") def patch(self, tenant_id: str, dataset_id: UUID, document_id: UUID): """Update document by file on the canonical document resource.""" return _update_document_by_file(tenant_id=tenant_id, dataset_id=dataset_id, document_id=document_id) @service_api_ns.doc( summary="Delete Document", description="Permanently delete a document and all its chunks from the knowledge base.", tags=["Documents"], responses={ 204: "Success.", 400: "`document_indexing` : Cannot delete document during indexing.", 403: "`archived_document_immutable` : The archived document is not editable.", 404: "`not_found` : Document Not Exists.", }, ) @service_api_ns.doc("delete_document") @service_api_ns.doc(description="Delete a document") @service_api_ns.doc(params={"dataset_id": "Knowledge base ID.", "document_id": "Document ID."}) @service_api_ns.doc( responses={ 204: "Document deleted successfully", 401: "Unauthorized - invalid API token", 403: "Forbidden - document is archived", 404: "Document not found", } ) @cloud_edition_billing_rate_limit_check("knowledge", "dataset") def delete(self, tenant_id, dataset_id: UUID, document_id: UUID): """Delete document.""" document_id_str = str(document_id) dataset_id_str = str(dataset_id) tenant_id = str(tenant_id) # get dataset info dataset = db.session.scalar( select(Dataset).where(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id_str).limit(1) ) if not dataset: raise ValueError("Dataset does not exist.") document = DocumentService.get_document(dataset.id, document_id_str) # 404 if document not found if document is None: raise NotFound("Document Not Exists.") # 403 if document is archived if DocumentService.check_archived(document): raise ArchivedDocumentImmutableError() try: # delete document DocumentService.delete_document(document) except services.errors.document.DocumentIndexingError: raise DocumentIndexingError("Cannot delete document during indexing.") return "", 204