diff --git a/api/controllers/service_api/dataset/document.py b/api/controllers/service_api/dataset/document.py index 6db047567f..bc28ecb6b7 100644 --- a/api/controllers/service_api/dataset/document.py +++ b/api/controllers/service_api/dataset/document.py @@ -1,4 +1,12 @@ +"""Service API endpoints for dataset document management. + +The canonical Service API paths use hyphenated route segments. Legacy underscore +aliases remain registered for backward compatibility, but they must stay marked +deprecated in generated API docs so clients migrate toward the canonical paths. +""" + import json +from collections.abc import Mapping from contextlib import ExitStack from typing import Self from uuid import UUID @@ -117,12 +125,137 @@ register_schema_models( ) -@service_api_ns.route( - "/datasets//document/create_by_text", - "/datasets//document/create-by-text", -) +def _create_document_by_text(tenant_id: str, dataset_id: UUID) -> tuple[Mapping[str, object], int]: + """Create a document from text for both canonical and legacy routes.""" + payload = DocumentTextCreatePayload.model_validate(service_api_ns.payload or {}) + args = payload.model_dump(exclude_none=True) + + dataset_id_str = str(dataset_id) + tenant_id_str = str(tenant_id) + dataset = db.session.scalar( + select(Dataset).where(Dataset.tenant_id == tenant_id_str, Dataset.id == dataset_id_str).limit(1) + ) + + if not dataset: + raise ValueError("Dataset does not exist.") + + if not dataset.indexing_technique and not args["indexing_technique"]: + raise ValueError("indexing_technique is required.") + + embedding_model_provider = payload.embedding_model_provider + embedding_model = payload.embedding_model + if embedding_model_provider and embedding_model: + DatasetService.check_embedding_model_setting(tenant_id_str, embedding_model_provider, embedding_model) + + retrieval_model = payload.retrieval_model + if ( + retrieval_model + and retrieval_model.reranking_model + and retrieval_model.reranking_model.reranking_provider_name + and retrieval_model.reranking_model.reranking_model_name + ): + DatasetService.check_reranking_model_setting( + tenant_id_str, + retrieval_model.reranking_model.reranking_provider_name, + retrieval_model.reranking_model.reranking_model_name, + ) + + if not current_user: + raise ValueError("current_user is required") + + upload_file = FileService(db.engine).upload_text( + text=payload.text, text_name=payload.name, user_id=current_user.id, tenant_id=tenant_id_str + ) + data_source = { + "type": "upload_file", + "info_list": {"data_source_type": "upload_file", "file_info_list": {"file_ids": [upload_file.id]}}, + } + args["data_source"] = data_source + knowledge_config = KnowledgeConfig.model_validate(args) + DocumentService.document_create_args_validate(knowledge_config) + + if not current_user: + raise ValueError("current_user is required") + + try: + documents, batch = DocumentService.save_document_with_dataset_id( + dataset=dataset, + knowledge_config=knowledge_config, + account=current_user, + dataset_process_rule=dataset.latest_process_rule if "process_rule" not in args else None, + created_from="api", + ) + except ProviderTokenNotInitError as ex: + raise ProviderNotInitializeError(ex.description) + document = documents[0] + + documents_and_batch_fields = {"document": marshal(document, document_fields), "batch": batch} + return documents_and_batch_fields, 200 + + +def _update_document_by_text(tenant_id: str, dataset_id: UUID, document_id: UUID) -> tuple[Mapping[str, object], int]: + """Update a document from text for both canonical and legacy routes.""" + payload = DocumentTextUpdate.model_validate(service_api_ns.payload or {}) + dataset = db.session.scalar( + select(Dataset).where(Dataset.tenant_id == tenant_id, Dataset.id == str(dataset_id)).limit(1) + ) + args = payload.model_dump(exclude_none=True) + if not dataset: + raise ValueError("Dataset does not exist.") + + retrieval_model = payload.retrieval_model + if ( + retrieval_model + and retrieval_model.reranking_model + and retrieval_model.reranking_model.reranking_provider_name + and retrieval_model.reranking_model.reranking_model_name + ): + DatasetService.check_reranking_model_setting( + tenant_id, + retrieval_model.reranking_model.reranking_provider_name, + retrieval_model.reranking_model.reranking_model_name, + ) + + # indexing_technique is already set in dataset since this is an update + args["indexing_technique"] = dataset.indexing_technique + + if args.get("text"): + text = args.get("text") + name = args.get("name") + if not current_user: + raise ValueError("current_user is required") + upload_file = FileService(db.engine).upload_text( + text=str(text), text_name=str(name), user_id=current_user.id, tenant_id=tenant_id + ) + data_source = { + "type": "upload_file", + "info_list": {"data_source_type": "upload_file", "file_info_list": {"file_ids": [upload_file.id]}}, + } + args["data_source"] = data_source + + args["original_document_id"] = str(document_id) + knowledge_config = KnowledgeConfig.model_validate(args) + DocumentService.document_create_args_validate(knowledge_config) + + try: + documents, batch = DocumentService.save_document_with_dataset_id( + dataset=dataset, + knowledge_config=knowledge_config, + account=current_user, + dataset_process_rule=dataset.latest_process_rule if "process_rule" not in args else None, + created_from="api", + ) + except ProviderTokenNotInitError as ex: + raise ProviderNotInitializeError(ex.description) + document = documents[0] + + documents_and_batch_fields = {"document": marshal(document, document_fields), "batch": batch} + return documents_and_batch_fields, 200 + + +@service_api_ns.route("/datasets//document/create-by-text") class DocumentAddByTextApi(DatasetApiResource): - """Resource for documents.""" + """Resource for the canonical text document creation route.""" @service_api_ns.expect(service_api_ns.models[DocumentTextCreatePayload.__name__]) @service_api_ns.doc("create_document_by_text") @@ -138,81 +271,43 @@ class DocumentAddByTextApi(DatasetApiResource): @cloud_edition_billing_resource_check("vector_space", "dataset") @cloud_edition_billing_resource_check("documents", "dataset") @cloud_edition_billing_rate_limit_check("knowledge", "dataset") - def post(self, tenant_id, dataset_id): + def post(self, tenant_id: str, dataset_id: UUID): """Create document by text.""" - payload = DocumentTextCreatePayload.model_validate(service_api_ns.payload or {}) - args = payload.model_dump(exclude_none=True) + return _create_document_by_text(tenant_id=tenant_id, dataset_id=dataset_id) - dataset_id = str(dataset_id) - tenant_id = str(tenant_id) - dataset = db.session.scalar( - select(Dataset).where(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).limit(1) + +@service_api_ns.route("/datasets//document/create_by_text") +class DeprecatedDocumentAddByTextApi(DatasetApiResource): + """Deprecated resource alias for text document creation.""" + + @service_api_ns.expect(service_api_ns.models[DocumentTextCreatePayload.__name__]) + @service_api_ns.doc("create_document_by_text_deprecated") + @service_api_ns.doc(deprecated=True) + @service_api_ns.doc( + description=( + "Deprecated legacy alias for creating a new document by providing text content. " + "Use /datasets/{dataset_id}/document/create-by-text instead." ) - - if not dataset: - raise ValueError("Dataset does not exist.") - - if not dataset.indexing_technique and not args["indexing_technique"]: - raise ValueError("indexing_technique is required.") - - embedding_model_provider = payload.embedding_model_provider - embedding_model = payload.embedding_model - if embedding_model_provider and embedding_model: - DatasetService.check_embedding_model_setting(tenant_id, embedding_model_provider, embedding_model) - - retrieval_model = payload.retrieval_model - if ( - retrieval_model - and retrieval_model.reranking_model - and retrieval_model.reranking_model.reranking_provider_name - and retrieval_model.reranking_model.reranking_model_name - ): - DatasetService.check_reranking_model_setting( - tenant_id, - retrieval_model.reranking_model.reranking_provider_name, - retrieval_model.reranking_model.reranking_model_name, - ) - - if not current_user: - raise ValueError("current_user is required") - - upload_file = FileService(db.engine).upload_text( - text=payload.text, text_name=payload.name, user_id=current_user.id, tenant_id=tenant_id - ) - data_source = { - "type": "upload_file", - "info_list": {"data_source_type": "upload_file", "file_info_list": {"file_ids": [upload_file.id]}}, + ) + @service_api_ns.doc(params={"dataset_id": "Dataset ID"}) + @service_api_ns.doc( + responses={ + 200: "Document created successfully", + 401: "Unauthorized - invalid API token", + 400: "Bad request - invalid parameters", } - args["data_source"] = data_source - knowledge_config = KnowledgeConfig.model_validate(args) - # validate args - DocumentService.document_create_args_validate(knowledge_config) - - if not current_user: - raise ValueError("current_user is required") - - try: - documents, batch = DocumentService.save_document_with_dataset_id( - dataset=dataset, - knowledge_config=knowledge_config, - account=current_user, - dataset_process_rule=dataset.latest_process_rule if "process_rule" not in args else None, - created_from="api", - ) - except ProviderTokenNotInitError as ex: - raise ProviderNotInitializeError(ex.description) - document = documents[0] - - documents_and_batch_fields = {"document": marshal(document, document_fields), "batch": batch} - return documents_and_batch_fields, 200 + ) + @cloud_edition_billing_resource_check("vector_space", "dataset") + @cloud_edition_billing_resource_check("documents", "dataset") + @cloud_edition_billing_rate_limit_check("knowledge", "dataset") + def post(self, tenant_id: str, dataset_id: UUID): + """Create document by text through the deprecated underscore alias.""" + return _create_document_by_text(tenant_id=tenant_id, dataset_id=dataset_id) -@service_api_ns.route( - "/datasets//documents//update_by_text", - "/datasets//documents//update-by-text", -) +@service_api_ns.route("/datasets//documents//update-by-text") class DocumentUpdateByTextApi(DatasetApiResource): - """Resource for update documents.""" + """Resource for the canonical text document update route.""" @service_api_ns.expect(service_api_ns.models[DocumentTextUpdate.__name__]) @service_api_ns.doc("update_document_by_text") @@ -229,62 +324,35 @@ class DocumentUpdateByTextApi(DatasetApiResource): @cloud_edition_billing_rate_limit_check("knowledge", "dataset") def post(self, tenant_id: str, dataset_id: UUID, document_id: UUID): """Update document by text.""" - payload = DocumentTextUpdate.model_validate(service_api_ns.payload or {}) - dataset = db.session.scalar( - select(Dataset).where(Dataset.tenant_id == tenant_id, Dataset.id == str(dataset_id)).limit(1) + return _update_document_by_text(tenant_id=tenant_id, dataset_id=dataset_id, document_id=document_id) + + +@service_api_ns.route("/datasets//documents//update_by_text") +class DeprecatedDocumentUpdateByTextApi(DatasetApiResource): + """Deprecated resource alias for text document updates.""" + + @service_api_ns.expect(service_api_ns.models[DocumentTextUpdate.__name__]) + @service_api_ns.doc("update_document_by_text_deprecated") + @service_api_ns.doc(deprecated=True) + @service_api_ns.doc( + description=( + "Deprecated legacy alias for updating an existing document by providing text content. " + "Use /datasets/{dataset_id}/documents/{document_id}/update-by-text instead." ) - args = payload.model_dump(exclude_none=True) - if not dataset: - raise ValueError("Dataset does not exist.") - - retrieval_model = payload.retrieval_model - if ( - retrieval_model - and retrieval_model.reranking_model - and retrieval_model.reranking_model.reranking_provider_name - and retrieval_model.reranking_model.reranking_model_name - ): - DatasetService.check_reranking_model_setting( - tenant_id, - retrieval_model.reranking_model.reranking_provider_name, - retrieval_model.reranking_model.reranking_model_name, - ) - - # indexing_technique is already set in dataset since this is an update - args["indexing_technique"] = dataset.indexing_technique - - if args.get("text"): - text = args.get("text") - name = args.get("name") - if not current_user: - raise ValueError("current_user is required") - upload_file = FileService(db.engine).upload_text( - text=str(text), text_name=str(name), user_id=current_user.id, tenant_id=tenant_id - ) - data_source = { - "type": "upload_file", - "info_list": {"data_source_type": "upload_file", "file_info_list": {"file_ids": [upload_file.id]}}, - } - args["data_source"] = data_source - # validate args - args["original_document_id"] = str(document_id) - knowledge_config = KnowledgeConfig.model_validate(args) - DocumentService.document_create_args_validate(knowledge_config) - - try: - documents, batch = DocumentService.save_document_with_dataset_id( - dataset=dataset, - knowledge_config=knowledge_config, - account=current_user, - dataset_process_rule=dataset.latest_process_rule if "process_rule" not in args else None, - created_from="api", - ) - except ProviderTokenNotInitError as ex: - raise ProviderNotInitializeError(ex.description) - document = documents[0] - - documents_and_batch_fields = {"document": marshal(document, document_fields), "batch": batch} - return documents_and_batch_fields, 200 + ) + @service_api_ns.doc(params={"dataset_id": "Dataset ID", "document_id": "Document ID"}) + @service_api_ns.doc( + responses={ + 200: "Document updated successfully", + 401: "Unauthorized - invalid API token", + 404: "Document not found", + } + ) + @cloud_edition_billing_resource_check("vector_space", "dataset") + @cloud_edition_billing_rate_limit_check("knowledge", "dataset") + def post(self, tenant_id: str, dataset_id: UUID, document_id: UUID): + """Update document by text through the deprecated underscore alias.""" + return _update_document_by_text(tenant_id=tenant_id, dataset_id=dataset_id, document_id=document_id) @service_api_ns.route( diff --git a/api/tests/unit_tests/controllers/service_api/dataset/test_document.py b/api/tests/unit_tests/controllers/service_api/dataset/test_document.py index 12d5e7345d..288659b192 100644 --- a/api/tests/unit_tests/controllers/service_api/dataset/test_document.py +++ b/api/tests/unit_tests/controllers/service_api/dataset/test_document.py @@ -22,6 +22,8 @@ import pytest from werkzeug.exceptions import Forbidden, NotFound from controllers.service_api.dataset.document import ( + DeprecatedDocumentAddByTextApi, + DeprecatedDocumentUpdateByTextApi, DocumentAddByFileApi, DocumentAddByTextApi, DocumentApi, @@ -1005,7 +1007,7 @@ class TestDocumentAddByTextApi: # Act with app.test_request_context( - f"/datasets/{mock_dataset.id}/document/create_by_text", + f"/datasets/{mock_dataset.id}/document/create-by-text", method="POST", json={ "name": "Test Document", @@ -1037,7 +1039,7 @@ class TestDocumentAddByTextApi: # Act & Assert with app.test_request_context( - f"/datasets/{mock_dataset.id}/document/create_by_text", + f"/datasets/{mock_dataset.id}/document/create-by-text", method="POST", json={"name": "Test Document", "text": "Content"}, headers={"Authorization": "Bearer test_token"}, @@ -1066,7 +1068,7 @@ class TestDocumentAddByTextApi: # Act & Assert with app.test_request_context( - f"/datasets/{mock_dataset.id}/document/create_by_text", + f"/datasets/{mock_dataset.id}/document/create-by-text", method="POST", json={"name": "Test Document", "text": "Content"}, headers={"Authorization": "Bearer test_token"}, @@ -1093,6 +1095,20 @@ class TestArchivedDocumentImmutableError: assert error.code == 403 +class TestDocumentTextRouteDeprecation: + """Test that legacy underscore text routes stay marked deprecated.""" + + def test_create_by_text_legacy_alias_is_deprecated(self): + """Ensure only the legacy create-by-text alias is marked deprecated.""" + assert DeprecatedDocumentAddByTextApi.post.__apidoc__["deprecated"] is True + assert DocumentAddByTextApi.post.__apidoc__.get("deprecated") is not True + + def test_update_by_text_legacy_alias_is_deprecated(self): + """Ensure only the legacy update-by-text alias is marked deprecated.""" + assert DeprecatedDocumentUpdateByTextApi.post.__apidoc__["deprecated"] is True + assert DocumentUpdateByTextApi.post.__apidoc__.get("deprecated") is not True + + # ============================================================================= # Endpoint tests for DocumentUpdateByTextApi, DocumentAddByFileApi, # DocumentUpdateByFileApi. @@ -1162,7 +1178,7 @@ class TestDocumentUpdateByTextApiPost: doc_id = str(uuid.uuid4()) with app.test_request_context( - f"/datasets/{mock_dataset.id}/documents/{doc_id}/update_by_text", + f"/datasets/{mock_dataset.id}/documents/{doc_id}/update-by-text", method="POST", json={"name": "Updated Doc", "text": "New content"}, headers={"Authorization": "Bearer test_token"}, @@ -1195,7 +1211,7 @@ class TestDocumentUpdateByTextApiPost: doc_id = str(uuid.uuid4()) with app.test_request_context( - f"/datasets/{mock_dataset.id}/documents/{doc_id}/update_by_text", + f"/datasets/{mock_dataset.id}/documents/{doc_id}/update-by-text", method="POST", json={"name": "Doc", "text": "Content"}, headers={"Authorization": "Bearer test_token"},