From 5f61ca5e6f5c93cd3e231f11bcf3d7b20ad4589a Mon Sep 17 00:00:00 2001 From: GuanMu Date: Fri, 21 Nov 2025 12:58:20 +0800 Subject: [PATCH] feat: Implement partial update for document metadata, allowing merging of new values with existing ones. (#28390) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- .../vdb/matrixone/matrixone_vector.py | 25 ++- .../knowledge_entities/knowledge_entities.py | 1 + api/services/metadata_service.py | 19 ++- .../services/test_metadata_partial_update.py | 153 ++++++++++++++++++ .../hooks/use-batch-edit-document-metadata.ts | 1 + web/app/components/datasets/metadata/types.ts | 2 +- 6 files changed, 185 insertions(+), 16 deletions(-) create mode 100644 api/tests/unit_tests/services/test_metadata_partial_update.py diff --git a/api/core/rag/datasource/vdb/matrixone/matrixone_vector.py b/api/core/rag/datasource/vdb/matrixone/matrixone_vector.py index 6fe396dc1e..14955c8d7c 100644 --- a/api/core/rag/datasource/vdb/matrixone/matrixone_vector.py +++ b/api/core/rag/datasource/vdb/matrixone/matrixone_vector.py @@ -22,6 +22,18 @@ logger = logging.getLogger(__name__) P = ParamSpec("P") R = TypeVar("R") +T = TypeVar("T", bound="MatrixoneVector") + + +def ensure_client(func: Callable[Concatenate[T, P], R]): + @wraps(func) + def wrapper(self: T, *args: P.args, **kwargs: P.kwargs): + if self.client is None: + self.client = self._get_client(None, False) + return func(self, *args, **kwargs) + + return wrapper + class MatrixoneConfig(BaseModel): host: str = "localhost" @@ -206,19 +218,6 @@ class MatrixoneVector(BaseVector): self.client.delete() -T = TypeVar("T", bound=MatrixoneVector) - - -def ensure_client(func: Callable[Concatenate[T, P], R]): - @wraps(func) - def wrapper(self: T, *args: P.args, **kwargs: P.kwargs): - if self.client is None: - self.client = self._get_client(None, False) - return func(self, *args, **kwargs) - - return wrapper - - class MatrixoneVectorFactory(AbstractVectorFactory): def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> MatrixoneVector: if dataset.index_struct_dict: diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py index b9a210740d..131e90e195 100644 --- a/api/services/entities/knowledge_entities/knowledge_entities.py +++ b/api/services/entities/knowledge_entities/knowledge_entities.py @@ -158,6 +158,7 @@ class MetadataDetail(BaseModel): class DocumentMetadataOperation(BaseModel): document_id: str metadata_list: list[MetadataDetail] + partial_update: bool = False class MetadataOperationData(BaseModel): diff --git a/api/services/metadata_service.py b/api/services/metadata_service.py index b369994d2d..3329ac349c 100644 --- a/api/services/metadata_service.py +++ b/api/services/metadata_service.py @@ -206,7 +206,10 @@ class MetadataService: document = DocumentService.get_document(dataset.id, operation.document_id) if document is None: raise ValueError("Document not found.") - doc_metadata = {} + if operation.partial_update: + doc_metadata = copy.deepcopy(document.doc_metadata) if document.doc_metadata else {} + else: + doc_metadata = {} for metadata_value in operation.metadata_list: doc_metadata[metadata_value.name] = metadata_value.value if dataset.built_in_field_enabled: @@ -219,9 +222,21 @@ class MetadataService: db.session.add(document) db.session.commit() # deal metadata binding - db.session.query(DatasetMetadataBinding).filter_by(document_id=operation.document_id).delete() + if not operation.partial_update: + db.session.query(DatasetMetadataBinding).filter_by(document_id=operation.document_id).delete() + current_user, current_tenant_id = current_account_with_tenant() for metadata_value in operation.metadata_list: + # check if binding already exists + if operation.partial_update: + existing_binding = ( + db.session.query(DatasetMetadataBinding) + .filter_by(document_id=operation.document_id, metadata_id=metadata_value.id) + .first() + ) + if existing_binding: + continue + dataset_metadata_binding = DatasetMetadataBinding( tenant_id=current_tenant_id, dataset_id=dataset.id, diff --git a/api/tests/unit_tests/services/test_metadata_partial_update.py b/api/tests/unit_tests/services/test_metadata_partial_update.py new file mode 100644 index 0000000000..00162c10e4 --- /dev/null +++ b/api/tests/unit_tests/services/test_metadata_partial_update.py @@ -0,0 +1,153 @@ +import unittest +from unittest.mock import MagicMock, patch + +from models.dataset import Dataset, Document +from services.entities.knowledge_entities.knowledge_entities import ( + DocumentMetadataOperation, + MetadataDetail, + MetadataOperationData, +) +from services.metadata_service import MetadataService + + +class TestMetadataPartialUpdate(unittest.TestCase): + def setUp(self): + self.dataset = MagicMock(spec=Dataset) + self.dataset.id = "dataset_id" + self.dataset.built_in_field_enabled = False + + self.document = MagicMock(spec=Document) + self.document.id = "doc_id" + self.document.doc_metadata = {"existing_key": "existing_value"} + self.document.data_source_type = "upload_file" + + @patch("services.metadata_service.db") + @patch("services.metadata_service.DocumentService") + @patch("services.metadata_service.current_account_with_tenant") + @patch("services.metadata_service.redis_client") + def test_partial_update_merges_metadata(self, mock_redis, mock_current_account, mock_document_service, mock_db): + # Setup mocks + mock_redis.get.return_value = None + mock_document_service.get_document.return_value = self.document + mock_current_account.return_value = (MagicMock(id="user_id"), "tenant_id") + + # Mock DB query for existing bindings + + # No existing binding for new key + mock_db.session.query.return_value.filter_by.return_value.first.return_value = None + + # Input data + operation = DocumentMetadataOperation( + document_id="doc_id", + metadata_list=[MetadataDetail(id="new_meta_id", name="new_key", value="new_value")], + partial_update=True, + ) + metadata_args = MetadataOperationData(operation_data=[operation]) + + # Execute + MetadataService.update_documents_metadata(self.dataset, metadata_args) + + # Verify + # 1. Check that doc_metadata contains BOTH existing and new keys + expected_metadata = {"existing_key": "existing_value", "new_key": "new_value"} + assert self.document.doc_metadata == expected_metadata + + # 2. Check that existing bindings were NOT deleted + # The delete call in the original code: db.session.query(...).filter_by(...).delete() + # In partial update, this should NOT be called. + mock_db.session.query.return_value.filter_by.return_value.delete.assert_not_called() + + @patch("services.metadata_service.db") + @patch("services.metadata_service.DocumentService") + @patch("services.metadata_service.current_account_with_tenant") + @patch("services.metadata_service.redis_client") + def test_full_update_replaces_metadata(self, mock_redis, mock_current_account, mock_document_service, mock_db): + # Setup mocks + mock_redis.get.return_value = None + mock_document_service.get_document.return_value = self.document + mock_current_account.return_value = (MagicMock(id="user_id"), "tenant_id") + + # Input data (partial_update=False by default) + operation = DocumentMetadataOperation( + document_id="doc_id", + metadata_list=[MetadataDetail(id="new_meta_id", name="new_key", value="new_value")], + partial_update=False, + ) + metadata_args = MetadataOperationData(operation_data=[operation]) + + # Execute + MetadataService.update_documents_metadata(self.dataset, metadata_args) + + # Verify + # 1. Check that doc_metadata contains ONLY the new key + expected_metadata = {"new_key": "new_value"} + assert self.document.doc_metadata == expected_metadata + + # 2. Check that existing bindings WERE deleted + # In full update (default), we expect the existing bindings to be cleared. + mock_db.session.query.return_value.filter_by.return_value.delete.assert_called() + + @patch("services.metadata_service.db") + @patch("services.metadata_service.DocumentService") + @patch("services.metadata_service.current_account_with_tenant") + @patch("services.metadata_service.redis_client") + def test_partial_update_skips_existing_binding( + self, mock_redis, mock_current_account, mock_document_service, mock_db + ): + # Setup mocks + mock_redis.get.return_value = None + mock_document_service.get_document.return_value = self.document + mock_current_account.return_value = (MagicMock(id="user_id"), "tenant_id") + + # Mock DB query to return an existing binding + # This simulates that the document ALREADY has the metadata we are trying to add + mock_existing_binding = MagicMock() + mock_db.session.query.return_value.filter_by.return_value.first.return_value = mock_existing_binding + + # Input data + operation = DocumentMetadataOperation( + document_id="doc_id", + metadata_list=[MetadataDetail(id="existing_meta_id", name="existing_key", value="existing_value")], + partial_update=True, + ) + metadata_args = MetadataOperationData(operation_data=[operation]) + + # Execute + MetadataService.update_documents_metadata(self.dataset, metadata_args) + + # Verify + # We verify that db.session.add was NOT called for DatasetMetadataBinding + # Since we can't easily check "not called with specific type" on the generic add method without complex logic, + # we can check if the number of add calls is 1 (only for the document update) instead of 2 (document + binding) + + # Expected calls: + # 1. db.session.add(document) + # 2. NO db.session.add(binding) because it exists + + # Note: In the code, db.session.add is called for document. + # Then loop over metadata_list. + # If existing_binding found, continue. + # So binding add should be skipped. + + # Let's filter the calls to add to see what was added + add_calls = mock_db.session.add.call_args_list + added_objects = [call.args[0] for call in add_calls] + + # Check that no DatasetMetadataBinding was added + from models.dataset import DatasetMetadataBinding + + has_binding_add = any( + isinstance(obj, DatasetMetadataBinding) + or (isinstance(obj, MagicMock) and getattr(obj, "__class__", None) == DatasetMetadataBinding) + for obj in added_objects + ) + + # Since we mock everything, checking isinstance might be tricky if DatasetMetadataBinding + # is not the exact class used in the service (imports match). + # But we can check the count. + # If it were added, there would be 2 calls. If skipped, 1 call. + assert mock_db.session.add.call_count == 1 + + +if __name__ == "__main__": + unittest.main() diff --git a/web/app/components/datasets/metadata/hooks/use-batch-edit-document-metadata.ts b/web/app/components/datasets/metadata/hooks/use-batch-edit-document-metadata.ts index f350fd7b8b..0f07b3de60 100644 --- a/web/app/components/datasets/metadata/hooks/use-batch-edit-document-metadata.ts +++ b/web/app/components/datasets/metadata/hooks/use-batch-edit-document-metadata.ts @@ -115,6 +115,7 @@ const useBatchEditDocumentMetadata = ({ return { document_id: documentId, metadata_list: newMetadataList, + partial_update: docIndex < 0, } }) return res diff --git a/web/app/components/datasets/metadata/types.ts b/web/app/components/datasets/metadata/types.ts index 00688e29cc..fb1728232a 100644 --- a/web/app/components/datasets/metadata/types.ts +++ b/web/app/components/datasets/metadata/types.ts @@ -25,7 +25,7 @@ export type MetadataItemInBatchEdit = MetadataItemWithValue & { isMultipleValue?: boolean } -export type MetadataBatchEditToServer = { document_id: string, metadata_list: MetadataItemWithValue[] }[] +export type MetadataBatchEditToServer = { document_id: string, metadata_list: MetadataItemWithValue[], partial_update?: boolean }[] export enum UpdateType { changeValue = 'changeValue',