From 8f79989172f690fe0a5f8e9ba7efbef7ef55281e Mon Sep 17 00:00:00 2001 From: zhangx1n Date: Wed, 25 Mar 2026 15:51:48 +0800 Subject: [PATCH] remove vdb tablestore --- api/commands/vector.py | 1 - api/configs/middleware/__init__.py | 2 - .../middleware/vdb/tablestore_config.py | 33 -- api/controllers/console/datasets/datasets.py | 1 - .../rag/datasource/vdb/tablestore/__init__.py | 0 .../vdb/tablestore/tablestore_vector.py | 413 ------------------ api/core/rag/datasource/vdb/vector_factory.py | 4 - api/core/rag/datasource/vdb/vector_type.py | 1 - api/pyproject.toml | 1 - api/pyrefly-local-excludes.txt | 1 - .../vdb/tablestore/__init__.py | 0 .../vdb/tablestore/test_tablestore.py | 100 ----- api/uv.lock | 53 --- docker/docker-compose.yaml | 5 - 14 files changed, 615 deletions(-) delete mode 100644 api/configs/middleware/vdb/tablestore_config.py delete mode 100644 api/core/rag/datasource/vdb/tablestore/__init__.py delete mode 100644 api/core/rag/datasource/vdb/tablestore/tablestore_vector.py delete mode 100644 api/tests/integration_tests/vdb/tablestore/__init__.py delete mode 100644 api/tests/integration_tests/vdb/tablestore/test_tablestore.py diff --git a/api/commands/vector.py b/api/commands/vector.py index 37add64bf7..2ea03f16a5 100644 --- a/api/commands/vector.py +++ b/api/commands/vector.py @@ -155,7 +155,6 @@ def migrate_knowledge_vector_database(): VectorType.ORACLE, VectorType.ELASTICSEARCH, VectorType.OPENGAUSS, - VectorType.TABLESTORE, VectorType.MATRIXONE, } lower_collection_vector_types = { diff --git a/api/configs/middleware/__init__.py b/api/configs/middleware/__init__.py index e280095435..5a15905ea3 100644 --- a/api/configs/middleware/__init__.py +++ b/api/configs/middleware/__init__.py @@ -38,7 +38,6 @@ from .vdb.pgvector_config import PGVectorConfig from .vdb.pgvectors_config import PGVectoRSConfig from .vdb.qdrant_config import QdrantConfig from .vdb.relyt_config import RelytConfig -from .vdb.tablestore_config import TableStoreConfig from .vdb.tencent_vector_config import TencentVectorDBConfig from .vdb.tidb_on_qdrant_config import TidbOnQdrantConfig from .vdb.tidb_vector_config import TiDBVectorConfig @@ -367,7 +366,6 @@ class MiddlewareConfig( OceanBaseVectorConfig, BaiduVectorDBConfig, OpenGaussConfig, - TableStoreConfig, DatasetQueueMonitorConfig, MatrixoneConfig, ): diff --git a/api/configs/middleware/vdb/tablestore_config.py b/api/configs/middleware/vdb/tablestore_config.py deleted file mode 100644 index 2cec384b5d..0000000000 --- a/api/configs/middleware/vdb/tablestore_config.py +++ /dev/null @@ -1,33 +0,0 @@ -from pydantic import Field -from pydantic_settings import BaseSettings - - -class TableStoreConfig(BaseSettings): - """ - Configuration settings for TableStore. - """ - - TABLESTORE_ENDPOINT: str | None = Field( - description="Endpoint address of the TableStore server (e.g. 'https://instance-name.cn-hangzhou.ots.aliyuncs.com')", - default=None, - ) - - TABLESTORE_INSTANCE_NAME: str | None = Field( - description="Instance name to access TableStore server (eg. 'instance-name')", - default=None, - ) - - TABLESTORE_ACCESS_KEY_ID: str | None = Field( - description="AccessKey id for the instance name", - default=None, - ) - - TABLESTORE_ACCESS_KEY_SECRET: str | None = Field( - description="AccessKey secret for the instance name", - default=None, - ) - - TABLESTORE_NORMALIZE_FULLTEXT_BM25_SCORE: bool = Field( - description="Whether to normalize full-text search scores to [0, 1]", - default=False, - ) diff --git a/api/controllers/console/datasets/datasets.py b/api/controllers/console/datasets/datasets.py index 053d502e47..2d5b0f1585 100644 --- a/api/controllers/console/datasets/datasets.py +++ b/api/controllers/console/datasets/datasets.py @@ -254,7 +254,6 @@ def _get_retrieval_methods_by_vector_type(vector_type: str | None, is_mock: bool VectorType.OPENGAUSS, VectorType.OCEANBASE, VectorType.SEEKDB, - VectorType.TABLESTORE, VectorType.HUAWEI_CLOUD, VectorType.TENCENT, VectorType.MATRIXONE, diff --git a/api/core/rag/datasource/vdb/tablestore/__init__.py b/api/core/rag/datasource/vdb/tablestore/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/api/core/rag/datasource/vdb/tablestore/tablestore_vector.py b/api/core/rag/datasource/vdb/tablestore/tablestore_vector.py deleted file mode 100644 index f2156afa59..0000000000 --- a/api/core/rag/datasource/vdb/tablestore/tablestore_vector.py +++ /dev/null @@ -1,413 +0,0 @@ -import json -import logging -import math -from collections.abc import Iterable -from typing import Any - -import tablestore # type: ignore -from pydantic import BaseModel, model_validator -from tablestore import BatchGetRowRequest, TableInBatchGetRowItem - -from configs import dify_config -from core.rag.datasource.vdb.field import Field -from core.rag.datasource.vdb.vector_base import BaseVector -from core.rag.datasource.vdb.vector_factory import AbstractVectorFactory -from core.rag.datasource.vdb.vector_type import VectorType -from core.rag.embedding.embedding_base import Embeddings -from core.rag.models.document import Document -from extensions.ext_redis import redis_client -from models import Dataset - -logger = logging.getLogger(__name__) - - -class TableStoreConfig(BaseModel): - access_key_id: str | None = None - access_key_secret: str | None = None - instance_name: str | None = None - endpoint: str | None = None - normalize_full_text_bm25_score: bool | None = False - - @model_validator(mode="before") - @classmethod - def validate_config(cls, values: dict): - if not values["access_key_id"]: - raise ValueError("config ACCESS_KEY_ID is required") - if not values["access_key_secret"]: - raise ValueError("config ACCESS_KEY_SECRET is required") - if not values["instance_name"]: - raise ValueError("config INSTANCE_NAME is required") - if not values["endpoint"]: - raise ValueError("config ENDPOINT is required") - return values - - -class TableStoreVector(BaseVector): - def __init__(self, collection_name: str, config: TableStoreConfig): - super().__init__(collection_name) - self._config = config - self._tablestore_client = tablestore.OTSClient( - config.endpoint, - config.access_key_id, - config.access_key_secret, - config.instance_name, - ) - self._normalize_full_text_bm25_score = config.normalize_full_text_bm25_score - self._table_name = f"{collection_name}" - self._index_name = f"{collection_name}_idx" - self._tags_field = f"{Field.METADATA_KEY}_tags" - - def create_collection(self, embeddings: list[list[float]], **kwargs): - dimension = len(embeddings[0]) - self._create_collection(dimension) - - def get_by_ids(self, ids: list[str]) -> list[Document]: - docs = [] - request = BatchGetRowRequest() - columns_to_get = [Field.METADATA_KEY, Field.CONTENT_KEY] - rows_to_get = [[("id", _id)] for _id in ids] - request.add(TableInBatchGetRowItem(self._table_name, rows_to_get, columns_to_get, None, 1)) - - result = self._tablestore_client.batch_get_row(request) - table_result = result.get_result_by_table(self._table_name) - for item in table_result: - if item.is_ok and item.row: - kv = {k: v for k, v, _ in item.row.attribute_columns} - docs.append(Document(page_content=kv[Field.CONTENT_KEY], metadata=json.loads(kv[Field.METADATA_KEY]))) - return docs - - def get_type(self) -> str: - return VectorType.TABLESTORE - - def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs): - dimension = len(embeddings[0]) - self._create_collection(dimension) - self.add_texts(documents=texts, embeddings=embeddings, **kwargs) - - def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs): - uuids = self._get_uuids(documents) - - for i in range(len(documents)): - self._write_row( - primary_key=uuids[i], - attributes={ - Field.CONTENT_KEY: documents[i].page_content, - Field.VECTOR: embeddings[i], - Field.METADATA_KEY: documents[i].metadata, - }, - ) - return uuids - - def text_exists(self, id: str) -> bool: - result = self._tablestore_client.get_row( - table_name=self._table_name, primary_key=[("id", id)], columns_to_get=["id"] - ) - assert isinstance(result, tuple | list) - # Unpack the tuple result - _, return_row, _ = result - - return return_row is not None - - def delete_by_ids(self, ids: list[str]): - if not ids: - return - for id in ids: - self._delete_row(id=id) - - def get_ids_by_metadata_field(self, key: str, value: str): - return self._search_by_metadata(key, value) - - def delete_by_metadata_field(self, key: str, value: str): - ids = self.get_ids_by_metadata_field(key, value) - self.delete_by_ids(ids) - - def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]: - top_k = kwargs.get("top_k", 4) - document_ids_filter = kwargs.get("document_ids_filter") - filtered_list = None - if document_ids_filter: - filtered_list = ["document_id=" + item for item in document_ids_filter] - score_threshold = float(kwargs.get("score_threshold") or 0.0) - return self._search_by_vector(query_vector, filtered_list, top_k, score_threshold) - - def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]: - top_k = kwargs.get("top_k", 4) - document_ids_filter = kwargs.get("document_ids_filter") - filtered_list = None - if document_ids_filter: - filtered_list = ["document_id=" + item for item in document_ids_filter] - score_threshold = float(kwargs.get("score_threshold") or 0.0) - return self._search_by_full_text(query, filtered_list, top_k, score_threshold) - - def delete(self): - self._delete_table_if_exist() - - def _create_collection(self, dimension: int): - lock_name = f"vector_indexing_lock_{self._collection_name}" - with redis_client.lock(lock_name, timeout=20): - collection_exist_cache_key = f"vector_indexing_{self._collection_name}" - if redis_client.get(collection_exist_cache_key): - logger.info("Collection %s already exists.", self._collection_name) - return - - self._create_table_if_not_exist() - self._create_search_index_if_not_exist(dimension) - redis_client.set(collection_exist_cache_key, 1, ex=3600) - - def _create_table_if_not_exist(self): - table_list = self._tablestore_client.list_table() - if self._table_name in table_list: - logger.info("Tablestore system table[%s] already exists", self._table_name) - return None - - schema_of_primary_key = [("id", "STRING")] - table_meta = tablestore.TableMeta(self._table_name, schema_of_primary_key) - table_options = tablestore.TableOptions() - reserved_throughput = tablestore.ReservedThroughput(tablestore.CapacityUnit(0, 0)) - self._tablestore_client.create_table(table_meta, table_options, reserved_throughput) - logger.info("Tablestore create table[%s] successfully.", self._table_name) - - def _create_search_index_if_not_exist(self, dimension: int): - search_index_list = self._tablestore_client.list_search_index(table_name=self._table_name) - assert isinstance(search_index_list, Iterable) - if self._index_name in [t[1] for t in search_index_list]: - logger.info("Tablestore system index[%s] already exists", self._index_name) - return None - - field_schemas = [ - tablestore.FieldSchema( - Field.CONTENT_KEY, - tablestore.FieldType.TEXT, - analyzer=tablestore.AnalyzerType.MAXWORD, - index=True, - enable_sort_and_agg=False, - store=False, - ), - tablestore.FieldSchema( - Field.VECTOR, - tablestore.FieldType.VECTOR, - vector_options=tablestore.VectorOptions( - data_type=tablestore.VectorDataType.VD_FLOAT_32, - dimension=dimension, - metric_type=tablestore.VectorMetricType.VM_COSINE, - ), - ), - tablestore.FieldSchema( - Field.METADATA_KEY, - tablestore.FieldType.KEYWORD, - index=True, - store=False, - ), - tablestore.FieldSchema( - self._tags_field, - tablestore.FieldType.KEYWORD, - index=True, - store=False, - is_array=True, - ), - ] - - index_meta = tablestore.SearchIndexMeta(field_schemas) - self._tablestore_client.create_search_index(self._table_name, self._index_name, index_meta) - logger.info("Tablestore create system index[%s] successfully.", self._index_name) - - def _delete_table_if_exist(self): - search_index_list = self._tablestore_client.list_search_index(table_name=self._table_name) - assert isinstance(search_index_list, Iterable) - for resp_tuple in search_index_list: - self._tablestore_client.delete_search_index(resp_tuple[0], resp_tuple[1]) - logger.info("Tablestore delete index[%s] successfully.", self._index_name) - - self._tablestore_client.delete_table(self._table_name) - logger.info("Tablestore delete system table[%s] successfully.", self._index_name) - - def _delete_search_index(self): - self._tablestore_client.delete_search_index(self._table_name, self._index_name) - logger.info("Tablestore delete index[%s] successfully.", self._index_name) - - def _write_row(self, primary_key: str, attributes: dict[str, Any]): - pk = [("id", primary_key)] - - tags = [] - for key, value in attributes[Field.METADATA_KEY].items(): - tags.append(str(key) + "=" + str(value)) - - attribute_columns = [ - (Field.CONTENT_KEY, attributes[Field.CONTENT_KEY]), - (Field.VECTOR, json.dumps(attributes[Field.VECTOR])), - ( - Field.METADATA_KEY, - json.dumps(attributes[Field.METADATA_KEY]), - ), - (self._tags_field, json.dumps(tags)), - ] - row = tablestore.Row(pk, attribute_columns) - self._tablestore_client.put_row(self._table_name, row) - - def _delete_row(self, id: str): - primary_key = [("id", id)] - row = tablestore.Row(primary_key) - self._tablestore_client.delete_row(self._table_name, row, None) - - def _search_by_metadata(self, key: str, value: str) -> list[str]: - query = tablestore.SearchQuery( - tablestore.TermQuery(self._tags_field, str(key) + "=" + str(value)), - limit=1000, - get_total_count=False, - ) - rows: list[str] = [] - next_token = None - while True: - if next_token is not None: - query.next_token = next_token - - search_response = self._tablestore_client.search( - table_name=self._table_name, - index_name=self._index_name, - search_query=query, - columns_to_get=tablestore.ColumnsToGet( - column_names=[Field.PRIMARY_KEY], return_type=tablestore.ColumnReturnType.SPECIFIED - ), - ) - - if search_response is not None: - rows.extend([row[0][0][1] for row in list(search_response.rows)]) - - if search_response is None or search_response.next_token == b"": - break - else: - next_token = search_response.next_token - - return rows - - def _search_by_vector( - self, query_vector: list[float], document_ids_filter: list[str] | None, top_k: int, score_threshold: float - ) -> list[Document]: - knn_vector_query = tablestore.KnnVectorQuery( - field_name=Field.VECTOR, - top_k=top_k, - float32_query_vector=query_vector, - ) - if document_ids_filter: - knn_vector_query.filter = tablestore.TermsQuery(self._tags_field, document_ids_filter) - - sort = tablestore.Sort(sorters=[tablestore.ScoreSort(sort_order=tablestore.SortOrder.DESC)]) - search_query = tablestore.SearchQuery(knn_vector_query, limit=top_k, get_total_count=False, sort=sort) - - search_response = self._tablestore_client.search( - table_name=self._table_name, - index_name=self._index_name, - search_query=search_query, - columns_to_get=tablestore.ColumnsToGet(return_type=tablestore.ColumnReturnType.ALL_FROM_INDEX), - ) - documents = [] - for search_hit in search_response.search_hits: - if search_hit.score >= score_threshold: - ots_column_map = {} - for col in search_hit.row[1]: - ots_column_map[col[0]] = col[1] - - vector_str = ots_column_map.get(Field.VECTOR) - metadata_str = ots_column_map.get(Field.METADATA_KEY) - - vector = json.loads(vector_str) if vector_str else None - metadata = json.loads(metadata_str) if metadata_str else {} - - metadata["score"] = search_hit.score - - documents.append( - Document( - page_content=ots_column_map.get(Field.CONTENT_KEY) or "", - vector=vector, - metadata=metadata, - ) - ) - documents = sorted(documents, key=lambda x: x.metadata["score"] if x.metadata else 0, reverse=True) - return documents - - @staticmethod - def _normalize_score_exp_decay(score: float, k: float = 0.15) -> float: - """ - Args: - score: BM25 search score. - k: decay factor, the larger the k, the steeper the low score end - """ - normalized_score = 1 - math.exp(-k * score) - return max(0.0, min(1.0, normalized_score)) - - def _search_by_full_text( - self, query: str, document_ids_filter: list[str] | None, top_k: int, score_threshold: float - ) -> list[Document]: - bool_query = tablestore.BoolQuery(must_queries=[], filter_queries=[], should_queries=[], must_not_queries=[]) - bool_query.must_queries.append(tablestore.MatchQuery(text=query, field_name=Field.CONTENT_KEY)) - - if document_ids_filter: - bool_query.filter_queries.append(tablestore.TermsQuery(self._tags_field, document_ids_filter)) - - search_query = tablestore.SearchQuery( - query=bool_query, - sort=tablestore.Sort(sorters=[tablestore.ScoreSort(sort_order=tablestore.SortOrder.DESC)]), - limit=top_k, - ) - search_response = self._tablestore_client.search( - table_name=self._table_name, - index_name=self._index_name, - search_query=search_query, - columns_to_get=tablestore.ColumnsToGet(return_type=tablestore.ColumnReturnType.ALL_FROM_INDEX), - ) - - documents = [] - for search_hit in search_response.search_hits: - score = None - if self._normalize_full_text_bm25_score: - score = self._normalize_score_exp_decay(search_hit.score) - - # skip when score is below threshold and use normalize score - if score and score <= score_threshold: - continue - - ots_column_map = {} - for col in search_hit.row[1]: - ots_column_map[col[0]] = col[1] - - metadata_str = ots_column_map.get(Field.METADATA_KEY) - metadata = json.loads(metadata_str) if metadata_str else {} - - vector_str = ots_column_map.get(Field.VECTOR) - vector = json.loads(vector_str) if vector_str else None - - if score: - metadata["score"] = score - - documents.append( - Document( - page_content=ots_column_map.get(Field.CONTENT_KEY) or "", - vector=vector, - metadata=metadata, - ) - ) - if self._normalize_full_text_bm25_score: - documents = sorted(documents, key=lambda x: x.metadata["score"] if x.metadata else 0, reverse=True) - return documents - - -class TableStoreVectorFactory(AbstractVectorFactory): - def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> TableStoreVector: - if dataset.index_struct_dict: - class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"] - collection_name = class_prefix - else: - dataset_id = dataset.id - collection_name = Dataset.gen_collection_name_by_id(dataset_id) - dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.TABLESTORE, collection_name)) - - return TableStoreVector( - collection_name=collection_name, - config=TableStoreConfig( - endpoint=dify_config.TABLESTORE_ENDPOINT, - instance_name=dify_config.TABLESTORE_INSTANCE_NAME, - access_key_id=dify_config.TABLESTORE_ACCESS_KEY_ID, - access_key_secret=dify_config.TABLESTORE_ACCESS_KEY_SECRET, - normalize_full_text_bm25_score=dify_config.TABLESTORE_NORMALIZE_FULLTEXT_BM25_SCORE, - ), - ) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index 9839dbf98d..f63036cec9 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -167,10 +167,6 @@ class Vector: from core.rag.datasource.vdb.opengauss.opengauss import OpenGaussFactory return OpenGaussFactory - case VectorType.TABLESTORE: - from core.rag.datasource.vdb.tablestore.tablestore_vector import TableStoreVectorFactory - - return TableStoreVectorFactory case VectorType.HUAWEI_CLOUD: from core.rag.datasource.vdb.huawei.huawei_cloud_vector import HuaweiCloudVectorFactory diff --git a/api/core/rag/datasource/vdb/vector_type.py b/api/core/rag/datasource/vdb/vector_type.py index 2dfec13b71..90cd4592db 100644 --- a/api/core/rag/datasource/vdb/vector_type.py +++ b/api/core/rag/datasource/vdb/vector_type.py @@ -28,7 +28,6 @@ class VectorType(StrEnum): OCEANBASE = "oceanbase" SEEKDB = "seekdb" OPENGAUSS = "opengauss" - TABLESTORE = "tablestore" HUAWEI_CLOUD = "huawei_cloud" MATRIXONE = "matrixone" IRIS = "iris" diff --git a/api/pyproject.toml b/api/pyproject.toml index 9d940778e2..b31a002686 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -215,7 +215,6 @@ vdb = [ "pyobvector~=0.2.17", "qdrant-client==1.9.0", "intersystems-irispython>=5.1.0", - "tablestore==6.4.1", "tcvectordb~=2.0.0", "tidb-vector==0.0.15", "upstash-vector==0.8.0", diff --git a/api/pyrefly-local-excludes.txt b/api/pyrefly-local-excludes.txt index bf1a7a5716..1b26d73df4 100644 --- a/api/pyrefly-local-excludes.txt +++ b/api/pyrefly-local-excludes.txt @@ -59,7 +59,6 @@ core/rag/datasource/vdb/opensearch/opensearch_vector.py core/rag/datasource/vdb/oracle/oraclevector.py core/rag/datasource/vdb/pgvecto_rs/pgvecto_rs.py core/rag/datasource/vdb/relyt/relyt_vector.py -core/rag/datasource/vdb/tablestore/tablestore_vector.py core/rag/datasource/vdb/tencent/tencent_vector.py core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py core/rag/datasource/vdb/tidb_on_qdrant/tidb_service.py diff --git a/api/tests/integration_tests/vdb/tablestore/__init__.py b/api/tests/integration_tests/vdb/tablestore/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/api/tests/integration_tests/vdb/tablestore/test_tablestore.py b/api/tests/integration_tests/vdb/tablestore/test_tablestore.py deleted file mode 100644 index aebf3fbda1..0000000000 --- a/api/tests/integration_tests/vdb/tablestore/test_tablestore.py +++ /dev/null @@ -1,100 +0,0 @@ -import os -import uuid - -import tablestore -from _pytest.python_api import approx - -from core.rag.datasource.vdb.tablestore.tablestore_vector import ( - TableStoreConfig, - TableStoreVector, -) -from tests.integration_tests.vdb.test_vector_store import ( - AbstractVectorTest, - get_example_document, - get_example_text, - setup_mock_redis, -) - - -class TableStoreVectorTest(AbstractVectorTest): - def __init__(self, normalize_full_text_score: bool = False): - super().__init__() - self.vector = TableStoreVector( - collection_name=self.collection_name, - config=TableStoreConfig( - endpoint=os.getenv("TABLESTORE_ENDPOINT"), - instance_name=os.getenv("TABLESTORE_INSTANCE_NAME"), - access_key_id=os.getenv("TABLESTORE_ACCESS_KEY_ID"), - access_key_secret=os.getenv("TABLESTORE_ACCESS_KEY_SECRET"), - normalize_full_text_bm25_score=normalize_full_text_score, - ), - ) - - def get_ids_by_metadata_field(self): - ids = self.vector.get_ids_by_metadata_field(key="doc_id", value=self.example_doc_id) - assert ids is not None - assert len(ids) == 1 - assert ids[0] == self.example_doc_id - - def create_vector(self): - self.vector.create( - texts=[get_example_document(doc_id=self.example_doc_id)], - embeddings=[self.example_embedding], - ) - while True: - search_response = self.vector._tablestore_client.search( - table_name=self.vector._table_name, - index_name=self.vector._index_name, - search_query=tablestore.SearchQuery(query=tablestore.MatchAllQuery(), get_total_count=True, limit=0), - columns_to_get=tablestore.ColumnsToGet(return_type=tablestore.ColumnReturnType.ALL_FROM_INDEX), - ) - if search_response.total_count == 1: - break - - def search_by_vector(self): - super().search_by_vector() - docs = self.vector.search_by_vector(self.example_embedding, document_ids_filter=[self.example_doc_id]) - assert len(docs) == 1 - assert docs[0].metadata["doc_id"] == self.example_doc_id - assert docs[0].metadata["score"] > 0 - - docs = self.vector.search_by_vector(self.example_embedding, document_ids_filter=[str(uuid.uuid4())]) - assert len(docs) == 0 - - def search_by_full_text(self): - super().search_by_full_text() - docs = self.vector.search_by_full_text(get_example_text(), document_ids_filter=[self.example_doc_id]) - assert len(docs) == 1 - assert docs[0].metadata["doc_id"] == self.example_doc_id - if self.vector._config.normalize_full_text_bm25_score: - assert docs[0].metadata["score"] == approx(0.1214, abs=1e-3) - else: - assert docs[0].metadata.get("score") is None - - # return none if normalize_full_text_score=true and score_threshold > 0 - docs = self.vector.search_by_full_text( - get_example_text(), document_ids_filter=[self.example_doc_id], score_threshold=0.5 - ) - if self.vector._config.normalize_full_text_bm25_score: - assert len(docs) == 0 - else: - assert len(docs) == 1 - assert docs[0].metadata["doc_id"] == self.example_doc_id - assert docs[0].metadata.get("score") is None - - docs = self.vector.search_by_full_text(get_example_text(), document_ids_filter=[str(uuid.uuid4())]) - assert len(docs) == 0 - - def run_all_tests(self): - try: - self.vector.delete() - except Exception: - pass - - return super().run_all_tests() - - -def test_tablestore_vector(setup_mock_redis): - TableStoreVectorTest().run_all_tests() - TableStoreVectorTest(normalize_full_text_score=True).run_all_tests() - TableStoreVectorTest(normalize_full_text_score=False).run_all_tests() diff --git a/api/uv.lock b/api/uv.lock index 2e4115717a..b514b038d9 100644 --- a/api/uv.lock +++ b/api/uv.lock @@ -1155,37 +1155,6 @@ toml = [ { name = "tomli", marker = "python_full_version <= '3.11'" }, ] -[[package]] -name = "crc32c" -version = "2.8" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e3/66/7e97aa77af7cf6afbff26e3651b564fe41932599bc2d3dce0b2f73d4829a/crc32c-2.8.tar.gz", hash = "sha256:578728964e59c47c356aeeedee6220e021e124b9d3e8631d95d9a5e5f06e261c", size = 48179 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/0b/5e03b22d913698e9cc563f39b9f6bbd508606bf6b8e9122cd6bf196b87ea/crc32c-2.8-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e560a97fbb96c9897cb1d9b5076ef12fc12e2e25622530a1afd0de4240f17e1f", size = 66329 }, - { url = "https://files.pythonhosted.org/packages/6b/38/2fe0051ffe8c6a650c8b1ac0da31b8802d1dbe5fa40a84e4b6b6f5583db5/crc32c-2.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6762d276d90331a490ef7e71ffee53b9c0eb053bd75a272d786f3b08d3fe3671", size = 62988 }, - { url = "https://files.pythonhosted.org/packages/3e/30/5837a71c014be83aba1469c58820d287fc836512a0cad6b8fdd43868accd/crc32c-2.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:60670569f5ede91e39f48fb0cb4060e05b8d8704dd9e17ede930bf441b2f73ef", size = 61522 }, - { url = "https://files.pythonhosted.org/packages/ca/29/63972fc1452778e2092ae998c50cbfc2fc93e3fa9798a0278650cd6169c5/crc32c-2.8-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:711743da6ccc70b3c6718c328947b0b6f34a1fe6a6c27cc6c1d69cc226bf70e9", size = 80200 }, - { url = "https://files.pythonhosted.org/packages/cb/3a/60eb49d7bdada4122b3ffd45b0df54bdc1b8dd092cda4b069a287bdfcff4/crc32c-2.8-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5eb4094a2054774f13b26f21bf56792bb44fa1fcee6c6ad099387a43ffbfb4fa", size = 81757 }, - { url = "https://files.pythonhosted.org/packages/f5/63/6efc1b64429ef7d23bd58b75b7ac24d15df327e3ebbe9c247a0f7b1c2ed1/crc32c-2.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fff15bf2bd3e95780516baae935ed12be88deaa5ebe6143c53eb0d26a7bdc7b7", size = 80830 }, - { url = "https://files.pythonhosted.org/packages/e1/eb/0ae9f436f8004f1c88f7429e659a7218a3879bd11a6b18ed1257aad7e98b/crc32c-2.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4c0e11e3826668121fa53e0745635baf5e4f0ded437e8ff63ea56f38fc4f970a", size = 80095 }, - { url = "https://files.pythonhosted.org/packages/9e/81/4afc9d468977a4cd94a2eb62908553345009a7c0d30e74463a15d4b48ec3/crc32c-2.8-cp311-cp311-win32.whl", hash = "sha256:38f915336715d1f1353ab07d7d786f8a789b119e273aea106ba55355dfc9101d", size = 64886 }, - { url = "https://files.pythonhosted.org/packages/d6/e8/94e839c9f7e767bf8479046a207afd440a08f5c59b52586e1af5e64fa4a0/crc32c-2.8-cp311-cp311-win_amd64.whl", hash = "sha256:60e0a765b1caab8d31b2ea80840639253906a9351d4b861551c8c8625ea20f86", size = 66639 }, - { url = "https://files.pythonhosted.org/packages/b6/36/fd18ef23c42926b79c7003e16cb0f79043b5b179c633521343d3b499e996/crc32c-2.8-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:572ffb1b78cce3d88e8d4143e154d31044a44be42cb3f6fbbf77f1e7a941c5ab", size = 66379 }, - { url = "https://files.pythonhosted.org/packages/7f/b8/c584958e53f7798dd358f5bdb1bbfc97483134f053ee399d3eeb26cca075/crc32c-2.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cf827b3758ee0c4aacd21ceca0e2da83681f10295c38a10bfeb105f7d98f7a68", size = 63042 }, - { url = "https://files.pythonhosted.org/packages/62/e6/6f2af0ec64a668a46c861e5bc778ea3ee42171fedfc5440f791f470fd783/crc32c-2.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:106fbd79013e06fa92bc3b51031694fcc1249811ed4364ef1554ee3dd2c7f5a2", size = 61528 }, - { url = "https://files.pythonhosted.org/packages/17/8b/4a04bd80a024f1a23978f19ae99407783e06549e361ab56e9c08bba3c1d3/crc32c-2.8-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6dde035f91ffbfe23163e68605ee5a4bb8ceebd71ed54bb1fb1d0526cdd125a2", size = 80028 }, - { url = "https://files.pythonhosted.org/packages/21/8f/01c7afdc76ac2007d0e6a98e7300b4470b170480f8188475b597d1f4b4c6/crc32c-2.8-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e41ebe7c2f0fdcd9f3a3fd206989a36b460b4d3f24816d53e5be6c7dba72c5e1", size = 81531 }, - { url = "https://files.pythonhosted.org/packages/32/2b/8f78c5a8cc66486be5f51b6f038fc347c3ba748d3ea68be17a014283c331/crc32c-2.8-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ecf66cf90266d9c15cea597d5cc86c01917cd1a238dc3c51420c7886fa750d7e", size = 80608 }, - { url = "https://files.pythonhosted.org/packages/db/86/fad1a94cdeeeb6b6e2323c87f970186e74bfd6fbfbc247bf5c88ad0873d5/crc32c-2.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:59eee5f3a69ad0793d5fa9cdc9b9d743b0cd50edf7fccc0a3988a821fef0208c", size = 79886 }, - { url = "https://files.pythonhosted.org/packages/d5/db/1a7cb6757a1e32376fa2dfce00c815ea4ee614a94f9bff8228e37420c183/crc32c-2.8-cp312-cp312-win32.whl", hash = "sha256:a73d03ce3604aa5d7a2698e9057a0eef69f529c46497b27ee1c38158e90ceb76", size = 64896 }, - { url = "https://files.pythonhosted.org/packages/bf/8e/2024de34399b2e401a37dcb54b224b56c747b0dc46de4966886827b4d370/crc32c-2.8-cp312-cp312-win_amd64.whl", hash = "sha256:56b3b7d015247962cf58186e06d18c3d75a1a63d709d3233509e1c50a2d36aa2", size = 66645 }, - { url = "https://files.pythonhosted.org/packages/a7/1d/dd926c68eb8aac8b142a1a10b8eb62d95212c1cf81775644373fe7cceac2/crc32c-2.8-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5833f4071da7ea182c514ba17d1eee8aec3c5be927d798222fbfbbd0f5eea02c", size = 62345 }, - { url = "https://files.pythonhosted.org/packages/51/be/803404e5abea2ef2c15042edca04bbb7f625044cca879e47f186b43887c2/crc32c-2.8-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:1dc4da036126ac07b39dd9d03e93e585ec615a2ad28ff12757aef7de175295a8", size = 61229 }, - { url = "https://files.pythonhosted.org/packages/fc/3a/00cc578cd27ed0b22c9be25cef2c24539d92df9fa80ebd67a3fc5419724c/crc32c-2.8-pp311-pypy311_pp73-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:15905fa78344654e241371c47e6ed2411f9eeb2b8095311c68c88eccf541e8b4", size = 64108 }, - { url = "https://files.pythonhosted.org/packages/6b/bc/0587ef99a1c7629f95dd0c9d4f3d894de383a0df85831eb16c48a6afdae4/crc32c-2.8-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c596f918688821f796434e89b431b1698396c38bf0b56de873621528fe3ecb1e", size = 64815 }, - { url = "https://files.pythonhosted.org/packages/73/42/94f2b8b92eae9064fcfb8deef2b971514065bd606231f8857ff8ae02bebd/crc32c-2.8-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8d23c4fe01b3844cb6e091044bc1cebdef7d16472e058ce12d9fadf10d2614af", size = 66659 }, -] - [[package]] name = "crcmod" version = "1.7" @@ -1534,7 +1503,6 @@ vdb = [ { name = "pymochow" }, { name = "pyobvector" }, { name = "qdrant-client" }, - { name = "tablestore" }, { name = "tcvectordb" }, { name = "tidb-vector" }, { name = "upstash-vector" }, @@ -1733,7 +1701,6 @@ vdb = [ { name = "pymochow", specifier = "==2.3.6" }, { name = "pyobvector", specifier = "~=0.2.17" }, { name = "qdrant-client", specifier = "==1.9.0" }, - { name = "tablestore", specifier = "==6.4.1" }, { name = "tcvectordb", specifier = "~=2.0.0" }, { name = "tidb-vector", specifier = "==0.0.15" }, { name = "upstash-vector", specifier = "==0.8.0" }, @@ -6231,26 +6198,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353 }, ] -[[package]] -name = "tablestore" -version = "6.4.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "aiohttp" }, - { name = "certifi" }, - { name = "crc32c" }, - { name = "flatbuffers" }, - { name = "future" }, - { name = "numpy" }, - { name = "protobuf" }, - { name = "six" }, - { name = "urllib3" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/62/00/53f8eeb0016e7ad518f92b085de8855891d10581b42f86d15d1df7a56d33/tablestore-6.4.1.tar.gz", hash = "sha256:005c6939832f2ecd403e01220b7045de45f2e53f1ffaf0c2efc435810885fffb", size = 120319 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cc/96/a132bdecb753dc9dc34124a53019da29672baaa34485c8c504895897ea96/tablestore-6.4.1-py3-none-any.whl", hash = "sha256:616898d294dfe22f0d427463c241c6788374cdb2ace9aaf85673ce2c2a18d7e0", size = 141556 }, -] - [[package]] name = "tabulate" version = "0.9.0" diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 5eeaf5d9ed..d600218c40 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -358,11 +358,6 @@ x-shared-env: &shared-api-worker-env HUAWEI_CLOUD_PASSWORD: ${HUAWEI_CLOUD_PASSWORD:-admin} UPSTASH_VECTOR_URL: ${UPSTASH_VECTOR_URL:-https://xxx-vector.upstash.io} UPSTASH_VECTOR_TOKEN: ${UPSTASH_VECTOR_TOKEN:-dify} - TABLESTORE_ENDPOINT: ${TABLESTORE_ENDPOINT:-https://instance-name.cn-hangzhou.ots.aliyuncs.com} - TABLESTORE_INSTANCE_NAME: ${TABLESTORE_INSTANCE_NAME:-instance-name} - TABLESTORE_ACCESS_KEY_ID: ${TABLESTORE_ACCESS_KEY_ID:-xxx} - TABLESTORE_ACCESS_KEY_SECRET: ${TABLESTORE_ACCESS_KEY_SECRET:-xxx} - TABLESTORE_NORMALIZE_FULLTEXT_BM25_SCORE: ${TABLESTORE_NORMALIZE_FULLTEXT_BM25_SCORE:-false} CLICKZETTA_USERNAME: ${CLICKZETTA_USERNAME:-} CLICKZETTA_PASSWORD: ${CLICKZETTA_PASSWORD:-} CLICKZETTA_INSTANCE: ${CLICKZETTA_INSTANCE:-}