From f76a3f545c8dd2c301790925ba89d02d80a87140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A2=A8=E7=BB=BF=E8=89=B2?= <48266410+lcedaw@users.noreply.github.com> Date: Tue, 25 Nov 2025 20:07:45 +0800 Subject: [PATCH] Feat/add weaviate tokenization configurable (#28159) Co-authored-by: lijiezhao Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- api/.env.example | 1 + api/configs/middleware/vdb/weaviate_config.py | 5 +++++ api/core/rag/datasource/vdb/weaviate/weaviate_vector.py | 7 ++++++- api/tests/integration_tests/.env.example | 1 + docker/.env.example | 1 + docker/docker-compose.yaml | 1 + 6 files changed, 15 insertions(+), 1 deletion(-) diff --git a/api/.env.example b/api/.env.example index ba512a668d..fbf0b12f40 100644 --- a/api/.env.example +++ b/api/.env.example @@ -176,6 +176,7 @@ WEAVIATE_ENDPOINT=http://localhost:8080 WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih WEAVIATE_GRPC_ENABLED=false WEAVIATE_BATCH_SIZE=100 +WEAVIATE_TOKENIZATION=word # OceanBase Vector configuration OCEANBASE_VECTOR_HOST=127.0.0.1 diff --git a/api/configs/middleware/vdb/weaviate_config.py b/api/configs/middleware/vdb/weaviate_config.py index aa81c870f6..6f4fccaa7f 100644 --- a/api/configs/middleware/vdb/weaviate_config.py +++ b/api/configs/middleware/vdb/weaviate_config.py @@ -31,3 +31,8 @@ class WeaviateConfig(BaseSettings): description="Number of objects to be processed in a single batch operation (default is 100)", default=100, ) + + WEAVIATE_TOKENIZATION: str | None = Field( + description="Tokenization for Weaviate (default is word)", + default="word", + ) diff --git a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py index 591de01669..2c7bc592c0 100644 --- a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py +++ b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py @@ -167,13 +167,18 @@ class WeaviateVector(BaseVector): try: if not self._client.collections.exists(self._collection_name): + tokenization = ( + wc.Tokenization(dify_config.WEAVIATE_TOKENIZATION) + if dify_config.WEAVIATE_TOKENIZATION + else wc.Tokenization.WORD + ) self._client.collections.create( name=self._collection_name, properties=[ wc.Property( name=Field.TEXT_KEY.value, data_type=wc.DataType.TEXT, - tokenization=wc.Tokenization.WORD, + tokenization=tokenization, ), wc.Property(name="document_id", data_type=wc.DataType.TEXT), wc.Property(name="doc_id", data_type=wc.DataType.TEXT), diff --git a/api/tests/integration_tests/.env.example b/api/tests/integration_tests/.env.example index e4c534f046..46d13079db 100644 --- a/api/tests/integration_tests/.env.example +++ b/api/tests/integration_tests/.env.example @@ -62,6 +62,7 @@ WEAVIATE_ENDPOINT=http://localhost:8080 WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih WEAVIATE_GRPC_ENABLED=false WEAVIATE_BATCH_SIZE=100 +WEAVIATE_TOKENIZATION=word # Upload configuration diff --git a/docker/.env.example b/docker/.env.example index 7e2e9aa26d..0bfdc6b495 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -525,6 +525,7 @@ VECTOR_INDEX_NAME_PREFIX=Vector_index WEAVIATE_ENDPOINT=http://weaviate:8080 WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih WEAVIATE_GRPC_ENDPOINT=grpc://weaviate:50051 +WEAVIATE_TOKENIZATION=word # For OceanBase metadata database configuration, available when `DB_TYPE` is `mysql` and `COMPOSE_PROFILES` includes `oceanbase`. # For OceanBase vector database configuration, available when `VECTOR_STORE` is `oceanbase` diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index d1e970719c..881111f629 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -164,6 +164,7 @@ x-shared-env: &shared-api-worker-env WEAVIATE_ENDPOINT: ${WEAVIATE_ENDPOINT:-http://weaviate:8080} WEAVIATE_API_KEY: ${WEAVIATE_API_KEY:-WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih} WEAVIATE_GRPC_ENDPOINT: ${WEAVIATE_GRPC_ENDPOINT:-grpc://weaviate:50051} + WEAVIATE_TOKENIZATION: ${WEAVIATE_TOKENIZATION:-word} OCEANBASE_VECTOR_HOST: ${OCEANBASE_VECTOR_HOST:-oceanbase} OCEANBASE_VECTOR_PORT: ${OCEANBASE_VECTOR_PORT:-2881} OCEANBASE_VECTOR_USER: ${OCEANBASE_VECTOR_USER:-root@test}