diff --git a/api/.env.example b/api/.env.example index ba512a668d..fbf0b12f40 100644 --- a/api/.env.example +++ b/api/.env.example @@ -176,6 +176,7 @@ WEAVIATE_ENDPOINT=http://localhost:8080 WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih WEAVIATE_GRPC_ENABLED=false WEAVIATE_BATCH_SIZE=100 +WEAVIATE_TOKENIZATION=word # OceanBase Vector configuration OCEANBASE_VECTOR_HOST=127.0.0.1 diff --git a/api/configs/middleware/vdb/weaviate_config.py b/api/configs/middleware/vdb/weaviate_config.py index aa81c870f6..6f4fccaa7f 100644 --- a/api/configs/middleware/vdb/weaviate_config.py +++ b/api/configs/middleware/vdb/weaviate_config.py @@ -31,3 +31,8 @@ class WeaviateConfig(BaseSettings): description="Number of objects to be processed in a single batch operation (default is 100)", default=100, ) + + WEAVIATE_TOKENIZATION: str | None = Field( + description="Tokenization for Weaviate (default is word)", + default="word", + ) diff --git a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py index 591de01669..2c7bc592c0 100644 --- a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py +++ b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py @@ -167,13 +167,18 @@ class WeaviateVector(BaseVector): try: if not self._client.collections.exists(self._collection_name): + tokenization = ( + wc.Tokenization(dify_config.WEAVIATE_TOKENIZATION) + if dify_config.WEAVIATE_TOKENIZATION + else wc.Tokenization.WORD + ) self._client.collections.create( name=self._collection_name, properties=[ wc.Property( name=Field.TEXT_KEY.value, data_type=wc.DataType.TEXT, - tokenization=wc.Tokenization.WORD, + tokenization=tokenization, ), wc.Property(name="document_id", data_type=wc.DataType.TEXT), wc.Property(name="doc_id", data_type=wc.DataType.TEXT), diff --git a/api/tests/integration_tests/.env.example b/api/tests/integration_tests/.env.example index e4c534f046..46d13079db 100644 --- a/api/tests/integration_tests/.env.example +++ b/api/tests/integration_tests/.env.example @@ -62,6 +62,7 @@ WEAVIATE_ENDPOINT=http://localhost:8080 WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih WEAVIATE_GRPC_ENABLED=false WEAVIATE_BATCH_SIZE=100 +WEAVIATE_TOKENIZATION=word # Upload configuration diff --git a/docker/.env.example b/docker/.env.example index 7e2e9aa26d..0bfdc6b495 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -525,6 +525,7 @@ VECTOR_INDEX_NAME_PREFIX=Vector_index WEAVIATE_ENDPOINT=http://weaviate:8080 WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih WEAVIATE_GRPC_ENDPOINT=grpc://weaviate:50051 +WEAVIATE_TOKENIZATION=word # For OceanBase metadata database configuration, available when `DB_TYPE` is `mysql` and `COMPOSE_PROFILES` includes `oceanbase`. # For OceanBase vector database configuration, available when `VECTOR_STORE` is `oceanbase` diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index d1e970719c..881111f629 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -164,6 +164,7 @@ x-shared-env: &shared-api-worker-env WEAVIATE_ENDPOINT: ${WEAVIATE_ENDPOINT:-http://weaviate:8080} WEAVIATE_API_KEY: ${WEAVIATE_API_KEY:-WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih} WEAVIATE_GRPC_ENDPOINT: ${WEAVIATE_GRPC_ENDPOINT:-grpc://weaviate:50051} + WEAVIATE_TOKENIZATION: ${WEAVIATE_TOKENIZATION:-word} OCEANBASE_VECTOR_HOST: ${OCEANBASE_VECTOR_HOST:-oceanbase} OCEANBASE_VECTOR_PORT: ${OCEANBASE_VECTOR_PORT:-2881} OCEANBASE_VECTOR_USER: ${OCEANBASE_VECTOR_USER:-root@test}