Feat/add weaviate tokenization configurable (#28159)

Co-authored-by: lijiezhao <lijiezhao@perfect99.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
墨绿色 2025-11-25 20:07:45 +08:00 committed by GitHub
parent b5650b579d
commit f76a3f545c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 15 additions and 1 deletions

View File

@ -176,6 +176,7 @@ WEAVIATE_ENDPOINT=http://localhost:8080
WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih
WEAVIATE_GRPC_ENABLED=false WEAVIATE_GRPC_ENABLED=false
WEAVIATE_BATCH_SIZE=100 WEAVIATE_BATCH_SIZE=100
WEAVIATE_TOKENIZATION=word
# OceanBase Vector configuration # OceanBase Vector configuration
OCEANBASE_VECTOR_HOST=127.0.0.1 OCEANBASE_VECTOR_HOST=127.0.0.1

View File

@ -31,3 +31,8 @@ class WeaviateConfig(BaseSettings):
description="Number of objects to be processed in a single batch operation (default is 100)", description="Number of objects to be processed in a single batch operation (default is 100)",
default=100, default=100,
) )
WEAVIATE_TOKENIZATION: str | None = Field(
description="Tokenization for Weaviate (default is word)",
default="word",
)

View File

@ -167,13 +167,18 @@ class WeaviateVector(BaseVector):
try: try:
if not self._client.collections.exists(self._collection_name): if not self._client.collections.exists(self._collection_name):
tokenization = (
wc.Tokenization(dify_config.WEAVIATE_TOKENIZATION)
if dify_config.WEAVIATE_TOKENIZATION
else wc.Tokenization.WORD
)
self._client.collections.create( self._client.collections.create(
name=self._collection_name, name=self._collection_name,
properties=[ properties=[
wc.Property( wc.Property(
name=Field.TEXT_KEY.value, name=Field.TEXT_KEY.value,
data_type=wc.DataType.TEXT, data_type=wc.DataType.TEXT,
tokenization=wc.Tokenization.WORD, tokenization=tokenization,
), ),
wc.Property(name="document_id", data_type=wc.DataType.TEXT), wc.Property(name="document_id", data_type=wc.DataType.TEXT),
wc.Property(name="doc_id", data_type=wc.DataType.TEXT), wc.Property(name="doc_id", data_type=wc.DataType.TEXT),

View File

@ -62,6 +62,7 @@ WEAVIATE_ENDPOINT=http://localhost:8080
WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih
WEAVIATE_GRPC_ENABLED=false WEAVIATE_GRPC_ENABLED=false
WEAVIATE_BATCH_SIZE=100 WEAVIATE_BATCH_SIZE=100
WEAVIATE_TOKENIZATION=word
# Upload configuration # Upload configuration

View File

@ -525,6 +525,7 @@ VECTOR_INDEX_NAME_PREFIX=Vector_index
WEAVIATE_ENDPOINT=http://weaviate:8080 WEAVIATE_ENDPOINT=http://weaviate:8080
WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih
WEAVIATE_GRPC_ENDPOINT=grpc://weaviate:50051 WEAVIATE_GRPC_ENDPOINT=grpc://weaviate:50051
WEAVIATE_TOKENIZATION=word
# For OceanBase metadata database configuration, available when `DB_TYPE` is `mysql` and `COMPOSE_PROFILES` includes `oceanbase`. # For OceanBase metadata database configuration, available when `DB_TYPE` is `mysql` and `COMPOSE_PROFILES` includes `oceanbase`.
# For OceanBase vector database configuration, available when `VECTOR_STORE` is `oceanbase` # For OceanBase vector database configuration, available when `VECTOR_STORE` is `oceanbase`

View File

@ -164,6 +164,7 @@ x-shared-env: &shared-api-worker-env
WEAVIATE_ENDPOINT: ${WEAVIATE_ENDPOINT:-http://weaviate:8080} WEAVIATE_ENDPOINT: ${WEAVIATE_ENDPOINT:-http://weaviate:8080}
WEAVIATE_API_KEY: ${WEAVIATE_API_KEY:-WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih} WEAVIATE_API_KEY: ${WEAVIATE_API_KEY:-WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih}
WEAVIATE_GRPC_ENDPOINT: ${WEAVIATE_GRPC_ENDPOINT:-grpc://weaviate:50051} WEAVIATE_GRPC_ENDPOINT: ${WEAVIATE_GRPC_ENDPOINT:-grpc://weaviate:50051}
WEAVIATE_TOKENIZATION: ${WEAVIATE_TOKENIZATION:-word}
OCEANBASE_VECTOR_HOST: ${OCEANBASE_VECTOR_HOST:-oceanbase} OCEANBASE_VECTOR_HOST: ${OCEANBASE_VECTOR_HOST:-oceanbase}
OCEANBASE_VECTOR_PORT: ${OCEANBASE_VECTOR_PORT:-2881} OCEANBASE_VECTOR_PORT: ${OCEANBASE_VECTOR_PORT:-2881}
OCEANBASE_VECTOR_USER: ${OCEANBASE_VECTOR_USER:-root@test} OCEANBASE_VECTOR_USER: ${OCEANBASE_VECTOR_USER:-root@test}