diff --git a/.github/workflows/expose_service_ports.sh b/.github/workflows/expose_service_ports.sh index 01772ccf9f..fa0fd2ee8c 100755 --- a/.github/workflows/expose_service_ports.sh +++ b/.github/workflows/expose_service_ports.sh @@ -1,6 +1,7 @@ #!/bin/bash yq eval '.services.weaviate.ports += ["8080:8080"]' -i docker/docker-compose.yaml +yq eval '.services.weaviate.ports += ["50051:50051"]' -i docker/docker-compose.yaml yq eval '.services.qdrant.ports += ["6333:6333"]' -i docker/docker-compose.yaml yq eval '.services.chroma.ports += ["8000:8000"]' -i docker/docker-compose.yaml yq eval '.services["milvus-standalone"].ports += ["19530:19530"]' -i docker/docker-compose.yaml diff --git a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py index 8820c0a846..4793d2bb50 100644 --- a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py +++ b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py @@ -1,9 +1,24 @@ +""" +Weaviate vector database implementation for Dify's RAG system. + +This module provides integration with Weaviate vector database for storing and retrieving +document embeddings used in retrieval-augmented generation workflows. +""" + import datetime import json +import logging +import uuid as _uuid from typing import Any +from urllib.parse import urlparse -import weaviate # type: ignore +import weaviate +import weaviate.classes.config as wc from pydantic import BaseModel, model_validator +from weaviate.classes.data import DataObject +from weaviate.classes.init import Auth +from weaviate.classes.query import Filter, MetadataQuery +from weaviate.exceptions import UnexpectedStatusCodeError from configs import dify_config from core.rag.datasource.vdb.field import Field @@ -15,265 +30,394 @@ from core.rag.models.document import Document from extensions.ext_redis import redis_client from models.dataset import Dataset +logger = logging.getLogger(__name__) + class WeaviateConfig(BaseModel): + """ + Configuration model for Weaviate connection settings. + + Attributes: + endpoint: Weaviate server endpoint URL + api_key: Optional API key for authentication + batch_size: Number of objects to batch per insert operation + """ + endpoint: str api_key: str | None = None batch_size: int = 100 @model_validator(mode="before") @classmethod - def validate_config(cls, values: dict): + def validate_config(cls, values: dict) -> dict: + """Validates that required configuration values are present.""" if not values["endpoint"]: raise ValueError("config WEAVIATE_ENDPOINT is required") return values class WeaviateVector(BaseVector): + """ + Weaviate vector database implementation for document storage and retrieval. + + Handles creation, insertion, deletion, and querying of document embeddings + in a Weaviate collection. + """ + def __init__(self, collection_name: str, config: WeaviateConfig, attributes: list): + """ + Initializes the Weaviate vector store. + + Args: + collection_name: Name of the Weaviate collection + config: Weaviate configuration settings + attributes: List of metadata attributes to store + """ super().__init__(collection_name) self._client = self._init_client(config) self._attributes = attributes - def _init_client(self, config: WeaviateConfig) -> weaviate.Client: - auth_config = weaviate.AuthApiKey(api_key=config.api_key or "") + def _init_client(self, config: WeaviateConfig) -> weaviate.WeaviateClient: + """ + Initializes and returns a connected Weaviate client. - weaviate.connect.connection.has_grpc = False # ty: ignore [unresolved-attribute] + Configures both HTTP and gRPC connections with proper authentication. + """ + p = urlparse(config.endpoint) + host = p.hostname or config.endpoint.replace("https://", "").replace("http://", "") + http_secure = p.scheme == "https" + http_port = p.port or (443 if http_secure else 80) - try: - client = weaviate.Client( - url=config.endpoint, auth_client_secret=auth_config, timeout_config=(5, 60), startup_period=None - ) - except Exception as exc: - raise ConnectionError("Vector database connection error") from exc + grpc_host = host + grpc_secure = http_secure + grpc_port = 443 if grpc_secure else 50051 - client.batch.configure( - # `batch_size` takes an `int` value to enable auto-batching - # (`None` is used for manual batching) - batch_size=config.batch_size, - # dynamically update the `batch_size` based on import speed - dynamic=True, - # `timeout_retries` takes an `int` value to retry on time outs - timeout_retries=3, + client = weaviate.connect_to_custom( + http_host=host, + http_port=http_port, + http_secure=http_secure, + grpc_host=grpc_host, + grpc_port=grpc_port, + grpc_secure=grpc_secure, + auth_credentials=Auth.api_key(config.api_key) if config.api_key else None, ) + if not client.is_ready(): + raise ConnectionError("Vector database is not ready") + return client def get_type(self) -> str: + """Returns the vector database type identifier.""" return VectorType.WEAVIATE def get_collection_name(self, dataset: Dataset) -> str: + """ + Retrieves or generates the collection name for a dataset. + + Uses existing index structure if available, otherwise generates from dataset ID. + """ if dataset.index_struct_dict: class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"] if not class_prefix.endswith("_Node"): - # original class_prefix class_prefix += "_Node" - return class_prefix dataset_id = dataset.id return Dataset.gen_collection_name_by_id(dataset_id) - def to_index_struct(self): + def to_index_struct(self) -> dict: + """Returns the index structure dictionary for persistence.""" return {"type": self.get_type(), "vector_store": {"class_prefix": self._collection_name}} def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs): - # create collection + """ + Creates a new collection and adds initial documents with embeddings. + """ self._create_collection() - # create vector self.add_texts(texts, embeddings) def _create_collection(self): + """ + Creates the Weaviate collection with required schema if it doesn't exist. + + Uses Redis locking to prevent concurrent creation attempts. + """ lock_name = f"vector_indexing_lock_{self._collection_name}" with redis_client.lock(lock_name, timeout=20): - collection_exist_cache_key = f"vector_indexing_{self._collection_name}" - if redis_client.get(collection_exist_cache_key): + cache_key = f"vector_indexing_{self._collection_name}" + if redis_client.get(cache_key): return - schema = self._default_schema(self._collection_name) - if not self._client.schema.contains(schema): - # create collection - self._client.schema.create_class(schema) - redis_client.set(collection_exist_cache_key, 1, ex=3600) + + try: + if not self._client.collections.exists(self._collection_name): + self._client.collections.create( + name=self._collection_name, + properties=[ + wc.Property( + name=Field.TEXT_KEY.value, + data_type=wc.DataType.TEXT, + tokenization=wc.Tokenization.WORD, + ), + wc.Property(name="document_id", data_type=wc.DataType.TEXT), + wc.Property(name="doc_id", data_type=wc.DataType.TEXT), + wc.Property(name="chunk_index", data_type=wc.DataType.INT), + ], + vector_config=wc.Configure.Vectors.self_provided(), + ) + + self._ensure_properties() + redis_client.set(cache_key, 1, ex=3600) + except Exception as e: + logger.exception("Error creating collection %s", self._collection_name) + raise + + def _ensure_properties(self) -> None: + """ + Ensures all required properties exist in the collection schema. + + Adds missing properties if the collection exists but lacks them. + """ + if not self._client.collections.exists(self._collection_name): + return + + col = self._client.collections.use(self._collection_name) + cfg = col.config.get() + existing = {p.name for p in (cfg.properties or [])} + + to_add = [] + if "document_id" not in existing: + to_add.append(wc.Property(name="document_id", data_type=wc.DataType.TEXT)) + if "doc_id" not in existing: + to_add.append(wc.Property(name="doc_id", data_type=wc.DataType.TEXT)) + if "chunk_index" not in existing: + to_add.append(wc.Property(name="chunk_index", data_type=wc.DataType.INT)) + + for prop in to_add: + try: + col.config.add_property(prop) + except Exception as e: + logger.warning("Could not add property %s: %s", prop.name, e) + + def _get_uuids(self, documents: list[Document]) -> list[str]: + """ + Generates deterministic UUIDs for documents based on their content. + + Uses UUID5 with URL namespace to ensure consistent IDs for identical content. + """ + URL_NAMESPACE = _uuid.UUID("6ba7b811-9dad-11d1-80b4-00c04fd430c8") + + uuids = [] + for doc in documents: + uuid_val = _uuid.uuid5(URL_NAMESPACE, doc.page_content) + uuids.append(str(uuid_val)) + + return uuids def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs): + """ + Adds documents with their embeddings to the collection. + + Batches insertions for efficiency and returns the list of inserted object IDs. + """ uuids = self._get_uuids(documents) texts = [d.page_content for d in documents] metadatas = [d.metadata for d in documents] - ids = [] + col = self._client.collections.use(self._collection_name) + objs: list[DataObject] = [] + ids_out: list[str] = [] - with self._client.batch as batch: - for i, text in enumerate(texts): - data_properties = {Field.TEXT_KEY: text} - if metadatas is not None: - # metadata maybe None - for key, val in (metadatas[i] or {}).items(): - data_properties[key] = self._json_serializable(val) + for i, text in enumerate(texts): + props: dict[str, Any] = {Field.TEXT_KEY.value: text} + meta = metadatas[i] or {} + for k, v in meta.items(): + props[k] = self._json_serializable(v) - batch.add_data_object( - data_object=data_properties, - class_name=self._collection_name, - uuid=uuids[i], - vector=embeddings[i] if embeddings else None, + candidate = uuids[i] if uuids else None + uid = candidate if (candidate and self._is_uuid(candidate)) else str(_uuid.uuid4()) + ids_out.append(uid) + + vec_payload = None + if embeddings and i < len(embeddings) and embeddings[i]: + vec_payload = {"default": embeddings[i]} + + objs.append( + DataObject( + uuid=uid, + properties=props, # type: ignore[arg-type] # mypy incorrectly infers DataObject signature + vector=vec_payload, ) - ids.append(uuids[i]) - return ids + ) - def delete_by_metadata_field(self, key: str, value: str): - # check whether the index already exists - schema = self._default_schema(self._collection_name) - if self._client.schema.contains(schema): - where_filter = {"operator": "Equal", "path": [key], "valueText": value} + batch_size = max(1, int(dify_config.WEAVIATE_BATCH_SIZE or 100)) + with col.batch.dynamic() as batch: + for obj in objs: + batch.add_object(properties=obj.properties, uuid=obj.uuid, vector=obj.vector) - self._client.batch.delete_objects(class_name=self._collection_name, where=where_filter, output="minimal") + return ids_out + + def _is_uuid(self, val: str) -> bool: + """Validates whether a string is a valid UUID format.""" + try: + _uuid.UUID(str(val)) + return True + except Exception: + return False + + def delete_by_metadata_field(self, key: str, value: str) -> None: + """Deletes all objects matching a specific metadata field value.""" + if not self._client.collections.exists(self._collection_name): + return + + col = self._client.collections.use(self._collection_name) + col.data.delete_many(where=Filter.by_property(key).equal(value)) def delete(self): - # check whether the index already exists - schema = self._default_schema(self._collection_name) - if self._client.schema.contains(schema): - self._client.schema.delete_class(self._collection_name) + """Deletes the entire collection from Weaviate.""" + if self._client.collections.exists(self._collection_name): + self._client.collections.delete(self._collection_name) def text_exists(self, id: str) -> bool: - collection_name = self._collection_name - schema = self._default_schema(self._collection_name) - - # check whether the index already exists - if not self._client.schema.contains(schema): + """Checks if a document with the given doc_id exists in the collection.""" + if not self._client.collections.exists(self._collection_name): return False - result = ( - self._client.query.get(collection_name) - .with_additional(["id"]) - .with_where( - { - "path": ["doc_id"], - "operator": "Equal", - "valueText": id, - } - ) - .with_limit(1) - .do() + + col = self._client.collections.use(self._collection_name) + res = col.query.fetch_objects( + filters=Filter.by_property("doc_id").equal(id), + limit=1, + return_properties=["doc_id"], ) - if "errors" in result: - raise ValueError(f"Error during query: {result['errors']}") + return len(res.objects) > 0 - entries = result["data"]["Get"][collection_name] - if len(entries) == 0: - return False + def delete_by_ids(self, ids: list[str]) -> None: + """ + Deletes objects by their UUID identifiers. - return True + Silently ignores 404 errors for non-existent IDs. + """ + if not self._client.collections.exists(self._collection_name): + return - def delete_by_ids(self, ids: list[str]): - # check whether the index already exists - schema = self._default_schema(self._collection_name) - if self._client.schema.contains(schema): - for uuid in ids: - try: - self._client.data_object.delete( - class_name=self._collection_name, - uuid=uuid, - ) - except weaviate.UnexpectedStatusCodeException as e: - # tolerate not found error - if e.status_code != 404: - raise e + col = self._client.collections.use(self._collection_name) + + for uid in ids: + try: + col.data.delete_by_id(uid) + except UnexpectedStatusCodeError as e: + if getattr(e, "status_code", None) != 404: + raise def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]: - """Look up similar documents by embedding vector in Weaviate.""" - collection_name = self._collection_name - properties = self._attributes - properties.append(Field.TEXT_KEY) - query_obj = self._client.query.get(collection_name, properties) + """ + Performs vector similarity search using the provided query vector. - vector = {"vector": query_vector} - document_ids_filter = kwargs.get("document_ids_filter") - if document_ids_filter: - operands = [] - for document_id_filter in document_ids_filter: - operands.append({"path": ["document_id"], "operator": "Equal", "valueText": document_id_filter}) - where_filter = {"operator": "Or", "operands": operands} - query_obj = query_obj.with_where(where_filter) - result = ( - query_obj.with_near_vector(vector) - .with_limit(kwargs.get("top_k", 4)) - .with_additional(["vector", "distance"]) - .do() + Filters by document IDs if provided and applies score threshold. + Returns documents sorted by relevance score. + """ + if not self._client.collections.exists(self._collection_name): + return [] + + col = self._client.collections.use(self._collection_name) + props = list({*self._attributes, "document_id", Field.TEXT_KEY.value}) + + where = None + doc_ids = kwargs.get("document_ids_filter") or [] + if doc_ids: + ors = [Filter.by_property("document_id").equal(x) for x in doc_ids] + where = ors[0] + for f in ors[1:]: + where = where | f + + top_k = int(kwargs.get("top_k", 4)) + score_threshold = float(kwargs.get("score_threshold") or 0.0) + + res = col.query.near_vector( + near_vector=query_vector, + limit=top_k, + return_properties=props, + return_metadata=MetadataQuery(distance=True), + include_vector=False, + filters=where, + target_vector="default", ) - if "errors" in result: - raise ValueError(f"Error during query: {result['errors']}") - docs_and_scores = [] - for res in result["data"]["Get"][collection_name]: - text = res.pop(Field.TEXT_KEY) - score = 1 - res["_additional"]["distance"] - docs_and_scores.append((Document(page_content=text, metadata=res), score)) + docs: list[Document] = [] + for obj in res.objects: + properties = dict(obj.properties or {}) + text = properties.pop(Field.TEXT_KEY.value, "") + distance = (obj.metadata.distance if obj.metadata else None) or 1.0 + score = 1.0 - distance - docs = [] - for doc, score in docs_and_scores: - score_threshold = float(kwargs.get("score_threshold") or 0.0) - # check score threshold - if score >= score_threshold: - if doc.metadata is not None: - doc.metadata["score"] = score - docs.append(doc) - # Sort the documents by score in descending order - docs = sorted(docs, key=lambda x: x.metadata.get("score", 0) if x.metadata else 0, reverse=True) + if score > score_threshold: + properties["score"] = score + docs.append(Document(page_content=text, metadata=properties)) + + docs.sort(key=lambda d: d.metadata.get("score", 0.0), reverse=True) return docs def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]: - """Return docs using BM25F. - - Args: - query: Text to look up documents similar to. - - Returns: - List of Documents most similar to the query. """ - collection_name = self._collection_name - content: dict[str, Any] = {"concepts": [query]} - properties = self._attributes - properties.append(Field.TEXT_KEY) - if kwargs.get("search_distance"): - content["certainty"] = kwargs.get("search_distance") - query_obj = self._client.query.get(collection_name, properties) - document_ids_filter = kwargs.get("document_ids_filter") - if document_ids_filter: - operands = [] - for document_id_filter in document_ids_filter: - operands.append({"path": ["document_id"], "operator": "Equal", "valueText": document_id_filter}) - where_filter = {"operator": "Or", "operands": operands} - query_obj = query_obj.with_where(where_filter) - query_obj = query_obj.with_additional(["vector"]) - properties = ["text"] - result = query_obj.with_bm25(query=query, properties=properties).with_limit(kwargs.get("top_k", 4)).do() - if "errors" in result: - raise ValueError(f"Error during query: {result['errors']}") - docs = [] - for res in result["data"]["Get"][collection_name]: - text = res.pop(Field.TEXT_KEY) - additional = res.pop("_additional") - docs.append(Document(page_content=text, vector=additional["vector"], metadata=res)) + Performs BM25 full-text search on document content. + + Filters by document IDs if provided and returns matching documents with vectors. + """ + if not self._client.collections.exists(self._collection_name): + return [] + + col = self._client.collections.use(self._collection_name) + props = list({*self._attributes, Field.TEXT_KEY.value}) + + where = None + doc_ids = kwargs.get("document_ids_filter") or [] + if doc_ids: + ors = [Filter.by_property("document_id").equal(x) for x in doc_ids] + where = ors[0] + for f in ors[1:]: + where = where | f + + top_k = int(kwargs.get("top_k", 4)) + + res = col.query.bm25( + query=query, + query_properties=[Field.TEXT_KEY.value], + limit=top_k, + return_properties=props, + include_vector=True, + filters=where, + ) + + docs: list[Document] = [] + for obj in res.objects: + properties = dict(obj.properties or {}) + text = properties.pop(Field.TEXT_KEY.value, "") + + vec = obj.vector + if isinstance(vec, dict): + vec = vec.get("default") or next(iter(vec.values()), None) + + docs.append(Document(page_content=text, vector=vec, metadata=properties)) return docs - def _default_schema(self, index_name: str): - return { - "class": index_name, - "properties": [ - { - "name": "text", - "dataType": ["text"], - } - ], - } - - def _json_serializable(self, value: Any): + def _json_serializable(self, value: Any) -> Any: + """Converts values to JSON-serializable format, handling datetime objects.""" if isinstance(value, datetime.datetime): return value.isoformat() return value class WeaviateVectorFactory(AbstractVectorFactory): + """Factory class for creating WeaviateVector instances.""" + def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> WeaviateVector: + """ + Initializes a WeaviateVector instance for the given dataset. + + Uses existing collection name from dataset index structure or generates a new one. + Updates dataset index structure if not already set. + """ if dataset.index_struct_dict: class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"] collection_name = class_prefix @@ -281,7 +425,6 @@ class WeaviateVectorFactory(AbstractVectorFactory): dataset_id = dataset.id collection_name = Dataset.gen_collection_name_by_id(dataset_id) dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.WEAVIATE, collection_name)) - return WeaviateVector( collection_name=collection_name, config=WeaviateConfig( diff --git a/api/pyproject.toml b/api/pyproject.toml index 0ee2b1a291..1fb5747edc 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -86,6 +86,7 @@ dependencies = [ "sendgrid~=6.12.3", "flask-restx~=1.3.0", "packaging~=23.2", + "weaviate-client==4.17.0", ] # Before adding new dependency, consider place it in # alphabet order (a-z) and suitable group. @@ -214,7 +215,7 @@ vdb = [ "tidb-vector==0.0.9", "upstash-vector==0.6.0", "volcengine-compat~=1.0.0", - "weaviate-client~=3.24.0", + "weaviate-client>=4.0.0,<5.0.0", "xinference-client~=1.2.2", "mo-vector~=0.1.13", "mysql-connector-python>=9.3.0", diff --git a/api/uv.lock b/api/uv.lock index 87d96fb418..7914cd6b65 100644 --- a/api/uv.lock +++ b/api/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.11, <3.13" resolution-markers = [ "python_full_version >= '3.12.4' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'", @@ -587,16 +587,16 @@ wheels = [ [[package]] name = "boto3-stubs" -version = "1.40.50" +version = "1.40.51" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "botocore-stubs" }, { name = "types-s3transfer" }, { name = "typing-extensions", marker = "python_full_version < '3.12'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/35/c8/06584145c4ccc80e3297a97874bfaa43e6b2fb9f8a69bcc38e29a1457bf5/boto3_stubs-1.40.50.tar.gz", hash = "sha256:29828adfcb8629b5e285468eb89610f1fc71f964ad0913de3049a0a9d5de0be1", size = 100836, upload-time = "2025-10-10T20:32:34.867Z" } +sdist = { url = "https://files.pythonhosted.org/packages/82/4d/b07f9ee0fe432fa8ec6dc368ee7a0409e2b6d9df2c5a2a88265c9b6fd878/boto3_stubs-1.40.51.tar.gz", hash = "sha256:0281e820813a310954e15fb7c1d470c24c34c1cccc7b1ddad977fa293a1080a9", size = 100890, upload-time = "2025-10-13T19:25:36.126Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7b/69/f18c7135dc8a2b74e21b4a2375fa455e4d9e7e47f7838bc175d52005054a/boto3_stubs-1.40.50-py3-none-any.whl", hash = "sha256:01b9c67df62f26371a4a7473c616eece988a5305e7f7cb3fbc014d178685ac4e", size = 69689, upload-time = "2025-10-10T20:32:25.77Z" }, + { url = "https://files.pythonhosted.org/packages/d3/2e/4476431f11fc3bf7a7e0f4f5c275f17607aa127da7c0d8685a4dc6bf6291/boto3_stubs-1.40.51-py3-none-any.whl", hash = "sha256:896d0ffaa298ce1749eea1a54946320a0f4e07c6912f8e1f8c0744a708ee25a4", size = 69709, upload-time = "2025-10-13T19:25:23.116Z" }, ] [package.optional-dependencies] @@ -620,14 +620,14 @@ wheels = [ [[package]] name = "botocore-stubs" -version = "1.40.50" +version = "1.40.51" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "types-awscrt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/20/4b/86ad2d24ea36eed159c8e1f85a2645bfeedae34ccb8c77ea8c99abbd66d1/botocore_stubs-1.40.50.tar.gz", hash = "sha256:d772b2d3aea6b4e464963fe45b2d504eee7bc3842f047cebbae5492b3993e0fd", size = 42250, upload-time = "2025-10-11T23:08:59.925Z" } +sdist = { url = "https://files.pythonhosted.org/packages/55/ca/429fadb6e037cb7b300d508a0b24b59a71961db12539e21749cbec7e7422/botocore_stubs-1.40.51.tar.gz", hash = "sha256:8ddbeb1f68e39382533bb53f3b968d29e640406016af00ad8bbd6e1a2bd59536", size = 42249, upload-time = "2025-10-13T20:26:57.777Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b9/c1/4a736155b2d5dd7fdd09af8fba9ed59693c565d6e2bc1b5adc769da36cb5/botocore_stubs-1.40.50-py3-none-any.whl", hash = "sha256:7cb8d636e061e600929cd03339c3bbc162c21435b4bfeb6413cf7b0b612e7de0", size = 66541, upload-time = "2025-10-11T23:08:57.678Z" }, + { url = "https://files.pythonhosted.org/packages/c9/b9/5f1296bc46f293f284a1a6259f3c1f21f4161088dc6f70428698841b56a7/botocore_stubs-1.40.51-py3-none-any.whl", hash = "sha256:9a028104979205c9be0b68bb59ba679e4fe452e017eec3d40f6c2b41c590a73c", size = 66541, upload-time = "2025-10-13T20:26:55.559Z" }, ] [[package]] @@ -1372,6 +1372,7 @@ dependencies = [ { name = "transformers" }, { name = "unstructured", extra = ["docx", "epub", "md", "ppt", "pptx"] }, { name = "weave" }, + { name = "weaviate-client" }, { name = "webvtt-py" }, { name = "yarl" }, ] @@ -1561,6 +1562,7 @@ requires-dist = [ { name = "transformers", specifier = "~=4.56.1" }, { name = "unstructured", extras = ["docx", "epub", "md", "ppt", "pptx"], specifier = "~=0.16.1" }, { name = "weave", specifier = "~=0.51.0" }, + { name = "weaviate-client", specifier = "==4.17.0" }, { name = "webvtt-py", specifier = "~=0.5.1" }, { name = "yarl", specifier = "~=1.18.3" }, ] @@ -1667,7 +1669,7 @@ vdb = [ { name = "tidb-vector", specifier = "==0.0.9" }, { name = "upstash-vector", specifier = "==0.6.0" }, { name = "volcengine-compat", specifier = "~=1.0.0" }, - { name = "weaviate-client", specifier = "~=3.24.0" }, + { name = "weaviate-client", specifier = ">=4.0.0,<5.0.0" }, { name = "xinference-client", specifier = "~=1.2.2" }, ] @@ -6901,16 +6903,20 @@ wheels = [ [[package]] name = "weaviate-client" -version = "3.24.2" +version = "4.17.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "authlib" }, - { name = "requests" }, + { name = "deprecation" }, + { name = "grpcio" }, + { name = "httpx" }, + { name = "protobuf" }, + { name = "pydantic" }, { name = "validators" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/1f/c1/3285a21d8885f2b09aabb65edb9a8e062a35c2d7175e1bb024fa096582ab/weaviate-client-3.24.2.tar.gz", hash = "sha256:6914c48c9a7e5ad0be9399271f9cb85d6f59ab77476c6d4e56a3925bf149edaa", size = 199332, upload-time = "2023-10-04T08:37:54.26Z" } +sdist = { url = "https://files.pythonhosted.org/packages/bd/0e/e4582b007427187a9fde55fa575db4b766c81929d2b43a3dd8becce50567/weaviate_client-4.17.0.tar.gz", hash = "sha256:731d58d84b0989df4db399b686357ed285fb95971a492ccca8dec90bb2343c51", size = 769019, upload-time = "2025-09-26T11:20:27.381Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ab/98/3136d05f93e30cf29e1db280eaadf766df18d812dfe7994bcced653b2340/weaviate_client-3.24.2-py3-none-any.whl", hash = "sha256:bc50ca5fcebcd48de0d00f66700b0cf7c31a97c4cd3d29b4036d77c5d1d9479b", size = 107968, upload-time = "2023-10-04T08:37:52.511Z" }, + { url = "https://files.pythonhosted.org/packages/5b/c5/2da3a45866da7a935dab8ad07be05dcaee48b3ad4955144583b651929be7/weaviate_client-4.17.0-py3-none-any.whl", hash = "sha256:60e4a355b90537ee1e942ab0b76a94750897a13d9cf13c5a6decbd166d0ca8b5", size = 582763, upload-time = "2025-09-26T11:20:25.864Z" }, ] [[package]] diff --git a/docker/docker-compose-template.yaml b/docker/docker-compose-template.yaml index 5253f750b9..5483e2d554 100644 --- a/docker/docker-compose-template.yaml +++ b/docker/docker-compose-template.yaml @@ -329,7 +329,7 @@ services: # The Weaviate vector store. weaviate: - image: semitechnologies/weaviate:1.19.0 + image: semitechnologies/weaviate:1.27.0 profiles: - "" - weaviate diff --git a/docker/docker-compose.middleware.yaml b/docker/docker-compose.middleware.yaml index d350503f27..ebc619a50f 100644 --- a/docker/docker-compose.middleware.yaml +++ b/docker/docker-compose.middleware.yaml @@ -181,7 +181,7 @@ services: # The Weaviate vector store. weaviate: - image: semitechnologies/weaviate:1.19.0 + image: semitechnologies/weaviate:1.27.0 profiles: - "" - weaviate @@ -206,6 +206,7 @@ services: AUTHORIZATION_ADMINLIST_USERS: ${WEAVIATE_AUTHORIZATION_ADMINLIST_USERS:-hello@dify.ai} ports: - "${EXPOSE_WEAVIATE_PORT:-8080}:8080" + - "${EXPOSE_WEAVIATE_GRPC_PORT:-50051}:50051" networks: # create a network between sandbox, api and ssrf_proxy, and can not access outside. diff --git a/docker/docker-compose.override.yml b/docker/docker-compose.override.yml new file mode 100644 index 0000000000..8f2ab1cb43 --- /dev/null +++ b/docker/docker-compose.override.yml @@ -0,0 +1,9 @@ +services: + api: + volumes: + - ../api/core/rag/datasource/vdb/weaviate/weaviate_vector.py:/app/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py:ro + command: > + sh -c " + pip install --no-cache-dir 'weaviate>=4.0.0' && + /bin/bash /entrypoint.sh + " diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 0df648f38f..46b4a750ea 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -936,7 +936,7 @@ services: # The Weaviate vector store. weaviate: - image: semitechnologies/weaviate:1.19.0 + image: semitechnologies/weaviate:1.27.0 profiles: - "" - weaviate