From 928b17d0f194e6b370b3a950e91c5b23adc2e3c3 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Wed, 24 Sep 2025 12:45:08 +0000 Subject: [PATCH 1/2] [autofix.ci] apply automated fixes --- api/configs/middleware/vdb/pinecone_config.py | 8 +- .../vdb/pinecone/pinecone_vector.py | 83 +++++++------------ .../vdb/pinecone/test_pinecone.py | 3 +- 3 files changed, 37 insertions(+), 57 deletions(-) diff --git a/api/configs/middleware/vdb/pinecone_config.py b/api/configs/middleware/vdb/pinecone_config.py index 3b3ca186ff..4cae306f53 100644 --- a/api/configs/middleware/vdb/pinecone_config.py +++ b/api/configs/middleware/vdb/pinecone_config.py @@ -9,17 +9,17 @@ class PineconeConfig(BaseSettings): Configuration settings for Pinecone vector database """ - PINECONE_API_KEY: Optional[str] = Field( + PINECONE_API_KEY: str | None = Field( description="API key for authenticating with Pinecone service", default=None, ) - PINECONE_ENVIRONMENT: Optional[str] = Field( + PINECONE_ENVIRONMENT: str | None = Field( description="Pinecone environment (e.g., 'us-west1-gcp', 'us-east-1-aws')", default=None, ) - PINECONE_INDEX_NAME: Optional[str] = Field( + PINECONE_INDEX_NAME: str | None = Field( description="Default Pinecone index name", default=None, ) @@ -37,4 +37,4 @@ class PineconeConfig(BaseSettings): PINECONE_METRIC: str = Field( description="Distance metric for Pinecone index (cosine, euclidean, dotproduct)", default="cosine", - ) \ No newline at end of file + ) diff --git a/api/core/rag/datasource/vdb/pinecone/pinecone_vector.py b/api/core/rag/datasource/vdb/pinecone/pinecone_vector.py index 5deafe6245..2357d57e38 100644 --- a/api/core/rag/datasource/vdb/pinecone/pinecone_vector.py +++ b/api/core/rag/datasource/vdb/pinecone/pinecone_vector.py @@ -18,9 +18,10 @@ from models.dataset import Dataset, DatasetCollectionBinding class PineconeConfig(BaseModel): """Pinecone configuration class""" + api_key: str environment: str - index_name: Optional[str] = None + index_name: str | None = None timeout: float = 30 batch_size: int = 100 metric: str = "cosine" @@ -48,14 +49,15 @@ class PineconeVector(BaseVector): # Normalize index name: lowercase, only a-z0-9- and <=45 chars import hashlib import re + base_name = collection_name.lower() - base_name = re.sub(r'[^a-z0-9-]+', '-', base_name) # replace invalid chars with '-' - base_name = re.sub(r'-+', '-', base_name).strip('-') + base_name = re.sub(r"[^a-z0-9-]+", "-", base_name) # replace invalid chars with '-' + base_name = re.sub(r"-+", "-", base_name).strip("-") # Use longer secure suffix to reduce collision risk suffix_len = 24 # 24 hex digits (96-bit entropy) if len(base_name) > 45: hash_suffix = hashlib.sha256(base_name.encode()).hexdigest()[:suffix_len] - truncated_name = base_name[:45 - (suffix_len + 1)].rstrip('-') + truncated_name = base_name[: 45 - (suffix_len + 1)].rstrip("-") self._index_name = f"{truncated_name}-{hash_suffix}" else: self._index_name = base_name @@ -63,7 +65,7 @@ class PineconeVector(BaseVector): if not self._index_name: self._index_name = f"index-{hashlib.sha256(collection_name.encode()).hexdigest()[:suffix_len]}" self._index = None - + def get_type(self) -> str: """Return vector database type identifier""" return "pinecone" @@ -79,14 +81,11 @@ class PineconeVector(BaseVector): else: raise ValueError("Index not initialized. Please ingest documents to create index.") except Exception: - raise + raise def to_index_struct(self) -> dict: """Generate index structure dictionary""" - return { - "type": self.get_type(), - "vector_store": {"class_prefix": self._collection_name} - } + return {"type": self.get_type(), "vector_store": {"class_prefix": self._collection_name}} def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs): """Create vector index""" @@ -120,14 +119,11 @@ class PineconeVector(BaseVector): name=self._index_name, dimension=dimension, metric=self._client_config.metric, - spec=ServerlessSpec( - cloud='aws', - region=self._client_config.environment - ) + spec=ServerlessSpec(cloud="aws", region=self._client_config.environment), ) # Wait for index creation to complete - while not self._pc.describe_index(self._index_name).status['ready']: + while not self._pc.describe_index(self._index_name).status["ready"]: time.sleep(1) else: # Get index instance @@ -135,7 +131,7 @@ class PineconeVector(BaseVector): # Set cache redis_client.set(index_exist_cache_key, 1, ex=3600) - + def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs): """Batch add document vectors""" if not self._index: @@ -150,9 +146,9 @@ class PineconeVector(BaseVector): # Batch processing total_batches = (total_docs + batch_size - 1) // batch_size # Ceiling division for batch_idx, i in enumerate(range(0, len(documents), batch_size), 1): - batch_documents = documents[i:i + batch_size] - batch_embeddings = embeddings[i:i + batch_size] - batch_uuids = uuids[i:i + batch_size] + batch_documents = documents[i : i + batch_size] + batch_embeddings = embeddings[i : i + batch_size] + batch_uuids = uuids[i : i + batch_size] batch_size_actual = len(batch_documents) # Build Pinecone vector data (metadata must be primitives or list[str]) @@ -162,12 +158,8 @@ class PineconeVector(BaseVector): safe_meta: dict[str, Any] = {} # lift common identifiers to top-level fields for filtering for k, v in raw_meta.items(): - if ( - isinstance(v, (str, int, float, bool)) - or ( - isinstance(v, list) - and all(isinstance(x, str) for x in v) - ) + if isinstance(v, (str, int, float, bool)) or ( + isinstance(v, list) and all(isinstance(x, str) for x in v) ): safe_meta[k] = v else: @@ -178,11 +170,7 @@ class PineconeVector(BaseVector): # group id as string safe_meta[Field.GROUP_KEY.value] = str(self._group_id) - vectors_to_upsert.append({ - "id": doc_id, - "values": embedding, - "metadata": safe_meta - }) + vectors_to_upsert.append({"id": doc_id, "values": embedding, "metadata": safe_meta}) # Batch insert to Pinecone try: @@ -192,7 +180,7 @@ class PineconeVector(BaseVector): raise return added_ids - + def search_by_vector(self, query_vector: list[float], **kwargs) -> list[Document]: """Vector similarity search""" # Lazily attach to an existing index if needed @@ -211,12 +199,7 @@ class PineconeVector(BaseVector): # Execute search try: - response = self._index.query( - vector=query_vector, - top_k=top_k, - include_metadata=True, - filter=filter_dict - ) + response = self._index.query(vector=query_vector, top_k=top_k, include_metadata=True, filter=filter_dict) except Exception as e: raise @@ -240,11 +223,11 @@ class PineconeVector(BaseVector): docs.sort(key=lambda x: x.metadata.get("score", 0), reverse=True) return docs - + def search_by_full_text(self, query: str, **kwargs) -> list[Document]: """Full-text search - Pinecone does not natively support it, returns empty list""" return [] - + def delete_by_metadata_field(self, key: str, value: str): """Delete by metadata field""" self._ensure_index_initialized() @@ -253,7 +236,7 @@ class PineconeVector(BaseVector): # Build filter conditions filter_dict = { Field.GROUP_KEY.value: {"$eq": self._group_id}, - f"{Field.METADATA_KEY.value}.{key}": {"$eq": value} + f"{Field.METADATA_KEY.value}.{key}": {"$eq": value}, } # Pinecone delete operation @@ -261,7 +244,7 @@ class PineconeVector(BaseVector): except Exception as e: # Ignore delete errors pass - + def delete_by_ids(self, ids: list[str]) -> None: """Batch delete by ID list""" self._ensure_index_initialized() @@ -271,7 +254,7 @@ class PineconeVector(BaseVector): self._index.delete(ids=ids) except Exception as e: raise - + def delete(self) -> None: """Delete all vector data for the entire dataset""" self._ensure_index_initialized() @@ -282,7 +265,7 @@ class PineconeVector(BaseVector): self._index.delete(filter=filter_dict) except Exception as e: raise - + def text_exists(self, id: str) -> bool: """Check if document exists""" try: @@ -301,10 +284,10 @@ class PineconeVector(BaseVector): class PineconeVectorFactory(AbstractVectorFactory): """Pinecone vector database factory class""" - + def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> PineconeVector: """Create PineconeVector instance""" - + # Determine index name if dataset.collection_binding_id: dataset_collection_binding = ( @@ -323,13 +306,11 @@ class PineconeVectorFactory(AbstractVectorFactory): else: dataset_id = dataset.id collection_name = Dataset.gen_collection_name_by_id(dataset_id) - + # Set index structure if not dataset.index_struct_dict: - dataset.index_struct = json.dumps( - self.gen_index_struct_dict("pinecone", collection_name) - ) - + dataset.index_struct = json.dumps(self.gen_index_struct_dict("pinecone", collection_name)) + # Create PineconeVector instance return PineconeVector( collection_name=collection_name, @@ -342,4 +323,4 @@ class PineconeVectorFactory(AbstractVectorFactory): batch_size=dify_config.PINECONE_BATCH_SIZE, metric=dify_config.PINECONE_METRIC, ), - ) \ No newline at end of file + ) diff --git a/api/tests/integration_tests/vdb/pinecone/test_pinecone.py b/api/tests/integration_tests/vdb/pinecone/test_pinecone.py index cb4f9f4534..1071e6e28f 100644 --- a/api/tests/integration_tests/vdb/pinecone/test_pinecone.py +++ b/api/tests/integration_tests/vdb/pinecone/test_pinecone.py @@ -24,5 +24,4 @@ class PineconeVectorTest(AbstractVectorTest): def test_pinecone_vector(): - - PineconeVectorTest().run_all_tests() \ No newline at end of file + PineconeVectorTest().run_all_tests() From 80d9c81439ce18cae3510f3a980a34d49570e433 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Wed, 24 Sep 2025 12:47:03 +0000 Subject: [PATCH 2/2] [autofix.ci] apply automated fixes (attempt 2/3) --- api/configs/middleware/vdb/pinecone_config.py | 2 -- api/core/rag/datasource/vdb/pinecone/pinecone_vector.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/api/configs/middleware/vdb/pinecone_config.py b/api/configs/middleware/vdb/pinecone_config.py index 4cae306f53..aee9d460aa 100644 --- a/api/configs/middleware/vdb/pinecone_config.py +++ b/api/configs/middleware/vdb/pinecone_config.py @@ -1,5 +1,3 @@ -from typing import Optional - from pydantic import Field, PositiveInt from pydantic_settings import BaseSettings diff --git a/api/core/rag/datasource/vdb/pinecone/pinecone_vector.py b/api/core/rag/datasource/vdb/pinecone/pinecone_vector.py index 2357d57e38..ba46ba9a1c 100644 --- a/api/core/rag/datasource/vdb/pinecone/pinecone_vector.py +++ b/api/core/rag/datasource/vdb/pinecone/pinecone_vector.py @@ -1,6 +1,6 @@ import json import time -from typing import Any, Optional +from typing import Any from pinecone import Pinecone, ServerlessSpec from pydantic import BaseModel