fix: test

This commit is contained in:
Frederick2313072 2025-09-26 22:48:37 +08:00
commit ee0d181caa
3 changed files with 34 additions and 49 deletions

View File

@ -1,5 +1,3 @@
from typing import Optional
from pydantic import Field, PositiveInt from pydantic import Field, PositiveInt
from pydantic_settings import BaseSettings from pydantic_settings import BaseSettings
@ -9,17 +7,17 @@ class PineconeConfig(BaseSettings):
Configuration settings for Pinecone vector database Configuration settings for Pinecone vector database
""" """
PINECONE_API_KEY: Optional[str] = Field( PINECONE_API_KEY: str | None = Field(
description="API key for authenticating with Pinecone service", description="API key for authenticating with Pinecone service",
default=None, default=None,
) )
PINECONE_ENVIRONMENT: Optional[str] = Field( PINECONE_ENVIRONMENT: str | None = Field(
description="Pinecone environment (e.g., 'us-west1-gcp', 'us-east-1-aws')", description="Pinecone environment (e.g., 'us-west1-gcp', 'us-east-1-aws')",
default=None, default=None,
) )
PINECONE_INDEX_NAME: Optional[str] = Field( PINECONE_INDEX_NAME: str | None = Field(
description="Default Pinecone index name", description="Default Pinecone index name",
default=None, default=None,
) )
@ -37,4 +35,4 @@ class PineconeConfig(BaseSettings):
PINECONE_METRIC: str = Field( PINECONE_METRIC: str = Field(
description="Distance metric for Pinecone index (cosine, euclidean, dotproduct)", description="Distance metric for Pinecone index (cosine, euclidean, dotproduct)",
default="cosine", default="cosine",
) )

View File

@ -1,6 +1,6 @@
import json import json
import time import time
from typing import Any, Optional from typing import Any
from pinecone import Pinecone, ServerlessSpec from pinecone import Pinecone, ServerlessSpec
from pydantic import BaseModel from pydantic import BaseModel
@ -19,9 +19,10 @@ from models.dataset import Dataset, DatasetCollectionBinding
class PineconeConfig(BaseModel): class PineconeConfig(BaseModel):
"""Pinecone configuration class""" """Pinecone configuration class"""
api_key: str api_key: str
environment: str environment: str
index_name: Optional[str] = None index_name: str | None = None
timeout: float = 30 timeout: float = 30
batch_size: int = 100 batch_size: int = 100
metric: str = "cosine" metric: str = "cosine"
@ -49,14 +50,15 @@ class PineconeVector(BaseVector):
# Normalize index name: lowercase, only a-z0-9- and <=45 chars # Normalize index name: lowercase, only a-z0-9- and <=45 chars
import hashlib import hashlib
import re import re
base_name = collection_name.lower() base_name = collection_name.lower()
base_name = re.sub(r'[^a-z0-9-]+', '-', base_name) # replace invalid chars with '-' base_name = re.sub(r"[^a-z0-9-]+", "-", base_name) # replace invalid chars with '-'
base_name = re.sub(r'-+', '-', base_name).strip('-') base_name = re.sub(r"-+", "-", base_name).strip("-")
# Use longer secure suffix to reduce collision risk # Use longer secure suffix to reduce collision risk
suffix_len = 24 # 24 hex digits (96-bit entropy) suffix_len = 24 # 24 hex digits (96-bit entropy)
if len(base_name) > 45: if len(base_name) > 45:
hash_suffix = hashlib.sha256(base_name.encode()).hexdigest()[:suffix_len] hash_suffix = hashlib.sha256(base_name.encode()).hexdigest()[:suffix_len]
truncated_name = base_name[:45 - (suffix_len + 1)].rstrip('-') truncated_name = base_name[: 45 - (suffix_len + 1)].rstrip("-")
self._index_name = f"{truncated_name}-{hash_suffix}" self._index_name = f"{truncated_name}-{hash_suffix}"
else: else:
self._index_name = base_name self._index_name = base_name
@ -81,14 +83,11 @@ class PineconeVector(BaseVector):
else: else:
raise ValueError("Index not initialized. Please ingest documents to create index.") raise ValueError("Index not initialized. Please ingest documents to create index.")
except Exception: except Exception:
raise raise
def to_index_struct(self) -> dict: def to_index_struct(self) -> dict:
"""Generate index structure dictionary""" """Generate index structure dictionary"""
return { return {"type": self.get_type(), "vector_store": {"class_prefix": self._collection_name}}
"type": self.get_type(),
"vector_store": {"class_prefix": self._collection_name}
}
def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs): def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
"""Create vector index""" """Create vector index"""
@ -122,14 +121,11 @@ class PineconeVector(BaseVector):
name=self._index_name, name=self._index_name,
dimension=dimension, dimension=dimension,
metric=self._client_config.metric, metric=self._client_config.metric,
spec=ServerlessSpec( spec=ServerlessSpec(cloud="aws", region=self._client_config.environment),
cloud='aws',
region=self._client_config.environment
)
) )
# Wait for index creation to complete # Wait for index creation to complete
while not self._pc.describe_index(self._index_name).status['ready']: while not self._pc.describe_index(self._index_name).status["ready"]:
time.sleep(1) time.sleep(1)
else: else:
# Get index instance # Get index instance
@ -137,7 +133,7 @@ class PineconeVector(BaseVector):
# Set cache # Set cache
redis_client.set(index_exist_cache_key, 1, ex=3600) redis_client.set(index_exist_cache_key, 1, ex=3600)
def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs): def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
"""Batch add document vectors""" """Batch add document vectors"""
if not self._index: if not self._index:
@ -152,9 +148,9 @@ class PineconeVector(BaseVector):
# Batch processing # Batch processing
total_batches = (total_docs + batch_size - 1) // batch_size # Ceiling division total_batches = (total_docs + batch_size - 1) // batch_size # Ceiling division
for batch_idx, i in enumerate(range(0, len(documents), batch_size), 1): for batch_idx, i in enumerate(range(0, len(documents), batch_size), 1):
batch_documents = documents[i:i + batch_size] batch_documents = documents[i : i + batch_size]
batch_embeddings = embeddings[i:i + batch_size] batch_embeddings = embeddings[i : i + batch_size]
batch_uuids = uuids[i:i + batch_size] batch_uuids = uuids[i : i + batch_size]
batch_size_actual = len(batch_documents) batch_size_actual = len(batch_documents)
# Build Pinecone vector data (metadata must be primitives or list[str]) # Build Pinecone vector data (metadata must be primitives or list[str])
@ -164,12 +160,8 @@ class PineconeVector(BaseVector):
safe_meta: dict[str, Any] = {} safe_meta: dict[str, Any] = {}
# lift common identifiers to top-level fields for filtering # lift common identifiers to top-level fields for filtering
for k, v in raw_meta.items(): for k, v in raw_meta.items():
if ( if isinstance(v, (str, int, float, bool)) or (
isinstance(v, (str, int, float, bool)) isinstance(v, list) and all(isinstance(x, str) for x in v)
or (
isinstance(v, list)
and all(isinstance(x, str) for x in v)
)
): ):
safe_meta[k] = v safe_meta[k] = v
else: else:
@ -180,11 +172,7 @@ class PineconeVector(BaseVector):
# group id as string # group id as string
safe_meta[Field.GROUP_KEY.value] = str(self._group_id) safe_meta[Field.GROUP_KEY.value] = str(self._group_id)
vectors_to_upsert.append({ vectors_to_upsert.append({"id": doc_id, "values": embedding, "metadata": safe_meta})
"id": doc_id,
"values": embedding,
"metadata": safe_meta
})
# Batch insert to Pinecone # Batch insert to Pinecone
try: try:
@ -194,7 +182,7 @@ class PineconeVector(BaseVector):
raise raise
return added_ids return added_ids
def search_by_vector(self, query_vector: list[float], **kwargs) -> list[Document]: def search_by_vector(self, query_vector: list[float], **kwargs) -> list[Document]:
"""Vector similarity search""" """Vector similarity search"""
# Lazily attach to an existing index if needed # Lazily attach to an existing index if needed
@ -244,11 +232,11 @@ class PineconeVector(BaseVector):
docs.sort(key=lambda x: x.metadata.get("score", 0), reverse=True) docs.sort(key=lambda x: x.metadata.get("score", 0), reverse=True)
return docs return docs
def search_by_full_text(self, query: str, **kwargs) -> list[Document]: def search_by_full_text(self, query: str, **kwargs) -> list[Document]:
"""Full-text search - Pinecone does not natively support it, returns empty list""" """Full-text search - Pinecone does not natively support it, returns empty list"""
return [] return []
def delete_by_metadata_field(self, key: str, value: str): def delete_by_metadata_field(self, key: str, value: str):
"""Delete by metadata field""" """Delete by metadata field"""
self._ensure_index_initialized() self._ensure_index_initialized()
@ -257,7 +245,7 @@ class PineconeVector(BaseVector):
# Build filter conditions # Build filter conditions
filter_dict = { filter_dict = {
Field.GROUP_KEY.value: {"$eq": self._group_id}, Field.GROUP_KEY.value: {"$eq": self._group_id},
f"{Field.METADATA_KEY.value}.{key}": {"$eq": value} f"{Field.METADATA_KEY.value}.{key}": {"$eq": value},
} }
# Pinecone delete operation # Pinecone delete operation
@ -267,7 +255,7 @@ class PineconeVector(BaseVector):
except Exception as e: except Exception as e:
# Ignore delete errors # Ignore delete errors
pass pass
def delete_by_ids(self, ids: list[str]) -> None: def delete_by_ids(self, ids: list[str]) -> None:
"""Batch delete by ID list""" """Batch delete by ID list"""
self._ensure_index_initialized() self._ensure_index_initialized()
@ -279,7 +267,7 @@ class PineconeVector(BaseVector):
index.delete(ids=ids) index.delete(ids=ids)
except Exception as e: except Exception as e:
raise raise
def delete(self) -> None: def delete(self) -> None:
"""Delete all vector data for the entire dataset""" """Delete all vector data for the entire dataset"""
self._ensure_index_initialized() self._ensure_index_initialized()
@ -292,7 +280,7 @@ class PineconeVector(BaseVector):
index.delete(filter=filter_dict) index.delete(filter=filter_dict)
except Exception as e: except Exception as e:
raise raise
def text_exists(self, id: str) -> bool: def text_exists(self, id: str) -> bool:
"""Check if document exists""" """Check if document exists"""
try: try:
@ -313,10 +301,10 @@ class PineconeVector(BaseVector):
class PineconeVectorFactory(AbstractVectorFactory): class PineconeVectorFactory(AbstractVectorFactory):
"""Pinecone vector database factory class""" """Pinecone vector database factory class"""
def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> PineconeVector: def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> PineconeVector:
"""Create PineconeVector instance""" """Create PineconeVector instance"""
# Determine index name # Determine index name
if dataset.collection_binding_id: if dataset.collection_binding_id:
dataset_collection_binding = ( dataset_collection_binding = (
@ -335,7 +323,7 @@ class PineconeVectorFactory(AbstractVectorFactory):
else: else:
dataset_id = dataset.id dataset_id = dataset.id
collection_name = Dataset.gen_collection_name_by_id(dataset_id) collection_name = Dataset.gen_collection_name_by_id(dataset_id)
# Set index structure # Set index structure
if not dataset.index_struct_dict: if not dataset.index_struct_dict:
dataset.index_struct = json.dumps( dataset.index_struct = json.dumps(
@ -354,4 +342,4 @@ class PineconeVectorFactory(AbstractVectorFactory):
batch_size=dify_config.PINECONE_BATCH_SIZE, batch_size=dify_config.PINECONE_BATCH_SIZE,
metric=dify_config.PINECONE_METRIC, metric=dify_config.PINECONE_METRIC,
), ),
) )

View File

@ -24,5 +24,4 @@ class PineconeVectorTest(AbstractVectorTest):
def test_pinecone_vector(): def test_pinecone_vector():
PineconeVectorTest().run_all_tests()
PineconeVectorTest().run_all_tests()