mirror of
https://github.com/langgenius/dify.git
synced 2026-05-03 15:57:06 +08:00
fix: test
This commit is contained in:
commit
ee0d181caa
@ -1,5 +1,3 @@
|
|||||||
from typing import Optional
|
|
||||||
|
|
||||||
from pydantic import Field, PositiveInt
|
from pydantic import Field, PositiveInt
|
||||||
from pydantic_settings import BaseSettings
|
from pydantic_settings import BaseSettings
|
||||||
|
|
||||||
@ -9,17 +7,17 @@ class PineconeConfig(BaseSettings):
|
|||||||
Configuration settings for Pinecone vector database
|
Configuration settings for Pinecone vector database
|
||||||
"""
|
"""
|
||||||
|
|
||||||
PINECONE_API_KEY: Optional[str] = Field(
|
PINECONE_API_KEY: str | None = Field(
|
||||||
description="API key for authenticating with Pinecone service",
|
description="API key for authenticating with Pinecone service",
|
||||||
default=None,
|
default=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
PINECONE_ENVIRONMENT: Optional[str] = Field(
|
PINECONE_ENVIRONMENT: str | None = Field(
|
||||||
description="Pinecone environment (e.g., 'us-west1-gcp', 'us-east-1-aws')",
|
description="Pinecone environment (e.g., 'us-west1-gcp', 'us-east-1-aws')",
|
||||||
default=None,
|
default=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
PINECONE_INDEX_NAME: Optional[str] = Field(
|
PINECONE_INDEX_NAME: str | None = Field(
|
||||||
description="Default Pinecone index name",
|
description="Default Pinecone index name",
|
||||||
default=None,
|
default=None,
|
||||||
)
|
)
|
||||||
@ -37,4 +35,4 @@ class PineconeConfig(BaseSettings):
|
|||||||
PINECONE_METRIC: str = Field(
|
PINECONE_METRIC: str = Field(
|
||||||
description="Distance metric for Pinecone index (cosine, euclidean, dotproduct)",
|
description="Distance metric for Pinecone index (cosine, euclidean, dotproduct)",
|
||||||
default="cosine",
|
default="cosine",
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
from typing import Any, Optional
|
from typing import Any
|
||||||
|
|
||||||
from pinecone import Pinecone, ServerlessSpec
|
from pinecone import Pinecone, ServerlessSpec
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
@ -19,9 +19,10 @@ from models.dataset import Dataset, DatasetCollectionBinding
|
|||||||
|
|
||||||
class PineconeConfig(BaseModel):
|
class PineconeConfig(BaseModel):
|
||||||
"""Pinecone configuration class"""
|
"""Pinecone configuration class"""
|
||||||
|
|
||||||
api_key: str
|
api_key: str
|
||||||
environment: str
|
environment: str
|
||||||
index_name: Optional[str] = None
|
index_name: str | None = None
|
||||||
timeout: float = 30
|
timeout: float = 30
|
||||||
batch_size: int = 100
|
batch_size: int = 100
|
||||||
metric: str = "cosine"
|
metric: str = "cosine"
|
||||||
@ -49,14 +50,15 @@ class PineconeVector(BaseVector):
|
|||||||
# Normalize index name: lowercase, only a-z0-9- and <=45 chars
|
# Normalize index name: lowercase, only a-z0-9- and <=45 chars
|
||||||
import hashlib
|
import hashlib
|
||||||
import re
|
import re
|
||||||
|
|
||||||
base_name = collection_name.lower()
|
base_name = collection_name.lower()
|
||||||
base_name = re.sub(r'[^a-z0-9-]+', '-', base_name) # replace invalid chars with '-'
|
base_name = re.sub(r"[^a-z0-9-]+", "-", base_name) # replace invalid chars with '-'
|
||||||
base_name = re.sub(r'-+', '-', base_name).strip('-')
|
base_name = re.sub(r"-+", "-", base_name).strip("-")
|
||||||
# Use longer secure suffix to reduce collision risk
|
# Use longer secure suffix to reduce collision risk
|
||||||
suffix_len = 24 # 24 hex digits (96-bit entropy)
|
suffix_len = 24 # 24 hex digits (96-bit entropy)
|
||||||
if len(base_name) > 45:
|
if len(base_name) > 45:
|
||||||
hash_suffix = hashlib.sha256(base_name.encode()).hexdigest()[:suffix_len]
|
hash_suffix = hashlib.sha256(base_name.encode()).hexdigest()[:suffix_len]
|
||||||
truncated_name = base_name[:45 - (suffix_len + 1)].rstrip('-')
|
truncated_name = base_name[: 45 - (suffix_len + 1)].rstrip("-")
|
||||||
self._index_name = f"{truncated_name}-{hash_suffix}"
|
self._index_name = f"{truncated_name}-{hash_suffix}"
|
||||||
else:
|
else:
|
||||||
self._index_name = base_name
|
self._index_name = base_name
|
||||||
@ -81,14 +83,11 @@ class PineconeVector(BaseVector):
|
|||||||
else:
|
else:
|
||||||
raise ValueError("Index not initialized. Please ingest documents to create index.")
|
raise ValueError("Index not initialized. Please ingest documents to create index.")
|
||||||
except Exception:
|
except Exception:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def to_index_struct(self) -> dict:
|
def to_index_struct(self) -> dict:
|
||||||
"""Generate index structure dictionary"""
|
"""Generate index structure dictionary"""
|
||||||
return {
|
return {"type": self.get_type(), "vector_store": {"class_prefix": self._collection_name}}
|
||||||
"type": self.get_type(),
|
|
||||||
"vector_store": {"class_prefix": self._collection_name}
|
|
||||||
}
|
|
||||||
|
|
||||||
def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
|
def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
|
||||||
"""Create vector index"""
|
"""Create vector index"""
|
||||||
@ -122,14 +121,11 @@ class PineconeVector(BaseVector):
|
|||||||
name=self._index_name,
|
name=self._index_name,
|
||||||
dimension=dimension,
|
dimension=dimension,
|
||||||
metric=self._client_config.metric,
|
metric=self._client_config.metric,
|
||||||
spec=ServerlessSpec(
|
spec=ServerlessSpec(cloud="aws", region=self._client_config.environment),
|
||||||
cloud='aws',
|
|
||||||
region=self._client_config.environment
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Wait for index creation to complete
|
# Wait for index creation to complete
|
||||||
while not self._pc.describe_index(self._index_name).status['ready']:
|
while not self._pc.describe_index(self._index_name).status["ready"]:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
else:
|
else:
|
||||||
# Get index instance
|
# Get index instance
|
||||||
@ -137,7 +133,7 @@ class PineconeVector(BaseVector):
|
|||||||
|
|
||||||
# Set cache
|
# Set cache
|
||||||
redis_client.set(index_exist_cache_key, 1, ex=3600)
|
redis_client.set(index_exist_cache_key, 1, ex=3600)
|
||||||
|
|
||||||
def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
|
def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
|
||||||
"""Batch add document vectors"""
|
"""Batch add document vectors"""
|
||||||
if not self._index:
|
if not self._index:
|
||||||
@ -152,9 +148,9 @@ class PineconeVector(BaseVector):
|
|||||||
# Batch processing
|
# Batch processing
|
||||||
total_batches = (total_docs + batch_size - 1) // batch_size # Ceiling division
|
total_batches = (total_docs + batch_size - 1) // batch_size # Ceiling division
|
||||||
for batch_idx, i in enumerate(range(0, len(documents), batch_size), 1):
|
for batch_idx, i in enumerate(range(0, len(documents), batch_size), 1):
|
||||||
batch_documents = documents[i:i + batch_size]
|
batch_documents = documents[i : i + batch_size]
|
||||||
batch_embeddings = embeddings[i:i + batch_size]
|
batch_embeddings = embeddings[i : i + batch_size]
|
||||||
batch_uuids = uuids[i:i + batch_size]
|
batch_uuids = uuids[i : i + batch_size]
|
||||||
batch_size_actual = len(batch_documents)
|
batch_size_actual = len(batch_documents)
|
||||||
|
|
||||||
# Build Pinecone vector data (metadata must be primitives or list[str])
|
# Build Pinecone vector data (metadata must be primitives or list[str])
|
||||||
@ -164,12 +160,8 @@ class PineconeVector(BaseVector):
|
|||||||
safe_meta: dict[str, Any] = {}
|
safe_meta: dict[str, Any] = {}
|
||||||
# lift common identifiers to top-level fields for filtering
|
# lift common identifiers to top-level fields for filtering
|
||||||
for k, v in raw_meta.items():
|
for k, v in raw_meta.items():
|
||||||
if (
|
if isinstance(v, (str, int, float, bool)) or (
|
||||||
isinstance(v, (str, int, float, bool))
|
isinstance(v, list) and all(isinstance(x, str) for x in v)
|
||||||
or (
|
|
||||||
isinstance(v, list)
|
|
||||||
and all(isinstance(x, str) for x in v)
|
|
||||||
)
|
|
||||||
):
|
):
|
||||||
safe_meta[k] = v
|
safe_meta[k] = v
|
||||||
else:
|
else:
|
||||||
@ -180,11 +172,7 @@ class PineconeVector(BaseVector):
|
|||||||
# group id as string
|
# group id as string
|
||||||
safe_meta[Field.GROUP_KEY.value] = str(self._group_id)
|
safe_meta[Field.GROUP_KEY.value] = str(self._group_id)
|
||||||
|
|
||||||
vectors_to_upsert.append({
|
vectors_to_upsert.append({"id": doc_id, "values": embedding, "metadata": safe_meta})
|
||||||
"id": doc_id,
|
|
||||||
"values": embedding,
|
|
||||||
"metadata": safe_meta
|
|
||||||
})
|
|
||||||
|
|
||||||
# Batch insert to Pinecone
|
# Batch insert to Pinecone
|
||||||
try:
|
try:
|
||||||
@ -194,7 +182,7 @@ class PineconeVector(BaseVector):
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
return added_ids
|
return added_ids
|
||||||
|
|
||||||
def search_by_vector(self, query_vector: list[float], **kwargs) -> list[Document]:
|
def search_by_vector(self, query_vector: list[float], **kwargs) -> list[Document]:
|
||||||
"""Vector similarity search"""
|
"""Vector similarity search"""
|
||||||
# Lazily attach to an existing index if needed
|
# Lazily attach to an existing index if needed
|
||||||
@ -244,11 +232,11 @@ class PineconeVector(BaseVector):
|
|||||||
docs.sort(key=lambda x: x.metadata.get("score", 0), reverse=True)
|
docs.sort(key=lambda x: x.metadata.get("score", 0), reverse=True)
|
||||||
|
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
def search_by_full_text(self, query: str, **kwargs) -> list[Document]:
|
def search_by_full_text(self, query: str, **kwargs) -> list[Document]:
|
||||||
"""Full-text search - Pinecone does not natively support it, returns empty list"""
|
"""Full-text search - Pinecone does not natively support it, returns empty list"""
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def delete_by_metadata_field(self, key: str, value: str):
|
def delete_by_metadata_field(self, key: str, value: str):
|
||||||
"""Delete by metadata field"""
|
"""Delete by metadata field"""
|
||||||
self._ensure_index_initialized()
|
self._ensure_index_initialized()
|
||||||
@ -257,7 +245,7 @@ class PineconeVector(BaseVector):
|
|||||||
# Build filter conditions
|
# Build filter conditions
|
||||||
filter_dict = {
|
filter_dict = {
|
||||||
Field.GROUP_KEY.value: {"$eq": self._group_id},
|
Field.GROUP_KEY.value: {"$eq": self._group_id},
|
||||||
f"{Field.METADATA_KEY.value}.{key}": {"$eq": value}
|
f"{Field.METADATA_KEY.value}.{key}": {"$eq": value},
|
||||||
}
|
}
|
||||||
|
|
||||||
# Pinecone delete operation
|
# Pinecone delete operation
|
||||||
@ -267,7 +255,7 @@ class PineconeVector(BaseVector):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Ignore delete errors
|
# Ignore delete errors
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def delete_by_ids(self, ids: list[str]) -> None:
|
def delete_by_ids(self, ids: list[str]) -> None:
|
||||||
"""Batch delete by ID list"""
|
"""Batch delete by ID list"""
|
||||||
self._ensure_index_initialized()
|
self._ensure_index_initialized()
|
||||||
@ -279,7 +267,7 @@ class PineconeVector(BaseVector):
|
|||||||
index.delete(ids=ids)
|
index.delete(ids=ids)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def delete(self) -> None:
|
def delete(self) -> None:
|
||||||
"""Delete all vector data for the entire dataset"""
|
"""Delete all vector data for the entire dataset"""
|
||||||
self._ensure_index_initialized()
|
self._ensure_index_initialized()
|
||||||
@ -292,7 +280,7 @@ class PineconeVector(BaseVector):
|
|||||||
index.delete(filter=filter_dict)
|
index.delete(filter=filter_dict)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def text_exists(self, id: str) -> bool:
|
def text_exists(self, id: str) -> bool:
|
||||||
"""Check if document exists"""
|
"""Check if document exists"""
|
||||||
try:
|
try:
|
||||||
@ -313,10 +301,10 @@ class PineconeVector(BaseVector):
|
|||||||
|
|
||||||
class PineconeVectorFactory(AbstractVectorFactory):
|
class PineconeVectorFactory(AbstractVectorFactory):
|
||||||
"""Pinecone vector database factory class"""
|
"""Pinecone vector database factory class"""
|
||||||
|
|
||||||
def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> PineconeVector:
|
def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> PineconeVector:
|
||||||
"""Create PineconeVector instance"""
|
"""Create PineconeVector instance"""
|
||||||
|
|
||||||
# Determine index name
|
# Determine index name
|
||||||
if dataset.collection_binding_id:
|
if dataset.collection_binding_id:
|
||||||
dataset_collection_binding = (
|
dataset_collection_binding = (
|
||||||
@ -335,7 +323,7 @@ class PineconeVectorFactory(AbstractVectorFactory):
|
|||||||
else:
|
else:
|
||||||
dataset_id = dataset.id
|
dataset_id = dataset.id
|
||||||
collection_name = Dataset.gen_collection_name_by_id(dataset_id)
|
collection_name = Dataset.gen_collection_name_by_id(dataset_id)
|
||||||
|
|
||||||
# Set index structure
|
# Set index structure
|
||||||
if not dataset.index_struct_dict:
|
if not dataset.index_struct_dict:
|
||||||
dataset.index_struct = json.dumps(
|
dataset.index_struct = json.dumps(
|
||||||
@ -354,4 +342,4 @@ class PineconeVectorFactory(AbstractVectorFactory):
|
|||||||
batch_size=dify_config.PINECONE_BATCH_SIZE,
|
batch_size=dify_config.PINECONE_BATCH_SIZE,
|
||||||
metric=dify_config.PINECONE_METRIC,
|
metric=dify_config.PINECONE_METRIC,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|||||||
@ -24,5 +24,4 @@ class PineconeVectorTest(AbstractVectorTest):
|
|||||||
|
|
||||||
|
|
||||||
def test_pinecone_vector():
|
def test_pinecone_vector():
|
||||||
|
PineconeVectorTest().run_all_tests()
|
||||||
PineconeVectorTest().run_all_tests()
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user