fix(pinecone): normalize index names and sanitize metadata to meet API constraints

This commit is contained in:
Frederick2313072 2025-09-20 02:56:53 +08:00
parent 90fc5a1f12
commit 1cbe9eedb6
5 changed files with 94 additions and 35 deletions

View File

@ -712,6 +712,7 @@ class DatasetRetrievalSettingMockApi(Resource):
| VectorType.BAIDU
| VectorType.VIKINGDB
| VectorType.UPSTASH
| VectorType.PINECONE
):
return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]}
case (

View File

@ -35,39 +35,64 @@ class PineconeVector(BaseVector):
super().__init__(collection_name)
self._client_config = config
self._group_id = group_id
# Initialize Pinecone client
self._pc = Pinecone(api_key=config.api_key)
# Use collection_name as index name
self._index_name = collection_name
# Initialize Pinecone client with SSL configuration
try:
self._pc = Pinecone(
api_key=config.api_key,
# Configure SSL to handle connection issues
ssl_ca_certs=None, # Use system default CA certificates
)
except Exception as e:
# Fallback to basic initialization if SSL config fails
import logging
logging.warning(f"Failed to initialize Pinecone with SSL config: {e}, using basic config")
self._pc = Pinecone(api_key=config.api_key)
# Normalize index name: lowercase, only a-z0-9- and <=45 chars
import re, hashlib
base_name = collection_name.lower()
base_name = re.sub(r'[^a-z0-9-]+', '-', base_name) # replace invalid chars with '-'
base_name = re.sub(r'-+', '-', base_name).strip('-')
if len(base_name) > 45:
hash_suffix = hashlib.md5(base_name.encode()).hexdigest()[:8]
truncated_name = base_name[:45-9].rstrip('-')
self._index_name = f"{truncated_name}-{hash_suffix}"
else:
self._index_name = base_name
# Guard empty name
if not self._index_name:
self._index_name = f"index-{hashlib.md5(collection_name.encode()).hexdigest()[:8]}"
self._index = None
def get_type(self) -> str:
"""Return vector database type identifier"""
return "pinecone"
def to_index_struct(self) -> dict:
"""Generate index structure dictionary"""
return {
"type": self.get_type(),
"type": self.get_type(),
"vector_store": {"class_prefix": self._collection_name}
}
def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
"""Create vector index"""
if texts:
# Get vector dimension
vector_size = len(embeddings[0])
# Create Pinecone index
self.create_index(vector_size)
# Add vector data
self.add_texts(texts, embeddings, **kwargs)
def create_index(self, dimension: int):
"""Create Pinecone index"""
# Debug: Log the index name being used
import logging
logging.warning(f"Pinecone: Creating index with name: {self._index_name} (length: {len(self._index_name)})")
lock_name = f"vector_indexing_lock_{self._index_name}"
with redis_client.lock(lock_name, timeout=30):
@ -117,19 +142,29 @@ class PineconeVector(BaseVector):
batch_embeddings = embeddings[i:i + batch_size]
batch_uuids = uuids[i:i + batch_size]
# Build Pinecone vector data
# Build Pinecone vector data (metadata must be primitives or list[str])
vectors_to_upsert = []
for doc, embedding, doc_id in zip(batch_documents, batch_embeddings, batch_uuids):
metadata = {
Field.CONTENT_KEY.value: doc.page_content,
Field.METADATA_KEY.value: doc.metadata or {},
Field.GROUP_KEY.value: self._group_id,
}
raw_meta = doc.metadata or {}
safe_meta: dict[str, Any] = {}
# lift common identifiers to top-level fields for filtering
for k, v in raw_meta.items():
if isinstance(v, (str, int, float, bool)):
safe_meta[k] = v
elif isinstance(v, list) and all(isinstance(x, str) for x in v):
safe_meta[k] = v
else:
safe_meta[k] = json.dumps(v, ensure_ascii=False)
# keep content as string metadata if needed
safe_meta[Field.CONTENT_KEY.value] = doc.page_content
# group id as string
safe_meta[Field.GROUP_KEY.value] = str(self._group_id)
vectors_to_upsert.append({
"id": doc_id,
"values": embedding,
"metadata": metadata
"metadata": safe_meta
})
# Batch insert to Pinecone

View File

@ -11,10 +11,10 @@ services:
PGDATA: ${PGDATA:-/var/lib/postgresql/data/pgdata}
command: >
postgres -c 'max_connections=${POSTGRES_MAX_CONNECTIONS:-100}'
-c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
-c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
-c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
-c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
-c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
-c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
-c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
-c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
volumes:
- ${PGDATA_HOST_VOLUME:-./volumes/db/data}:/var/lib/postgresql/data
ports:

View File

@ -151,6 +151,12 @@ x-shared-env: &shared-api-worker-env
VECTOR_INDEX_NAME_PREFIX: ${VECTOR_INDEX_NAME_PREFIX:-Vector_index}
WEAVIATE_ENDPOINT: ${WEAVIATE_ENDPOINT:-http://weaviate:8080}
WEAVIATE_API_KEY: ${WEAVIATE_API_KEY:-WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih}
PINECONE_API_KEY: ${PINECONE_API_KEY:-}
PINECONE_ENVIRONMENT: ${PINECONE_ENVIRONMENT:-}
PINECONE_INDEX_NAME: ${PINECONE_INDEX_NAME:-}
PINECONE_CLIENT_TIMEOUT: ${PINECONE_CLIENT_TIMEOUT:-30}
PINECONE_BATCH_SIZE: ${PINECONE_BATCH_SIZE:-100}
PINECONE_METRIC: ${PINECONE_METRIC:-cosine}
QDRANT_URL: ${QDRANT_URL:-http://qdrant:6333}
QDRANT_API_KEY: ${QDRANT_API_KEY:-difyai123456}
QDRANT_CLIENT_TIMEOUT: ${QDRANT_CLIENT_TIMEOUT:-20}
@ -582,7 +588,9 @@ x-shared-env: &shared-api-worker-env
services:
# API service
api:
image: langgenius/dify-api:1.8.0
build:
context: ../api
dockerfile: Dockerfile
restart: always
environment:
# Use the shared environment variables.
@ -611,7 +619,9 @@ services:
# worker service
# The Celery worker for processing the queue.
worker:
image: langgenius/dify-api:1.8.0
build:
context: ../api
dockerfile: Dockerfile
restart: always
environment:
# Use the shared environment variables.
@ -638,7 +648,9 @@ services:
# worker_beat service
# Celery beat for scheduling periodic tasks.
worker_beat:
image: langgenius/dify-api:1.8.0
build:
context: ../api
dockerfile: Dockerfile
restart: always
environment:
# Use the shared environment variables.
@ -656,7 +668,7 @@ services:
# Frontend web application.
web:
image: langgenius/dify-web:1.8.0
image: langgenius/dify-web:release-e-1.8.2
restart: always
environment:
CONSOLE_API_URL: ${CONSOLE_API_URL:-}
@ -691,10 +703,10 @@ services:
PGDATA: ${PGDATA:-/var/lib/postgresql/data/pgdata}
command: >
postgres -c 'max_connections=${POSTGRES_MAX_CONNECTIONS:-100}'
-c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
-c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
-c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
-c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
-c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
-c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
-c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
-c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
volumes:
- ./volumes/db/data:/var/lib/postgresql/data
healthcheck:
@ -719,7 +731,7 @@ services:
# The DifySandbox
sandbox:
image: langgenius/dify-sandbox:0.2.12
image: langgenius/dify-sandbox:latest
restart: always
environment:
# The DifySandbox configurations
@ -743,7 +755,7 @@ services:
# plugin daemon
plugin_daemon:
image: langgenius/dify-plugin-daemon:0.2.0-local
image: langgenius/dify-plugin-daemon:deploy-dev-local
restart: always
environment:
# Use the shared environment variables.
@ -910,7 +922,7 @@ services:
# Qdrant vector store.
# (if used, you need to set VECTOR_STORE to qdrant in the api & worker service.)
qdrant:
image: langgenius/qdrant:v1.7.3
image: langgenius/qdrant:latest
profiles:
- qdrant
restart: always

View File

@ -79,6 +79,17 @@ WEAVIATE_AUTHORIZATION_ADMINLIST_ENABLED=true
WEAVIATE_AUTHORIZATION_ADMINLIST_USERS=hello@dify.ai
WEAVIATE_HOST_VOLUME=./volumes/weaviate
# ------------------------------
# Environment Variables for Pinecone Vector Database
# ------------------------------
# Get your API key from: https://app.pinecone.io/
# PINECONE_API_KEY=your-pinecone-api-key
# PINECONE_ENVIRONMENT=us-west1-gcp
# PINECONE_INDEX_NAME=dify-pinecone-index
# PINECONE_CLIENT_TIMEOUT=30
# PINECONE_BATCH_SIZE=100
# PINECONE_METRIC=cosine
# ------------------------------
# Docker Compose Service Expose Host Port Configurations
# ------------------------------