mirror of https://github.com/langgenius/dify.git
fix(pinecone): normalize index names and sanitize metadata to meet API constraints
This commit is contained in:
parent
90fc5a1f12
commit
1cbe9eedb6
|
|
@ -712,6 +712,7 @@ class DatasetRetrievalSettingMockApi(Resource):
|
|||
| VectorType.BAIDU
|
||||
| VectorType.VIKINGDB
|
||||
| VectorType.UPSTASH
|
||||
| VectorType.PINECONE
|
||||
):
|
||||
return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]}
|
||||
case (
|
||||
|
|
|
|||
|
|
@ -35,39 +35,64 @@ class PineconeVector(BaseVector):
|
|||
super().__init__(collection_name)
|
||||
self._client_config = config
|
||||
self._group_id = group_id
|
||||
|
||||
# Initialize Pinecone client
|
||||
self._pc = Pinecone(api_key=config.api_key)
|
||||
|
||||
# Use collection_name as index name
|
||||
self._index_name = collection_name
|
||||
|
||||
# Initialize Pinecone client with SSL configuration
|
||||
try:
|
||||
self._pc = Pinecone(
|
||||
api_key=config.api_key,
|
||||
# Configure SSL to handle connection issues
|
||||
ssl_ca_certs=None, # Use system default CA certificates
|
||||
)
|
||||
except Exception as e:
|
||||
# Fallback to basic initialization if SSL config fails
|
||||
import logging
|
||||
logging.warning(f"Failed to initialize Pinecone with SSL config: {e}, using basic config")
|
||||
self._pc = Pinecone(api_key=config.api_key)
|
||||
|
||||
# Normalize index name: lowercase, only a-z0-9- and <=45 chars
|
||||
import re, hashlib
|
||||
base_name = collection_name.lower()
|
||||
base_name = re.sub(r'[^a-z0-9-]+', '-', base_name) # replace invalid chars with '-'
|
||||
base_name = re.sub(r'-+', '-', base_name).strip('-')
|
||||
if len(base_name) > 45:
|
||||
hash_suffix = hashlib.md5(base_name.encode()).hexdigest()[:8]
|
||||
truncated_name = base_name[:45-9].rstrip('-')
|
||||
self._index_name = f"{truncated_name}-{hash_suffix}"
|
||||
else:
|
||||
self._index_name = base_name
|
||||
# Guard empty name
|
||||
if not self._index_name:
|
||||
self._index_name = f"index-{hashlib.md5(collection_name.encode()).hexdigest()[:8]}"
|
||||
self._index = None
|
||||
|
||||
def get_type(self) -> str:
|
||||
"""Return vector database type identifier"""
|
||||
return "pinecone"
|
||||
|
||||
|
||||
def to_index_struct(self) -> dict:
|
||||
"""Generate index structure dictionary"""
|
||||
return {
|
||||
"type": self.get_type(),
|
||||
"type": self.get_type(),
|
||||
"vector_store": {"class_prefix": self._collection_name}
|
||||
}
|
||||
|
||||
|
||||
def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
|
||||
"""Create vector index"""
|
||||
if texts:
|
||||
# Get vector dimension
|
||||
vector_size = len(embeddings[0])
|
||||
|
||||
|
||||
# Create Pinecone index
|
||||
self.create_index(vector_size)
|
||||
|
||||
|
||||
# Add vector data
|
||||
self.add_texts(texts, embeddings, **kwargs)
|
||||
|
||||
|
||||
def create_index(self, dimension: int):
|
||||
"""Create Pinecone index"""
|
||||
# Debug: Log the index name being used
|
||||
import logging
|
||||
logging.warning(f"Pinecone: Creating index with name: {self._index_name} (length: {len(self._index_name)})")
|
||||
lock_name = f"vector_indexing_lock_{self._index_name}"
|
||||
|
||||
with redis_client.lock(lock_name, timeout=30):
|
||||
|
|
@ -117,19 +142,29 @@ class PineconeVector(BaseVector):
|
|||
batch_embeddings = embeddings[i:i + batch_size]
|
||||
batch_uuids = uuids[i:i + batch_size]
|
||||
|
||||
# Build Pinecone vector data
|
||||
# Build Pinecone vector data (metadata must be primitives or list[str])
|
||||
vectors_to_upsert = []
|
||||
for doc, embedding, doc_id in zip(batch_documents, batch_embeddings, batch_uuids):
|
||||
metadata = {
|
||||
Field.CONTENT_KEY.value: doc.page_content,
|
||||
Field.METADATA_KEY.value: doc.metadata or {},
|
||||
Field.GROUP_KEY.value: self._group_id,
|
||||
}
|
||||
|
||||
raw_meta = doc.metadata or {}
|
||||
safe_meta: dict[str, Any] = {}
|
||||
# lift common identifiers to top-level fields for filtering
|
||||
for k, v in raw_meta.items():
|
||||
if isinstance(v, (str, int, float, bool)):
|
||||
safe_meta[k] = v
|
||||
elif isinstance(v, list) and all(isinstance(x, str) for x in v):
|
||||
safe_meta[k] = v
|
||||
else:
|
||||
safe_meta[k] = json.dumps(v, ensure_ascii=False)
|
||||
|
||||
# keep content as string metadata if needed
|
||||
safe_meta[Field.CONTENT_KEY.value] = doc.page_content
|
||||
# group id as string
|
||||
safe_meta[Field.GROUP_KEY.value] = str(self._group_id)
|
||||
|
||||
vectors_to_upsert.append({
|
||||
"id": doc_id,
|
||||
"values": embedding,
|
||||
"metadata": metadata
|
||||
"metadata": safe_meta
|
||||
})
|
||||
|
||||
# Batch insert to Pinecone
|
||||
|
|
|
|||
|
|
@ -11,10 +11,10 @@ services:
|
|||
PGDATA: ${PGDATA:-/var/lib/postgresql/data/pgdata}
|
||||
command: >
|
||||
postgres -c 'max_connections=${POSTGRES_MAX_CONNECTIONS:-100}'
|
||||
-c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
|
||||
-c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
|
||||
-c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
|
||||
-c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
|
||||
-c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
|
||||
-c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
|
||||
-c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
|
||||
-c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
|
||||
volumes:
|
||||
- ${PGDATA_HOST_VOLUME:-./volumes/db/data}:/var/lib/postgresql/data
|
||||
ports:
|
||||
|
|
|
|||
|
|
@ -151,6 +151,12 @@ x-shared-env: &shared-api-worker-env
|
|||
VECTOR_INDEX_NAME_PREFIX: ${VECTOR_INDEX_NAME_PREFIX:-Vector_index}
|
||||
WEAVIATE_ENDPOINT: ${WEAVIATE_ENDPOINT:-http://weaviate:8080}
|
||||
WEAVIATE_API_KEY: ${WEAVIATE_API_KEY:-WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih}
|
||||
PINECONE_API_KEY: ${PINECONE_API_KEY:-}
|
||||
PINECONE_ENVIRONMENT: ${PINECONE_ENVIRONMENT:-}
|
||||
PINECONE_INDEX_NAME: ${PINECONE_INDEX_NAME:-}
|
||||
PINECONE_CLIENT_TIMEOUT: ${PINECONE_CLIENT_TIMEOUT:-30}
|
||||
PINECONE_BATCH_SIZE: ${PINECONE_BATCH_SIZE:-100}
|
||||
PINECONE_METRIC: ${PINECONE_METRIC:-cosine}
|
||||
QDRANT_URL: ${QDRANT_URL:-http://qdrant:6333}
|
||||
QDRANT_API_KEY: ${QDRANT_API_KEY:-difyai123456}
|
||||
QDRANT_CLIENT_TIMEOUT: ${QDRANT_CLIENT_TIMEOUT:-20}
|
||||
|
|
@ -582,7 +588,9 @@ x-shared-env: &shared-api-worker-env
|
|||
services:
|
||||
# API service
|
||||
api:
|
||||
image: langgenius/dify-api:1.8.0
|
||||
build:
|
||||
context: ../api
|
||||
dockerfile: Dockerfile
|
||||
restart: always
|
||||
environment:
|
||||
# Use the shared environment variables.
|
||||
|
|
@ -611,7 +619,9 @@ services:
|
|||
# worker service
|
||||
# The Celery worker for processing the queue.
|
||||
worker:
|
||||
image: langgenius/dify-api:1.8.0
|
||||
build:
|
||||
context: ../api
|
||||
dockerfile: Dockerfile
|
||||
restart: always
|
||||
environment:
|
||||
# Use the shared environment variables.
|
||||
|
|
@ -638,7 +648,9 @@ services:
|
|||
# worker_beat service
|
||||
# Celery beat for scheduling periodic tasks.
|
||||
worker_beat:
|
||||
image: langgenius/dify-api:1.8.0
|
||||
build:
|
||||
context: ../api
|
||||
dockerfile: Dockerfile
|
||||
restart: always
|
||||
environment:
|
||||
# Use the shared environment variables.
|
||||
|
|
@ -656,7 +668,7 @@ services:
|
|||
|
||||
# Frontend web application.
|
||||
web:
|
||||
image: langgenius/dify-web:1.8.0
|
||||
image: langgenius/dify-web:release-e-1.8.2
|
||||
restart: always
|
||||
environment:
|
||||
CONSOLE_API_URL: ${CONSOLE_API_URL:-}
|
||||
|
|
@ -691,10 +703,10 @@ services:
|
|||
PGDATA: ${PGDATA:-/var/lib/postgresql/data/pgdata}
|
||||
command: >
|
||||
postgres -c 'max_connections=${POSTGRES_MAX_CONNECTIONS:-100}'
|
||||
-c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
|
||||
-c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
|
||||
-c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
|
||||
-c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
|
||||
-c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
|
||||
-c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
|
||||
-c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
|
||||
-c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
|
||||
volumes:
|
||||
- ./volumes/db/data:/var/lib/postgresql/data
|
||||
healthcheck:
|
||||
|
|
@ -719,7 +731,7 @@ services:
|
|||
|
||||
# The DifySandbox
|
||||
sandbox:
|
||||
image: langgenius/dify-sandbox:0.2.12
|
||||
image: langgenius/dify-sandbox:latest
|
||||
restart: always
|
||||
environment:
|
||||
# The DifySandbox configurations
|
||||
|
|
@ -743,7 +755,7 @@ services:
|
|||
|
||||
# plugin daemon
|
||||
plugin_daemon:
|
||||
image: langgenius/dify-plugin-daemon:0.2.0-local
|
||||
image: langgenius/dify-plugin-daemon:deploy-dev-local
|
||||
restart: always
|
||||
environment:
|
||||
# Use the shared environment variables.
|
||||
|
|
@ -910,7 +922,7 @@ services:
|
|||
# Qdrant vector store.
|
||||
# (if used, you need to set VECTOR_STORE to qdrant in the api & worker service.)
|
||||
qdrant:
|
||||
image: langgenius/qdrant:v1.7.3
|
||||
image: langgenius/qdrant:latest
|
||||
profiles:
|
||||
- qdrant
|
||||
restart: always
|
||||
|
|
|
|||
|
|
@ -79,6 +79,17 @@ WEAVIATE_AUTHORIZATION_ADMINLIST_ENABLED=true
|
|||
WEAVIATE_AUTHORIZATION_ADMINLIST_USERS=hello@dify.ai
|
||||
WEAVIATE_HOST_VOLUME=./volumes/weaviate
|
||||
|
||||
# ------------------------------
|
||||
# Environment Variables for Pinecone Vector Database
|
||||
# ------------------------------
|
||||
# Get your API key from: https://app.pinecone.io/
|
||||
# PINECONE_API_KEY=your-pinecone-api-key
|
||||
# PINECONE_ENVIRONMENT=us-west1-gcp
|
||||
# PINECONE_INDEX_NAME=dify-pinecone-index
|
||||
# PINECONE_CLIENT_TIMEOUT=30
|
||||
# PINECONE_BATCH_SIZE=100
|
||||
# PINECONE_METRIC=cosine
|
||||
|
||||
# ------------------------------
|
||||
# Docker Compose Service Expose Host Port Configurations
|
||||
# ------------------------------
|
||||
|
|
|
|||
Loading…
Reference in New Issue