mirror of
https://github.com/langgenius/dify.git
synced 2026-04-28 11:56:55 +08:00
fix(pinecone): normalize index names and sanitize metadata to meet API constraints
This commit is contained in:
parent
90fc5a1f12
commit
1cbe9eedb6
@ -712,6 +712,7 @@ class DatasetRetrievalSettingMockApi(Resource):
|
|||||||
| VectorType.BAIDU
|
| VectorType.BAIDU
|
||||||
| VectorType.VIKINGDB
|
| VectorType.VIKINGDB
|
||||||
| VectorType.UPSTASH
|
| VectorType.UPSTASH
|
||||||
|
| VectorType.PINECONE
|
||||||
):
|
):
|
||||||
return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]}
|
return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]}
|
||||||
case (
|
case (
|
||||||
|
|||||||
@ -35,39 +35,64 @@ class PineconeVector(BaseVector):
|
|||||||
super().__init__(collection_name)
|
super().__init__(collection_name)
|
||||||
self._client_config = config
|
self._client_config = config
|
||||||
self._group_id = group_id
|
self._group_id = group_id
|
||||||
|
|
||||||
# Initialize Pinecone client
|
# Initialize Pinecone client with SSL configuration
|
||||||
self._pc = Pinecone(api_key=config.api_key)
|
try:
|
||||||
|
self._pc = Pinecone(
|
||||||
# Use collection_name as index name
|
api_key=config.api_key,
|
||||||
self._index_name = collection_name
|
# Configure SSL to handle connection issues
|
||||||
|
ssl_ca_certs=None, # Use system default CA certificates
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
# Fallback to basic initialization if SSL config fails
|
||||||
|
import logging
|
||||||
|
logging.warning(f"Failed to initialize Pinecone with SSL config: {e}, using basic config")
|
||||||
|
self._pc = Pinecone(api_key=config.api_key)
|
||||||
|
|
||||||
|
# Normalize index name: lowercase, only a-z0-9- and <=45 chars
|
||||||
|
import re, hashlib
|
||||||
|
base_name = collection_name.lower()
|
||||||
|
base_name = re.sub(r'[^a-z0-9-]+', '-', base_name) # replace invalid chars with '-'
|
||||||
|
base_name = re.sub(r'-+', '-', base_name).strip('-')
|
||||||
|
if len(base_name) > 45:
|
||||||
|
hash_suffix = hashlib.md5(base_name.encode()).hexdigest()[:8]
|
||||||
|
truncated_name = base_name[:45-9].rstrip('-')
|
||||||
|
self._index_name = f"{truncated_name}-{hash_suffix}"
|
||||||
|
else:
|
||||||
|
self._index_name = base_name
|
||||||
|
# Guard empty name
|
||||||
|
if not self._index_name:
|
||||||
|
self._index_name = f"index-{hashlib.md5(collection_name.encode()).hexdigest()[:8]}"
|
||||||
self._index = None
|
self._index = None
|
||||||
|
|
||||||
def get_type(self) -> str:
|
def get_type(self) -> str:
|
||||||
"""Return vector database type identifier"""
|
"""Return vector database type identifier"""
|
||||||
return "pinecone"
|
return "pinecone"
|
||||||
|
|
||||||
def to_index_struct(self) -> dict:
|
def to_index_struct(self) -> dict:
|
||||||
"""Generate index structure dictionary"""
|
"""Generate index structure dictionary"""
|
||||||
return {
|
return {
|
||||||
"type": self.get_type(),
|
"type": self.get_type(),
|
||||||
"vector_store": {"class_prefix": self._collection_name}
|
"vector_store": {"class_prefix": self._collection_name}
|
||||||
}
|
}
|
||||||
|
|
||||||
def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
|
def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
|
||||||
"""Create vector index"""
|
"""Create vector index"""
|
||||||
if texts:
|
if texts:
|
||||||
# Get vector dimension
|
# Get vector dimension
|
||||||
vector_size = len(embeddings[0])
|
vector_size = len(embeddings[0])
|
||||||
|
|
||||||
# Create Pinecone index
|
# Create Pinecone index
|
||||||
self.create_index(vector_size)
|
self.create_index(vector_size)
|
||||||
|
|
||||||
# Add vector data
|
# Add vector data
|
||||||
self.add_texts(texts, embeddings, **kwargs)
|
self.add_texts(texts, embeddings, **kwargs)
|
||||||
|
|
||||||
def create_index(self, dimension: int):
|
def create_index(self, dimension: int):
|
||||||
"""Create Pinecone index"""
|
"""Create Pinecone index"""
|
||||||
|
# Debug: Log the index name being used
|
||||||
|
import logging
|
||||||
|
logging.warning(f"Pinecone: Creating index with name: {self._index_name} (length: {len(self._index_name)})")
|
||||||
lock_name = f"vector_indexing_lock_{self._index_name}"
|
lock_name = f"vector_indexing_lock_{self._index_name}"
|
||||||
|
|
||||||
with redis_client.lock(lock_name, timeout=30):
|
with redis_client.lock(lock_name, timeout=30):
|
||||||
@ -117,19 +142,29 @@ class PineconeVector(BaseVector):
|
|||||||
batch_embeddings = embeddings[i:i + batch_size]
|
batch_embeddings = embeddings[i:i + batch_size]
|
||||||
batch_uuids = uuids[i:i + batch_size]
|
batch_uuids = uuids[i:i + batch_size]
|
||||||
|
|
||||||
# Build Pinecone vector data
|
# Build Pinecone vector data (metadata must be primitives or list[str])
|
||||||
vectors_to_upsert = []
|
vectors_to_upsert = []
|
||||||
for doc, embedding, doc_id in zip(batch_documents, batch_embeddings, batch_uuids):
|
for doc, embedding, doc_id in zip(batch_documents, batch_embeddings, batch_uuids):
|
||||||
metadata = {
|
raw_meta = doc.metadata or {}
|
||||||
Field.CONTENT_KEY.value: doc.page_content,
|
safe_meta: dict[str, Any] = {}
|
||||||
Field.METADATA_KEY.value: doc.metadata or {},
|
# lift common identifiers to top-level fields for filtering
|
||||||
Field.GROUP_KEY.value: self._group_id,
|
for k, v in raw_meta.items():
|
||||||
}
|
if isinstance(v, (str, int, float, bool)):
|
||||||
|
safe_meta[k] = v
|
||||||
|
elif isinstance(v, list) and all(isinstance(x, str) for x in v):
|
||||||
|
safe_meta[k] = v
|
||||||
|
else:
|
||||||
|
safe_meta[k] = json.dumps(v, ensure_ascii=False)
|
||||||
|
|
||||||
|
# keep content as string metadata if needed
|
||||||
|
safe_meta[Field.CONTENT_KEY.value] = doc.page_content
|
||||||
|
# group id as string
|
||||||
|
safe_meta[Field.GROUP_KEY.value] = str(self._group_id)
|
||||||
|
|
||||||
vectors_to_upsert.append({
|
vectors_to_upsert.append({
|
||||||
"id": doc_id,
|
"id": doc_id,
|
||||||
"values": embedding,
|
"values": embedding,
|
||||||
"metadata": metadata
|
"metadata": safe_meta
|
||||||
})
|
})
|
||||||
|
|
||||||
# Batch insert to Pinecone
|
# Batch insert to Pinecone
|
||||||
|
|||||||
@ -11,10 +11,10 @@ services:
|
|||||||
PGDATA: ${PGDATA:-/var/lib/postgresql/data/pgdata}
|
PGDATA: ${PGDATA:-/var/lib/postgresql/data/pgdata}
|
||||||
command: >
|
command: >
|
||||||
postgres -c 'max_connections=${POSTGRES_MAX_CONNECTIONS:-100}'
|
postgres -c 'max_connections=${POSTGRES_MAX_CONNECTIONS:-100}'
|
||||||
-c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
|
-c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
|
||||||
-c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
|
-c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
|
||||||
-c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
|
-c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
|
||||||
-c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
|
-c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
|
||||||
volumes:
|
volumes:
|
||||||
- ${PGDATA_HOST_VOLUME:-./volumes/db/data}:/var/lib/postgresql/data
|
- ${PGDATA_HOST_VOLUME:-./volumes/db/data}:/var/lib/postgresql/data
|
||||||
ports:
|
ports:
|
||||||
|
|||||||
@ -151,6 +151,12 @@ x-shared-env: &shared-api-worker-env
|
|||||||
VECTOR_INDEX_NAME_PREFIX: ${VECTOR_INDEX_NAME_PREFIX:-Vector_index}
|
VECTOR_INDEX_NAME_PREFIX: ${VECTOR_INDEX_NAME_PREFIX:-Vector_index}
|
||||||
WEAVIATE_ENDPOINT: ${WEAVIATE_ENDPOINT:-http://weaviate:8080}
|
WEAVIATE_ENDPOINT: ${WEAVIATE_ENDPOINT:-http://weaviate:8080}
|
||||||
WEAVIATE_API_KEY: ${WEAVIATE_API_KEY:-WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih}
|
WEAVIATE_API_KEY: ${WEAVIATE_API_KEY:-WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih}
|
||||||
|
PINECONE_API_KEY: ${PINECONE_API_KEY:-}
|
||||||
|
PINECONE_ENVIRONMENT: ${PINECONE_ENVIRONMENT:-}
|
||||||
|
PINECONE_INDEX_NAME: ${PINECONE_INDEX_NAME:-}
|
||||||
|
PINECONE_CLIENT_TIMEOUT: ${PINECONE_CLIENT_TIMEOUT:-30}
|
||||||
|
PINECONE_BATCH_SIZE: ${PINECONE_BATCH_SIZE:-100}
|
||||||
|
PINECONE_METRIC: ${PINECONE_METRIC:-cosine}
|
||||||
QDRANT_URL: ${QDRANT_URL:-http://qdrant:6333}
|
QDRANT_URL: ${QDRANT_URL:-http://qdrant:6333}
|
||||||
QDRANT_API_KEY: ${QDRANT_API_KEY:-difyai123456}
|
QDRANT_API_KEY: ${QDRANT_API_KEY:-difyai123456}
|
||||||
QDRANT_CLIENT_TIMEOUT: ${QDRANT_CLIENT_TIMEOUT:-20}
|
QDRANT_CLIENT_TIMEOUT: ${QDRANT_CLIENT_TIMEOUT:-20}
|
||||||
@ -582,7 +588,9 @@ x-shared-env: &shared-api-worker-env
|
|||||||
services:
|
services:
|
||||||
# API service
|
# API service
|
||||||
api:
|
api:
|
||||||
image: langgenius/dify-api:1.8.0
|
build:
|
||||||
|
context: ../api
|
||||||
|
dockerfile: Dockerfile
|
||||||
restart: always
|
restart: always
|
||||||
environment:
|
environment:
|
||||||
# Use the shared environment variables.
|
# Use the shared environment variables.
|
||||||
@ -611,7 +619,9 @@ services:
|
|||||||
# worker service
|
# worker service
|
||||||
# The Celery worker for processing the queue.
|
# The Celery worker for processing the queue.
|
||||||
worker:
|
worker:
|
||||||
image: langgenius/dify-api:1.8.0
|
build:
|
||||||
|
context: ../api
|
||||||
|
dockerfile: Dockerfile
|
||||||
restart: always
|
restart: always
|
||||||
environment:
|
environment:
|
||||||
# Use the shared environment variables.
|
# Use the shared environment variables.
|
||||||
@ -638,7 +648,9 @@ services:
|
|||||||
# worker_beat service
|
# worker_beat service
|
||||||
# Celery beat for scheduling periodic tasks.
|
# Celery beat for scheduling periodic tasks.
|
||||||
worker_beat:
|
worker_beat:
|
||||||
image: langgenius/dify-api:1.8.0
|
build:
|
||||||
|
context: ../api
|
||||||
|
dockerfile: Dockerfile
|
||||||
restart: always
|
restart: always
|
||||||
environment:
|
environment:
|
||||||
# Use the shared environment variables.
|
# Use the shared environment variables.
|
||||||
@ -656,7 +668,7 @@ services:
|
|||||||
|
|
||||||
# Frontend web application.
|
# Frontend web application.
|
||||||
web:
|
web:
|
||||||
image: langgenius/dify-web:1.8.0
|
image: langgenius/dify-web:release-e-1.8.2
|
||||||
restart: always
|
restart: always
|
||||||
environment:
|
environment:
|
||||||
CONSOLE_API_URL: ${CONSOLE_API_URL:-}
|
CONSOLE_API_URL: ${CONSOLE_API_URL:-}
|
||||||
@ -691,10 +703,10 @@ services:
|
|||||||
PGDATA: ${PGDATA:-/var/lib/postgresql/data/pgdata}
|
PGDATA: ${PGDATA:-/var/lib/postgresql/data/pgdata}
|
||||||
command: >
|
command: >
|
||||||
postgres -c 'max_connections=${POSTGRES_MAX_CONNECTIONS:-100}'
|
postgres -c 'max_connections=${POSTGRES_MAX_CONNECTIONS:-100}'
|
||||||
-c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
|
-c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
|
||||||
-c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
|
-c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
|
||||||
-c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
|
-c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
|
||||||
-c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
|
-c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
|
||||||
volumes:
|
volumes:
|
||||||
- ./volumes/db/data:/var/lib/postgresql/data
|
- ./volumes/db/data:/var/lib/postgresql/data
|
||||||
healthcheck:
|
healthcheck:
|
||||||
@ -719,7 +731,7 @@ services:
|
|||||||
|
|
||||||
# The DifySandbox
|
# The DifySandbox
|
||||||
sandbox:
|
sandbox:
|
||||||
image: langgenius/dify-sandbox:0.2.12
|
image: langgenius/dify-sandbox:latest
|
||||||
restart: always
|
restart: always
|
||||||
environment:
|
environment:
|
||||||
# The DifySandbox configurations
|
# The DifySandbox configurations
|
||||||
@ -743,7 +755,7 @@ services:
|
|||||||
|
|
||||||
# plugin daemon
|
# plugin daemon
|
||||||
plugin_daemon:
|
plugin_daemon:
|
||||||
image: langgenius/dify-plugin-daemon:0.2.0-local
|
image: langgenius/dify-plugin-daemon:deploy-dev-local
|
||||||
restart: always
|
restart: always
|
||||||
environment:
|
environment:
|
||||||
# Use the shared environment variables.
|
# Use the shared environment variables.
|
||||||
@ -910,7 +922,7 @@ services:
|
|||||||
# Qdrant vector store.
|
# Qdrant vector store.
|
||||||
# (if used, you need to set VECTOR_STORE to qdrant in the api & worker service.)
|
# (if used, you need to set VECTOR_STORE to qdrant in the api & worker service.)
|
||||||
qdrant:
|
qdrant:
|
||||||
image: langgenius/qdrant:v1.7.3
|
image: langgenius/qdrant:latest
|
||||||
profiles:
|
profiles:
|
||||||
- qdrant
|
- qdrant
|
||||||
restart: always
|
restart: always
|
||||||
|
|||||||
@ -79,6 +79,17 @@ WEAVIATE_AUTHORIZATION_ADMINLIST_ENABLED=true
|
|||||||
WEAVIATE_AUTHORIZATION_ADMINLIST_USERS=hello@dify.ai
|
WEAVIATE_AUTHORIZATION_ADMINLIST_USERS=hello@dify.ai
|
||||||
WEAVIATE_HOST_VOLUME=./volumes/weaviate
|
WEAVIATE_HOST_VOLUME=./volumes/weaviate
|
||||||
|
|
||||||
|
# ------------------------------
|
||||||
|
# Environment Variables for Pinecone Vector Database
|
||||||
|
# ------------------------------
|
||||||
|
# Get your API key from: https://app.pinecone.io/
|
||||||
|
# PINECONE_API_KEY=your-pinecone-api-key
|
||||||
|
# PINECONE_ENVIRONMENT=us-west1-gcp
|
||||||
|
# PINECONE_INDEX_NAME=dify-pinecone-index
|
||||||
|
# PINECONE_CLIENT_TIMEOUT=30
|
||||||
|
# PINECONE_BATCH_SIZE=100
|
||||||
|
# PINECONE_METRIC=cosine
|
||||||
|
|
||||||
# ------------------------------
|
# ------------------------------
|
||||||
# Docker Compose Service Expose Host Port Configurations
|
# Docker Compose Service Expose Host Port Configurations
|
||||||
# ------------------------------
|
# ------------------------------
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user