fix(pinecone): normalize index names and sanitize metadata to meet API constraints

2025-09-20 02:56:53 +08:00 · 2025-09-20 02:56:53 +08:00 · 1cbe9eedb6
parent 90fc5a1f12
commit 1cbe9eedb6
5 changed files with 94 additions and 35 deletions
--- a/api/controllers/console/datasets/datasets.py
+++ b/api/controllers/console/datasets/datasets.py
@ -712,6 +712,7 @@ class DatasetRetrievalSettingMockApi(Resource):
                | VectorType.BAIDU
                | VectorType.VIKINGDB
                | VectorType.UPSTASH
+                | VectorType.PINECONE
            ):
                return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]}
            case (
--- a/api/core/rag/datasource/vdb/pinecone/pinecone_vector.py
+++ b/api/core/rag/datasource/vdb/pinecone/pinecone_vector.py
@ -35,39 +35,64 @@ class PineconeVector(BaseVector):
        super().__init__(collection_name)
        self._client_config = config
        self._group_id = group_id
-        
-        # Initialize Pinecone client
-        self._pc = Pinecone(api_key=config.api_key)
-        
-        # Use collection_name as index name
-        self._index_name = collection_name
+
+        # Initialize Pinecone client with SSL configuration
+        try:
+            self._pc = Pinecone(
+                api_key=config.api_key,
+                # Configure SSL to handle connection issues
+                ssl_ca_certs=None,  # Use system default CA certificates
+            )
+        except Exception as e:
+            # Fallback to basic initialization if SSL config fails
+            import logging
+            logging.warning(f"Failed to initialize Pinecone with SSL config: {e}, using basic config")
+            self._pc = Pinecone(api_key=config.api_key)
+
+        # Normalize index name: lowercase, only a-z0-9- and <=45 chars
+        import re, hashlib
+        base_name = collection_name.lower()
+        base_name = re.sub(r'[^a-z0-9-]+', '-', base_name)  # replace invalid chars with '-'
+        base_name = re.sub(r'-+', '-', base_name).strip('-')
+        if len(base_name) > 45:
+            hash_suffix = hashlib.md5(base_name.encode()).hexdigest()[:8]
+            truncated_name = base_name[:45-9].rstrip('-')
+            self._index_name = f"{truncated_name}-{hash_suffix}"
+        else:
+            self._index_name = base_name
+        # Guard empty name
+        if not self._index_name:
+            self._index_name = f"index-{hashlib.md5(collection_name.encode()).hexdigest()[:8]}"
        self._index = None
        
    def get_type(self) -> str:
        """Return vector database type identifier"""
        return "pinecone"
-    
+
    def to_index_struct(self) -> dict:
        """Generate index structure dictionary"""
        return {
-            "type": self.get_type(), 
+            "type": self.get_type(),
            "vector_store": {"class_prefix": self._collection_name}
        }
-    
+
    def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
        """Create vector index"""
        if texts:
            # Get vector dimension
            vector_size = len(embeddings[0])
-            
+
            # Create Pinecone index
            self.create_index(vector_size)
-            
+
            # Add vector data
            self.add_texts(texts, embeddings, **kwargs)
-    
+
    def create_index(self, dimension: int):
        """Create Pinecone index"""
+        # Debug: Log the index name being used
+        import logging
+        logging.warning(f"Pinecone: Creating index with name: {self._index_name} (length: {len(self._index_name)})")
        lock_name = f"vector_indexing_lock_{self._index_name}"
        
        with redis_client.lock(lock_name, timeout=30):
@ -117,19 +142,29 @@ class PineconeVector(BaseVector):
            batch_embeddings = embeddings[i:i + batch_size]
            batch_uuids = uuids[i:i + batch_size]
            
-            # Build Pinecone vector data
+            # Build Pinecone vector data (metadata must be primitives or list[str])
            vectors_to_upsert = []
            for doc, embedding, doc_id in zip(batch_documents, batch_embeddings, batch_uuids):
-                metadata = {
-                    Field.CONTENT_KEY.value: doc.page_content,
-                    Field.METADATA_KEY.value: doc.metadata or {},
-                    Field.GROUP_KEY.value: self._group_id,
-                }
-                
+                raw_meta = doc.metadata or {}
+                safe_meta: dict[str, Any] = {}
+                # lift common identifiers to top-level fields for filtering
+                for k, v in raw_meta.items():
+                    if isinstance(v, (str, int, float, bool)):
+                        safe_meta[k] = v
+                    elif isinstance(v, list) and all(isinstance(x, str) for x in v):
+                        safe_meta[k] = v
+                    else:
+                        safe_meta[k] = json.dumps(v, ensure_ascii=False)
+
+                # keep content as string metadata if needed
+                safe_meta[Field.CONTENT_KEY.value] = doc.page_content
+                # group id as string
+                safe_meta[Field.GROUP_KEY.value] = str(self._group_id)
+
                vectors_to_upsert.append({
                    "id": doc_id,
                    "values": embedding,
-                    "metadata": metadata
+                    "metadata": safe_meta
                })
            
            # Batch insert to Pinecone
--- a/docker/docker-compose.middleware.yaml
+++ b/docker/docker-compose.middleware.yaml
@ -11,10 +11,10 @@ services:
      PGDATA: ${PGDATA:-/var/lib/postgresql/data/pgdata}
    command: >
      postgres -c 'max_connections=${POSTGRES_MAX_CONNECTIONS:-100}'
-               -c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
-               -c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
-               -c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
-               -c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
+        -c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
+        -c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
+        -c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
+        -c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
    volumes:
      - ${PGDATA_HOST_VOLUME:-./volumes/db/data}:/var/lib/postgresql/data
    ports:
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@ -151,6 +151,12 @@ x-shared-env: &shared-api-worker-env
  VECTOR_INDEX_NAME_PREFIX: ${VECTOR_INDEX_NAME_PREFIX:-Vector_index}
  WEAVIATE_ENDPOINT: ${WEAVIATE_ENDPOINT:-http://weaviate:8080}
  WEAVIATE_API_KEY: ${WEAVIATE_API_KEY:-WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih}
+  PINECONE_API_KEY: ${PINECONE_API_KEY:-}
+  PINECONE_ENVIRONMENT: ${PINECONE_ENVIRONMENT:-}
+  PINECONE_INDEX_NAME: ${PINECONE_INDEX_NAME:-}
+  PINECONE_CLIENT_TIMEOUT: ${PINECONE_CLIENT_TIMEOUT:-30}
+  PINECONE_BATCH_SIZE: ${PINECONE_BATCH_SIZE:-100}
+  PINECONE_METRIC: ${PINECONE_METRIC:-cosine}
  QDRANT_URL: ${QDRANT_URL:-http://qdrant:6333}
  QDRANT_API_KEY: ${QDRANT_API_KEY:-difyai123456}
  QDRANT_CLIENT_TIMEOUT: ${QDRANT_CLIENT_TIMEOUT:-20}
@ -582,7 +588,9 @@ x-shared-env: &shared-api-worker-env
 services:
  # API service
  api:
-    image: langgenius/dify-api:1.8.0
+    build:
+      context: ../api
+      dockerfile: Dockerfile
    restart: always
    environment:
      # Use the shared environment variables.
@ -611,7 +619,9 @@ services:
  # worker service
  # The Celery worker for processing the queue.
  worker:
-    image: langgenius/dify-api:1.8.0
+    build:
+      context: ../api
+      dockerfile: Dockerfile
    restart: always
    environment:
      # Use the shared environment variables.
@ -638,7 +648,9 @@ services:
  # worker_beat service
  # Celery beat for scheduling periodic tasks.
  worker_beat:
-    image: langgenius/dify-api:1.8.0
+    build:
+      context: ../api
+      dockerfile: Dockerfile
    restart: always
    environment:
      # Use the shared environment variables.
@ -656,7 +668,7 @@ services:

  # Frontend web application.
  web:
-    image: langgenius/dify-web:1.8.0
+    image: langgenius/dify-web:release-e-1.8.2
    restart: always
    environment:
      CONSOLE_API_URL: ${CONSOLE_API_URL:-}
@ -691,10 +703,10 @@ services:
      PGDATA: ${PGDATA:-/var/lib/postgresql/data/pgdata}
    command: >
      postgres -c 'max_connections=${POSTGRES_MAX_CONNECTIONS:-100}'
-               -c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
-               -c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
-               -c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
-               -c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
+        -c 'shared_buffers=${POSTGRES_SHARED_BUFFERS:-128MB}'
+        -c 'work_mem=${POSTGRES_WORK_MEM:-4MB}'
+        -c 'maintenance_work_mem=${POSTGRES_MAINTENANCE_WORK_MEM:-64MB}'
+        -c 'effective_cache_size=${POSTGRES_EFFECTIVE_CACHE_SIZE:-4096MB}'
    volumes:
      - ./volumes/db/data:/var/lib/postgresql/data
    healthcheck:
@ -719,7 +731,7 @@ services:

  # The DifySandbox
  sandbox:
-    image: langgenius/dify-sandbox:0.2.12
+    image: langgenius/dify-sandbox:latest
    restart: always
    environment:
      # The DifySandbox configurations
@ -743,7 +755,7 @@ services:

  # plugin daemon
  plugin_daemon:
-    image: langgenius/dify-plugin-daemon:0.2.0-local
+    image: langgenius/dify-plugin-daemon:deploy-dev-local
    restart: always
    environment:
      # Use the shared environment variables.
@ -910,7 +922,7 @@ services:
  # Qdrant vector store.
  # (if used, you need to set VECTOR_STORE to qdrant in the api & worker service.)
  qdrant:
-    image: langgenius/qdrant:v1.7.3
+    image: langgenius/qdrant:latest
    profiles:
      - qdrant
    restart: always
--- a/docker/middleware.env.example
+++ b/docker/middleware.env.example
@ -79,6 +79,17 @@ WEAVIATE_AUTHORIZATION_ADMINLIST_ENABLED=true
 WEAVIATE_AUTHORIZATION_ADMINLIST_USERS=hello@dify.ai
 WEAVIATE_HOST_VOLUME=./volumes/weaviate

+# ------------------------------
+# Environment Variables for Pinecone Vector Database
+# ------------------------------
+# Get your API key from: https://app.pinecone.io/
+# PINECONE_API_KEY=your-pinecone-api-key
+# PINECONE_ENVIRONMENT=us-west1-gcp
+# PINECONE_INDEX_NAME=dify-pinecone-index
+# PINECONE_CLIENT_TIMEOUT=30
+# PINECONE_BATCH_SIZE=100
+# PINECONE_METRIC=cosine
+
 # ------------------------------
 # Docker Compose Service Expose Host Port Configurations
 # ------------------------------