mirror of https://github.com/langgenius/dify.git
pipecone
This commit is contained in:
parent
41dfdf1ac0
commit
90fc5a1f12
|
|
@ -0,0 +1,581 @@
|
|||
# Your App secret key will be used for securely signing the session cookie
|
||||
# Make sure you are changing this key for your deployment with a strong key.
|
||||
# You can generate a strong key using `openssl rand -base64 42`.
|
||||
# Alternatively you can set it with `SECRET_KEY` environment variable.
|
||||
SECRET_KEY=9k4es1n7e13xdGP+Rs+DmRR2gTvSdZL9KIc4Bgah+uGLEcUgmf5+FUUc
|
||||
|
||||
# Ensure UTF-8 encoding
|
||||
LANG=en_US.UTF-8
|
||||
LC_ALL=en_US.UTF-8
|
||||
PYTHONIOENCODING=utf-8
|
||||
|
||||
# Console API base URL
|
||||
CONSOLE_API_URL=http://localhost:5001
|
||||
CONSOLE_WEB_URL=http://localhost:3000
|
||||
|
||||
# Service API base URL
|
||||
SERVICE_API_URL=http://localhost:5001
|
||||
|
||||
# Web APP base URL
|
||||
APP_WEB_URL=http://localhost:3000
|
||||
|
||||
# Files URL
|
||||
FILES_URL=http://localhost:5001
|
||||
|
||||
# INTERNAL_FILES_URL is used for plugin daemon communication within Docker network.
|
||||
# Set this to the internal Docker service URL for proper plugin file access.
|
||||
# Example: INTERNAL_FILES_URL=http://api:5001
|
||||
INTERNAL_FILES_URL=http://127.0.0.1:5001
|
||||
|
||||
# The time in seconds after the signature is rejected
|
||||
FILES_ACCESS_TIMEOUT=300
|
||||
|
||||
# Access token expiration time in minutes
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES=60
|
||||
|
||||
# Refresh token expiration time in days
|
||||
REFRESH_TOKEN_EXPIRE_DAYS=30
|
||||
|
||||
# redis configuration
|
||||
REDIS_HOST=localhost
|
||||
REDIS_PORT=6379
|
||||
REDIS_USERNAME=
|
||||
REDIS_PASSWORD=difyai123456
|
||||
REDIS_USE_SSL=false
|
||||
# SSL configuration for Redis (when REDIS_USE_SSL=true)
|
||||
REDIS_SSL_CERT_REQS=CERT_NONE
|
||||
# Options: CERT_NONE, CERT_OPTIONAL, CERT_REQUIRED
|
||||
REDIS_SSL_CA_CERTS=
|
||||
# Path to CA certificate file for SSL verification
|
||||
REDIS_SSL_CERTFILE=
|
||||
# Path to client certificate file for SSL authentication
|
||||
REDIS_SSL_KEYFILE=
|
||||
# Path to client private key file for SSL authentication
|
||||
REDIS_DB=0
|
||||
|
||||
# redis Sentinel configuration.
|
||||
REDIS_USE_SENTINEL=false
|
||||
REDIS_SENTINELS=
|
||||
REDIS_SENTINEL_SERVICE_NAME=
|
||||
REDIS_SENTINEL_USERNAME=
|
||||
REDIS_SENTINEL_PASSWORD=
|
||||
REDIS_SENTINEL_SOCKET_TIMEOUT=0.1
|
||||
|
||||
# redis Cluster configuration.
|
||||
REDIS_USE_CLUSTERS=false
|
||||
REDIS_CLUSTERS=
|
||||
REDIS_CLUSTERS_PASSWORD=
|
||||
|
||||
# celery configuration
|
||||
CELERY_BROKER_URL=redis://:difyai123456@localhost:${REDIS_PORT}/1
|
||||
CELERY_BACKEND=redis
|
||||
# PostgreSQL database configuration
|
||||
DB_USERNAME=postgres
|
||||
DB_PASSWORD=difyai123456
|
||||
DB_HOST=localhost
|
||||
DB_PORT=5432
|
||||
DB_DATABASE=dify
|
||||
|
||||
# Storage configuration
|
||||
# use for store upload files, private keys...
|
||||
# storage type: opendal, s3, aliyun-oss, azure-blob, baidu-obs, google-storage, huawei-obs, oci-storage, tencent-cos, volcengine-tos, supabase
|
||||
STORAGE_TYPE=opendal
|
||||
|
||||
# Apache OpenDAL storage configuration, refer to https://github.com/apache/opendal
|
||||
OPENDAL_SCHEME=fs
|
||||
OPENDAL_FS_ROOT=storage
|
||||
|
||||
# S3 Storage configuration
|
||||
S3_USE_AWS_MANAGED_IAM=false
|
||||
S3_ENDPOINT=https://your-bucket-name.storage.s3.cloudflare.com
|
||||
S3_BUCKET_NAME=your-bucket-name
|
||||
S3_ACCESS_KEY=your-access-key
|
||||
S3_SECRET_KEY=your-secret-key
|
||||
S3_REGION=your-region
|
||||
|
||||
# Azure Blob Storage configuration
|
||||
AZURE_BLOB_ACCOUNT_NAME=your-account-name
|
||||
AZURE_BLOB_ACCOUNT_KEY=your-account-key
|
||||
AZURE_BLOB_CONTAINER_NAME=your-container-name
|
||||
AZURE_BLOB_ACCOUNT_URL=https://<your_account_name>.blob.core.windows.net
|
||||
|
||||
# Aliyun oss Storage configuration
|
||||
ALIYUN_OSS_BUCKET_NAME=your-bucket-name
|
||||
ALIYUN_OSS_ACCESS_KEY=your-access-key
|
||||
ALIYUN_OSS_SECRET_KEY=your-secret-key
|
||||
ALIYUN_OSS_ENDPOINT=your-endpoint
|
||||
ALIYUN_OSS_AUTH_VERSION=v1
|
||||
ALIYUN_OSS_REGION=your-region
|
||||
# Don't start with '/'. OSS doesn't support leading slash in object names.
|
||||
ALIYUN_OSS_PATH=your-path
|
||||
|
||||
# Google Storage configuration
|
||||
GOOGLE_STORAGE_BUCKET_NAME=your-bucket-name
|
||||
GOOGLE_STORAGE_SERVICE_ACCOUNT_JSON_BASE64=your-google-service-account-json-base64-string
|
||||
|
||||
# Tencent COS Storage configuration
|
||||
TENCENT_COS_BUCKET_NAME=your-bucket-name
|
||||
TENCENT_COS_SECRET_KEY=your-secret-key
|
||||
TENCENT_COS_SECRET_ID=your-secret-id
|
||||
TENCENT_COS_REGION=your-region
|
||||
TENCENT_COS_SCHEME=your-scheme
|
||||
|
||||
# Huawei OBS Storage Configuration
|
||||
HUAWEI_OBS_BUCKET_NAME=your-bucket-name
|
||||
HUAWEI_OBS_SECRET_KEY=your-secret-key
|
||||
HUAWEI_OBS_ACCESS_KEY=your-access-key
|
||||
HUAWEI_OBS_SERVER=your-server-url
|
||||
|
||||
# Baidu OBS Storage Configuration
|
||||
BAIDU_OBS_BUCKET_NAME=your-bucket-name
|
||||
BAIDU_OBS_SECRET_KEY=your-secret-key
|
||||
BAIDU_OBS_ACCESS_KEY=your-access-key
|
||||
BAIDU_OBS_ENDPOINT=your-server-url
|
||||
|
||||
# OCI Storage configuration
|
||||
OCI_ENDPOINT=your-endpoint
|
||||
OCI_BUCKET_NAME=your-bucket-name
|
||||
OCI_ACCESS_KEY=your-access-key
|
||||
OCI_SECRET_KEY=your-secret-key
|
||||
OCI_REGION=your-region
|
||||
|
||||
# Volcengine tos Storage configuration
|
||||
VOLCENGINE_TOS_ENDPOINT=your-endpoint
|
||||
VOLCENGINE_TOS_BUCKET_NAME=your-bucket-name
|
||||
VOLCENGINE_TOS_ACCESS_KEY=your-access-key
|
||||
VOLCENGINE_TOS_SECRET_KEY=your-secret-key
|
||||
VOLCENGINE_TOS_REGION=your-region
|
||||
|
||||
# Supabase Storage Configuration
|
||||
SUPABASE_BUCKET_NAME=your-bucket-name
|
||||
SUPABASE_API_KEY=your-access-key
|
||||
SUPABASE_URL=your-server-url
|
||||
|
||||
# CORS configuration
|
||||
WEB_API_CORS_ALLOW_ORIGINS=http://localhost:3000,*
|
||||
CONSOLE_CORS_ALLOW_ORIGINS=http://localhost:3000,*
|
||||
|
||||
# Vector database configuration
|
||||
# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`, `opengauss`, `tablestore`,`vastbase`,`tidb`,`tidb_on_qdrant`,`baidu`,`lindorm`,`huawei_cloud`,`upstash`, `matrixone`, `pinecone`.
|
||||
VECTOR_STORE=weaviate
|
||||
# Prefix used to create collection name in vector database
|
||||
VECTOR_INDEX_NAME_PREFIX=Vector_index
|
||||
|
||||
# Weaviate configuration
|
||||
WEAVIATE_ENDPOINT=http://localhost:8080
|
||||
WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih
|
||||
WEAVIATE_GRPC_ENABLED=false
|
||||
WEAVIATE_BATCH_SIZE=100
|
||||
|
||||
# Qdrant configuration, use `http://localhost:6333` for local mode or `https://your-qdrant-cluster-url.qdrant.io` for remote mode
|
||||
QDRANT_URL=http://localhost:6333
|
||||
QDRANT_API_KEY=difyai123456
|
||||
QDRANT_CLIENT_TIMEOUT=20
|
||||
QDRANT_GRPC_ENABLED=false
|
||||
QDRANT_GRPC_PORT=6334
|
||||
QDRANT_REPLICATION_FACTOR=1
|
||||
|
||||
#Couchbase configuration
|
||||
COUCHBASE_CONNECTION_STRING=127.0.0.1
|
||||
COUCHBASE_USER=Administrator
|
||||
COUCHBASE_PASSWORD=password
|
||||
COUCHBASE_BUCKET_NAME=Embeddings
|
||||
COUCHBASE_SCOPE_NAME=_default
|
||||
|
||||
# Milvus configuration
|
||||
MILVUS_URI=http://127.0.0.1:19530
|
||||
MILVUS_TOKEN=
|
||||
MILVUS_USER=root
|
||||
MILVUS_PASSWORD=Milvus
|
||||
MILVUS_ANALYZER_PARAMS=
|
||||
|
||||
# MyScale configuration
|
||||
MYSCALE_HOST=127.0.0.1
|
||||
MYSCALE_PORT=8123
|
||||
MYSCALE_USER=default
|
||||
MYSCALE_PASSWORD=
|
||||
MYSCALE_DATABASE=default
|
||||
MYSCALE_FTS_PARAMS=
|
||||
|
||||
# Relyt configuration
|
||||
RELYT_HOST=127.0.0.1
|
||||
RELYT_PORT=5432
|
||||
RELYT_USER=postgres
|
||||
RELYT_PASSWORD=postgres
|
||||
RELYT_DATABASE=postgres
|
||||
|
||||
# Tencent configuration
|
||||
TENCENT_VECTOR_DB_URL=http://127.0.0.1
|
||||
TENCENT_VECTOR_DB_API_KEY=dify
|
||||
TENCENT_VECTOR_DB_TIMEOUT=30
|
||||
TENCENT_VECTOR_DB_USERNAME=dify
|
||||
TENCENT_VECTOR_DB_DATABASE=dify
|
||||
TENCENT_VECTOR_DB_SHARD=1
|
||||
TENCENT_VECTOR_DB_REPLICAS=2
|
||||
TENCENT_VECTOR_DB_ENABLE_HYBRID_SEARCH=false
|
||||
|
||||
# ElasticSearch configuration
|
||||
ELASTICSEARCH_HOST=127.0.0.1
|
||||
ELASTICSEARCH_PORT=9200
|
||||
ELASTICSEARCH_USERNAME=elastic
|
||||
ELASTICSEARCH_PASSWORD=elastic
|
||||
|
||||
# PGVECTO_RS configuration
|
||||
PGVECTO_RS_HOST=localhost
|
||||
PGVECTO_RS_PORT=5431
|
||||
PGVECTO_RS_USER=postgres
|
||||
PGVECTO_RS_PASSWORD=difyai123456
|
||||
PGVECTO_RS_DATABASE=postgres
|
||||
|
||||
# PGVector configuration
|
||||
PGVECTOR_HOST=127.0.0.1
|
||||
PGVECTOR_PORT=5433
|
||||
PGVECTOR_USER=postgres
|
||||
PGVECTOR_PASSWORD=postgres
|
||||
PGVECTOR_DATABASE=postgres
|
||||
PGVECTOR_MIN_CONNECTION=1
|
||||
PGVECTOR_MAX_CONNECTION=5
|
||||
|
||||
# TableStore Vector configuration
|
||||
TABLESTORE_ENDPOINT=https://instance-name.cn-hangzhou.ots.aliyuncs.com
|
||||
TABLESTORE_INSTANCE_NAME=instance-name
|
||||
TABLESTORE_ACCESS_KEY_ID=xxx
|
||||
TABLESTORE_ACCESS_KEY_SECRET=xxx
|
||||
TABLESTORE_NORMALIZE_FULLTEXT_BM25_SCORE=false
|
||||
|
||||
# Tidb Vector configuration
|
||||
TIDB_VECTOR_HOST=xxx.eu-central-1.xxx.aws.tidbcloud.com
|
||||
TIDB_VECTOR_PORT=4000
|
||||
TIDB_VECTOR_USER=xxx.root
|
||||
TIDB_VECTOR_PASSWORD=xxxxxx
|
||||
TIDB_VECTOR_DATABASE=dify
|
||||
|
||||
# Tidb on qdrant configuration
|
||||
TIDB_ON_QDRANT_URL=http://127.0.0.1
|
||||
TIDB_ON_QDRANT_API_KEY=dify
|
||||
TIDB_ON_QDRANT_CLIENT_TIMEOUT=20
|
||||
TIDB_ON_QDRANT_GRPC_ENABLED=false
|
||||
TIDB_ON_QDRANT_GRPC_PORT=6334
|
||||
TIDB_PUBLIC_KEY=dify
|
||||
TIDB_PRIVATE_KEY=dify
|
||||
TIDB_API_URL=http://127.0.0.1
|
||||
TIDB_IAM_API_URL=http://127.0.0.1
|
||||
TIDB_REGION=regions/aws-us-east-1
|
||||
TIDB_PROJECT_ID=dify
|
||||
TIDB_SPEND_LIMIT=100
|
||||
|
||||
# Chroma configuration
|
||||
CHROMA_HOST=127.0.0.1
|
||||
CHROMA_PORT=8000
|
||||
CHROMA_TENANT=default_tenant
|
||||
CHROMA_DATABASE=default_database
|
||||
CHROMA_AUTH_PROVIDER=chromadb.auth.token_authn.TokenAuthenticationServerProvider
|
||||
CHROMA_AUTH_CREDENTIALS=difyai123456
|
||||
|
||||
# AnalyticDB configuration
|
||||
ANALYTICDB_KEY_ID=your-ak
|
||||
ANALYTICDB_KEY_SECRET=your-sk
|
||||
ANALYTICDB_REGION_ID=cn-hangzhou
|
||||
ANALYTICDB_INSTANCE_ID=gp-ab123456
|
||||
ANALYTICDB_ACCOUNT=testaccount
|
||||
ANALYTICDB_PASSWORD=testpassword
|
||||
ANALYTICDB_NAMESPACE=dify
|
||||
ANALYTICDB_NAMESPACE_PASSWORD=difypassword
|
||||
ANALYTICDB_HOST=gp-test.aliyuncs.com
|
||||
ANALYTICDB_PORT=5432
|
||||
ANALYTICDB_MIN_CONNECTION=1
|
||||
ANALYTICDB_MAX_CONNECTION=5
|
||||
|
||||
# OpenSearch configuration
|
||||
OPENSEARCH_HOST=127.0.0.1
|
||||
OPENSEARCH_PORT=9200
|
||||
OPENSEARCH_USER=admin
|
||||
OPENSEARCH_PASSWORD=admin
|
||||
OPENSEARCH_SECURE=true
|
||||
OPENSEARCH_VERIFY_CERTS=true
|
||||
|
||||
# Baidu configuration
|
||||
BAIDU_VECTOR_DB_ENDPOINT=http://127.0.0.1:5287
|
||||
BAIDU_VECTOR_DB_CONNECTION_TIMEOUT_MS=30000
|
||||
BAIDU_VECTOR_DB_ACCOUNT=root
|
||||
BAIDU_VECTOR_DB_API_KEY=dify
|
||||
BAIDU_VECTOR_DB_DATABASE=dify
|
||||
BAIDU_VECTOR_DB_SHARD=1
|
||||
BAIDU_VECTOR_DB_REPLICAS=3
|
||||
|
||||
# Upstash configuration
|
||||
UPSTASH_VECTOR_URL=your-server-url
|
||||
UPSTASH_VECTOR_TOKEN=your-access-token
|
||||
|
||||
# ViKingDB configuration
|
||||
VIKINGDB_ACCESS_KEY=your-ak
|
||||
VIKINGDB_SECRET_KEY=your-sk
|
||||
VIKINGDB_REGION=cn-shanghai
|
||||
VIKINGDB_HOST=api-vikingdb.xxx.volces.com
|
||||
VIKINGDB_SCHEMA=http
|
||||
VIKINGDB_CONNECTION_TIMEOUT=30
|
||||
VIKINGDB_SOCKET_TIMEOUT=30
|
||||
|
||||
# Matrixone configration
|
||||
MATRIXONE_HOST=127.0.0.1
|
||||
MATRIXONE_PORT=6001
|
||||
MATRIXONE_USER=dump
|
||||
MATRIXONE_PASSWORD=111
|
||||
MATRIXONE_DATABASE=dify
|
||||
|
||||
# Lindorm configuration
|
||||
LINDORM_URL=http://ld-*******************-proxy-search-pub.lindorm.aliyuncs.com:30070
|
||||
LINDORM_USERNAME=admin
|
||||
LINDORM_PASSWORD=admin
|
||||
USING_UGC_INDEX=False
|
||||
LINDORM_QUERY_TIMEOUT=1
|
||||
|
||||
# OceanBase Vector configuration
|
||||
OCEANBASE_VECTOR_HOST=127.0.0.1
|
||||
OCEANBASE_VECTOR_PORT=2881
|
||||
OCEANBASE_VECTOR_USER=root@test
|
||||
OCEANBASE_VECTOR_PASSWORD=difyai123456
|
||||
OCEANBASE_VECTOR_DATABASE=test
|
||||
OCEANBASE_MEMORY_LIMIT=6G
|
||||
OCEANBASE_ENABLE_HYBRID_SEARCH=false
|
||||
|
||||
# openGauss configuration
|
||||
OPENGAUSS_HOST=127.0.0.1
|
||||
OPENGAUSS_PORT=6600
|
||||
OPENGAUSS_USER=postgres
|
||||
OPENGAUSS_PASSWORD=Dify@123
|
||||
OPENGAUSS_DATABASE=dify
|
||||
OPENGAUSS_MIN_CONNECTION=1
|
||||
OPENGAUSS_MAX_CONNECTION=5
|
||||
|
||||
# Upload configuration
|
||||
UPLOAD_FILE_SIZE_LIMIT=15
|
||||
UPLOAD_FILE_BATCH_LIMIT=5
|
||||
UPLOAD_IMAGE_FILE_SIZE_LIMIT=10
|
||||
UPLOAD_VIDEO_FILE_SIZE_LIMIT=100
|
||||
UPLOAD_AUDIO_FILE_SIZE_LIMIT=50
|
||||
|
||||
# Model configuration
|
||||
MULTIMODAL_SEND_FORMAT=base64
|
||||
PROMPT_GENERATION_MAX_TOKENS=512
|
||||
CODE_GENERATION_MAX_TOKENS=1024
|
||||
PLUGIN_BASED_TOKEN_COUNTING_ENABLED=false
|
||||
|
||||
|
||||
# Pinecone configuration, only available when VECTOR_STORE is `pinecone`
|
||||
PINECONE_API_KEY=your-pinecone-api-key
|
||||
PINECONE_ENVIRONMENT=your-pinecone-environment
|
||||
PINECONE_INDEX_NAME=dify-index
|
||||
PINECONE_CLIENT_TIMEOUT=30
|
||||
PINECONE_BATCH_SIZE=100
|
||||
PINECONE_METRIC=cosine
|
||||
PINECONE_PODS=1
|
||||
PINECONE_POD_TYPE=s1
|
||||
|
||||
# Mail configuration, support: resend, smtp, sendgrid
|
||||
MAIL_TYPE=
|
||||
# If using SendGrid, use the 'from' field for authentication if necessary.
|
||||
MAIL_DEFAULT_SEND_FROM=no-reply <no-reply@dify.ai>
|
||||
# resend configuration
|
||||
RESEND_API_KEY=
|
||||
RESEND_API_URL=https://api.resend.com
|
||||
# smtp configuration
|
||||
SMTP_SERVER=smtp.gmail.com
|
||||
SMTP_PORT=465
|
||||
SMTP_USERNAME=123
|
||||
SMTP_PASSWORD=abc
|
||||
SMTP_USE_TLS=true
|
||||
SMTP_OPPORTUNISTIC_TLS=false
|
||||
# Sendgid configuration
|
||||
SENDGRID_API_KEY=
|
||||
# Sentry configuration
|
||||
SENTRY_DSN=
|
||||
|
||||
# DEBUG
|
||||
DEBUG=false
|
||||
ENABLE_REQUEST_LOGGING=False
|
||||
SQLALCHEMY_ECHO=false
|
||||
|
||||
# Notion import configuration, support public and internal
|
||||
NOTION_INTEGRATION_TYPE=public
|
||||
NOTION_CLIENT_SECRET=you-client-secret
|
||||
NOTION_CLIENT_ID=you-client-id
|
||||
NOTION_INTERNAL_SECRET=you-internal-secret
|
||||
|
||||
ETL_TYPE=dify
|
||||
UNSTRUCTURED_API_URL=
|
||||
UNSTRUCTURED_API_KEY=
|
||||
SCARF_NO_ANALYTICS=true
|
||||
|
||||
#ssrf
|
||||
SSRF_PROXY_HTTP_URL=
|
||||
SSRF_PROXY_HTTPS_URL=
|
||||
SSRF_DEFAULT_MAX_RETRIES=3
|
||||
SSRF_DEFAULT_TIME_OUT=5
|
||||
SSRF_DEFAULT_CONNECT_TIME_OUT=5
|
||||
SSRF_DEFAULT_READ_TIME_OUT=5
|
||||
SSRF_DEFAULT_WRITE_TIME_OUT=5
|
||||
|
||||
BATCH_UPLOAD_LIMIT=10
|
||||
KEYWORD_DATA_SOURCE_TYPE=database
|
||||
|
||||
# Workflow file upload limit
|
||||
WORKFLOW_FILE_UPLOAD_LIMIT=10
|
||||
|
||||
# CODE EXECUTION CONFIGURATION
|
||||
CODE_EXECUTION_ENDPOINT=http://127.0.0.1:8194
|
||||
CODE_EXECUTION_API_KEY=dify-sandbox
|
||||
CODE_MAX_NUMBER=9223372036854775807
|
||||
CODE_MIN_NUMBER=-9223372036854775808
|
||||
CODE_MAX_STRING_LENGTH=80000
|
||||
TEMPLATE_TRANSFORM_MAX_LENGTH=80000
|
||||
CODE_MAX_STRING_ARRAY_LENGTH=30
|
||||
CODE_MAX_OBJECT_ARRAY_LENGTH=30
|
||||
CODE_MAX_NUMBER_ARRAY_LENGTH=1000
|
||||
|
||||
# API Tool configuration
|
||||
API_TOOL_DEFAULT_CONNECT_TIMEOUT=10
|
||||
API_TOOL_DEFAULT_READ_TIMEOUT=60
|
||||
|
||||
# HTTP Node configuration
|
||||
HTTP_REQUEST_MAX_CONNECT_TIMEOUT=300
|
||||
HTTP_REQUEST_MAX_READ_TIMEOUT=600
|
||||
HTTP_REQUEST_MAX_WRITE_TIMEOUT=600
|
||||
HTTP_REQUEST_NODE_MAX_BINARY_SIZE=10485760
|
||||
HTTP_REQUEST_NODE_MAX_TEXT_SIZE=1048576
|
||||
HTTP_REQUEST_NODE_SSL_VERIFY=True
|
||||
|
||||
# Respect X-* headers to redirect clients
|
||||
RESPECT_XFORWARD_HEADERS_ENABLED=false
|
||||
|
||||
# Log file path
|
||||
LOG_FILE=
|
||||
# Log file max size, the unit is MB
|
||||
LOG_FILE_MAX_SIZE=20
|
||||
# Log file max backup count
|
||||
LOG_FILE_BACKUP_COUNT=5
|
||||
# Log dateformat
|
||||
LOG_DATEFORMAT=%Y-%m-%d %H:%M:%S
|
||||
# Log Timezone
|
||||
LOG_TZ=UTC
|
||||
# Log format
|
||||
LOG_FORMAT=%(asctime)s,%(msecs)d %(levelname)-2s [%(filename)s:%(lineno)d] %(req_id)s %(message)s
|
||||
|
||||
# Indexing configuration
|
||||
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=4000
|
||||
|
||||
# Workflow runtime configuration
|
||||
WORKFLOW_MAX_EXECUTION_STEPS=500
|
||||
WORKFLOW_MAX_EXECUTION_TIME=1200
|
||||
WORKFLOW_CALL_MAX_DEPTH=5
|
||||
WORKFLOW_PARALLEL_DEPTH_LIMIT=3
|
||||
MAX_VARIABLE_SIZE=204800
|
||||
|
||||
# Workflow storage configuration
|
||||
# Options: rdbms, hybrid
|
||||
# rdbms: Use only the relational database (default)
|
||||
# hybrid: Save new data to object storage, read from both object storage and RDBMS
|
||||
WORKFLOW_NODE_EXECUTION_STORAGE=rdbms
|
||||
|
||||
# Repository configuration
|
||||
# Core workflow execution repository implementation
|
||||
CORE_WORKFLOW_EXECUTION_REPOSITORY=core.repositories.sqlalchemy_workflow_execution_repository.SQLAlchemyWorkflowExecutionRepository
|
||||
|
||||
# Core workflow node execution repository implementation
|
||||
CORE_WORKFLOW_NODE_EXECUTION_REPOSITORY=core.repositories.sqlalchemy_workflow_node_execution_repository.SQLAlchemyWorkflowNodeExecutionRepository
|
||||
|
||||
# API workflow node execution repository implementation
|
||||
API_WORKFLOW_NODE_EXECUTION_REPOSITORY=repositories.sqlalchemy_api_workflow_node_execution_repository.DifyAPISQLAlchemyWorkflowNodeExecutionRepository
|
||||
|
||||
# API workflow run repository implementation
|
||||
API_WORKFLOW_RUN_REPOSITORY=repositories.sqlalchemy_api_workflow_run_repository.DifyAPISQLAlchemyWorkflowRunRepository
|
||||
# Workflow log cleanup configuration
|
||||
# Enable automatic cleanup of workflow run logs to manage database size
|
||||
WORKFLOW_LOG_CLEANUP_ENABLED=true
|
||||
# Number of days to retain workflow run logs (default: 30 days)
|
||||
WORKFLOW_LOG_RETENTION_DAYS=30
|
||||
# Batch size for workflow log cleanup operations (default: 100)
|
||||
WORKFLOW_LOG_CLEANUP_BATCH_SIZE=100
|
||||
|
||||
# App configuration
|
||||
APP_MAX_EXECUTION_TIME=1200
|
||||
APP_MAX_ACTIVE_REQUESTS=0
|
||||
|
||||
# Celery beat configuration
|
||||
CELERY_BEAT_SCHEDULER_TIME=1
|
||||
|
||||
# Celery schedule tasks configuration
|
||||
ENABLE_CLEAN_EMBEDDING_CACHE_TASK=false
|
||||
ENABLE_CLEAN_UNUSED_DATASETS_TASK=false
|
||||
ENABLE_CREATE_TIDB_SERVERLESS_TASK=false
|
||||
ENABLE_UPDATE_TIDB_SERVERLESS_STATUS_TASK=false
|
||||
ENABLE_CLEAN_MESSAGES=false
|
||||
ENABLE_MAIL_CLEAN_DOCUMENT_NOTIFY_TASK=false
|
||||
ENABLE_DATASETS_QUEUE_MONITOR=false
|
||||
ENABLE_CHECK_UPGRADABLE_PLUGIN_TASK=true
|
||||
|
||||
# Position configuration
|
||||
POSITION_TOOL_PINS=
|
||||
POSITION_TOOL_INCLUDES=
|
||||
POSITION_TOOL_EXCLUDES=
|
||||
|
||||
POSITION_PROVIDER_PINS=
|
||||
POSITION_PROVIDER_INCLUDES=
|
||||
POSITION_PROVIDER_EXCLUDES=
|
||||
|
||||
# Plugin configuration
|
||||
PLUGIN_DAEMON_KEY=lYkiYYT6owG+71oLerGzA7GXCgOT++6ovaezWAjpCjf+Sjc3ZtU+qUEi
|
||||
PLUGIN_DAEMON_URL=http://127.0.0.1:5002
|
||||
PLUGIN_REMOTE_INSTALL_PORT=5003
|
||||
PLUGIN_REMOTE_INSTALL_HOST=localhost
|
||||
PLUGIN_MAX_PACKAGE_SIZE=15728640
|
||||
INNER_API_KEY_FOR_PLUGIN=QaHbTe77CtuXmsfyhR7+vRjI/+XbV1AaFy691iy+kGDv2Jvy0/eAh8Y1
|
||||
|
||||
# Marketplace configuration
|
||||
MARKETPLACE_ENABLED=true
|
||||
MARKETPLACE_API_URL=https://marketplace.dify.ai
|
||||
|
||||
# Endpoint configuration
|
||||
ENDPOINT_URL_TEMPLATE=http://localhost:5002/e/{hook_id}
|
||||
|
||||
# Reset password token expiry minutes
|
||||
RESET_PASSWORD_TOKEN_EXPIRY_MINUTES=5
|
||||
CHANGE_EMAIL_TOKEN_EXPIRY_MINUTES=5
|
||||
OWNER_TRANSFER_TOKEN_EXPIRY_MINUTES=5
|
||||
|
||||
CREATE_TIDB_SERVICE_JOB_ENABLED=false
|
||||
|
||||
# Maximum number of submitted thread count in a ThreadPool for parallel node execution
|
||||
MAX_SUBMIT_COUNT=100
|
||||
# Lockout duration in seconds
|
||||
LOGIN_LOCKOUT_DURATION=86400
|
||||
|
||||
# Enable OpenTelemetry
|
||||
ENABLE_OTEL=false
|
||||
OTLP_TRACE_ENDPOINT=
|
||||
OTLP_METRIC_ENDPOINT=
|
||||
OTLP_BASE_ENDPOINT=http://localhost:4318
|
||||
OTLP_API_KEY=
|
||||
OTEL_EXPORTER_OTLP_PROTOCOL=
|
||||
OTEL_EXPORTER_TYPE=otlp
|
||||
OTEL_SAMPLING_RATE=0.1
|
||||
OTEL_BATCH_EXPORT_SCHEDULE_DELAY=5000
|
||||
OTEL_MAX_QUEUE_SIZE=2048
|
||||
OTEL_MAX_EXPORT_BATCH_SIZE=512
|
||||
OTEL_METRIC_EXPORT_INTERVAL=60000
|
||||
OTEL_BATCH_EXPORT_TIMEOUT=10000
|
||||
OTEL_METRIC_EXPORT_TIMEOUT=30000
|
||||
|
||||
# Prevent Clickjacking
|
||||
ALLOW_EMBED=false
|
||||
|
||||
# Dataset queue monitor configuration
|
||||
QUEUE_MONITOR_THRESHOLD=200
|
||||
# You can configure multiple ones, separated by commas. eg: test1@dify.ai,test2@dify.ai
|
||||
QUEUE_MONITOR_ALERT_EMAILS=
|
||||
# Monitor interval in minutes, default is 30 minutes
|
||||
QUEUE_MONITOR_INTERVAL=30
|
||||
|
||||
# Swagger UI configuration
|
||||
SWAGGER_UI_ENABLED=true
|
||||
SWAGGER_UI_PATH=/swagger-ui.html
|
||||
|
|
@ -156,7 +156,7 @@ WEB_API_CORS_ALLOW_ORIGINS=http://localhost:3000,*
|
|||
CONSOLE_CORS_ALLOW_ORIGINS=http://localhost:3000,*
|
||||
|
||||
# Vector database configuration
|
||||
# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`, `opengauss`, `tablestore`,`vastbase`,`tidb`,`tidb_on_qdrant`,`baidu`,`lindorm`,`huawei_cloud`,`upstash`, `matrixone`.
|
||||
# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`, `opengauss`, `tablestore`,`vastbase`,`tidb`,`tidb_on_qdrant`,`baidu`,`lindorm`,`huawei_cloud`,`upstash`, `matrixone`, `pinecone`.
|
||||
VECTOR_STORE=weaviate
|
||||
# Prefix used to create collection name in vector database
|
||||
VECTOR_INDEX_NAME_PREFIX=Vector_index
|
||||
|
|
@ -361,6 +361,17 @@ PROMPT_GENERATION_MAX_TOKENS=512
|
|||
CODE_GENERATION_MAX_TOKENS=1024
|
||||
PLUGIN_BASED_TOKEN_COUNTING_ENABLED=false
|
||||
|
||||
|
||||
# Pinecone configuration, only available when VECTOR_STORE is `pinecone`
|
||||
PINECONE_API_KEY=your-pinecone-api-key
|
||||
PINECONE_ENVIRONMENT=your-pinecone-environment
|
||||
PINECONE_INDEX_NAME=dify-index
|
||||
PINECONE_CLIENT_TIMEOUT=30
|
||||
PINECONE_BATCH_SIZE=100
|
||||
PINECONE_METRIC=cosine
|
||||
PINECONE_PODS=1
|
||||
PINECONE_POD_TYPE=s1
|
||||
|
||||
# Mail configuration, support: resend, smtp, sendgrid
|
||||
MAIL_TYPE=
|
||||
# If using SendGrid, use the 'from' field for authentication if necessary.
|
||||
|
|
|
|||
|
|
@ -74,9 +74,13 @@ class DifyConfig(
|
|||
# **Before using, please contact business@dify.ai by email to inquire about licensing matters.**
|
||||
EnterpriseFeatureConfig,
|
||||
):
|
||||
# Get the project root directory (parent of api directory)
|
||||
_project_root = Path(__file__).parent.parent.parent
|
||||
_env_file = _project_root / "api" / ".env"
|
||||
|
||||
model_config = SettingsConfigDict(
|
||||
# read from dotenv format config file
|
||||
env_file=".env",
|
||||
env_file=str(_env_file),
|
||||
env_file_encoding="utf-8",
|
||||
# ignore extra attributes
|
||||
extra="ignore",
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ from .vdb.opensearch_config import OpenSearchConfig
|
|||
from .vdb.oracle_config import OracleConfig
|
||||
from .vdb.pgvector_config import PGVectorConfig
|
||||
from .vdb.pgvectors_config import PGVectoRSConfig
|
||||
from .vdb.pinecone_config import PineconeConfig
|
||||
from .vdb.qdrant_config import QdrantConfig
|
||||
from .vdb.relyt_config import RelytConfig
|
||||
from .vdb.tablestore_config import TableStoreConfig
|
||||
|
|
@ -331,6 +332,7 @@ class MiddlewareConfig(
|
|||
PGVectorConfig,
|
||||
VastbaseVectorConfig,
|
||||
PGVectoRSConfig,
|
||||
PineconeConfig,
|
||||
QdrantConfig,
|
||||
RelytConfig,
|
||||
TencentVectorDBConfig,
|
||||
|
|
|
|||
|
|
@ -0,0 +1,41 @@
|
|||
from typing import Optional
|
||||
|
||||
from pydantic import Field, PositiveInt
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
|
||||
class PineconeConfig(BaseSettings):
|
||||
"""
|
||||
Configuration settings for Pinecone vector database
|
||||
"""
|
||||
|
||||
PINECONE_API_KEY: Optional[str] = Field(
|
||||
description="API key for authenticating with Pinecone service",
|
||||
default=None,
|
||||
)
|
||||
|
||||
PINECONE_ENVIRONMENT: Optional[str] = Field(
|
||||
description="Pinecone environment (e.g., 'us-west1-gcp', 'us-east-1-aws')",
|
||||
default=None,
|
||||
)
|
||||
|
||||
PINECONE_INDEX_NAME: Optional[str] = Field(
|
||||
description="Default Pinecone index name",
|
||||
default=None,
|
||||
)
|
||||
|
||||
PINECONE_CLIENT_TIMEOUT: PositiveInt = Field(
|
||||
description="Timeout in seconds for Pinecone client operations (default is 30 seconds)",
|
||||
default=30,
|
||||
)
|
||||
|
||||
PINECONE_BATCH_SIZE: PositiveInt = Field(
|
||||
description="Batch size for Pinecone operations (default is 100)",
|
||||
default=100,
|
||||
)
|
||||
|
||||
PINECONE_METRIC: str = Field(
|
||||
description="Distance metric for Pinecone index (cosine, euclidean, dotproduct)",
|
||||
default="cosine",
|
||||
)
|
||||
|
||||
|
|
@ -660,6 +660,7 @@ class DatasetRetrievalSettingApi(Resource):
|
|||
| VectorType.BAIDU
|
||||
| VectorType.VIKINGDB
|
||||
| VectorType.UPSTASH
|
||||
| VectorType.PINECONE
|
||||
):
|
||||
return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]}
|
||||
case (
|
||||
|
|
|
|||
|
|
@ -0,0 +1,285 @@
|
|||
import json
|
||||
import time
|
||||
import uuid
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from pinecone import Pinecone, ServerlessSpec
|
||||
from pydantic import BaseModel
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.datasource.vdb.field import Field
|
||||
from core.rag.datasource.vdb.vector_base import BaseVector
|
||||
from core.rag.datasource.vdb.vector_factory import AbstractVectorFactory
|
||||
from core.rag.datasource.vdb.vector_type import VectorType
|
||||
from core.rag.embedding.embedding_base import Embeddings
|
||||
from core.rag.models.document import Document
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_redis import redis_client
|
||||
from models.dataset import Dataset, DatasetCollectionBinding
|
||||
|
||||
|
||||
class PineconeConfig(BaseModel):
|
||||
"""Pinecone configuration class"""
|
||||
api_key: str
|
||||
environment: str
|
||||
index_name: Optional[str] = None
|
||||
timeout: float = 30
|
||||
batch_size: int = 100
|
||||
metric: str = "cosine"
|
||||
|
||||
|
||||
class PineconeVector(BaseVector):
|
||||
"""Pinecone vector database concrete implementation class"""
|
||||
|
||||
def __init__(self, collection_name: str, group_id: str, config: PineconeConfig):
|
||||
super().__init__(collection_name)
|
||||
self._client_config = config
|
||||
self._group_id = group_id
|
||||
|
||||
# Initialize Pinecone client
|
||||
self._pc = Pinecone(api_key=config.api_key)
|
||||
|
||||
# Use collection_name as index name
|
||||
self._index_name = collection_name
|
||||
self._index = None
|
||||
|
||||
def get_type(self) -> str:
|
||||
"""Return vector database type identifier"""
|
||||
return "pinecone"
|
||||
|
||||
def to_index_struct(self) -> dict:
|
||||
"""Generate index structure dictionary"""
|
||||
return {
|
||||
"type": self.get_type(),
|
||||
"vector_store": {"class_prefix": self._collection_name}
|
||||
}
|
||||
|
||||
def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
|
||||
"""Create vector index"""
|
||||
if texts:
|
||||
# Get vector dimension
|
||||
vector_size = len(embeddings[0])
|
||||
|
||||
# Create Pinecone index
|
||||
self.create_index(vector_size)
|
||||
|
||||
# Add vector data
|
||||
self.add_texts(texts, embeddings, **kwargs)
|
||||
|
||||
def create_index(self, dimension: int):
|
||||
"""Create Pinecone index"""
|
||||
lock_name = f"vector_indexing_lock_{self._index_name}"
|
||||
|
||||
with redis_client.lock(lock_name, timeout=30):
|
||||
# Check Redis cache
|
||||
index_exist_cache_key = f"vector_indexing_{self._index_name}"
|
||||
if redis_client.get(index_exist_cache_key):
|
||||
self._index = self._pc.Index(self._index_name)
|
||||
return
|
||||
|
||||
# Check if index already exists
|
||||
existing_indexes = self._pc.list_indexes().names()
|
||||
|
||||
if self._index_name not in existing_indexes:
|
||||
# Create new index using ServerlessSpec
|
||||
self._pc.create_index(
|
||||
name=self._index_name,
|
||||
dimension=dimension,
|
||||
metric=self._client_config.metric,
|
||||
spec=ServerlessSpec(
|
||||
cloud='aws',
|
||||
region=self._client_config.environment
|
||||
)
|
||||
)
|
||||
|
||||
# Wait for index creation to complete
|
||||
while not self._pc.describe_index(self._index_name).status['ready']:
|
||||
time.sleep(1)
|
||||
|
||||
# Get index instance
|
||||
self._index = self._pc.Index(self._index_name)
|
||||
|
||||
# Set cache
|
||||
redis_client.set(index_exist_cache_key, 1, ex=3600)
|
||||
|
||||
def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
|
||||
"""Batch add document vectors"""
|
||||
if not self._index:
|
||||
raise ValueError("Index not initialized. Call create() first.")
|
||||
|
||||
uuids = self._get_uuids(documents)
|
||||
batch_size = self._client_config.batch_size
|
||||
added_ids = []
|
||||
|
||||
# Batch processing
|
||||
for i in range(0, len(documents), batch_size):
|
||||
batch_documents = documents[i:i + batch_size]
|
||||
batch_embeddings = embeddings[i:i + batch_size]
|
||||
batch_uuids = uuids[i:i + batch_size]
|
||||
|
||||
# Build Pinecone vector data
|
||||
vectors_to_upsert = []
|
||||
for doc, embedding, doc_id in zip(batch_documents, batch_embeddings, batch_uuids):
|
||||
metadata = {
|
||||
Field.CONTENT_KEY.value: doc.page_content,
|
||||
Field.METADATA_KEY.value: doc.metadata or {},
|
||||
Field.GROUP_KEY.value: self._group_id,
|
||||
}
|
||||
|
||||
vectors_to_upsert.append({
|
||||
"id": doc_id,
|
||||
"values": embedding,
|
||||
"metadata": metadata
|
||||
})
|
||||
|
||||
# Batch insert to Pinecone
|
||||
self._index.upsert(vectors=vectors_to_upsert)
|
||||
added_ids.extend(batch_uuids)
|
||||
|
||||
return added_ids
|
||||
|
||||
def search_by_vector(self, query_vector: list[float], **kwargs) -> list[Document]:
|
||||
"""Vector similarity search"""
|
||||
if not self._index:
|
||||
raise ValueError("Index not initialized.")
|
||||
|
||||
top_k = kwargs.get("top_k", 4)
|
||||
score_threshold = float(kwargs.get("score_threshold", 0.0))
|
||||
|
||||
# Build filter conditions
|
||||
filter_dict = {Field.GROUP_KEY.value: {"$eq": self._group_id}}
|
||||
|
||||
# Document scope filtering
|
||||
document_ids_filter = kwargs.get("document_ids_filter")
|
||||
if document_ids_filter:
|
||||
filter_dict[f"{Field.METADATA_KEY.value}.document_id"] = {"$in": document_ids_filter}
|
||||
|
||||
# Execute search
|
||||
response = self._index.query(
|
||||
vector=query_vector,
|
||||
top_k=top_k,
|
||||
include_metadata=True,
|
||||
filter=filter_dict
|
||||
)
|
||||
|
||||
# Convert results
|
||||
docs = []
|
||||
for match in response.matches:
|
||||
if match.score >= score_threshold:
|
||||
metadata = match.metadata.get(Field.METADATA_KEY.value, {})
|
||||
metadata["score"] = match.score
|
||||
|
||||
doc = Document(
|
||||
page_content=match.metadata.get(Field.CONTENT_KEY.value, ""),
|
||||
metadata=metadata,
|
||||
)
|
||||
docs.append(doc)
|
||||
|
||||
# Sort by similarity score in descending order
|
||||
docs.sort(key=lambda x: x.metadata.get("score", 0), reverse=True)
|
||||
return docs
|
||||
|
||||
def search_by_full_text(self, query: str, **kwargs) -> list[Document]:
|
||||
"""Full-text search - Pinecone does not natively support it, returns empty list"""
|
||||
return []
|
||||
|
||||
def delete_by_metadata_field(self, key: str, value: str):
|
||||
"""Delete by metadata field"""
|
||||
if not self._index:
|
||||
return
|
||||
|
||||
try:
|
||||
# Build filter conditions
|
||||
filter_dict = {
|
||||
Field.GROUP_KEY.value: {"$eq": self._group_id},
|
||||
f"{Field.METADATA_KEY.value}.{key}": {"$eq": value}
|
||||
}
|
||||
|
||||
# Pinecone delete operation
|
||||
self._index.delete(filter=filter_dict)
|
||||
except Exception:
|
||||
# Ignore delete errors
|
||||
pass
|
||||
|
||||
def delete_by_ids(self, ids: list[str]) -> None:
|
||||
"""Batch delete by ID list"""
|
||||
if not self._index:
|
||||
return
|
||||
|
||||
try:
|
||||
# Pinecone delete by ID
|
||||
self._index.delete(ids=ids)
|
||||
except Exception:
|
||||
# Ignore delete errors
|
||||
pass
|
||||
|
||||
def delete(self) -> None:
|
||||
"""Delete all vector data for the entire dataset"""
|
||||
if not self._index:
|
||||
return
|
||||
|
||||
try:
|
||||
# Delete all vectors by group_id
|
||||
filter_dict = {Field.GROUP_KEY.value: {"$eq": self._group_id}}
|
||||
self._index.delete(filter=filter_dict)
|
||||
except Exception:
|
||||
# Ignore delete errors
|
||||
pass
|
||||
|
||||
def text_exists(self, id: str) -> bool:
|
||||
"""Check if document exists"""
|
||||
if not self._index:
|
||||
return False
|
||||
|
||||
try:
|
||||
# Check if vector exists through query
|
||||
response = self._index.fetch(ids=[id])
|
||||
return id in response.vectors
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
class PineconeVectorFactory(AbstractVectorFactory):
|
||||
"""Pinecone vector database factory class"""
|
||||
|
||||
def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> PineconeVector:
|
||||
"""Create PineconeVector instance"""
|
||||
|
||||
# Determine index name
|
||||
if dataset.collection_binding_id:
|
||||
dataset_collection_binding = (
|
||||
db.session.query(DatasetCollectionBinding)
|
||||
.where(DatasetCollectionBinding.id == dataset.collection_binding_id)
|
||||
.one_or_none()
|
||||
)
|
||||
if dataset_collection_binding:
|
||||
collection_name = dataset_collection_binding.collection_name
|
||||
else:
|
||||
raise ValueError("Dataset Collection Bindings does not exist!")
|
||||
else:
|
||||
if dataset.index_struct_dict:
|
||||
class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"]
|
||||
collection_name = class_prefix
|
||||
else:
|
||||
dataset_id = dataset.id
|
||||
collection_name = Dataset.gen_collection_name_by_id(dataset_id)
|
||||
|
||||
# Set index structure
|
||||
if not dataset.index_struct_dict:
|
||||
dataset.index_struct = json.dumps(
|
||||
self.gen_index_struct_dict("pinecone", collection_name)
|
||||
)
|
||||
|
||||
# Create PineconeVector instance
|
||||
return PineconeVector(
|
||||
collection_name=collection_name,
|
||||
group_id=dataset.id,
|
||||
config=PineconeConfig(
|
||||
api_key=dify_config.PINECONE_API_KEY or "",
|
||||
environment=dify_config.PINECONE_ENVIRONMENT or "",
|
||||
index_name=dify_config.PINECONE_INDEX_NAME,
|
||||
timeout=dify_config.PINECONE_CLIENT_TIMEOUT,
|
||||
batch_size=dify_config.PINECONE_BATCH_SIZE,
|
||||
metric=dify_config.PINECONE_METRIC,
|
||||
),
|
||||
)
|
||||
|
|
@ -86,6 +86,10 @@ class Vector:
|
|||
from core.rag.datasource.vdb.pgvecto_rs.pgvecto_rs import PGVectoRSFactory
|
||||
|
||||
return PGVectoRSFactory
|
||||
case VectorType.PINECONE:
|
||||
from core.rag.datasource.vdb.pinecone.pinecone_vector import PineconeVectorFactory
|
||||
|
||||
return PineconeVectorFactory
|
||||
case VectorType.QDRANT:
|
||||
from core.rag.datasource.vdb.qdrant.qdrant_vector import QdrantVectorFactory
|
||||
|
||||
|
|
|
|||
|
|
@ -31,3 +31,4 @@ class VectorType(StrEnum):
|
|||
HUAWEI_CLOUD = "huawei_cloud"
|
||||
MATRIXONE = "matrixone"
|
||||
CLICKZETTA = "clickzetta"
|
||||
PINECONE = "pinecone"
|
||||
|
|
|
|||
|
|
@ -10,6 +10,23 @@ from core.rag.extractor.extractor_base import BaseExtractor
|
|||
from core.rag.models.document import Document
|
||||
|
||||
|
||||
def _format_cell_value(value) -> str:
|
||||
if pd.isna(value):
|
||||
return ""
|
||||
|
||||
if isinstance(value, (int, float)):
|
||||
if isinstance(value, float):
|
||||
if value.is_integer():
|
||||
return str(int(value))
|
||||
else:
|
||||
formatted = f"{value:f}"
|
||||
return formatted.rstrip('0').rstrip('.')
|
||||
else:
|
||||
return str(value)
|
||||
|
||||
return str(value)
|
||||
|
||||
|
||||
class ExcelExtractor(BaseExtractor):
|
||||
"""Load Excel files.
|
||||
|
||||
|
|
@ -49,10 +66,12 @@ class ExcelExtractor(BaseExtractor):
|
|||
row=cast(int, index) + 2, column=col_index + 1
|
||||
) # +2 to account for header and 1-based index
|
||||
if cell.hyperlink:
|
||||
value = f"[{v}]({cell.hyperlink.target})"
|
||||
formatted_v = _format_cell_value(v)
|
||||
value = f"[{formatted_v}]({cell.hyperlink.target})"
|
||||
page_content.append(f'"{k}":"{value}"')
|
||||
else:
|
||||
page_content.append(f'"{k}":"{v}"')
|
||||
formatted_v = _format_cell_value(v)
|
||||
page_content.append(f'"{k}":"{formatted_v}"')
|
||||
documents.append(
|
||||
Document(page_content=";".join(page_content), metadata={"source": self._file_path})
|
||||
)
|
||||
|
|
@ -67,7 +86,8 @@ class ExcelExtractor(BaseExtractor):
|
|||
page_content = []
|
||||
for k, v in row.items():
|
||||
if pd.notna(v):
|
||||
page_content.append(f'"{k}":"{v}"')
|
||||
formatted_v = _format_cell_value(v)
|
||||
page_content.append(f'"{k}":"{formatted_v}"')
|
||||
documents.append(
|
||||
Document(page_content=";".join(page_content), metadata={"source": self._file_path})
|
||||
)
|
||||
|
|
|
|||
|
|
@ -485,6 +485,24 @@ def _extract_text_from_csv(file_content: bytes) -> str:
|
|||
raise TextExtractionError(f"Failed to extract text from CSV: {str(e)}") from e
|
||||
|
||||
|
||||
def _format_cell_value_for_markdown(value) -> str:
|
||||
"""格式化单元格值,避免科学计数法"""
|
||||
if pd.isna(value):
|
||||
return ""
|
||||
|
||||
if isinstance(value, (int, float)):
|
||||
if isinstance(value, float):
|
||||
if value.is_integer():
|
||||
return str(int(value))
|
||||
else:
|
||||
formatted = f"{value:f}"
|
||||
return formatted.rstrip('0').rstrip('.')
|
||||
else:
|
||||
return str(value)
|
||||
|
||||
return str(value)
|
||||
|
||||
|
||||
def _extract_text_from_excel(file_content: bytes) -> str:
|
||||
"""Extract text from an Excel file using pandas."""
|
||||
|
||||
|
|
@ -499,7 +517,8 @@ def _extract_text_from_excel(file_content: bytes) -> str:
|
|||
# Construct the data rows
|
||||
data_rows = []
|
||||
for _, row in df.iterrows():
|
||||
data_row = "| " + " | ".join(map(str, row)) + " |"
|
||||
formatted_row = [_format_cell_value_for_markdown(cell) for cell in row]
|
||||
data_row = "| " + " | ".join(formatted_row) + " |"
|
||||
data_rows.append(data_row)
|
||||
|
||||
# Combine all rows into a single string
|
||||
|
|
|
|||
|
|
@ -88,6 +88,7 @@ dependencies = [
|
|||
"httpx-sse>=0.4.0",
|
||||
"sendgrid~=6.12.3",
|
||||
"flask-restx>=1.3.0",
|
||||
"pinecone>=7.3.0",
|
||||
]
|
||||
# Before adding new dependency, consider place it in
|
||||
# alphabet order (a-z) and suitable group.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,30 @@
|
|||
from core.rag.datasource.vdb.pinecone.pinecone_vector import PineconeConfig, PineconeVector
|
||||
from core.rag.models.document import Document
|
||||
from tests.integration_tests.vdb.test_vector_store import (
|
||||
AbstractVectorTest,
|
||||
setup_mock_redis,
|
||||
)
|
||||
|
||||
|
||||
class PineconeVectorTest(AbstractVectorTest):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.attributes = ["doc_id", "dataset_id", "document_id", "doc_hash"]
|
||||
self.vector = PineconeVector(
|
||||
collection_name=self.collection_name,
|
||||
group_id=self.dataset_id,
|
||||
config=PineconeConfig(
|
||||
api_key="test_api_key",
|
||||
environment="test_environment",
|
||||
index_name="test_index",
|
||||
),
|
||||
)
|
||||
|
||||
def search_by_vector(self):
|
||||
super().search_by_vector()
|
||||
|
||||
|
||||
def test_pinecone_vector(setup_mock_redis):
|
||||
|
||||
|
||||
PineconeVectorTest().run_all_tests()
|
||||
4290
api/uv.lock
4290
api/uv.lock
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue