This commit is contained in:
Frederick2313072 2025-09-16 08:57:46 +08:00
parent 41dfdf1ac0
commit 90fc5a1f12
16 changed files with 3171 additions and 2131 deletions

581
api/.env.bak Normal file
View File

@ -0,0 +1,581 @@
# Your App secret key will be used for securely signing the session cookie
# Make sure you are changing this key for your deployment with a strong key.
# You can generate a strong key using `openssl rand -base64 42`.
# Alternatively you can set it with `SECRET_KEY` environment variable.
SECRET_KEY=9k4es1n7e13xdGP+Rs+DmRR2gTvSdZL9KIc4Bgah+uGLEcUgmf5+FUUc
# Ensure UTF-8 encoding
LANG=en_US.UTF-8
LC_ALL=en_US.UTF-8
PYTHONIOENCODING=utf-8
# Console API base URL
CONSOLE_API_URL=http://localhost:5001
CONSOLE_WEB_URL=http://localhost:3000
# Service API base URL
SERVICE_API_URL=http://localhost:5001
# Web APP base URL
APP_WEB_URL=http://localhost:3000
# Files URL
FILES_URL=http://localhost:5001
# INTERNAL_FILES_URL is used for plugin daemon communication within Docker network.
# Set this to the internal Docker service URL for proper plugin file access.
# Example: INTERNAL_FILES_URL=http://api:5001
INTERNAL_FILES_URL=http://127.0.0.1:5001
# The time in seconds after the signature is rejected
FILES_ACCESS_TIMEOUT=300
# Access token expiration time in minutes
ACCESS_TOKEN_EXPIRE_MINUTES=60
# Refresh token expiration time in days
REFRESH_TOKEN_EXPIRE_DAYS=30
# redis configuration
REDIS_HOST=localhost
REDIS_PORT=6379
REDIS_USERNAME=
REDIS_PASSWORD=difyai123456
REDIS_USE_SSL=false
# SSL configuration for Redis (when REDIS_USE_SSL=true)
REDIS_SSL_CERT_REQS=CERT_NONE
# Options: CERT_NONE, CERT_OPTIONAL, CERT_REQUIRED
REDIS_SSL_CA_CERTS=
# Path to CA certificate file for SSL verification
REDIS_SSL_CERTFILE=
# Path to client certificate file for SSL authentication
REDIS_SSL_KEYFILE=
# Path to client private key file for SSL authentication
REDIS_DB=0
# redis Sentinel configuration.
REDIS_USE_SENTINEL=false
REDIS_SENTINELS=
REDIS_SENTINEL_SERVICE_NAME=
REDIS_SENTINEL_USERNAME=
REDIS_SENTINEL_PASSWORD=
REDIS_SENTINEL_SOCKET_TIMEOUT=0.1
# redis Cluster configuration.
REDIS_USE_CLUSTERS=false
REDIS_CLUSTERS=
REDIS_CLUSTERS_PASSWORD=
# celery configuration
CELERY_BROKER_URL=redis://:difyai123456@localhost:${REDIS_PORT}/1
CELERY_BACKEND=redis
# PostgreSQL database configuration
DB_USERNAME=postgres
DB_PASSWORD=difyai123456
DB_HOST=localhost
DB_PORT=5432
DB_DATABASE=dify
# Storage configuration
# use for store upload files, private keys...
# storage type: opendal, s3, aliyun-oss, azure-blob, baidu-obs, google-storage, huawei-obs, oci-storage, tencent-cos, volcengine-tos, supabase
STORAGE_TYPE=opendal
# Apache OpenDAL storage configuration, refer to https://github.com/apache/opendal
OPENDAL_SCHEME=fs
OPENDAL_FS_ROOT=storage
# S3 Storage configuration
S3_USE_AWS_MANAGED_IAM=false
S3_ENDPOINT=https://your-bucket-name.storage.s3.cloudflare.com
S3_BUCKET_NAME=your-bucket-name
S3_ACCESS_KEY=your-access-key
S3_SECRET_KEY=your-secret-key
S3_REGION=your-region
# Azure Blob Storage configuration
AZURE_BLOB_ACCOUNT_NAME=your-account-name
AZURE_BLOB_ACCOUNT_KEY=your-account-key
AZURE_BLOB_CONTAINER_NAME=your-container-name
AZURE_BLOB_ACCOUNT_URL=https://<your_account_name>.blob.core.windows.net
# Aliyun oss Storage configuration
ALIYUN_OSS_BUCKET_NAME=your-bucket-name
ALIYUN_OSS_ACCESS_KEY=your-access-key
ALIYUN_OSS_SECRET_KEY=your-secret-key
ALIYUN_OSS_ENDPOINT=your-endpoint
ALIYUN_OSS_AUTH_VERSION=v1
ALIYUN_OSS_REGION=your-region
# Don't start with '/'. OSS doesn't support leading slash in object names.
ALIYUN_OSS_PATH=your-path
# Google Storage configuration
GOOGLE_STORAGE_BUCKET_NAME=your-bucket-name
GOOGLE_STORAGE_SERVICE_ACCOUNT_JSON_BASE64=your-google-service-account-json-base64-string
# Tencent COS Storage configuration
TENCENT_COS_BUCKET_NAME=your-bucket-name
TENCENT_COS_SECRET_KEY=your-secret-key
TENCENT_COS_SECRET_ID=your-secret-id
TENCENT_COS_REGION=your-region
TENCENT_COS_SCHEME=your-scheme
# Huawei OBS Storage Configuration
HUAWEI_OBS_BUCKET_NAME=your-bucket-name
HUAWEI_OBS_SECRET_KEY=your-secret-key
HUAWEI_OBS_ACCESS_KEY=your-access-key
HUAWEI_OBS_SERVER=your-server-url
# Baidu OBS Storage Configuration
BAIDU_OBS_BUCKET_NAME=your-bucket-name
BAIDU_OBS_SECRET_KEY=your-secret-key
BAIDU_OBS_ACCESS_KEY=your-access-key
BAIDU_OBS_ENDPOINT=your-server-url
# OCI Storage configuration
OCI_ENDPOINT=your-endpoint
OCI_BUCKET_NAME=your-bucket-name
OCI_ACCESS_KEY=your-access-key
OCI_SECRET_KEY=your-secret-key
OCI_REGION=your-region
# Volcengine tos Storage configuration
VOLCENGINE_TOS_ENDPOINT=your-endpoint
VOLCENGINE_TOS_BUCKET_NAME=your-bucket-name
VOLCENGINE_TOS_ACCESS_KEY=your-access-key
VOLCENGINE_TOS_SECRET_KEY=your-secret-key
VOLCENGINE_TOS_REGION=your-region
# Supabase Storage Configuration
SUPABASE_BUCKET_NAME=your-bucket-name
SUPABASE_API_KEY=your-access-key
SUPABASE_URL=your-server-url
# CORS configuration
WEB_API_CORS_ALLOW_ORIGINS=http://localhost:3000,*
CONSOLE_CORS_ALLOW_ORIGINS=http://localhost:3000,*
# Vector database configuration
# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`, `opengauss`, `tablestore`,`vastbase`,`tidb`,`tidb_on_qdrant`,`baidu`,`lindorm`,`huawei_cloud`,`upstash`, `matrixone`, `pinecone`.
VECTOR_STORE=weaviate
# Prefix used to create collection name in vector database
VECTOR_INDEX_NAME_PREFIX=Vector_index
# Weaviate configuration
WEAVIATE_ENDPOINT=http://localhost:8080
WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih
WEAVIATE_GRPC_ENABLED=false
WEAVIATE_BATCH_SIZE=100
# Qdrant configuration, use `http://localhost:6333` for local mode or `https://your-qdrant-cluster-url.qdrant.io` for remote mode
QDRANT_URL=http://localhost:6333
QDRANT_API_KEY=difyai123456
QDRANT_CLIENT_TIMEOUT=20
QDRANT_GRPC_ENABLED=false
QDRANT_GRPC_PORT=6334
QDRANT_REPLICATION_FACTOR=1
#Couchbase configuration
COUCHBASE_CONNECTION_STRING=127.0.0.1
COUCHBASE_USER=Administrator
COUCHBASE_PASSWORD=password
COUCHBASE_BUCKET_NAME=Embeddings
COUCHBASE_SCOPE_NAME=_default
# Milvus configuration
MILVUS_URI=http://127.0.0.1:19530
MILVUS_TOKEN=
MILVUS_USER=root
MILVUS_PASSWORD=Milvus
MILVUS_ANALYZER_PARAMS=
# MyScale configuration
MYSCALE_HOST=127.0.0.1
MYSCALE_PORT=8123
MYSCALE_USER=default
MYSCALE_PASSWORD=
MYSCALE_DATABASE=default
MYSCALE_FTS_PARAMS=
# Relyt configuration
RELYT_HOST=127.0.0.1
RELYT_PORT=5432
RELYT_USER=postgres
RELYT_PASSWORD=postgres
RELYT_DATABASE=postgres
# Tencent configuration
TENCENT_VECTOR_DB_URL=http://127.0.0.1
TENCENT_VECTOR_DB_API_KEY=dify
TENCENT_VECTOR_DB_TIMEOUT=30
TENCENT_VECTOR_DB_USERNAME=dify
TENCENT_VECTOR_DB_DATABASE=dify
TENCENT_VECTOR_DB_SHARD=1
TENCENT_VECTOR_DB_REPLICAS=2
TENCENT_VECTOR_DB_ENABLE_HYBRID_SEARCH=false
# ElasticSearch configuration
ELASTICSEARCH_HOST=127.0.0.1
ELASTICSEARCH_PORT=9200
ELASTICSEARCH_USERNAME=elastic
ELASTICSEARCH_PASSWORD=elastic
# PGVECTO_RS configuration
PGVECTO_RS_HOST=localhost
PGVECTO_RS_PORT=5431
PGVECTO_RS_USER=postgres
PGVECTO_RS_PASSWORD=difyai123456
PGVECTO_RS_DATABASE=postgres
# PGVector configuration
PGVECTOR_HOST=127.0.0.1
PGVECTOR_PORT=5433
PGVECTOR_USER=postgres
PGVECTOR_PASSWORD=postgres
PGVECTOR_DATABASE=postgres
PGVECTOR_MIN_CONNECTION=1
PGVECTOR_MAX_CONNECTION=5
# TableStore Vector configuration
TABLESTORE_ENDPOINT=https://instance-name.cn-hangzhou.ots.aliyuncs.com
TABLESTORE_INSTANCE_NAME=instance-name
TABLESTORE_ACCESS_KEY_ID=xxx
TABLESTORE_ACCESS_KEY_SECRET=xxx
TABLESTORE_NORMALIZE_FULLTEXT_BM25_SCORE=false
# Tidb Vector configuration
TIDB_VECTOR_HOST=xxx.eu-central-1.xxx.aws.tidbcloud.com
TIDB_VECTOR_PORT=4000
TIDB_VECTOR_USER=xxx.root
TIDB_VECTOR_PASSWORD=xxxxxx
TIDB_VECTOR_DATABASE=dify
# Tidb on qdrant configuration
TIDB_ON_QDRANT_URL=http://127.0.0.1
TIDB_ON_QDRANT_API_KEY=dify
TIDB_ON_QDRANT_CLIENT_TIMEOUT=20
TIDB_ON_QDRANT_GRPC_ENABLED=false
TIDB_ON_QDRANT_GRPC_PORT=6334
TIDB_PUBLIC_KEY=dify
TIDB_PRIVATE_KEY=dify
TIDB_API_URL=http://127.0.0.1
TIDB_IAM_API_URL=http://127.0.0.1
TIDB_REGION=regions/aws-us-east-1
TIDB_PROJECT_ID=dify
TIDB_SPEND_LIMIT=100
# Chroma configuration
CHROMA_HOST=127.0.0.1
CHROMA_PORT=8000
CHROMA_TENANT=default_tenant
CHROMA_DATABASE=default_database
CHROMA_AUTH_PROVIDER=chromadb.auth.token_authn.TokenAuthenticationServerProvider
CHROMA_AUTH_CREDENTIALS=difyai123456
# AnalyticDB configuration
ANALYTICDB_KEY_ID=your-ak
ANALYTICDB_KEY_SECRET=your-sk
ANALYTICDB_REGION_ID=cn-hangzhou
ANALYTICDB_INSTANCE_ID=gp-ab123456
ANALYTICDB_ACCOUNT=testaccount
ANALYTICDB_PASSWORD=testpassword
ANALYTICDB_NAMESPACE=dify
ANALYTICDB_NAMESPACE_PASSWORD=difypassword
ANALYTICDB_HOST=gp-test.aliyuncs.com
ANALYTICDB_PORT=5432
ANALYTICDB_MIN_CONNECTION=1
ANALYTICDB_MAX_CONNECTION=5
# OpenSearch configuration
OPENSEARCH_HOST=127.0.0.1
OPENSEARCH_PORT=9200
OPENSEARCH_USER=admin
OPENSEARCH_PASSWORD=admin
OPENSEARCH_SECURE=true
OPENSEARCH_VERIFY_CERTS=true
# Baidu configuration
BAIDU_VECTOR_DB_ENDPOINT=http://127.0.0.1:5287
BAIDU_VECTOR_DB_CONNECTION_TIMEOUT_MS=30000
BAIDU_VECTOR_DB_ACCOUNT=root
BAIDU_VECTOR_DB_API_KEY=dify
BAIDU_VECTOR_DB_DATABASE=dify
BAIDU_VECTOR_DB_SHARD=1
BAIDU_VECTOR_DB_REPLICAS=3
# Upstash configuration
UPSTASH_VECTOR_URL=your-server-url
UPSTASH_VECTOR_TOKEN=your-access-token
# ViKingDB configuration
VIKINGDB_ACCESS_KEY=your-ak
VIKINGDB_SECRET_KEY=your-sk
VIKINGDB_REGION=cn-shanghai
VIKINGDB_HOST=api-vikingdb.xxx.volces.com
VIKINGDB_SCHEMA=http
VIKINGDB_CONNECTION_TIMEOUT=30
VIKINGDB_SOCKET_TIMEOUT=30
# Matrixone configration
MATRIXONE_HOST=127.0.0.1
MATRIXONE_PORT=6001
MATRIXONE_USER=dump
MATRIXONE_PASSWORD=111
MATRIXONE_DATABASE=dify
# Lindorm configuration
LINDORM_URL=http://ld-*******************-proxy-search-pub.lindorm.aliyuncs.com:30070
LINDORM_USERNAME=admin
LINDORM_PASSWORD=admin
USING_UGC_INDEX=False
LINDORM_QUERY_TIMEOUT=1
# OceanBase Vector configuration
OCEANBASE_VECTOR_HOST=127.0.0.1
OCEANBASE_VECTOR_PORT=2881
OCEANBASE_VECTOR_USER=root@test
OCEANBASE_VECTOR_PASSWORD=difyai123456
OCEANBASE_VECTOR_DATABASE=test
OCEANBASE_MEMORY_LIMIT=6G
OCEANBASE_ENABLE_HYBRID_SEARCH=false
# openGauss configuration
OPENGAUSS_HOST=127.0.0.1
OPENGAUSS_PORT=6600
OPENGAUSS_USER=postgres
OPENGAUSS_PASSWORD=Dify@123
OPENGAUSS_DATABASE=dify
OPENGAUSS_MIN_CONNECTION=1
OPENGAUSS_MAX_CONNECTION=5
# Upload configuration
UPLOAD_FILE_SIZE_LIMIT=15
UPLOAD_FILE_BATCH_LIMIT=5
UPLOAD_IMAGE_FILE_SIZE_LIMIT=10
UPLOAD_VIDEO_FILE_SIZE_LIMIT=100
UPLOAD_AUDIO_FILE_SIZE_LIMIT=50
# Model configuration
MULTIMODAL_SEND_FORMAT=base64
PROMPT_GENERATION_MAX_TOKENS=512
CODE_GENERATION_MAX_TOKENS=1024
PLUGIN_BASED_TOKEN_COUNTING_ENABLED=false
# Pinecone configuration, only available when VECTOR_STORE is `pinecone`
PINECONE_API_KEY=your-pinecone-api-key
PINECONE_ENVIRONMENT=your-pinecone-environment
PINECONE_INDEX_NAME=dify-index
PINECONE_CLIENT_TIMEOUT=30
PINECONE_BATCH_SIZE=100
PINECONE_METRIC=cosine
PINECONE_PODS=1
PINECONE_POD_TYPE=s1
# Mail configuration, support: resend, smtp, sendgrid
MAIL_TYPE=
# If using SendGrid, use the 'from' field for authentication if necessary.
MAIL_DEFAULT_SEND_FROM=no-reply <no-reply@dify.ai>
# resend configuration
RESEND_API_KEY=
RESEND_API_URL=https://api.resend.com
# smtp configuration
SMTP_SERVER=smtp.gmail.com
SMTP_PORT=465
SMTP_USERNAME=123
SMTP_PASSWORD=abc
SMTP_USE_TLS=true
SMTP_OPPORTUNISTIC_TLS=false
# Sendgid configuration
SENDGRID_API_KEY=
# Sentry configuration
SENTRY_DSN=
# DEBUG
DEBUG=false
ENABLE_REQUEST_LOGGING=False
SQLALCHEMY_ECHO=false
# Notion import configuration, support public and internal
NOTION_INTEGRATION_TYPE=public
NOTION_CLIENT_SECRET=you-client-secret
NOTION_CLIENT_ID=you-client-id
NOTION_INTERNAL_SECRET=you-internal-secret
ETL_TYPE=dify
UNSTRUCTURED_API_URL=
UNSTRUCTURED_API_KEY=
SCARF_NO_ANALYTICS=true
#ssrf
SSRF_PROXY_HTTP_URL=
SSRF_PROXY_HTTPS_URL=
SSRF_DEFAULT_MAX_RETRIES=3
SSRF_DEFAULT_TIME_OUT=5
SSRF_DEFAULT_CONNECT_TIME_OUT=5
SSRF_DEFAULT_READ_TIME_OUT=5
SSRF_DEFAULT_WRITE_TIME_OUT=5
BATCH_UPLOAD_LIMIT=10
KEYWORD_DATA_SOURCE_TYPE=database
# Workflow file upload limit
WORKFLOW_FILE_UPLOAD_LIMIT=10
# CODE EXECUTION CONFIGURATION
CODE_EXECUTION_ENDPOINT=http://127.0.0.1:8194
CODE_EXECUTION_API_KEY=dify-sandbox
CODE_MAX_NUMBER=9223372036854775807
CODE_MIN_NUMBER=-9223372036854775808
CODE_MAX_STRING_LENGTH=80000
TEMPLATE_TRANSFORM_MAX_LENGTH=80000
CODE_MAX_STRING_ARRAY_LENGTH=30
CODE_MAX_OBJECT_ARRAY_LENGTH=30
CODE_MAX_NUMBER_ARRAY_LENGTH=1000
# API Tool configuration
API_TOOL_DEFAULT_CONNECT_TIMEOUT=10
API_TOOL_DEFAULT_READ_TIMEOUT=60
# HTTP Node configuration
HTTP_REQUEST_MAX_CONNECT_TIMEOUT=300
HTTP_REQUEST_MAX_READ_TIMEOUT=600
HTTP_REQUEST_MAX_WRITE_TIMEOUT=600
HTTP_REQUEST_NODE_MAX_BINARY_SIZE=10485760
HTTP_REQUEST_NODE_MAX_TEXT_SIZE=1048576
HTTP_REQUEST_NODE_SSL_VERIFY=True
# Respect X-* headers to redirect clients
RESPECT_XFORWARD_HEADERS_ENABLED=false
# Log file path
LOG_FILE=
# Log file max size, the unit is MB
LOG_FILE_MAX_SIZE=20
# Log file max backup count
LOG_FILE_BACKUP_COUNT=5
# Log dateformat
LOG_DATEFORMAT=%Y-%m-%d %H:%M:%S
# Log Timezone
LOG_TZ=UTC
# Log format
LOG_FORMAT=%(asctime)s,%(msecs)d %(levelname)-2s [%(filename)s:%(lineno)d] %(req_id)s %(message)s
# Indexing configuration
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=4000
# Workflow runtime configuration
WORKFLOW_MAX_EXECUTION_STEPS=500
WORKFLOW_MAX_EXECUTION_TIME=1200
WORKFLOW_CALL_MAX_DEPTH=5
WORKFLOW_PARALLEL_DEPTH_LIMIT=3
MAX_VARIABLE_SIZE=204800
# Workflow storage configuration
# Options: rdbms, hybrid
# rdbms: Use only the relational database (default)
# hybrid: Save new data to object storage, read from both object storage and RDBMS
WORKFLOW_NODE_EXECUTION_STORAGE=rdbms
# Repository configuration
# Core workflow execution repository implementation
CORE_WORKFLOW_EXECUTION_REPOSITORY=core.repositories.sqlalchemy_workflow_execution_repository.SQLAlchemyWorkflowExecutionRepository
# Core workflow node execution repository implementation
CORE_WORKFLOW_NODE_EXECUTION_REPOSITORY=core.repositories.sqlalchemy_workflow_node_execution_repository.SQLAlchemyWorkflowNodeExecutionRepository
# API workflow node execution repository implementation
API_WORKFLOW_NODE_EXECUTION_REPOSITORY=repositories.sqlalchemy_api_workflow_node_execution_repository.DifyAPISQLAlchemyWorkflowNodeExecutionRepository
# API workflow run repository implementation
API_WORKFLOW_RUN_REPOSITORY=repositories.sqlalchemy_api_workflow_run_repository.DifyAPISQLAlchemyWorkflowRunRepository
# Workflow log cleanup configuration
# Enable automatic cleanup of workflow run logs to manage database size
WORKFLOW_LOG_CLEANUP_ENABLED=true
# Number of days to retain workflow run logs (default: 30 days)
WORKFLOW_LOG_RETENTION_DAYS=30
# Batch size for workflow log cleanup operations (default: 100)
WORKFLOW_LOG_CLEANUP_BATCH_SIZE=100
# App configuration
APP_MAX_EXECUTION_TIME=1200
APP_MAX_ACTIVE_REQUESTS=0
# Celery beat configuration
CELERY_BEAT_SCHEDULER_TIME=1
# Celery schedule tasks configuration
ENABLE_CLEAN_EMBEDDING_CACHE_TASK=false
ENABLE_CLEAN_UNUSED_DATASETS_TASK=false
ENABLE_CREATE_TIDB_SERVERLESS_TASK=false
ENABLE_UPDATE_TIDB_SERVERLESS_STATUS_TASK=false
ENABLE_CLEAN_MESSAGES=false
ENABLE_MAIL_CLEAN_DOCUMENT_NOTIFY_TASK=false
ENABLE_DATASETS_QUEUE_MONITOR=false
ENABLE_CHECK_UPGRADABLE_PLUGIN_TASK=true
# Position configuration
POSITION_TOOL_PINS=
POSITION_TOOL_INCLUDES=
POSITION_TOOL_EXCLUDES=
POSITION_PROVIDER_PINS=
POSITION_PROVIDER_INCLUDES=
POSITION_PROVIDER_EXCLUDES=
# Plugin configuration
PLUGIN_DAEMON_KEY=lYkiYYT6owG+71oLerGzA7GXCgOT++6ovaezWAjpCjf+Sjc3ZtU+qUEi
PLUGIN_DAEMON_URL=http://127.0.0.1:5002
PLUGIN_REMOTE_INSTALL_PORT=5003
PLUGIN_REMOTE_INSTALL_HOST=localhost
PLUGIN_MAX_PACKAGE_SIZE=15728640
INNER_API_KEY_FOR_PLUGIN=QaHbTe77CtuXmsfyhR7+vRjI/+XbV1AaFy691iy+kGDv2Jvy0/eAh8Y1
# Marketplace configuration
MARKETPLACE_ENABLED=true
MARKETPLACE_API_URL=https://marketplace.dify.ai
# Endpoint configuration
ENDPOINT_URL_TEMPLATE=http://localhost:5002/e/{hook_id}
# Reset password token expiry minutes
RESET_PASSWORD_TOKEN_EXPIRY_MINUTES=5
CHANGE_EMAIL_TOKEN_EXPIRY_MINUTES=5
OWNER_TRANSFER_TOKEN_EXPIRY_MINUTES=5
CREATE_TIDB_SERVICE_JOB_ENABLED=false
# Maximum number of submitted thread count in a ThreadPool for parallel node execution
MAX_SUBMIT_COUNT=100
# Lockout duration in seconds
LOGIN_LOCKOUT_DURATION=86400
# Enable OpenTelemetry
ENABLE_OTEL=false
OTLP_TRACE_ENDPOINT=
OTLP_METRIC_ENDPOINT=
OTLP_BASE_ENDPOINT=http://localhost:4318
OTLP_API_KEY=
OTEL_EXPORTER_OTLP_PROTOCOL=
OTEL_EXPORTER_TYPE=otlp
OTEL_SAMPLING_RATE=0.1
OTEL_BATCH_EXPORT_SCHEDULE_DELAY=5000
OTEL_MAX_QUEUE_SIZE=2048
OTEL_MAX_EXPORT_BATCH_SIZE=512
OTEL_METRIC_EXPORT_INTERVAL=60000
OTEL_BATCH_EXPORT_TIMEOUT=10000
OTEL_METRIC_EXPORT_TIMEOUT=30000
# Prevent Clickjacking
ALLOW_EMBED=false
# Dataset queue monitor configuration
QUEUE_MONITOR_THRESHOLD=200
# You can configure multiple ones, separated by commas. eg: test1@dify.ai,test2@dify.ai
QUEUE_MONITOR_ALERT_EMAILS=
# Monitor interval in minutes, default is 30 minutes
QUEUE_MONITOR_INTERVAL=30
# Swagger UI configuration
SWAGGER_UI_ENABLED=true
SWAGGER_UI_PATH=/swagger-ui.html

View File

@ -156,7 +156,7 @@ WEB_API_CORS_ALLOW_ORIGINS=http://localhost:3000,*
CONSOLE_CORS_ALLOW_ORIGINS=http://localhost:3000,*
# Vector database configuration
# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`, `opengauss`, `tablestore`,`vastbase`,`tidb`,`tidb_on_qdrant`,`baidu`,`lindorm`,`huawei_cloud`,`upstash`, `matrixone`.
# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`, `opengauss`, `tablestore`,`vastbase`,`tidb`,`tidb_on_qdrant`,`baidu`,`lindorm`,`huawei_cloud`,`upstash`, `matrixone`, `pinecone`.
VECTOR_STORE=weaviate
# Prefix used to create collection name in vector database
VECTOR_INDEX_NAME_PREFIX=Vector_index
@ -361,6 +361,17 @@ PROMPT_GENERATION_MAX_TOKENS=512
CODE_GENERATION_MAX_TOKENS=1024
PLUGIN_BASED_TOKEN_COUNTING_ENABLED=false
# Pinecone configuration, only available when VECTOR_STORE is `pinecone`
PINECONE_API_KEY=your-pinecone-api-key
PINECONE_ENVIRONMENT=your-pinecone-environment
PINECONE_INDEX_NAME=dify-index
PINECONE_CLIENT_TIMEOUT=30
PINECONE_BATCH_SIZE=100
PINECONE_METRIC=cosine
PINECONE_PODS=1
PINECONE_POD_TYPE=s1
# Mail configuration, support: resend, smtp, sendgrid
MAIL_TYPE=
# If using SendGrid, use the 'from' field for authentication if necessary.

View File

@ -74,9 +74,13 @@ class DifyConfig(
# **Before using, please contact business@dify.ai by email to inquire about licensing matters.**
EnterpriseFeatureConfig,
):
# Get the project root directory (parent of api directory)
_project_root = Path(__file__).parent.parent.parent
_env_file = _project_root / "api" / ".env"
model_config = SettingsConfigDict(
# read from dotenv format config file
env_file=".env",
env_file=str(_env_file),
env_file_encoding="utf-8",
# ignore extra attributes
extra="ignore",

View File

@ -35,6 +35,7 @@ from .vdb.opensearch_config import OpenSearchConfig
from .vdb.oracle_config import OracleConfig
from .vdb.pgvector_config import PGVectorConfig
from .vdb.pgvectors_config import PGVectoRSConfig
from .vdb.pinecone_config import PineconeConfig
from .vdb.qdrant_config import QdrantConfig
from .vdb.relyt_config import RelytConfig
from .vdb.tablestore_config import TableStoreConfig
@ -331,6 +332,7 @@ class MiddlewareConfig(
PGVectorConfig,
VastbaseVectorConfig,
PGVectoRSConfig,
PineconeConfig,
QdrantConfig,
RelytConfig,
TencentVectorDBConfig,

View File

@ -0,0 +1,41 @@
from typing import Optional
from pydantic import Field, PositiveInt
from pydantic_settings import BaseSettings
class PineconeConfig(BaseSettings):
"""
Configuration settings for Pinecone vector database
"""
PINECONE_API_KEY: Optional[str] = Field(
description="API key for authenticating with Pinecone service",
default=None,
)
PINECONE_ENVIRONMENT: Optional[str] = Field(
description="Pinecone environment (e.g., 'us-west1-gcp', 'us-east-1-aws')",
default=None,
)
PINECONE_INDEX_NAME: Optional[str] = Field(
description="Default Pinecone index name",
default=None,
)
PINECONE_CLIENT_TIMEOUT: PositiveInt = Field(
description="Timeout in seconds for Pinecone client operations (default is 30 seconds)",
default=30,
)
PINECONE_BATCH_SIZE: PositiveInt = Field(
description="Batch size for Pinecone operations (default is 100)",
default=100,
)
PINECONE_METRIC: str = Field(
description="Distance metric for Pinecone index (cosine, euclidean, dotproduct)",
default="cosine",
)

View File

@ -660,6 +660,7 @@ class DatasetRetrievalSettingApi(Resource):
| VectorType.BAIDU
| VectorType.VIKINGDB
| VectorType.UPSTASH
| VectorType.PINECONE
):
return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]}
case (

View File

@ -0,0 +1,285 @@
import json
import time
import uuid
from typing import Any, Optional, Union
from pinecone import Pinecone, ServerlessSpec
from pydantic import BaseModel
from configs import dify_config
from core.rag.datasource.vdb.field import Field
from core.rag.datasource.vdb.vector_base import BaseVector
from core.rag.datasource.vdb.vector_factory import AbstractVectorFactory
from core.rag.datasource.vdb.vector_type import VectorType
from core.rag.embedding.embedding_base import Embeddings
from core.rag.models.document import Document
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import Dataset, DatasetCollectionBinding
class PineconeConfig(BaseModel):
"""Pinecone configuration class"""
api_key: str
environment: str
index_name: Optional[str] = None
timeout: float = 30
batch_size: int = 100
metric: str = "cosine"
class PineconeVector(BaseVector):
"""Pinecone vector database concrete implementation class"""
def __init__(self, collection_name: str, group_id: str, config: PineconeConfig):
super().__init__(collection_name)
self._client_config = config
self._group_id = group_id
# Initialize Pinecone client
self._pc = Pinecone(api_key=config.api_key)
# Use collection_name as index name
self._index_name = collection_name
self._index = None
def get_type(self) -> str:
"""Return vector database type identifier"""
return "pinecone"
def to_index_struct(self) -> dict:
"""Generate index structure dictionary"""
return {
"type": self.get_type(),
"vector_store": {"class_prefix": self._collection_name}
}
def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
"""Create vector index"""
if texts:
# Get vector dimension
vector_size = len(embeddings[0])
# Create Pinecone index
self.create_index(vector_size)
# Add vector data
self.add_texts(texts, embeddings, **kwargs)
def create_index(self, dimension: int):
"""Create Pinecone index"""
lock_name = f"vector_indexing_lock_{self._index_name}"
with redis_client.lock(lock_name, timeout=30):
# Check Redis cache
index_exist_cache_key = f"vector_indexing_{self._index_name}"
if redis_client.get(index_exist_cache_key):
self._index = self._pc.Index(self._index_name)
return
# Check if index already exists
existing_indexes = self._pc.list_indexes().names()
if self._index_name not in existing_indexes:
# Create new index using ServerlessSpec
self._pc.create_index(
name=self._index_name,
dimension=dimension,
metric=self._client_config.metric,
spec=ServerlessSpec(
cloud='aws',
region=self._client_config.environment
)
)
# Wait for index creation to complete
while not self._pc.describe_index(self._index_name).status['ready']:
time.sleep(1)
# Get index instance
self._index = self._pc.Index(self._index_name)
# Set cache
redis_client.set(index_exist_cache_key, 1, ex=3600)
def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
"""Batch add document vectors"""
if not self._index:
raise ValueError("Index not initialized. Call create() first.")
uuids = self._get_uuids(documents)
batch_size = self._client_config.batch_size
added_ids = []
# Batch processing
for i in range(0, len(documents), batch_size):
batch_documents = documents[i:i + batch_size]
batch_embeddings = embeddings[i:i + batch_size]
batch_uuids = uuids[i:i + batch_size]
# Build Pinecone vector data
vectors_to_upsert = []
for doc, embedding, doc_id in zip(batch_documents, batch_embeddings, batch_uuids):
metadata = {
Field.CONTENT_KEY.value: doc.page_content,
Field.METADATA_KEY.value: doc.metadata or {},
Field.GROUP_KEY.value: self._group_id,
}
vectors_to_upsert.append({
"id": doc_id,
"values": embedding,
"metadata": metadata
})
# Batch insert to Pinecone
self._index.upsert(vectors=vectors_to_upsert)
added_ids.extend(batch_uuids)
return added_ids
def search_by_vector(self, query_vector: list[float], **kwargs) -> list[Document]:
"""Vector similarity search"""
if not self._index:
raise ValueError("Index not initialized.")
top_k = kwargs.get("top_k", 4)
score_threshold = float(kwargs.get("score_threshold", 0.0))
# Build filter conditions
filter_dict = {Field.GROUP_KEY.value: {"$eq": self._group_id}}
# Document scope filtering
document_ids_filter = kwargs.get("document_ids_filter")
if document_ids_filter:
filter_dict[f"{Field.METADATA_KEY.value}.document_id"] = {"$in": document_ids_filter}
# Execute search
response = self._index.query(
vector=query_vector,
top_k=top_k,
include_metadata=True,
filter=filter_dict
)
# Convert results
docs = []
for match in response.matches:
if match.score >= score_threshold:
metadata = match.metadata.get(Field.METADATA_KEY.value, {})
metadata["score"] = match.score
doc = Document(
page_content=match.metadata.get(Field.CONTENT_KEY.value, ""),
metadata=metadata,
)
docs.append(doc)
# Sort by similarity score in descending order
docs.sort(key=lambda x: x.metadata.get("score", 0), reverse=True)
return docs
def search_by_full_text(self, query: str, **kwargs) -> list[Document]:
"""Full-text search - Pinecone does not natively support it, returns empty list"""
return []
def delete_by_metadata_field(self, key: str, value: str):
"""Delete by metadata field"""
if not self._index:
return
try:
# Build filter conditions
filter_dict = {
Field.GROUP_KEY.value: {"$eq": self._group_id},
f"{Field.METADATA_KEY.value}.{key}": {"$eq": value}
}
# Pinecone delete operation
self._index.delete(filter=filter_dict)
except Exception:
# Ignore delete errors
pass
def delete_by_ids(self, ids: list[str]) -> None:
"""Batch delete by ID list"""
if not self._index:
return
try:
# Pinecone delete by ID
self._index.delete(ids=ids)
except Exception:
# Ignore delete errors
pass
def delete(self) -> None:
"""Delete all vector data for the entire dataset"""
if not self._index:
return
try:
# Delete all vectors by group_id
filter_dict = {Field.GROUP_KEY.value: {"$eq": self._group_id}}
self._index.delete(filter=filter_dict)
except Exception:
# Ignore delete errors
pass
def text_exists(self, id: str) -> bool:
"""Check if document exists"""
if not self._index:
return False
try:
# Check if vector exists through query
response = self._index.fetch(ids=[id])
return id in response.vectors
except Exception:
return False
class PineconeVectorFactory(AbstractVectorFactory):
"""Pinecone vector database factory class"""
def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> PineconeVector:
"""Create PineconeVector instance"""
# Determine index name
if dataset.collection_binding_id:
dataset_collection_binding = (
db.session.query(DatasetCollectionBinding)
.where(DatasetCollectionBinding.id == dataset.collection_binding_id)
.one_or_none()
)
if dataset_collection_binding:
collection_name = dataset_collection_binding.collection_name
else:
raise ValueError("Dataset Collection Bindings does not exist!")
else:
if dataset.index_struct_dict:
class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"]
collection_name = class_prefix
else:
dataset_id = dataset.id
collection_name = Dataset.gen_collection_name_by_id(dataset_id)
# Set index structure
if not dataset.index_struct_dict:
dataset.index_struct = json.dumps(
self.gen_index_struct_dict("pinecone", collection_name)
)
# Create PineconeVector instance
return PineconeVector(
collection_name=collection_name,
group_id=dataset.id,
config=PineconeConfig(
api_key=dify_config.PINECONE_API_KEY or "",
environment=dify_config.PINECONE_ENVIRONMENT or "",
index_name=dify_config.PINECONE_INDEX_NAME,
timeout=dify_config.PINECONE_CLIENT_TIMEOUT,
batch_size=dify_config.PINECONE_BATCH_SIZE,
metric=dify_config.PINECONE_METRIC,
),
)

View File

@ -86,6 +86,10 @@ class Vector:
from core.rag.datasource.vdb.pgvecto_rs.pgvecto_rs import PGVectoRSFactory
return PGVectoRSFactory
case VectorType.PINECONE:
from core.rag.datasource.vdb.pinecone.pinecone_vector import PineconeVectorFactory
return PineconeVectorFactory
case VectorType.QDRANT:
from core.rag.datasource.vdb.qdrant.qdrant_vector import QdrantVectorFactory

View File

@ -31,3 +31,4 @@ class VectorType(StrEnum):
HUAWEI_CLOUD = "huawei_cloud"
MATRIXONE = "matrixone"
CLICKZETTA = "clickzetta"
PINECONE = "pinecone"

View File

@ -10,6 +10,23 @@ from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
def _format_cell_value(value) -> str:
if pd.isna(value):
return ""
if isinstance(value, (int, float)):
if isinstance(value, float):
if value.is_integer():
return str(int(value))
else:
formatted = f"{value:f}"
return formatted.rstrip('0').rstrip('.')
else:
return str(value)
return str(value)
class ExcelExtractor(BaseExtractor):
"""Load Excel files.
@ -49,10 +66,12 @@ class ExcelExtractor(BaseExtractor):
row=cast(int, index) + 2, column=col_index + 1
) # +2 to account for header and 1-based index
if cell.hyperlink:
value = f"[{v}]({cell.hyperlink.target})"
formatted_v = _format_cell_value(v)
value = f"[{formatted_v}]({cell.hyperlink.target})"
page_content.append(f'"{k}":"{value}"')
else:
page_content.append(f'"{k}":"{v}"')
formatted_v = _format_cell_value(v)
page_content.append(f'"{k}":"{formatted_v}"')
documents.append(
Document(page_content=";".join(page_content), metadata={"source": self._file_path})
)
@ -67,7 +86,8 @@ class ExcelExtractor(BaseExtractor):
page_content = []
for k, v in row.items():
if pd.notna(v):
page_content.append(f'"{k}":"{v}"')
formatted_v = _format_cell_value(v)
page_content.append(f'"{k}":"{formatted_v}"')
documents.append(
Document(page_content=";".join(page_content), metadata={"source": self._file_path})
)

View File

@ -485,6 +485,24 @@ def _extract_text_from_csv(file_content: bytes) -> str:
raise TextExtractionError(f"Failed to extract text from CSV: {str(e)}") from e
def _format_cell_value_for_markdown(value) -> str:
"""格式化单元格值,避免科学计数法"""
if pd.isna(value):
return ""
if isinstance(value, (int, float)):
if isinstance(value, float):
if value.is_integer():
return str(int(value))
else:
formatted = f"{value:f}"
return formatted.rstrip('0').rstrip('.')
else:
return str(value)
return str(value)
def _extract_text_from_excel(file_content: bytes) -> str:
"""Extract text from an Excel file using pandas."""
@ -499,7 +517,8 @@ def _extract_text_from_excel(file_content: bytes) -> str:
# Construct the data rows
data_rows = []
for _, row in df.iterrows():
data_row = "| " + " | ".join(map(str, row)) + " |"
formatted_row = [_format_cell_value_for_markdown(cell) for cell in row]
data_row = "| " + " | ".join(formatted_row) + " |"
data_rows.append(data_row)
# Combine all rows into a single string

View File

@ -88,6 +88,7 @@ dependencies = [
"httpx-sse>=0.4.0",
"sendgrid~=6.12.3",
"flask-restx>=1.3.0",
"pinecone>=7.3.0",
]
# Before adding new dependency, consider place it in
# alphabet order (a-z) and suitable group.

View File

@ -0,0 +1,30 @@
from core.rag.datasource.vdb.pinecone.pinecone_vector import PineconeConfig, PineconeVector
from core.rag.models.document import Document
from tests.integration_tests.vdb.test_vector_store import (
AbstractVectorTest,
setup_mock_redis,
)
class PineconeVectorTest(AbstractVectorTest):
def __init__(self):
super().__init__()
self.attributes = ["doc_id", "dataset_id", "document_id", "doc_hash"]
self.vector = PineconeVector(
collection_name=self.collection_name,
group_id=self.dataset_id,
config=PineconeConfig(
api_key="test_api_key",
environment="test_environment",
index_name="test_index",
),
)
def search_by_vector(self):
super().search_by_vector()
def test_pinecone_vector(setup_mock_redis):
PineconeVectorTest().run_all_tests()

File diff suppressed because it is too large Load Diff