From 71fa14f7915ce39bb2f8b51d6aac993b602a1c8a Mon Sep 17 00:00:00 2001
From: le0zh <newlight@qq.com>
Date: Wed, 22 Jan 2025 15:18:23 +0800
Subject: [PATCH 1/8] fix: resolve clipboard.writeText failure under HTTP
 protocol (#12936)

---
 web/app/components/develop/code.tsx |  3 ++-
 web/utils/clipboard.ts              | 35 +++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 web/utils/clipboard.ts
diff --git a/web/app/components/develop/code.tsx b/web/app/components/develop/code.tsx
index c1fbaa1cf8..7716cd4c93 100644
--- a/web/app/components/develop/code.tsx
+++ b/web/app/components/develop/code.tsx
@@ -10,6 +10,7 @@ import {
 import { Tab } from '@headlessui/react'
 import { Tag } from './tag'
 import classNames from '@/utils/classnames'
+import { writeTextToClipboard } from '@/utils/clipboard'
 
 const languageNames = {
   js: 'JavaScript',
@@ -71,7 +72,7 @@ function CopyButton({ code }: { code: string }) {
           : 'bg-white/5 hover:bg-white/7.5 dark:bg-white/2.5 dark:hover:bg-white/5',
       )}
       onClick={() => {
-        window.navigator.clipboard.writeText(code).then(() => {
+        writeTextToClipboard(code).then(() => {
           setCopyCount(count => count + 1)
         })
       }}
diff --git a/web/utils/clipboard.ts b/web/utils/clipboard.ts
new file mode 100644
index 0000000000..8e7a4495b3
--- /dev/null
+++ b/web/utils/clipboard.ts
@@ -0,0 +1,35 @@
+export async function writeTextToClipboard(text: string): Promise<void> {
+  if (navigator.clipboard && navigator.clipboard.writeText)
+    return navigator.clipboard.writeText(text)
+
+  return fallbackCopyTextToClipboard(text)
+}
+
+async function fallbackCopyTextToClipboard(text: string): Promise<void> {
+  const textArea = document.createElement('textarea')
+  textArea.value = text
+  textArea.style.position = 'fixed' // Avoid scrolling to bottom
+  document.body.appendChild(textArea)
+  textArea.focus()
+  textArea.select()
+  try {
+    const successful = document.execCommand('copy')
+    if (successful)
+      return Promise.resolve()
+
+    return Promise.reject(new Error('document.execCommand failed'))
+  }
+  catch (err) {
+    return Promise.reject(convertAnyToError(err))
+  }
+  finally {
+    document.body.removeChild(textArea)
+  }
+}
+
+function convertAnyToError(err: any): Error {
+  if (err instanceof Error)
+    return err
+
+  return new Error(`Caught: ${String(err)}`)
+}

From d167d5b1be67d0c32e808fc975d2dd1007a21f45 Mon Sep 17 00:00:00 2001
From: sino <sino2322@gmail.com>
Date: Wed, 22 Jan 2025 15:25:57 +0800
Subject: [PATCH 2/8] feat(ark): support doubao 1.5 series of models (#12935)

---
 .../volcengine_maas/llm/models.py             | 57 +++++++++++++------
 .../volcengine_maas/volcengine_maas.yaml      | 24 ++++++++
 2 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/api/core/model_runtime/model_providers/volcengine_maas/llm/models.py b/api/core/model_runtime/model_providers/volcengine_maas/llm/models.py
index 7c37368086..94315cd026 100644
--- a/api/core/model_runtime/model_providers/volcengine_maas/llm/models.py
+++ b/api/core/model_runtime/model_providers/volcengine_maas/llm/models.py
@@ -18,72 +18,93 @@ class ModelConfig(BaseModel):
 
 
 configs: dict[str, ModelConfig] = {
+    "Doubao-1.5-vision-pro-32k": ModelConfig(
+        properties=ModelProperties(context_size=32768, max_tokens=12288, mode=LLMMode.CHAT),
+        features=[ModelFeature.AGENT_THOUGHT, ModelFeature.VISION],
+    ),
+    "Doubao-1.5-pro-32k": ModelConfig(
+        properties=ModelProperties(context_size=32768, max_tokens=12288, mode=LLMMode.CHAT),
+        features=[ModelFeature.AGENT_THOUGHT],
+    ),
+    "Doubao-1.5-lite-32k": ModelConfig(
+        properties=ModelProperties(context_size=32768, max_tokens=12288, mode=LLMMode.CHAT),
+        features=[ModelFeature.AGENT_THOUGHT],
+    ),
+    "Doubao-1.5-pro-256k": ModelConfig(
+        properties=ModelProperties(context_size=262144, max_tokens=12288, mode=LLMMode.CHAT),
+        features=[ModelFeature.AGENT_THOUGHT],
+    ),
     "Doubao-vision-pro-32k": ModelConfig(
         properties=ModelProperties(context_size=32768, max_tokens=4096, mode=LLMMode.CHAT),
-        features=[ModelFeature.VISION],
+        features=[ModelFeature.AGENT_THOUGHT, ModelFeature.VISION],
     ),
     "Doubao-vision-lite-32k": ModelConfig(
         properties=ModelProperties(context_size=32768, max_tokens=4096, mode=LLMMode.CHAT),
-        features=[ModelFeature.VISION],
+        features=[ModelFeature.AGENT_THOUGHT, ModelFeature.VISION],
     ),
     "Doubao-pro-4k": ModelConfig(
         properties=ModelProperties(context_size=4096, max_tokens=4096, mode=LLMMode.CHAT),
-        features=[ModelFeature.TOOL_CALL],
+        features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL],
     ),
     "Doubao-lite-4k": ModelConfig(
         properties=ModelProperties(context_size=4096, max_tokens=4096, mode=LLMMode.CHAT),
-        features=[ModelFeature.TOOL_CALL],
+        features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL],
     ),
     "Doubao-pro-32k": ModelConfig(
         properties=ModelProperties(context_size=32768, max_tokens=4096, mode=LLMMode.CHAT),
-        features=[ModelFeature.TOOL_CALL],
+        features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL],
     ),
     "Doubao-lite-32k": ModelConfig(
         properties=ModelProperties(context_size=32768, max_tokens=4096, mode=LLMMode.CHAT),
-        features=[ModelFeature.TOOL_CALL],
+        features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL],
     ),
     "Doubao-pro-256k": ModelConfig(
         properties=ModelProperties(context_size=262144, max_tokens=4096, mode=LLMMode.CHAT),
-        features=[],
+        features=[ModelFeature.AGENT_THOUGHT],
     ),
     "Doubao-pro-128k": ModelConfig(
         properties=ModelProperties(context_size=131072, max_tokens=4096, mode=LLMMode.CHAT),
-        features=[ModelFeature.TOOL_CALL],
+        features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL],
     ),
     "Doubao-lite-128k": ModelConfig(
-        properties=ModelProperties(context_size=131072, max_tokens=4096, mode=LLMMode.CHAT), features=[]
+        properties=ModelProperties(context_size=131072, max_tokens=4096, mode=LLMMode.CHAT),
+        features=[ModelFeature.AGENT_THOUGHT],
     ),
     "Skylark2-pro-4k": ModelConfig(
-        properties=ModelProperties(context_size=4096, max_tokens=4096, mode=LLMMode.CHAT), features=[]
+        properties=ModelProperties(context_size=4096, max_tokens=4096, mode=LLMMode.CHAT),
+        features=[ModelFeature.AGENT_THOUGHT],
     ),
     "Llama3-8B": ModelConfig(
-        properties=ModelProperties(context_size=8192, max_tokens=8192, mode=LLMMode.CHAT), features=[]
+        properties=ModelProperties(context_size=8192, max_tokens=8192, mode=LLMMode.CHAT),
+        features=[ModelFeature.AGENT_THOUGHT],
     ),
     "Llama3-70B": ModelConfig(
-        properties=ModelProperties(context_size=8192, max_tokens=8192, mode=LLMMode.CHAT), features=[]
+        properties=ModelProperties(context_size=8192, max_tokens=8192, mode=LLMMode.CHAT),
+        features=[ModelFeature.AGENT_THOUGHT],
     ),
     "Moonshot-v1-8k": ModelConfig(
         properties=ModelProperties(context_size=8192, max_tokens=4096, mode=LLMMode.CHAT),
-        features=[ModelFeature.TOOL_CALL],
+        features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL],
     ),
     "Moonshot-v1-32k": ModelConfig(
         properties=ModelProperties(context_size=32768, max_tokens=16384, mode=LLMMode.CHAT),
-        features=[ModelFeature.TOOL_CALL],
+        features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL],
     ),
     "Moonshot-v1-128k": ModelConfig(
         properties=ModelProperties(context_size=131072, max_tokens=65536, mode=LLMMode.CHAT),
-        features=[ModelFeature.TOOL_CALL],
+        features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL],
     ),
     "GLM3-130B": ModelConfig(
         properties=ModelProperties(context_size=8192, max_tokens=4096, mode=LLMMode.CHAT),
-        features=[ModelFeature.TOOL_CALL],
+        features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL],
     ),
     "GLM3-130B-Fin": ModelConfig(
         properties=ModelProperties(context_size=8192, max_tokens=4096, mode=LLMMode.CHAT),
-        features=[ModelFeature.TOOL_CALL],
+        features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL],
     ),
     "Mistral-7B": ModelConfig(
-        properties=ModelProperties(context_size=8192, max_tokens=2048, mode=LLMMode.CHAT), features=[]
+        properties=ModelProperties(context_size=8192, max_tokens=2048, mode=LLMMode.CHAT),
+        features=[ModelFeature.AGENT_THOUGHT],
     ),
 }
 
diff --git a/api/core/model_runtime/model_providers/volcengine_maas/volcengine_maas.yaml b/api/core/model_runtime/model_providers/volcengine_maas/volcengine_maas.yaml
index 2ddb612546..976cd26ac9 100644
--- a/api/core/model_runtime/model_providers/volcengine_maas/volcengine_maas.yaml
+++ b/api/core/model_runtime/model_providers/volcengine_maas/volcengine_maas.yaml
@@ -118,6 +118,30 @@ model_credential_schema:
       type: select
       required: true
       options:
+        - label:
+            en_US: Doubao-1.5-vision-pro-32k
+          value: Doubao-1.5-vision-pro-32k
+          show_on:
+            - variable: __model_type
+              value: llm
+        - label:
+            en_US: Doubao-1.5-pro-32k
+          value: Doubao-1.5-pro-32k
+          show_on:
+            - variable: __model_type
+              value: llm
+        - label:
+            en_US: Doubao-1.5-lite-32k
+          value: Doubao-1.5-lite-32k
+          show_on:
+            - variable: __model_type
+              value: llm
+        - label:
+            en_US: Doubao-1.5-pro-256k
+          value: Doubao-1.5-pro-256k
+          show_on:
+            - variable: __model_type
+              value: llm
         - label:
             en_US: Doubao-vision-pro-32k
           value: Doubao-vision-pro-32k

From 1e73f63ff8b6181963244cf72f3841ab70de11a4 Mon Sep 17 00:00:00 2001
From: -LAN- <laipz8200@outlook.com>
Date: Wed, 22 Jan 2025 16:40:44 +0800
Subject: [PATCH 3/8] chore: update version to 0.15.2 in packaging and docker
 configurations (#12940)

Signed-off-by: -LAN- <laipz8200@outlook.com>
---
 api/configs/packaging/__init__.py   | 2 +-
 docker-legacy/docker-compose.yaml   | 6 +++---
 docker/docker-compose-template.yaml | 6 +++---
 docker/docker-compose.yaml          | 6 +++---
 web/package.json                    | 2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/api/configs/packaging/__init__.py b/api/configs/packaging/__init__.py
index a54c5bf5ee..20c1f58c99 100644
--- a/api/configs/packaging/__init__.py
+++ b/api/configs/packaging/__init__.py
@@ -9,7 +9,7 @@ class PackagingInfo(BaseSettings):
 
     CURRENT_VERSION: str = Field(
         description="Dify version",
-        default="0.15.1",
+        default="0.15.2",
     )
 
     COMMIT_SHA: str = Field(
diff --git a/docker-legacy/docker-compose.yaml b/docker-legacy/docker-compose.yaml
index 6e4c8a748e..d2b6689453 100644
--- a/docker-legacy/docker-compose.yaml
+++ b/docker-legacy/docker-compose.yaml
@@ -2,7 +2,7 @@ version: '3'
 services:
   # API service
   api:
-    image: langgenius/dify-api:0.15.1
+    image: langgenius/dify-api:0.15.2
     restart: always
     environment:
       # Startup mode, 'api' starts the API server.
@@ -227,7 +227,7 @@ services:
   # worker service
   # The Celery worker for processing the queue.
   worker:
-    image: langgenius/dify-api:0.15.1
+    image: langgenius/dify-api:0.15.2
     restart: always
     environment:
       CONSOLE_WEB_URL: ''
@@ -397,7 +397,7 @@ services:
 
   # Frontend web application.
   web:
-    image: langgenius/dify-web:0.15.1
+    image: langgenius/dify-web:0.15.2
     restart: always
     environment:
       # The base URL of console application api server, refers to the Console base URL of WEB service if console domain is
diff --git a/docker/docker-compose-template.yaml b/docker/docker-compose-template.yaml
index d24e7c181f..8aafc61888 100644
--- a/docker/docker-compose-template.yaml
+++ b/docker/docker-compose-template.yaml
@@ -2,7 +2,7 @@ x-shared-env: &shared-api-worker-env
 services:
   # API service
   api:
-    image: langgenius/dify-api:0.15.1
+    image: langgenius/dify-api:0.15.2
     restart: always
     environment:
       # Use the shared environment variables.
@@ -25,7 +25,7 @@ services:
   # worker service
   # The Celery worker for processing the queue.
   worker:
-    image: langgenius/dify-api:0.15.1
+    image: langgenius/dify-api:0.15.2
     restart: always
     environment:
       # Use the shared environment variables.
@@ -47,7 +47,7 @@ services:
 
   # Frontend web application.
   web:
-    image: langgenius/dify-web:0.15.1
+    image: langgenius/dify-web:0.15.2
     restart: always
     environment:
       CONSOLE_API_URL: ${CONSOLE_API_URL:-}
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
index 21e72a4cd6..a11ec261f3 100644
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -393,7 +393,7 @@ x-shared-env: &shared-api-worker-env
 services:
   # API service
   api:
-    image: langgenius/dify-api:0.15.1
+    image: langgenius/dify-api:0.15.2
     restart: always
     environment:
       # Use the shared environment variables.
@@ -416,7 +416,7 @@ services:
   # worker service
   # The Celery worker for processing the queue.
   worker:
-    image: langgenius/dify-api:0.15.1
+    image: langgenius/dify-api:0.15.2
     restart: always
     environment:
       # Use the shared environment variables.
@@ -438,7 +438,7 @@ services:
 
   # Frontend web application.
   web:
-    image: langgenius/dify-web:0.15.1
+    image: langgenius/dify-web:0.15.2
     restart: always
     environment:
       CONSOLE_API_URL: ${CONSOLE_API_URL:-}
diff --git a/web/package.json b/web/package.json
index 879b87c596..6ae11d71b4 100644
--- a/web/package.json
+++ b/web/package.json
@@ -1,6 +1,6 @@
 {
   "name": "dify-web",
-  "version": "0.15.1",
+  "version": "0.15.2",
   "private": true,
   "engines": {
     "node": ">=18.17.0"

From 4c3076f2a474fd66233a1a0f9fe0331423aea546 Mon Sep 17 00:00:00 2001
From: huangzhuo1949 <167434202+huangzhuo1949@users.noreply.github.com>
Date: Wed, 22 Jan 2025 17:07:18 +0800
Subject: [PATCH 4/8] feat: add pg vector index (#12338)

Co-authored-by: huangzhuo <huangzhuo1@xiaomi.com>
---
 api/core/rag/datasource/vdb/pgvector/pgvector.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/api/core/rag/datasource/vdb/pgvector/pgvector.py b/api/core/rag/datasource/vdb/pgvector/pgvector.py
index de443ba580..c8a1e4f90c 100644
--- a/api/core/rag/datasource/vdb/pgvector/pgvector.py
+++ b/api/core/rag/datasource/vdb/pgvector/pgvector.py
@@ -57,6 +57,11 @@ CREATE TABLE IF NOT EXISTS {table_name} (
 ) using heap;
 """
 
+SQL_CREATE_INDEX = """
+CREATE INDEX IF NOT EXISTS embedding_cosine_v1_idx ON {table_name} 
+USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64);
+"""
+
 
 class PGVector(BaseVector):
     def __init__(self, collection_name: str, config: PGVectorConfig):
@@ -205,7 +210,10 @@ class PGVector(BaseVector):
             with self._get_cursor() as cur:
                 cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
                 cur.execute(SQL_CREATE_TABLE.format(table_name=self.table_name, dimension=dimension))
-                # TODO: create index https://github.com/pgvector/pgvector?tab=readme-ov-file#indexing
+                # PG hnsw index only support 2000 dimension or less
+                # ref: https://github.com/pgvector/pgvector?tab=readme-ov-file#indexing
+                if dimension <= 2000:
+                    cur.execute(SQL_CREATE_INDEX.format(table_name=self.table_name))
             redis_client.set(collection_exist_cache_key, 1, ex=3600)
 
 

From dd0904f95c97dab8404f71e06212ee182db9c554 Mon Sep 17 00:00:00 2001
From: jiandanfeng <chenjh3@wangsu.com>
Date: Wed, 22 Jan 2025 19:26:25 +0800
Subject: [PATCH 5/8] feat: add giteeAI risk control identification. (#12946)

---
 .../builtin/gitee_ai/tools/risk-control.py    | 26 +++++++++++++++
 .../builtin/gitee_ai/tools/risk-control.yaml  | 32 +++++++++++++++++++
 2 files changed, 58 insertions(+)
 create mode 100644 api/core/tools/provider/builtin/gitee_ai/tools/risk-control.py
 create mode 100644 api/core/tools/provider/builtin/gitee_ai/tools/risk-control.yaml

diff --git a/api/core/tools/provider/builtin/gitee_ai/tools/risk-control.py b/api/core/tools/provider/builtin/gitee_ai/tools/risk-control.py
new file mode 100644
index 0000000000..e3558ce699
--- /dev/null
+++ b/api/core/tools/provider/builtin/gitee_ai/tools/risk-control.py
@@ -0,0 +1,26 @@
+from typing import Any, Union
+
+import requests
+
+from core.tools.entities.tool_entities import ToolInvokeMessage
+from core.tools.tool.builtin_tool import BuiltinTool
+
+
+class GiteeAIToolRiskControl(BuiltinTool):
+    def _invoke(
+        self, user_id: str, tool_parameters: dict[str, Any]
+    ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
+        headers = {
+            "content-type": "application/json",
+            "authorization": f"Bearer {self.runtime.credentials['api_key']}",
+        }
+
+        inputs = [{"type": "text", "text": tool_parameters.get("input-text")}]
+        model = tool_parameters.get("model", "Security-semantic-filtering")
+        payload = {"model": model, "input": inputs}
+        url = "https://ai.gitee.com/v1/moderations"
+        response = requests.post(url, json=payload, headers=headers)
+        if response.status_code != 200:
+            return self.create_text_message(f"Got Error Response:{response.text}")
+
+        return [self.create_text_message(response.content.decode("utf-8"))]
diff --git a/api/core/tools/provider/builtin/gitee_ai/tools/risk-control.yaml b/api/core/tools/provider/builtin/gitee_ai/tools/risk-control.yaml
new file mode 100644
index 0000000000..6e7229dc1c
--- /dev/null
+++ b/api/core/tools/provider/builtin/gitee_ai/tools/risk-control.yaml
@@ -0,0 +1,32 @@
+identity:
+  name: risk control
+  author: gitee_ai
+  label:
+    en_US: risk control identification
+    zh_Hans: 风控识别
+  icon: icon.svg
+description:
+  human:
+    en_US: Ensuring the protection and compliance of sensitive information through the filtering and analysis of data semantics
+    zh_Hans: 通过对数据语义的过滤和分析，确保敏感信息的保护和合规性
+  llm: This tool is used to risk control identification.
+parameters:
+  - name: model
+    type: string
+    required: true
+    default: Security-semantic-filtering
+    label:
+      en_US: Service Model
+      zh_Hans: 服务模型
+    form: form
+  - name: input-text
+    type: string
+    required: true
+    label:
+      en_US: Input Text
+      zh_Hans: 输入文本
+    human_description:
+      en_US: The text input for filtering and analysis.
+      zh_Hans: 用于分析过滤的文本
+    llm_description: The text input for filtering and analysis.
+    form: llm

From fd4afe09f877255d672ab20113471e52acc73fe1 Mon Sep 17 00:00:00 2001
From: Jhvcc <37662342+Jhvcc@users.noreply.github.com>
Date: Wed, 22 Jan 2025 19:27:02 +0800
Subject: [PATCH 6/8] fix: tools translate search (#12950)

Co-authored-by: lowell <lowell.hu@zkteco.in>
---
 web/app/components/tools/provider-list.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/app/components/tools/provider-list.tsx b/web/app/components/tools/provider-list.tsx
index 6f17835589..73c7363641 100644
--- a/web/app/components/tools/provider-list.tsx
+++ b/web/app/components/tools/provider-list.tsx
@@ -46,7 +46,7 @@ const ProviderList = () => {
       if (tagFilterValue.length > 0 && (!collection.labels || collection.labels.every(label => !tagFilterValue.includes(label))))
         return false
       if (keywords)
-        return collection.name.toLowerCase().includes(keywords.toLowerCase())
+        return Object.values(collection.label).some(value => value.toLowerCase().includes(keywords.toLowerCase()))
       return true
     })
   }, [activeTab, tagFilterValue, keywords, collectionList])

From f565f08aa0793eaa75adbf9392d1ecd68dc53bfc Mon Sep 17 00:00:00 2001
From: Joel <iamjoel007@gmail.com>
Date: Thu, 23 Jan 2025 11:02:29 +0800
Subject: [PATCH 7/8] fix: get property of string type variable caused page
 crash (#12969)

---
 .../workflow/nodes/_base/components/variable/utils.ts         | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/web/app/components/workflow/nodes/_base/components/variable/utils.ts b/web/app/components/workflow/nodes/_base/components/variable/utils.ts
index 715ad1c7b1..24c2b73fed 100644
--- a/web/app/components/workflow/nodes/_base/components/variable/utils.ts
+++ b/web/app/components/workflow/nodes/_base/components/variable/utils.ts
@@ -546,7 +546,9 @@ export const getVarType = ({
   else {
     (valueSelector as ValueSelector).slice(1).forEach((key, i) => {
       const isLast = i === valueSelector.length - 2
-      curr = curr?.find((v: any) => v.variable === key)
+      if (Array.isArray(curr))
+        curr = curr?.find((v: any) => v.variable === key)
+
       if (isLast) {
         type = curr?.type
       }

From 6024d8a42d1a39d3e0d6b81df806e3d624839519 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adem=C3=ADlson=20Tonato?= <ademilsonft@outlook.com>
Date: Thu, 23 Jan 2025 03:14:48 +0000
Subject: [PATCH 8/8] refactor: Update Firecrawl to use v1 API (#12574)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Ademílson Tonato <ademilson.tonato@refurbed.com>
---
 .../rag/extractor/firecrawl/firecrawl_app.py  | 105 +++++++++---------
 api/services/auth/firecrawl/firecrawl.py      |   8 +-
 api/services/website_service.py               |  30 ++---
 .../rag/extractor/firecrawl/test_firecrawl.py |  19 ++--
 4 files changed, 81 insertions(+), 81 deletions(-)

diff --git a/api/core/rag/extractor/firecrawl/firecrawl_app.py b/api/core/rag/extractor/firecrawl/firecrawl_app.py
index eac08aeb8b..836a1398bf 100644
--- a/api/core/rag/extractor/firecrawl/firecrawl_app.py
+++ b/api/core/rag/extractor/firecrawl/firecrawl_app.py
@@ -1,6 +1,6 @@
 import json
 import time
-from typing import cast
+from typing import Any, cast
 
 import requests
 
@@ -14,48 +14,47 @@ class FirecrawlApp:
         if self.api_key is None and self.base_url == "https://api.firecrawl.dev":
             raise ValueError("No API key provided")
 
-    def scrape_url(self, url, params=None) -> dict:
-        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
-        json_data = {"url": url}
+    def scrape_url(self, url, params=None) -> dict[str, Any]:
+        # Documentation: https://docs.firecrawl.dev/api-reference/endpoint/scrape
+        headers = self._prepare_headers()
+        json_data = {
+            "url": url,
+            "formats": ["markdown"],
+            "onlyMainContent": True,
+            "timeout": 30000,
+        }
         if params:
             json_data.update(params)
-        response = requests.post(f"{self.base_url}/v0/scrape", headers=headers, json=json_data)
+        response = self._post_request(f"{self.base_url}/v1/scrape", json_data, headers)
         if response.status_code == 200:
             response_data = response.json()
-            if response_data["success"] == True:
-                data = response_data["data"]
-                return {
-                    "title": data.get("metadata").get("title"),
-                    "description": data.get("metadata").get("description"),
-                    "source_url": data.get("metadata").get("sourceURL"),
-                    "markdown": data.get("markdown"),
-                }
-            else:
-                raise Exception(f"Failed to scrape URL. Error: {response_data['error']}")
-
-        elif response.status_code in {402, 409, 500}:
-            error_message = response.json().get("error", "Unknown error occurred")
-            raise Exception(f"Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}")
+            data = response_data["data"]
+            return self._extract_common_fields(data)
+        elif response.status_code in {402, 409, 500, 429, 408}:
+            self._handle_error(response, "scrape URL")
+            return {}  # Avoid additional exception after handling error
         else:
             raise Exception(f"Failed to scrape URL. Status code: {response.status_code}")
 
     def crawl_url(self, url, params=None) -> str:
+        # Documentation: https://docs.firecrawl.dev/api-reference/endpoint/crawl-post
         headers = self._prepare_headers()
         json_data = {"url": url}
         if params:
             json_data.update(params)
-        response = self._post_request(f"{self.base_url}/v0/crawl", json_data, headers)
+        response = self._post_request(f"{self.base_url}/v1/crawl", json_data, headers)
         if response.status_code == 200:
-            job_id = response.json().get("jobId")
+            # There's also another two fields in the response: "success" (bool) and "url" (str)
+            job_id = response.json().get("id")
             return cast(str, job_id)
         else:
             self._handle_error(response, "start crawl job")
             # FIXME: unreachable code for mypy
             return ""  # unreachable
 
-    def check_crawl_status(self, job_id) -> dict:
+    def check_crawl_status(self, job_id) -> dict[str, Any]:
         headers = self._prepare_headers()
-        response = self._get_request(f"{self.base_url}/v0/crawl/status/{job_id}", headers)
+        response = self._get_request(f"{self.base_url}/v1/crawl/{job_id}", headers)
         if response.status_code == 200:
             crawl_status_response = response.json()
             if crawl_status_response.get("status") == "completed":
@@ -66,42 +65,48 @@ class FirecrawlApp:
                 url_data_list = []
                 for item in data:
                     if isinstance(item, dict) and "metadata" in item and "markdown" in item:
-                        url_data = {
-                            "title": item.get("metadata", {}).get("title"),
-                            "description": item.get("metadata", {}).get("description"),
-                            "source_url": item.get("metadata", {}).get("sourceURL"),
-                            "markdown": item.get("markdown"),
-                        }
+                        url_data = self._extract_common_fields(item)
                         url_data_list.append(url_data)
                 if url_data_list:
                     file_key = "website_files/" + job_id + ".txt"
-                    if storage.exists(file_key):
-                        storage.delete(file_key)
-                    storage.save(file_key, json.dumps(url_data_list).encode("utf-8"))
-                return {
-                    "status": "completed",
-                    "total": crawl_status_response.get("total"),
-                    "current": crawl_status_response.get("current"),
-                    "data": url_data_list,
-                }
-
+                    try:
+                        if storage.exists(file_key):
+                            storage.delete(file_key)
+                        storage.save(file_key, json.dumps(url_data_list).encode("utf-8"))
+                    except Exception as e:
+                        raise Exception(f"Error saving crawl data: {e}")
+                return self._format_crawl_status_response("completed", crawl_status_response, url_data_list)
             else:
-                return {
-                    "status": crawl_status_response.get("status"),
-                    "total": crawl_status_response.get("total"),
-                    "current": crawl_status_response.get("current"),
-                    "data": [],
-                }
-
+                return self._format_crawl_status_response(
+                    crawl_status_response.get("status"), crawl_status_response, []
+                )
         else:
             self._handle_error(response, "check crawl status")
             # FIXME: unreachable code for mypy
             return {}  # unreachable
 
-    def _prepare_headers(self):
+    def _format_crawl_status_response(
+        self, status: str, crawl_status_response: dict[str, Any], url_data_list: list[dict[str, Any]]
+    ) -> dict[str, Any]:
+        return {
+            "status": status,
+            "total": crawl_status_response.get("total"),
+            "current": crawl_status_response.get("completed"),
+            "data": url_data_list,
+        }
+
+    def _extract_common_fields(self, item: dict[str, Any]) -> dict[str, Any]:
+        return {
+            "title": item.get("metadata", {}).get("title"),
+            "description": item.get("metadata", {}).get("description"),
+            "source_url": item.get("metadata", {}).get("sourceURL"),
+            "markdown": item.get("markdown"),
+        }
+
+    def _prepare_headers(self) -> dict[str, Any]:
         return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
 
-    def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
+    def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5) -> requests.Response:
         for attempt in range(retries):
             response = requests.post(url, headers=headers, json=data)
             if response.status_code == 502:
@@ -110,7 +115,7 @@ class FirecrawlApp:
                 return response
         return response
 
-    def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
+    def _get_request(self, url, headers, retries=3, backoff_factor=0.5) -> requests.Response:
         for attempt in range(retries):
             response = requests.get(url, headers=headers)
             if response.status_code == 502:
@@ -119,6 +124,6 @@ class FirecrawlApp:
                 return response
         return response
 
-    def _handle_error(self, response, action):
+    def _handle_error(self, response, action) -> None:
         error_message = response.json().get("error", "Unknown error occurred")
         raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}")
diff --git a/api/services/auth/firecrawl/firecrawl.py b/api/services/auth/firecrawl/firecrawl.py
index 50e4edff14..cc6eaaa42a 100644
--- a/api/services/auth/firecrawl/firecrawl.py
+++ b/api/services/auth/firecrawl/firecrawl.py
@@ -21,10 +21,12 @@ class FirecrawlAuth(ApiKeyAuthBase):
         headers = self._prepare_headers()
         options = {
             "url": "https://example.com",
-            "crawlerOptions": {"excludes": [], "includes": [], "limit": 1},
-            "pageOptions": {"onlyMainContent": True},
+            "excludes": [],
+            "includes": [],
+            "limit": 1,
+            "scrapeOptions": {"onlyMainContent": True},
         }
-        response = self._post_request(f"{self.base_url}/v0/crawl", options, headers)
+        response = self._post_request(f"{self.base_url}/v1/crawl", options, headers)
         if response.status_code == 200:
             return True
         else:
diff --git a/api/services/website_service.py b/api/services/website_service.py
index 1ad7d0399d..b30e2205f7 100644
--- a/api/services/website_service.py
+++ b/api/services/website_service.py
@@ -38,30 +38,24 @@ class WebsiteService:
             only_main_content = options.get("only_main_content", False)
             if not crawl_sub_pages:
                 params = {
-                    "crawlerOptions": {
-                        "includes": [],
-                        "excludes": [],
-                        "generateImgAltText": True,
-                        "limit": 1,
-                        "returnOnlyUrls": False,
-                        "pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False},
-                    }
+                    "includes": [],
+                    "excludes": [],
+                    "generateImgAltText": True,
+                    "limit": 1,
+                    "scrapeOptions": {"onlyMainContent": only_main_content},
                 }
             else:
                 includes = options.get("includes").split(",") if options.get("includes") else []
                 excludes = options.get("excludes").split(",") if options.get("excludes") else []
                 params = {
-                    "crawlerOptions": {
-                        "includes": includes,
-                        "excludes": excludes,
-                        "generateImgAltText": True,
-                        "limit": options.get("limit", 1),
-                        "returnOnlyUrls": False,
-                        "pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False},
-                    }
+                    "includes": includes,
+                    "excludes": excludes,
+                    "generateImgAltText": True,
+                    "limit": options.get("limit", 1),
+                    "scrapeOptions": {"onlyMainContent": only_main_content},
                 }
                 if options.get("max_depth"):
-                    params["crawlerOptions"]["maxDepth"] = options.get("max_depth")
+                    params["maxDepth"] = options.get("max_depth")
             job_id = firecrawl_app.crawl_url(url, params)
             website_crawl_time_cache_key = f"website_crawl_{job_id}"
             time = str(datetime.datetime.now().timestamp())
@@ -228,7 +222,7 @@ class WebsiteService:
             # decrypt api_key
             api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
             firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
-            params = {"pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False}}
+            params = {"onlyMainContent": only_main_content}
             result = firecrawl_app.scrape_url(url, params)
             return result
         else:
diff --git a/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py b/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py
index 8fcdf2e8e5..120ca9c8ea 100644
--- a/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py
+++ b/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py
@@ -10,19 +10,18 @@ def test_firecrawl_web_extractor_crawl_mode(mocker):
     base_url = "https://api.firecrawl.dev"
     firecrawl_app = FirecrawlApp(api_key=api_key, base_url=base_url)
     params = {
-        "crawlerOptions": {
-            "includes": [],
-            "excludes": [],
-            "generateImgAltText": True,
-            "maxDepth": 1,
-            "limit": 1,
-            "returnOnlyUrls": False,
-        }
+        "includes": [],
+        "excludes": [],
+        "generateImgAltText": True,
+        "maxDepth": 1,
+        "limit": 1,
     }
     mocked_firecrawl = {
-        "jobId": "test",
+        "id": "test",
     }
     mocker.patch("requests.post", return_value=_mock_response(mocked_firecrawl))
     job_id = firecrawl_app.crawl_url(url, params)
-    print(job_id)
+    print(f"job_id: {job_id}")
+
+    assert job_id is not None
     assert isinstance(job_id, str)