From 71fa14f7915ce39bb2f8b51d6aac993b602a1c8a Mon Sep 17 00:00:00 2001 From: le0zh Date: Wed, 22 Jan 2025 15:18:23 +0800 Subject: [PATCH 1/8] fix: resolve clipboard.writeText failure under HTTP protocol (#12936) --- web/app/components/develop/code.tsx | 3 ++- web/utils/clipboard.ts | 35 +++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 web/utils/clipboard.ts diff --git a/web/app/components/develop/code.tsx b/web/app/components/develop/code.tsx index c1fbaa1cf8..7716cd4c93 100644 --- a/web/app/components/develop/code.tsx +++ b/web/app/components/develop/code.tsx @@ -10,6 +10,7 @@ import { import { Tab } from '@headlessui/react' import { Tag } from './tag' import classNames from '@/utils/classnames' +import { writeTextToClipboard } from '@/utils/clipboard' const languageNames = { js: 'JavaScript', @@ -71,7 +72,7 @@ function CopyButton({ code }: { code: string }) { : 'bg-white/5 hover:bg-white/7.5 dark:bg-white/2.5 dark:hover:bg-white/5', )} onClick={() => { - window.navigator.clipboard.writeText(code).then(() => { + writeTextToClipboard(code).then(() => { setCopyCount(count => count + 1) }) }} diff --git a/web/utils/clipboard.ts b/web/utils/clipboard.ts new file mode 100644 index 0000000000..8e7a4495b3 --- /dev/null +++ b/web/utils/clipboard.ts @@ -0,0 +1,35 @@ +export async function writeTextToClipboard(text: string): Promise { + if (navigator.clipboard && navigator.clipboard.writeText) + return navigator.clipboard.writeText(text) + + return fallbackCopyTextToClipboard(text) +} + +async function fallbackCopyTextToClipboard(text: string): Promise { + const textArea = document.createElement('textarea') + textArea.value = text + textArea.style.position = 'fixed' // Avoid scrolling to bottom + document.body.appendChild(textArea) + textArea.focus() + textArea.select() + try { + const successful = document.execCommand('copy') + if (successful) + return Promise.resolve() + + return Promise.reject(new Error('document.execCommand failed')) + } + catch (err) { + return Promise.reject(convertAnyToError(err)) + } + finally { + document.body.removeChild(textArea) + } +} + +function convertAnyToError(err: any): Error { + if (err instanceof Error) + return err + + return new Error(`Caught: ${String(err)}`) +} From d167d5b1be67d0c32e808fc975d2dd1007a21f45 Mon Sep 17 00:00:00 2001 From: sino Date: Wed, 22 Jan 2025 15:25:57 +0800 Subject: [PATCH 2/8] feat(ark): support doubao 1.5 series of models (#12935) --- .../volcengine_maas/llm/models.py | 57 +++++++++++++------ .../volcengine_maas/volcengine_maas.yaml | 24 ++++++++ 2 files changed, 63 insertions(+), 18 deletions(-) diff --git a/api/core/model_runtime/model_providers/volcengine_maas/llm/models.py b/api/core/model_runtime/model_providers/volcengine_maas/llm/models.py index 7c37368086..94315cd026 100644 --- a/api/core/model_runtime/model_providers/volcengine_maas/llm/models.py +++ b/api/core/model_runtime/model_providers/volcengine_maas/llm/models.py @@ -18,72 +18,93 @@ class ModelConfig(BaseModel): configs: dict[str, ModelConfig] = { + "Doubao-1.5-vision-pro-32k": ModelConfig( + properties=ModelProperties(context_size=32768, max_tokens=12288, mode=LLMMode.CHAT), + features=[ModelFeature.AGENT_THOUGHT, ModelFeature.VISION], + ), + "Doubao-1.5-pro-32k": ModelConfig( + properties=ModelProperties(context_size=32768, max_tokens=12288, mode=LLMMode.CHAT), + features=[ModelFeature.AGENT_THOUGHT], + ), + "Doubao-1.5-lite-32k": ModelConfig( + properties=ModelProperties(context_size=32768, max_tokens=12288, mode=LLMMode.CHAT), + features=[ModelFeature.AGENT_THOUGHT], + ), + "Doubao-1.5-pro-256k": ModelConfig( + properties=ModelProperties(context_size=262144, max_tokens=12288, mode=LLMMode.CHAT), + features=[ModelFeature.AGENT_THOUGHT], + ), "Doubao-vision-pro-32k": ModelConfig( properties=ModelProperties(context_size=32768, max_tokens=4096, mode=LLMMode.CHAT), - features=[ModelFeature.VISION], + features=[ModelFeature.AGENT_THOUGHT, ModelFeature.VISION], ), "Doubao-vision-lite-32k": ModelConfig( properties=ModelProperties(context_size=32768, max_tokens=4096, mode=LLMMode.CHAT), - features=[ModelFeature.VISION], + features=[ModelFeature.AGENT_THOUGHT, ModelFeature.VISION], ), "Doubao-pro-4k": ModelConfig( properties=ModelProperties(context_size=4096, max_tokens=4096, mode=LLMMode.CHAT), - features=[ModelFeature.TOOL_CALL], + features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL], ), "Doubao-lite-4k": ModelConfig( properties=ModelProperties(context_size=4096, max_tokens=4096, mode=LLMMode.CHAT), - features=[ModelFeature.TOOL_CALL], + features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL], ), "Doubao-pro-32k": ModelConfig( properties=ModelProperties(context_size=32768, max_tokens=4096, mode=LLMMode.CHAT), - features=[ModelFeature.TOOL_CALL], + features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL], ), "Doubao-lite-32k": ModelConfig( properties=ModelProperties(context_size=32768, max_tokens=4096, mode=LLMMode.CHAT), - features=[ModelFeature.TOOL_CALL], + features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL], ), "Doubao-pro-256k": ModelConfig( properties=ModelProperties(context_size=262144, max_tokens=4096, mode=LLMMode.CHAT), - features=[], + features=[ModelFeature.AGENT_THOUGHT], ), "Doubao-pro-128k": ModelConfig( properties=ModelProperties(context_size=131072, max_tokens=4096, mode=LLMMode.CHAT), - features=[ModelFeature.TOOL_CALL], + features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL], ), "Doubao-lite-128k": ModelConfig( - properties=ModelProperties(context_size=131072, max_tokens=4096, mode=LLMMode.CHAT), features=[] + properties=ModelProperties(context_size=131072, max_tokens=4096, mode=LLMMode.CHAT), + features=[ModelFeature.AGENT_THOUGHT], ), "Skylark2-pro-4k": ModelConfig( - properties=ModelProperties(context_size=4096, max_tokens=4096, mode=LLMMode.CHAT), features=[] + properties=ModelProperties(context_size=4096, max_tokens=4096, mode=LLMMode.CHAT), + features=[ModelFeature.AGENT_THOUGHT], ), "Llama3-8B": ModelConfig( - properties=ModelProperties(context_size=8192, max_tokens=8192, mode=LLMMode.CHAT), features=[] + properties=ModelProperties(context_size=8192, max_tokens=8192, mode=LLMMode.CHAT), + features=[ModelFeature.AGENT_THOUGHT], ), "Llama3-70B": ModelConfig( - properties=ModelProperties(context_size=8192, max_tokens=8192, mode=LLMMode.CHAT), features=[] + properties=ModelProperties(context_size=8192, max_tokens=8192, mode=LLMMode.CHAT), + features=[ModelFeature.AGENT_THOUGHT], ), "Moonshot-v1-8k": ModelConfig( properties=ModelProperties(context_size=8192, max_tokens=4096, mode=LLMMode.CHAT), - features=[ModelFeature.TOOL_CALL], + features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL], ), "Moonshot-v1-32k": ModelConfig( properties=ModelProperties(context_size=32768, max_tokens=16384, mode=LLMMode.CHAT), - features=[ModelFeature.TOOL_CALL], + features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL], ), "Moonshot-v1-128k": ModelConfig( properties=ModelProperties(context_size=131072, max_tokens=65536, mode=LLMMode.CHAT), - features=[ModelFeature.TOOL_CALL], + features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL], ), "GLM3-130B": ModelConfig( properties=ModelProperties(context_size=8192, max_tokens=4096, mode=LLMMode.CHAT), - features=[ModelFeature.TOOL_CALL], + features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL], ), "GLM3-130B-Fin": ModelConfig( properties=ModelProperties(context_size=8192, max_tokens=4096, mode=LLMMode.CHAT), - features=[ModelFeature.TOOL_CALL], + features=[ModelFeature.AGENT_THOUGHT, ModelFeature.TOOL_CALL], ), "Mistral-7B": ModelConfig( - properties=ModelProperties(context_size=8192, max_tokens=2048, mode=LLMMode.CHAT), features=[] + properties=ModelProperties(context_size=8192, max_tokens=2048, mode=LLMMode.CHAT), + features=[ModelFeature.AGENT_THOUGHT], ), } diff --git a/api/core/model_runtime/model_providers/volcengine_maas/volcengine_maas.yaml b/api/core/model_runtime/model_providers/volcengine_maas/volcengine_maas.yaml index 2ddb612546..976cd26ac9 100644 --- a/api/core/model_runtime/model_providers/volcengine_maas/volcengine_maas.yaml +++ b/api/core/model_runtime/model_providers/volcengine_maas/volcengine_maas.yaml @@ -118,6 +118,30 @@ model_credential_schema: type: select required: true options: + - label: + en_US: Doubao-1.5-vision-pro-32k + value: Doubao-1.5-vision-pro-32k + show_on: + - variable: __model_type + value: llm + - label: + en_US: Doubao-1.5-pro-32k + value: Doubao-1.5-pro-32k + show_on: + - variable: __model_type + value: llm + - label: + en_US: Doubao-1.5-lite-32k + value: Doubao-1.5-lite-32k + show_on: + - variable: __model_type + value: llm + - label: + en_US: Doubao-1.5-pro-256k + value: Doubao-1.5-pro-256k + show_on: + - variable: __model_type + value: llm - label: en_US: Doubao-vision-pro-32k value: Doubao-vision-pro-32k From 1e73f63ff8b6181963244cf72f3841ab70de11a4 Mon Sep 17 00:00:00 2001 From: -LAN- Date: Wed, 22 Jan 2025 16:40:44 +0800 Subject: [PATCH 3/8] chore: update version to 0.15.2 in packaging and docker configurations (#12940) Signed-off-by: -LAN- --- api/configs/packaging/__init__.py | 2 +- docker-legacy/docker-compose.yaml | 6 +++--- docker/docker-compose-template.yaml | 6 +++--- docker/docker-compose.yaml | 6 +++--- web/package.json | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/api/configs/packaging/__init__.py b/api/configs/packaging/__init__.py index a54c5bf5ee..20c1f58c99 100644 --- a/api/configs/packaging/__init__.py +++ b/api/configs/packaging/__init__.py @@ -9,7 +9,7 @@ class PackagingInfo(BaseSettings): CURRENT_VERSION: str = Field( description="Dify version", - default="0.15.1", + default="0.15.2", ) COMMIT_SHA: str = Field( diff --git a/docker-legacy/docker-compose.yaml b/docker-legacy/docker-compose.yaml index 6e4c8a748e..d2b6689453 100644 --- a/docker-legacy/docker-compose.yaml +++ b/docker-legacy/docker-compose.yaml @@ -2,7 +2,7 @@ version: '3' services: # API service api: - image: langgenius/dify-api:0.15.1 + image: langgenius/dify-api:0.15.2 restart: always environment: # Startup mode, 'api' starts the API server. @@ -227,7 +227,7 @@ services: # worker service # The Celery worker for processing the queue. worker: - image: langgenius/dify-api:0.15.1 + image: langgenius/dify-api:0.15.2 restart: always environment: CONSOLE_WEB_URL: '' @@ -397,7 +397,7 @@ services: # Frontend web application. web: - image: langgenius/dify-web:0.15.1 + image: langgenius/dify-web:0.15.2 restart: always environment: # The base URL of console application api server, refers to the Console base URL of WEB service if console domain is diff --git a/docker/docker-compose-template.yaml b/docker/docker-compose-template.yaml index d24e7c181f..8aafc61888 100644 --- a/docker/docker-compose-template.yaml +++ b/docker/docker-compose-template.yaml @@ -2,7 +2,7 @@ x-shared-env: &shared-api-worker-env services: # API service api: - image: langgenius/dify-api:0.15.1 + image: langgenius/dify-api:0.15.2 restart: always environment: # Use the shared environment variables. @@ -25,7 +25,7 @@ services: # worker service # The Celery worker for processing the queue. worker: - image: langgenius/dify-api:0.15.1 + image: langgenius/dify-api:0.15.2 restart: always environment: # Use the shared environment variables. @@ -47,7 +47,7 @@ services: # Frontend web application. web: - image: langgenius/dify-web:0.15.1 + image: langgenius/dify-web:0.15.2 restart: always environment: CONSOLE_API_URL: ${CONSOLE_API_URL:-} diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 21e72a4cd6..a11ec261f3 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -393,7 +393,7 @@ x-shared-env: &shared-api-worker-env services: # API service api: - image: langgenius/dify-api:0.15.1 + image: langgenius/dify-api:0.15.2 restart: always environment: # Use the shared environment variables. @@ -416,7 +416,7 @@ services: # worker service # The Celery worker for processing the queue. worker: - image: langgenius/dify-api:0.15.1 + image: langgenius/dify-api:0.15.2 restart: always environment: # Use the shared environment variables. @@ -438,7 +438,7 @@ services: # Frontend web application. web: - image: langgenius/dify-web:0.15.1 + image: langgenius/dify-web:0.15.2 restart: always environment: CONSOLE_API_URL: ${CONSOLE_API_URL:-} diff --git a/web/package.json b/web/package.json index 879b87c596..6ae11d71b4 100644 --- a/web/package.json +++ b/web/package.json @@ -1,6 +1,6 @@ { "name": "dify-web", - "version": "0.15.1", + "version": "0.15.2", "private": true, "engines": { "node": ">=18.17.0" From 4c3076f2a474fd66233a1a0f9fe0331423aea546 Mon Sep 17 00:00:00 2001 From: huangzhuo1949 <167434202+huangzhuo1949@users.noreply.github.com> Date: Wed, 22 Jan 2025 17:07:18 +0800 Subject: [PATCH 4/8] feat: add pg vector index (#12338) Co-authored-by: huangzhuo --- api/core/rag/datasource/vdb/pgvector/pgvector.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/api/core/rag/datasource/vdb/pgvector/pgvector.py b/api/core/rag/datasource/vdb/pgvector/pgvector.py index de443ba580..c8a1e4f90c 100644 --- a/api/core/rag/datasource/vdb/pgvector/pgvector.py +++ b/api/core/rag/datasource/vdb/pgvector/pgvector.py @@ -57,6 +57,11 @@ CREATE TABLE IF NOT EXISTS {table_name} ( ) using heap; """ +SQL_CREATE_INDEX = """ +CREATE INDEX IF NOT EXISTS embedding_cosine_v1_idx ON {table_name} +USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64); +""" + class PGVector(BaseVector): def __init__(self, collection_name: str, config: PGVectorConfig): @@ -205,7 +210,10 @@ class PGVector(BaseVector): with self._get_cursor() as cur: cur.execute("CREATE EXTENSION IF NOT EXISTS vector") cur.execute(SQL_CREATE_TABLE.format(table_name=self.table_name, dimension=dimension)) - # TODO: create index https://github.com/pgvector/pgvector?tab=readme-ov-file#indexing + # PG hnsw index only support 2000 dimension or less + # ref: https://github.com/pgvector/pgvector?tab=readme-ov-file#indexing + if dimension <= 2000: + cur.execute(SQL_CREATE_INDEX.format(table_name=self.table_name)) redis_client.set(collection_exist_cache_key, 1, ex=3600) From dd0904f95c97dab8404f71e06212ee182db9c554 Mon Sep 17 00:00:00 2001 From: jiandanfeng Date: Wed, 22 Jan 2025 19:26:25 +0800 Subject: [PATCH 5/8] feat: add giteeAI risk control identification. (#12946) --- .../builtin/gitee_ai/tools/risk-control.py | 26 +++++++++++++++ .../builtin/gitee_ai/tools/risk-control.yaml | 32 +++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 api/core/tools/provider/builtin/gitee_ai/tools/risk-control.py create mode 100644 api/core/tools/provider/builtin/gitee_ai/tools/risk-control.yaml diff --git a/api/core/tools/provider/builtin/gitee_ai/tools/risk-control.py b/api/core/tools/provider/builtin/gitee_ai/tools/risk-control.py new file mode 100644 index 0000000000..e3558ce699 --- /dev/null +++ b/api/core/tools/provider/builtin/gitee_ai/tools/risk-control.py @@ -0,0 +1,26 @@ +from typing import Any, Union + +import requests + +from core.tools.entities.tool_entities import ToolInvokeMessage +from core.tools.tool.builtin_tool import BuiltinTool + + +class GiteeAIToolRiskControl(BuiltinTool): + def _invoke( + self, user_id: str, tool_parameters: dict[str, Any] + ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: + headers = { + "content-type": "application/json", + "authorization": f"Bearer {self.runtime.credentials['api_key']}", + } + + inputs = [{"type": "text", "text": tool_parameters.get("input-text")}] + model = tool_parameters.get("model", "Security-semantic-filtering") + payload = {"model": model, "input": inputs} + url = "https://ai.gitee.com/v1/moderations" + response = requests.post(url, json=payload, headers=headers) + if response.status_code != 200: + return self.create_text_message(f"Got Error Response:{response.text}") + + return [self.create_text_message(response.content.decode("utf-8"))] diff --git a/api/core/tools/provider/builtin/gitee_ai/tools/risk-control.yaml b/api/core/tools/provider/builtin/gitee_ai/tools/risk-control.yaml new file mode 100644 index 0000000000..6e7229dc1c --- /dev/null +++ b/api/core/tools/provider/builtin/gitee_ai/tools/risk-control.yaml @@ -0,0 +1,32 @@ +identity: + name: risk control + author: gitee_ai + label: + en_US: risk control identification + zh_Hans: 风控识别 + icon: icon.svg +description: + human: + en_US: Ensuring the protection and compliance of sensitive information through the filtering and analysis of data semantics + zh_Hans: 通过对数据语义的过滤和分析,确保敏感信息的保护和合规性 + llm: This tool is used to risk control identification. +parameters: + - name: model + type: string + required: true + default: Security-semantic-filtering + label: + en_US: Service Model + zh_Hans: 服务模型 + form: form + - name: input-text + type: string + required: true + label: + en_US: Input Text + zh_Hans: 输入文本 + human_description: + en_US: The text input for filtering and analysis. + zh_Hans: 用于分析过滤的文本 + llm_description: The text input for filtering and analysis. + form: llm From fd4afe09f877255d672ab20113471e52acc73fe1 Mon Sep 17 00:00:00 2001 From: Jhvcc <37662342+Jhvcc@users.noreply.github.com> Date: Wed, 22 Jan 2025 19:27:02 +0800 Subject: [PATCH 6/8] fix: tools translate search (#12950) Co-authored-by: lowell --- web/app/components/tools/provider-list.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/app/components/tools/provider-list.tsx b/web/app/components/tools/provider-list.tsx index 6f17835589..73c7363641 100644 --- a/web/app/components/tools/provider-list.tsx +++ b/web/app/components/tools/provider-list.tsx @@ -46,7 +46,7 @@ const ProviderList = () => { if (tagFilterValue.length > 0 && (!collection.labels || collection.labels.every(label => !tagFilterValue.includes(label)))) return false if (keywords) - return collection.name.toLowerCase().includes(keywords.toLowerCase()) + return Object.values(collection.label).some(value => value.toLowerCase().includes(keywords.toLowerCase())) return true }) }, [activeTab, tagFilterValue, keywords, collectionList]) From f565f08aa0793eaa75adbf9392d1ecd68dc53bfc Mon Sep 17 00:00:00 2001 From: Joel Date: Thu, 23 Jan 2025 11:02:29 +0800 Subject: [PATCH 7/8] fix: get property of string type variable caused page crash (#12969) --- .../workflow/nodes/_base/components/variable/utils.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/web/app/components/workflow/nodes/_base/components/variable/utils.ts b/web/app/components/workflow/nodes/_base/components/variable/utils.ts index 715ad1c7b1..24c2b73fed 100644 --- a/web/app/components/workflow/nodes/_base/components/variable/utils.ts +++ b/web/app/components/workflow/nodes/_base/components/variable/utils.ts @@ -546,7 +546,9 @@ export const getVarType = ({ else { (valueSelector as ValueSelector).slice(1).forEach((key, i) => { const isLast = i === valueSelector.length - 2 - curr = curr?.find((v: any) => v.variable === key) + if (Array.isArray(curr)) + curr = curr?.find((v: any) => v.variable === key) + if (isLast) { type = curr?.type } From 6024d8a42d1a39d3e0d6b81df806e3d624839519 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20Tonato?= Date: Thu, 23 Jan 2025 03:14:48 +0000 Subject: [PATCH 8/8] refactor: Update Firecrawl to use v1 API (#12574) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Ademílson Tonato --- .../rag/extractor/firecrawl/firecrawl_app.py | 105 +++++++++--------- api/services/auth/firecrawl/firecrawl.py | 8 +- api/services/website_service.py | 30 ++--- .../rag/extractor/firecrawl/test_firecrawl.py | 19 ++-- 4 files changed, 81 insertions(+), 81 deletions(-) diff --git a/api/core/rag/extractor/firecrawl/firecrawl_app.py b/api/core/rag/extractor/firecrawl/firecrawl_app.py index eac08aeb8b..836a1398bf 100644 --- a/api/core/rag/extractor/firecrawl/firecrawl_app.py +++ b/api/core/rag/extractor/firecrawl/firecrawl_app.py @@ -1,6 +1,6 @@ import json import time -from typing import cast +from typing import Any, cast import requests @@ -14,48 +14,47 @@ class FirecrawlApp: if self.api_key is None and self.base_url == "https://api.firecrawl.dev": raise ValueError("No API key provided") - def scrape_url(self, url, params=None) -> dict: - headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} - json_data = {"url": url} + def scrape_url(self, url, params=None) -> dict[str, Any]: + # Documentation: https://docs.firecrawl.dev/api-reference/endpoint/scrape + headers = self._prepare_headers() + json_data = { + "url": url, + "formats": ["markdown"], + "onlyMainContent": True, + "timeout": 30000, + } if params: json_data.update(params) - response = requests.post(f"{self.base_url}/v0/scrape", headers=headers, json=json_data) + response = self._post_request(f"{self.base_url}/v1/scrape", json_data, headers) if response.status_code == 200: response_data = response.json() - if response_data["success"] == True: - data = response_data["data"] - return { - "title": data.get("metadata").get("title"), - "description": data.get("metadata").get("description"), - "source_url": data.get("metadata").get("sourceURL"), - "markdown": data.get("markdown"), - } - else: - raise Exception(f"Failed to scrape URL. Error: {response_data['error']}") - - elif response.status_code in {402, 409, 500}: - error_message = response.json().get("error", "Unknown error occurred") - raise Exception(f"Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}") + data = response_data["data"] + return self._extract_common_fields(data) + elif response.status_code in {402, 409, 500, 429, 408}: + self._handle_error(response, "scrape URL") + return {} # Avoid additional exception after handling error else: raise Exception(f"Failed to scrape URL. Status code: {response.status_code}") def crawl_url(self, url, params=None) -> str: + # Documentation: https://docs.firecrawl.dev/api-reference/endpoint/crawl-post headers = self._prepare_headers() json_data = {"url": url} if params: json_data.update(params) - response = self._post_request(f"{self.base_url}/v0/crawl", json_data, headers) + response = self._post_request(f"{self.base_url}/v1/crawl", json_data, headers) if response.status_code == 200: - job_id = response.json().get("jobId") + # There's also another two fields in the response: "success" (bool) and "url" (str) + job_id = response.json().get("id") return cast(str, job_id) else: self._handle_error(response, "start crawl job") # FIXME: unreachable code for mypy return "" # unreachable - def check_crawl_status(self, job_id) -> dict: + def check_crawl_status(self, job_id) -> dict[str, Any]: headers = self._prepare_headers() - response = self._get_request(f"{self.base_url}/v0/crawl/status/{job_id}", headers) + response = self._get_request(f"{self.base_url}/v1/crawl/{job_id}", headers) if response.status_code == 200: crawl_status_response = response.json() if crawl_status_response.get("status") == "completed": @@ -66,42 +65,48 @@ class FirecrawlApp: url_data_list = [] for item in data: if isinstance(item, dict) and "metadata" in item and "markdown" in item: - url_data = { - "title": item.get("metadata", {}).get("title"), - "description": item.get("metadata", {}).get("description"), - "source_url": item.get("metadata", {}).get("sourceURL"), - "markdown": item.get("markdown"), - } + url_data = self._extract_common_fields(item) url_data_list.append(url_data) if url_data_list: file_key = "website_files/" + job_id + ".txt" - if storage.exists(file_key): - storage.delete(file_key) - storage.save(file_key, json.dumps(url_data_list).encode("utf-8")) - return { - "status": "completed", - "total": crawl_status_response.get("total"), - "current": crawl_status_response.get("current"), - "data": url_data_list, - } - + try: + if storage.exists(file_key): + storage.delete(file_key) + storage.save(file_key, json.dumps(url_data_list).encode("utf-8")) + except Exception as e: + raise Exception(f"Error saving crawl data: {e}") + return self._format_crawl_status_response("completed", crawl_status_response, url_data_list) else: - return { - "status": crawl_status_response.get("status"), - "total": crawl_status_response.get("total"), - "current": crawl_status_response.get("current"), - "data": [], - } - + return self._format_crawl_status_response( + crawl_status_response.get("status"), crawl_status_response, [] + ) else: self._handle_error(response, "check crawl status") # FIXME: unreachable code for mypy return {} # unreachable - def _prepare_headers(self): + def _format_crawl_status_response( + self, status: str, crawl_status_response: dict[str, Any], url_data_list: list[dict[str, Any]] + ) -> dict[str, Any]: + return { + "status": status, + "total": crawl_status_response.get("total"), + "current": crawl_status_response.get("completed"), + "data": url_data_list, + } + + def _extract_common_fields(self, item: dict[str, Any]) -> dict[str, Any]: + return { + "title": item.get("metadata", {}).get("title"), + "description": item.get("metadata", {}).get("description"), + "source_url": item.get("metadata", {}).get("sourceURL"), + "markdown": item.get("markdown"), + } + + def _prepare_headers(self) -> dict[str, Any]: return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} - def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): + def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5) -> requests.Response: for attempt in range(retries): response = requests.post(url, headers=headers, json=data) if response.status_code == 502: @@ -110,7 +115,7 @@ class FirecrawlApp: return response return response - def _get_request(self, url, headers, retries=3, backoff_factor=0.5): + def _get_request(self, url, headers, retries=3, backoff_factor=0.5) -> requests.Response: for attempt in range(retries): response = requests.get(url, headers=headers) if response.status_code == 502: @@ -119,6 +124,6 @@ class FirecrawlApp: return response return response - def _handle_error(self, response, action): + def _handle_error(self, response, action) -> None: error_message = response.json().get("error", "Unknown error occurred") raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}") diff --git a/api/services/auth/firecrawl/firecrawl.py b/api/services/auth/firecrawl/firecrawl.py index 50e4edff14..cc6eaaa42a 100644 --- a/api/services/auth/firecrawl/firecrawl.py +++ b/api/services/auth/firecrawl/firecrawl.py @@ -21,10 +21,12 @@ class FirecrawlAuth(ApiKeyAuthBase): headers = self._prepare_headers() options = { "url": "https://example.com", - "crawlerOptions": {"excludes": [], "includes": [], "limit": 1}, - "pageOptions": {"onlyMainContent": True}, + "excludes": [], + "includes": [], + "limit": 1, + "scrapeOptions": {"onlyMainContent": True}, } - response = self._post_request(f"{self.base_url}/v0/crawl", options, headers) + response = self._post_request(f"{self.base_url}/v1/crawl", options, headers) if response.status_code == 200: return True else: diff --git a/api/services/website_service.py b/api/services/website_service.py index 1ad7d0399d..b30e2205f7 100644 --- a/api/services/website_service.py +++ b/api/services/website_service.py @@ -38,30 +38,24 @@ class WebsiteService: only_main_content = options.get("only_main_content", False) if not crawl_sub_pages: params = { - "crawlerOptions": { - "includes": [], - "excludes": [], - "generateImgAltText": True, - "limit": 1, - "returnOnlyUrls": False, - "pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False}, - } + "includes": [], + "excludes": [], + "generateImgAltText": True, + "limit": 1, + "scrapeOptions": {"onlyMainContent": only_main_content}, } else: includes = options.get("includes").split(",") if options.get("includes") else [] excludes = options.get("excludes").split(",") if options.get("excludes") else [] params = { - "crawlerOptions": { - "includes": includes, - "excludes": excludes, - "generateImgAltText": True, - "limit": options.get("limit", 1), - "returnOnlyUrls": False, - "pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False}, - } + "includes": includes, + "excludes": excludes, + "generateImgAltText": True, + "limit": options.get("limit", 1), + "scrapeOptions": {"onlyMainContent": only_main_content}, } if options.get("max_depth"): - params["crawlerOptions"]["maxDepth"] = options.get("max_depth") + params["maxDepth"] = options.get("max_depth") job_id = firecrawl_app.crawl_url(url, params) website_crawl_time_cache_key = f"website_crawl_{job_id}" time = str(datetime.datetime.now().timestamp()) @@ -228,7 +222,7 @@ class WebsiteService: # decrypt api_key api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key")) firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None)) - params = {"pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False}} + params = {"onlyMainContent": only_main_content} result = firecrawl_app.scrape_url(url, params) return result else: diff --git a/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py b/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py index 8fcdf2e8e5..120ca9c8ea 100644 --- a/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py +++ b/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py @@ -10,19 +10,18 @@ def test_firecrawl_web_extractor_crawl_mode(mocker): base_url = "https://api.firecrawl.dev" firecrawl_app = FirecrawlApp(api_key=api_key, base_url=base_url) params = { - "crawlerOptions": { - "includes": [], - "excludes": [], - "generateImgAltText": True, - "maxDepth": 1, - "limit": 1, - "returnOnlyUrls": False, - } + "includes": [], + "excludes": [], + "generateImgAltText": True, + "maxDepth": 1, + "limit": 1, } mocked_firecrawl = { - "jobId": "test", + "id": "test", } mocker.patch("requests.post", return_value=_mock_response(mocked_firecrawl)) job_id = firecrawl_app.crawl_url(url, params) - print(job_id) + print(f"job_id: {job_id}") + + assert job_id is not None assert isinstance(job_id, str)