mirror of https://github.com/langgenius/dify.git
refactor: Update Firecrawl to use v2 API (#24734)
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
c39dae06d4
commit
a16ef7e73c
|
|
@ -25,7 +25,7 @@ class FirecrawlApp:
|
||||||
}
|
}
|
||||||
if params:
|
if params:
|
||||||
json_data.update(params)
|
json_data.update(params)
|
||||||
response = self._post_request(f"{self.base_url}/v1/scrape", json_data, headers)
|
response = self._post_request(f"{self.base_url}/v2/scrape", json_data, headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
response_data = response.json()
|
response_data = response.json()
|
||||||
data = response_data["data"]
|
data = response_data["data"]
|
||||||
|
|
@ -42,7 +42,7 @@ class FirecrawlApp:
|
||||||
json_data = {"url": url}
|
json_data = {"url": url}
|
||||||
if params:
|
if params:
|
||||||
json_data.update(params)
|
json_data.update(params)
|
||||||
response = self._post_request(f"{self.base_url}/v1/crawl", json_data, headers)
|
response = self._post_request(f"{self.base_url}/v2/crawl", json_data, headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
# There's also another two fields in the response: "success" (bool) and "url" (str)
|
# There's also another two fields in the response: "success" (bool) and "url" (str)
|
||||||
job_id = response.json().get("id")
|
job_id = response.json().get("id")
|
||||||
|
|
@ -51,9 +51,25 @@ class FirecrawlApp:
|
||||||
self._handle_error(response, "start crawl job")
|
self._handle_error(response, "start crawl job")
|
||||||
return "" # unreachable
|
return "" # unreachable
|
||||||
|
|
||||||
|
def map(self, url: str, params: dict[str, Any] | None = None) -> dict[str, Any]:
|
||||||
|
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/map
|
||||||
|
headers = self._prepare_headers()
|
||||||
|
json_data: dict[str, Any] = {"url": url, "integration": "dify"}
|
||||||
|
if params:
|
||||||
|
# Pass through provided params, including optional "sitemap": "only" | "include" | "skip"
|
||||||
|
json_data.update(params)
|
||||||
|
response = self._post_request(f"{self.base_url}/v2/map", json_data, headers)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return cast(dict[str, Any], response.json())
|
||||||
|
elif response.status_code in {402, 409, 500, 429, 408}:
|
||||||
|
self._handle_error(response, "start map job")
|
||||||
|
return {}
|
||||||
|
else:
|
||||||
|
raise Exception(f"Failed to start map job. Status code: {response.status_code}")
|
||||||
|
|
||||||
def check_crawl_status(self, job_id) -> dict[str, Any]:
|
def check_crawl_status(self, job_id) -> dict[str, Any]:
|
||||||
headers = self._prepare_headers()
|
headers = self._prepare_headers()
|
||||||
response = self._get_request(f"{self.base_url}/v1/crawl/{job_id}", headers)
|
response = self._get_request(f"{self.base_url}/v2/crawl/{job_id}", headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
crawl_status_response = response.json()
|
crawl_status_response = response.json()
|
||||||
if crawl_status_response.get("status") == "completed":
|
if crawl_status_response.get("status") == "completed":
|
||||||
|
|
@ -135,12 +151,16 @@ class FirecrawlApp:
|
||||||
"lang": "en",
|
"lang": "en",
|
||||||
"country": "us",
|
"country": "us",
|
||||||
"timeout": 60000,
|
"timeout": 60000,
|
||||||
"ignoreInvalidURLs": False,
|
"ignoreInvalidURLs": True,
|
||||||
"scrapeOptions": {},
|
"scrapeOptions": {},
|
||||||
|
"sources": [
|
||||||
|
{"type": "web"},
|
||||||
|
],
|
||||||
|
"integration": "dify",
|
||||||
}
|
}
|
||||||
if params:
|
if params:
|
||||||
json_data.update(params)
|
json_data.update(params)
|
||||||
response = self._post_request(f"{self.base_url}/v1/search", json_data, headers)
|
response = self._post_request(f"{self.base_url}/v2/search", json_data, headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
response_data = response.json()
|
response_data = response.json()
|
||||||
if not response_data.get("success"):
|
if not response_data.get("success"):
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ class CrawlOptions:
|
||||||
only_main_content: bool = False
|
only_main_content: bool = False
|
||||||
includes: str | None = None
|
includes: str | None = None
|
||||||
excludes: str | None = None
|
excludes: str | None = None
|
||||||
|
prompt: str | None = None
|
||||||
max_depth: int | None = None
|
max_depth: int | None = None
|
||||||
use_sitemap: bool = True
|
use_sitemap: bool = True
|
||||||
|
|
||||||
|
|
@ -70,6 +71,7 @@ class WebsiteCrawlApiRequest:
|
||||||
only_main_content=self.options.get("only_main_content", False),
|
only_main_content=self.options.get("only_main_content", False),
|
||||||
includes=self.options.get("includes"),
|
includes=self.options.get("includes"),
|
||||||
excludes=self.options.get("excludes"),
|
excludes=self.options.get("excludes"),
|
||||||
|
prompt=self.options.get("prompt"),
|
||||||
max_depth=self.options.get("max_depth"),
|
max_depth=self.options.get("max_depth"),
|
||||||
use_sitemap=self.options.get("use_sitemap", True),
|
use_sitemap=self.options.get("use_sitemap", True),
|
||||||
)
|
)
|
||||||
|
|
@ -174,6 +176,7 @@ class WebsiteService:
|
||||||
def _crawl_with_firecrawl(cls, request: CrawlRequest, api_key: str, config: dict) -> dict[str, Any]:
|
def _crawl_with_firecrawl(cls, request: CrawlRequest, api_key: str, config: dict) -> dict[str, Any]:
|
||||||
firecrawl_app = FirecrawlApp(api_key=api_key, base_url=config.get("base_url"))
|
firecrawl_app = FirecrawlApp(api_key=api_key, base_url=config.get("base_url"))
|
||||||
|
|
||||||
|
params: dict[str, Any]
|
||||||
if not request.options.crawl_sub_pages:
|
if not request.options.crawl_sub_pages:
|
||||||
params = {
|
params = {
|
||||||
"includePaths": [],
|
"includePaths": [],
|
||||||
|
|
@ -188,8 +191,10 @@ class WebsiteService:
|
||||||
"limit": request.options.limit,
|
"limit": request.options.limit,
|
||||||
"scrapeOptions": {"onlyMainContent": request.options.only_main_content},
|
"scrapeOptions": {"onlyMainContent": request.options.only_main_content},
|
||||||
}
|
}
|
||||||
if request.options.max_depth:
|
|
||||||
params["maxDepth"] = request.options.max_depth
|
# Add optional prompt for Firecrawl v2 crawl-params compatibility
|
||||||
|
if request.options.prompt:
|
||||||
|
params["prompt"] = request.options.prompt
|
||||||
|
|
||||||
job_id = firecrawl_app.crawl_url(request.url, params)
|
job_id = firecrawl_app.crawl_url(request.url, params)
|
||||||
website_crawl_time_cache_key = f"website_crawl_{job_id}"
|
website_crawl_time_cache_key = f"website_crawl_{job_id}"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue