mirror of
https://github.com/langgenius/dify.git
synced 2026-04-16 02:16:57 +08:00
refactor(api): add null safety to extractor_processor and firecrawl (#35209)
Co-authored-by: tmimmanuel <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>
This commit is contained in:
parent
e78558bc06
commit
ef28a63ad3
@ -95,9 +95,9 @@ class ExtractProcessor:
|
||||
) -> list[Document]:
|
||||
if extract_setting.datasource_type == DatasourceType.FILE:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
upload_file = extract_setting.upload_file
|
||||
if not file_path:
|
||||
assert extract_setting.upload_file is not None, "upload_file is required"
|
||||
upload_file: UploadFile = extract_setting.upload_file
|
||||
assert upload_file is not None, "upload_file is required"
|
||||
suffix = Path(upload_file.key).suffix
|
||||
# FIXME mypy: Cannot determine type of 'tempfile._get_candidate_names' better not use it here
|
||||
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" # type: ignore
|
||||
@ -113,6 +113,7 @@ class ExtractProcessor:
|
||||
if file_extension in {".xlsx", ".xls"}:
|
||||
extractor = ExcelExtractor(file_path)
|
||||
elif file_extension == ".pdf":
|
||||
assert upload_file is not None
|
||||
extractor = PdfExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
|
||||
elif file_extension in {".md", ".markdown", ".mdx"}:
|
||||
extractor = (
|
||||
@ -123,6 +124,7 @@ class ExtractProcessor:
|
||||
elif file_extension in {".htm", ".html"}:
|
||||
extractor = HtmlExtractor(file_path)
|
||||
elif file_extension == ".docx":
|
||||
assert upload_file is not None
|
||||
extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
|
||||
elif file_extension == ".doc":
|
||||
extractor = UnstructuredWordExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||
@ -149,12 +151,14 @@ class ExtractProcessor:
|
||||
if file_extension in {".xlsx", ".xls"}:
|
||||
extractor = ExcelExtractor(file_path)
|
||||
elif file_extension == ".pdf":
|
||||
assert upload_file is not None
|
||||
extractor = PdfExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
|
||||
elif file_extension in {".md", ".markdown", ".mdx"}:
|
||||
extractor = MarkdownExtractor(file_path, autodetect_encoding=True)
|
||||
elif file_extension in {".htm", ".html"}:
|
||||
extractor = HtmlExtractor(file_path)
|
||||
elif file_extension == ".docx":
|
||||
assert upload_file is not None
|
||||
extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
|
||||
elif file_extension == ".csv":
|
||||
extractor = CSVExtractor(file_path, autodetect_encoding=True)
|
||||
|
||||
@ -174,21 +174,25 @@ class FirecrawlApp:
|
||||
return f"{self.base_url.rstrip('/')}/{path.lstrip('/')}"
|
||||
|
||||
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5) -> httpx.Response:
|
||||
response: httpx.Response | None = None
|
||||
for attempt in range(retries):
|
||||
response = httpx.post(url, headers=headers, json=data)
|
||||
if response.status_code == 502:
|
||||
time.sleep(backoff_factor * (2**attempt))
|
||||
else:
|
||||
return response
|
||||
assert response is not None, "retries must be at least 1"
|
||||
return response
|
||||
|
||||
def _get_request(self, url, headers, retries=3, backoff_factor=0.5) -> httpx.Response:
|
||||
response: httpx.Response | None = None
|
||||
for attempt in range(retries):
|
||||
response = httpx.get(url, headers=headers)
|
||||
if response.status_code == 502:
|
||||
time.sleep(backoff_factor * (2**attempt))
|
||||
else:
|
||||
return response
|
||||
assert response is not None, "retries must be at least 1"
|
||||
return response
|
||||
|
||||
def _handle_error(self, response, action):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user