refactor(api): add null safety to extractor_processor and firecrawl (#35209)

Co-authored-by: tmimmanuel <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>
This commit is contained in:
tmimmanuel 2026-04-14 20:23:20 +02:00 committed by GitHub
parent e78558bc06
commit ef28a63ad3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 10 additions and 2 deletions

View File

@ -95,9 +95,9 @@ class ExtractProcessor:
) -> list[Document]:
if extract_setting.datasource_type == DatasourceType.FILE:
with tempfile.TemporaryDirectory() as temp_dir:
upload_file = extract_setting.upload_file
if not file_path:
assert extract_setting.upload_file is not None, "upload_file is required"
upload_file: UploadFile = extract_setting.upload_file
assert upload_file is not None, "upload_file is required"
suffix = Path(upload_file.key).suffix
# FIXME mypy: Cannot determine type of 'tempfile._get_candidate_names' better not use it here
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" # type: ignore
@ -113,6 +113,7 @@ class ExtractProcessor:
if file_extension in {".xlsx", ".xls"}:
extractor = ExcelExtractor(file_path)
elif file_extension == ".pdf":
assert upload_file is not None
extractor = PdfExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
elif file_extension in {".md", ".markdown", ".mdx"}:
extractor = (
@ -123,6 +124,7 @@ class ExtractProcessor:
elif file_extension in {".htm", ".html"}:
extractor = HtmlExtractor(file_path)
elif file_extension == ".docx":
assert upload_file is not None
extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
elif file_extension == ".doc":
extractor = UnstructuredWordExtractor(file_path, unstructured_api_url, unstructured_api_key)
@ -149,12 +151,14 @@ class ExtractProcessor:
if file_extension in {".xlsx", ".xls"}:
extractor = ExcelExtractor(file_path)
elif file_extension == ".pdf":
assert upload_file is not None
extractor = PdfExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
elif file_extension in {".md", ".markdown", ".mdx"}:
extractor = MarkdownExtractor(file_path, autodetect_encoding=True)
elif file_extension in {".htm", ".html"}:
extractor = HtmlExtractor(file_path)
elif file_extension == ".docx":
assert upload_file is not None
extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
elif file_extension == ".csv":
extractor = CSVExtractor(file_path, autodetect_encoding=True)

View File

@ -174,21 +174,25 @@ class FirecrawlApp:
return f"{self.base_url.rstrip('/')}/{path.lstrip('/')}"
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5) -> httpx.Response:
response: httpx.Response | None = None
for attempt in range(retries):
response = httpx.post(url, headers=headers, json=data)
if response.status_code == 502:
time.sleep(backoff_factor * (2**attempt))
else:
return response
assert response is not None, "retries must be at least 1"
return response
def _get_request(self, url, headers, retries=3, backoff_factor=0.5) -> httpx.Response:
response: httpx.Response | None = None
for attempt in range(retries):
response = httpx.get(url, headers=headers)
if response.status_code == 502:
time.sleep(backoff_factor * (2**attempt))
else:
return response
assert response is not None, "retries must be at least 1"
return response
def _handle_error(self, response, action):