refactor(api): add null safety to extractor_processor and firecrawl (#35209)

Co-authored-by: tmimmanuel <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>
2026-05-13 08:57:28 +08:00 · 2026-04-14 20:23:20 +02:00 · 2026-04-14 20:23:20 +02:00 · ef28a63ad3
commit ef28a63ad3
parent e78558bc06
2 changed files with 10 additions and 2 deletions
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@ -95,9 +95,9 @@ class ExtractProcessor:
    ) -> list[Document]:
        if extract_setting.datasource_type == DatasourceType.FILE:
            with tempfile.TemporaryDirectory() as temp_dir:
+                upload_file = extract_setting.upload_file
                if not file_path:
-                    assert extract_setting.upload_file is not None, "upload_file is required"
-                    upload_file: UploadFile = extract_setting.upload_file
+                    assert upload_file is not None, "upload_file is required"
                    suffix = Path(upload_file.key).suffix
                    # FIXME mypy: Cannot determine type of 'tempfile._get_candidate_names' better not use it here
                    file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"  # type: ignore
@ -113,6 +113,7 @@ class ExtractProcessor:
                    if file_extension in {".xlsx", ".xls"}:
                        extractor = ExcelExtractor(file_path)
                    elif file_extension == ".pdf":
+                        assert upload_file is not None
                        extractor = PdfExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
                    elif file_extension in {".md", ".markdown", ".mdx"}:
                        extractor = (
@ -123,6 +124,7 @@ class ExtractProcessor:
                    elif file_extension in {".htm", ".html"}:
                        extractor = HtmlExtractor(file_path)
                    elif file_extension == ".docx":
+                        assert upload_file is not None
                        extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
                    elif file_extension == ".doc":
                        extractor = UnstructuredWordExtractor(file_path, unstructured_api_url, unstructured_api_key)
@ -149,12 +151,14 @@ class ExtractProcessor:
                    if file_extension in {".xlsx", ".xls"}:
                        extractor = ExcelExtractor(file_path)
                    elif file_extension == ".pdf":
+                        assert upload_file is not None
                        extractor = PdfExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
                    elif file_extension in {".md", ".markdown", ".mdx"}:
                        extractor = MarkdownExtractor(file_path, autodetect_encoding=True)
                    elif file_extension in {".htm", ".html"}:
                        extractor = HtmlExtractor(file_path)
                    elif file_extension == ".docx":
+                        assert upload_file is not None
                        extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
                    elif file_extension == ".csv":
                        extractor = CSVExtractor(file_path, autodetect_encoding=True)
--- a/api/core/rag/extractor/firecrawl/firecrawl_app.py
+++ b/api/core/rag/extractor/firecrawl/firecrawl_app.py
@ -174,21 +174,25 @@ class FirecrawlApp:
        return f"{self.base_url.rstrip('/')}/{path.lstrip('/')}"

    def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5) -> httpx.Response:
+        response: httpx.Response | None = None
        for attempt in range(retries):
            response = httpx.post(url, headers=headers, json=data)
            if response.status_code == 502:
                time.sleep(backoff_factor * (2**attempt))
            else:
                return response
+        assert response is not None, "retries must be at least 1"
        return response

    def _get_request(self, url, headers, retries=3, backoff_factor=0.5) -> httpx.Response:
+        response: httpx.Response | None = None
        for attempt in range(retries):
            response = httpx.get(url, headers=headers)
            if response.status_code == 502:
                time.sleep(backoff_factor * (2**attempt))
            else:
                return response
+        assert response is not None, "retries must be at least 1"
        return response

    def _handle_error(self, response, action):