diff --git a/api/core/rag/extractor/watercrawl/client.py b/api/core/rag/extractor/watercrawl/client.py index 2cd2440253..1f4adc0d41 100644 --- a/api/core/rag/extractor/watercrawl/client.py +++ b/api/core/rag/extractor/watercrawl/client.py @@ -118,16 +118,18 @@ class WaterCrawlAPIClient(BaseAPIClient): response.raise_for_status() if response.status_code == 204: return None - if response.headers.get("Content-Type") == "application/json": + content_type = response.headers.get("Content-Type", "") + media_type = content_type.split(";", 1)[0].strip().lower() + if media_type == "application/json": return response.json() or {} - if response.headers.get("Content-Type") == "application/octet-stream": + if media_type == "application/octet-stream": return response.content - if response.headers.get("Content-Type") == "text/event-stream": + if media_type == "text/event-stream": return self.process_eventstream(response) - raise Exception(f"Unknown response type: {response.headers.get('Content-Type')}") + raise Exception(f"Unknown response type: {content_type}") def get_crawl_requests_list(self, page: int | None = None, page_size: int | None = None): query_params = {"page": page or 1, "page_size": page_size or 10} diff --git a/api/tests/unit_tests/core/rag/extractor/watercrawl/test_watercrawl.py b/api/tests/unit_tests/core/rag/extractor/watercrawl/test_watercrawl.py index bf5913faa3..35e581ccc1 100644 --- a/api/tests/unit_tests/core/rag/extractor/watercrawl/test_watercrawl.py +++ b/api/tests/unit_tests/core/rag/extractor/watercrawl/test_watercrawl.py @@ -168,6 +168,13 @@ class TestWaterCrawlAPIClient: assert client.process_response(_response(200, {"ok": True})) == {"ok": True} assert client.process_response(_response(200, None)) == {} + def test_process_response_accepts_json_content_type_parameters(self): + client = WaterCrawlAPIClient(api_key="k") + + response = _response(200, {"ok": True}, content_type="application/json; charset=utf-8") + + assert client.process_response(response) == {"ok": True} + def test_process_response_octet_stream_returns_bytes(self): client = WaterCrawlAPIClient(api_key="k") assert (