From a5709251305de47fe4f6ef9438f43365d69c474a Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Thu, 18 Sep 2025 11:24:56 +0800 Subject: [PATCH] del website test --- api/services/plugin/dependencies_analysis.py | 2 +- .../services/test_website_service.py | 1391 ----------------- 2 files changed, 1 insertion(+), 1392 deletions(-) delete mode 100644 api/tests/test_containers_integration_tests/services/test_website_service.py diff --git a/api/services/plugin/dependencies_analysis.py b/api/services/plugin/dependencies_analysis.py index 623fa2740f..2f0c5ae3af 100644 --- a/api/services/plugin/dependencies_analysis.py +++ b/api/services/plugin/dependencies_analysis.py @@ -59,7 +59,7 @@ class DependenciesAnalysisService: version_match = _VERSION_REGEX.search(unique_identifier) if version_match: dependency.value.version = version_match.group("version") - + # Create and append the dependency (same for all types) leaked_dependencies.append( PluginDependency( diff --git a/api/tests/test_containers_integration_tests/services/test_website_service.py b/api/tests/test_containers_integration_tests/services/test_website_service.py deleted file mode 100644 index 897e31c88a..0000000000 --- a/api/tests/test_containers_integration_tests/services/test_website_service.py +++ /dev/null @@ -1,1391 +0,0 @@ -from datetime import datetime -from unittest.mock import MagicMock, create_autospec, patch - -import pytest -from faker import Faker - -from models.account import Account, Tenant, TenantAccountJoin, TenantAccountRole -from services.datasource_provider_service import DatasourceProviderService -from services.website_service import ( - CrawlOptions, - ScrapeRequest, - WebsiteCrawlApiRequest, - WebsiteCrawlStatusApiRequest, - WebsiteService, -) - - -class TestWebsiteService: - """Integration tests for WebsiteService using testcontainers.""" - - @pytest.fixture - def mock_external_service_dependencies(self): - """Mock setup for external service dependencies.""" - with ( - # patch("services.website_service.ApiKeyAuthService") as mock_api_key_auth_service, - patch("services.website_service.FirecrawlApp") as mock_firecrawl_app, - patch("services.website_service.WaterCrawlProvider") as mock_watercrawl_provider, - patch("services.website_service.requests") as mock_requests, - patch("services.website_service.redis_client") as mock_redis_client, - patch("services.website_service.storage") as mock_storage, - patch("services.website_service.encrypter") as mock_encrypter, - patch( - "services.website_service.DatasourceProviderService", - ) as mock_datasource_provider_service, - ): - # Setup default mock returns - # mock_api_key_auth_service.get_auth_credentials.return_value = { - # "config": {"api_key": "encrypted_api_key", "base_url": "https://api.example.com"} - # } - mock_datasource_provider_service_instance = MagicMock(spec=DatasourceProviderService) - mock_datasource_provider_service_instance.get_datasource_credentials.return_value = { - "firecrawl_api_key": "firecrawl_api_key", - "api_key": "api_key", - } - mock_datasource_provider_service.return_value = mock_datasource_provider_service_instance - - # Mock FirecrawlApp - mock_firecrawl_instance = MagicMock() - mock_firecrawl_instance.crawl_url.return_value = "test_job_id_123" - mock_firecrawl_instance.check_crawl_status.return_value = { - "status": "completed", - "total": 5, - "current": 5, - "data": [{"source_url": "https://example.com", "title": "Test Page"}], - } - mock_firecrawl_app.return_value = mock_firecrawl_instance - - # Mock WaterCrawlProvider - mock_watercrawl_instance = MagicMock() - mock_watercrawl_instance.crawl_url.return_value = {"status": "active", "job_id": "watercrawl_job_123"} - mock_watercrawl_instance.get_crawl_status.return_value = { - "status": "completed", - "job_id": "watercrawl_job_123", - "total": 3, - "current": 3, - "data": [], - } - mock_watercrawl_instance.get_crawl_url_data.return_value = { - "title": "WaterCrawl Page", - "source_url": "https://example.com", - "description": "Test description", - "markdown": "# Test Content", - } - mock_watercrawl_instance.scrape_url.return_value = { - "title": "Scraped Page", - "content": "Test content", - "url": "https://example.com", - } - mock_watercrawl_provider.return_value = mock_watercrawl_instance - - # Mock requests - mock_response = MagicMock() - mock_response.json.return_value = {"code": 200, "data": {"taskId": "jina_job_123"}} - mock_requests.get.return_value = mock_response - mock_requests.post.return_value = mock_response - - # Mock Redis - mock_redis_client.setex.return_value = None - mock_redis_client.get.return_value = str(datetime.now().timestamp()) - mock_redis_client.delete.return_value = None - - # Mock Storage - mock_storage.exists.return_value = False - mock_storage.load_once.return_value = None - - yield { - "mock_datasource_provider_service": mock_datasource_provider_service, - "mock_datasource_provider_service_instance": mock_datasource_provider_service_instance, - "firecrawl_app": mock_firecrawl_app, - "watercrawl_provider": mock_watercrawl_provider, - "requests": mock_requests, - "redis_client": mock_redis_client, - "storage": mock_storage, - "encrypter": mock_encrypter, - } - - def _create_test_account(self, db_session_with_containers, mock_external_service_dependencies): - """ - Helper method to create a test account with proper tenant setup. - - Args: - db_session_with_containers: Database session from testcontainers infrastructure - mock_external_service_dependencies: Mock dependencies - - Returns: - Account: Created account instance - """ - fake = Faker() - - # Create account - account = Account( - email=fake.email(), - name=fake.name(), - interface_language="en-US", - status="active", - ) - - from extensions.ext_database import db - - db.session.add(account) - db.session.commit() - - # Create tenant for the account - tenant = Tenant( - name=fake.company(), - status="normal", - ) - db.session.add(tenant) - db.session.commit() - - # Create tenant-account join - join = TenantAccountJoin( - tenant_id=tenant.id, - account_id=account.id, - role=TenantAccountRole.OWNER.value, - current=True, - ) - db.session.add(join) - db.session.commit() - - # Set current tenant for account - account.current_tenant = tenant - - return account - - def test_document_create_args_validate_success( - self, db_session_with_containers, mock_external_service_dependencies - ): - """ - Test successful argument validation for document creation. - - This test verifies: - - Valid arguments are accepted without errors - - All required fields are properly validated - - Optional fields are handled correctly - """ - # Arrange: Prepare valid arguments - valid_args = { - "provider": "firecrawl", - "url": "https://example.com", - "options": { - "limit": 5, - "crawl_sub_pages": True, - "only_main_content": False, - "includes": "blog,news", - "excludes": "admin,private", - "max_depth": 3, - "use_sitemap": True, - }, - } - - # Act: Validate arguments - WebsiteService.document_create_args_validate(valid_args) - - # Assert: No exception should be raised - # If we reach here, validation passed successfully - - def test_document_create_args_validate_missing_provider( - self, db_session_with_containers, mock_external_service_dependencies - ): - """ - Test argument validation fails when provider is missing. - - This test verifies: - - Missing provider raises ValueError - - Proper error message is provided - - Validation stops at first missing required field - """ - # Arrange: Prepare arguments without provider - invalid_args = {"url": "https://example.com", "options": {"limit": 5, "crawl_sub_pages": True}} - - # Act & Assert: Verify proper error handling - with pytest.raises(ValueError) as exc_info: - WebsiteService.document_create_args_validate(invalid_args) - - assert "Provider is required" in str(exc_info.value) - - def test_document_create_args_validate_missing_url( - self, db_session_with_containers, mock_external_service_dependencies - ): - """ - Test argument validation fails when URL is missing. - - This test verifies: - - Missing URL raises ValueError - - Proper error message is provided - - Validation continues after provider check - """ - # Arrange: Prepare arguments without URL - invalid_args = {"provider": "firecrawl", "options": {"limit": 5, "crawl_sub_pages": True}} - - # Act & Assert: Verify proper error handling - with pytest.raises(ValueError) as exc_info: - WebsiteService.document_create_args_validate(invalid_args) - - assert "URL is required" in str(exc_info.value) - - def test_crawl_url_firecrawl_success(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test successful URL crawling with Firecrawl provider. - - This test verifies: - - Firecrawl provider is properly initialized - - API credentials are retrieved and decrypted - - Crawl parameters are correctly formatted - - Job ID is returned with active status - - Redis cache is properly set - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - fake = Faker() - - # Mock current_user for the test - mock_current_user = create_autospec(Account, instance=True) - mock_current_user.current_tenant_id = account.current_tenant.id - - with patch("services.website_service.current_user", mock_current_user): - # Create API request - api_request = WebsiteCrawlApiRequest( - provider="firecrawl", - url="https://example.com", - options={ - "limit": 10, - "crawl_sub_pages": True, - "only_main_content": True, - "includes": "blog,news", - "excludes": "admin,private", - "max_depth": 2, - "use_sitemap": True, - }, - ) - - mock_provider_instance = mock_external_service_dependencies["mock_datasource_provider_service_instance"] - credential = { - "firecrawl_api_key": "decrypted_api_key", - "base_url": "https://api.example.com", - } - mock_provider_instance.get_datasource_credentials.return_value = credential - # Act: Execute crawl operation - result = WebsiteService.crawl_url(api_request) - - # Assert: Verify successful operation - assert result is not None - assert result["status"] == "active" - assert result["job_id"] == "test_job_id_123" - - mock_provider_instance.get_datasource_credentials.assert_called_once_with( - tenant_id=current_tenant.id, - provider="firecrawl", - plugin_id="langgenius/firecrawl_datasource", - ) - # Verify external service interactions - mock_external_service_dependencies["firecrawl_app"].assert_called_once_with( - api_key="decrypted_api_key", base_url="https://api.example.com" - ) - - # Verify Redis cache was set - mock_external_service_dependencies["redis_client"].setex.assert_called_once() - - def test_crawl_url_watercrawl_success(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test successful URL crawling with WaterCrawl provider. - - This test verifies: - - WaterCrawl provider is properly initialized - - API credentials are retrieved and decrypted - - Crawl options are correctly passed to provider - - Provider returns expected response format - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock current_user for the test - mock_current_user = create_autospec(Account, instance=True) - mock_current_user.current_tenant_id = account.current_tenant.id - - with patch("services.website_service.current_user", mock_current_user): - # Create API request - api_request = WebsiteCrawlApiRequest( - provider="watercrawl", - url="https://example.com", - options={ - "limit": 5, - "crawl_sub_pages": False, - "only_main_content": False, - "includes": None, - "excludes": None, - "max_depth": None, - "use_sitemap": False, - }, - ) - mock_provider_instance = mock_external_service_dependencies["mock_datasource_provider_service_instance"] - credential = { - "api_key": "decrypted_api_key", - "base_url": "https://api.example.com", - } - mock_provider_instance.get_datasource_credentials.return_value = credential - # Act: Execute crawl operation - result = WebsiteService.crawl_url(api_request) - - # Assert: Verify successful operation - assert result is not None - assert result["status"] == "active" - assert result["job_id"] == "watercrawl_job_123" - - # Verify external service interactions - mock_provider_instance.get_datasource_credentials.assert_called_once_with( - tenant_id=current_tenant.id, - provider="watercrawl", - plugin_id="langgenius/watercrawl_datasource", - ) - mock_external_service_dependencies["watercrawl_provider"].assert_called_once_with( - api_key="decrypted_api_key", base_url="https://api.example.com" - ) - - def test_crawl_url_jinareader_success(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test successful URL crawling with JinaReader provider. - - This test verifies: - - JinaReader provider handles single page crawling - - API credentials are retrieved and decrypted - - HTTP requests are made with proper headers - - Response is properly parsed and returned - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock current_user for the test - mock_current_user = create_autospec(Account, instance=True) - mock_current_user.current_tenant_id = account.current_tenant.id - - with patch("services.website_service.current_user", mock_current_user): - # Create API request for single page crawling - api_request = WebsiteCrawlApiRequest( - provider="jinareader", - url="https://example.com", - options={ - "limit": 1, - "crawl_sub_pages": False, - "only_main_content": True, - "includes": None, - "excludes": None, - "max_depth": None, - "use_sitemap": False, - }, - ) - - # Act: Execute crawl operation - result = WebsiteService.crawl_url(api_request) - - # Assert: Verify successful operation - assert result is not None - assert result["status"] == "active" - assert result["data"] is not None - - # Verify HTTP request was made - mock_external_service_dependencies["requests"].get.assert_called_once_with( - "https://r.jina.ai/https://example.com", - headers={"Accept": "application/json", "Authorization": "Bearer decrypted_api_key"}, - ) - - def test_crawl_url_invalid_provider(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test crawl operation fails with invalid provider. - - This test verifies: - - Invalid provider raises ValueError - - Proper error message is provided - - Service handles unsupported providers gracefully - """ - # Arrange: Create test account and prepare request with invalid provider - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock current_user for the test - mock_current_user = create_autospec(Account, instance=True) - mock_current_user.current_tenant_id = account.current_tenant.id - - with patch("services.website_service.current_user", mock_current_user): - # Create API request with invalid provider - api_request = WebsiteCrawlApiRequest( - provider="invalid_provider", - url="https://example.com", - options={"limit": 5, "crawl_sub_pages": False, "only_main_content": False}, - ) - - # Act & Assert: Verify proper error handling - with pytest.raises(ValueError) as exc_info: - WebsiteService.crawl_url(api_request) - - assert "Invalid provider" in str(exc_info.value) - - def test_get_crawl_status_firecrawl_success(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test successful crawl status retrieval with Firecrawl provider. - - This test verifies: - - Firecrawl status is properly retrieved - - API credentials are retrieved and decrypted - - Status data includes all required fields - - Redis cache is properly managed for completed jobs - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock current_user for the test - mock_current_user = create_autospec(Account, instance=True) - mock_current_user.current_tenant_id = account.current_tenant.id - - with patch("services.website_service.current_user", mock_current_user): - # Create API request - api_request = WebsiteCrawlStatusApiRequest(provider="firecrawl", job_id="test_job_id_123") - - # Act: Get crawl status - result = WebsiteService.get_crawl_status_typed(api_request) - - # Assert: Verify successful operation - assert result is not None - assert result["status"] == "completed" - assert result["job_id"] == "test_job_id_123" - assert result["total"] == 5 - assert result["current"] == 5 - assert "data" in result - assert "time_consuming" in result - - # Verify Redis cache was accessed and cleaned up - mock_external_service_dependencies["redis_client"].get.assert_called_once() - mock_external_service_dependencies["redis_client"].delete.assert_called_once() - - def test_get_crawl_status_watercrawl_success(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test successful crawl status retrieval with WaterCrawl provider. - - This test verifies: - - WaterCrawl status is properly retrieved - - API credentials are retrieved and decrypted - - Provider returns expected status format - - All required status fields are present - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock current_user for the test - mock_current_user = create_autospec(Account, instance=True) - mock_current_user.current_tenant_id = account.current_tenant.id - - with patch("services.website_service.current_user", mock_current_user): - # Create API request - api_request = WebsiteCrawlStatusApiRequest(provider="watercrawl", job_id="watercrawl_job_123") - - # Act: Get crawl status - result = WebsiteService.get_crawl_status_typed(api_request) - - # Assert: Verify successful operation - assert result is not None - assert result["status"] == "completed" - assert result["job_id"] == "watercrawl_job_123" - assert result["total"] == 3 - assert result["current"] == 3 - assert "data" in result - - def test_get_crawl_status_jinareader_success(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test successful crawl status retrieval with JinaReader provider. - - This test verifies: - - JinaReader status is properly retrieved - - API credentials are retrieved and decrypted - - HTTP requests are made with proper parameters - - Status data is properly formatted and returned - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock current_user for the test - mock_current_user = create_autospec(Account, instance=True) - mock_current_user.current_tenant_id = account.current_tenant.id - - with patch("services.website_service.current_user", mock_current_user): - # Create API request - api_request = WebsiteCrawlStatusApiRequest(provider="jinareader", job_id="jina_job_123") - - # Act: Get crawl status - result = WebsiteService.get_crawl_status_typed(api_request) - - # Assert: Verify successful operation - assert result is not None - assert result["status"] == "active" - assert result["job_id"] == "jina_job_123" - assert "total" in result - assert "current" in result - assert "data" in result - assert "time_consuming" in result - - # Verify HTTP request was made - mock_external_service_dependencies["requests"].post.assert_called_once() - - def test_get_crawl_status_invalid_provider(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test crawl status retrieval fails with invalid provider. - - This test verifies: - - Invalid provider raises ValueError - - Proper error message is provided - - Service handles unsupported providers gracefully - """ - # Arrange: Create test account and prepare request with invalid provider - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock current_user for the test - mock_current_user = create_autospec(Account, instance=True) - mock_current_user.current_tenant_id = account.current_tenant.id - - with patch("services.website_service.current_user", mock_current_user): - # Create API request with invalid provider - api_request = WebsiteCrawlStatusApiRequest(provider="invalid_provider", job_id="test_job_id_123") - - # Act & Assert: Verify proper error handling - with pytest.raises(ValueError) as exc_info: - WebsiteService.get_crawl_status_typed(api_request) - - assert "Invalid provider" in str(exc_info.value) - - def test_get_crawl_status_missing_credentials(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test crawl status retrieval fails when credentials are missing. - - This test verifies: - - Missing credentials raises ValueError - - Proper error message is provided - - Service handles authentication failures gracefully - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock current_user for the test - mock_current_user = create_autospec(Account, instance=True) - mock_current_user.current_tenant_id = account.current_tenant.id - - with patch("services.website_service.current_user", mock_current_user): - # Mock missing credentials - mock_external_service_dependencies[ - "mock_datasource_provider_service" - ].get_datasource_credentials.return_value = None - - # Create API request - api_request = WebsiteCrawlStatusApiRequest(provider="firecrawl", job_id="test_job_id_123") - - # Act & Assert: Verify proper error handling - with pytest.raises(ValueError) as exc_info: - WebsiteService.get_crawl_status_typed(api_request) - - assert "No valid credentials found for the provider" in str(exc_info.value) - - def test_get_crawl_status_missing_api_key(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test crawl status retrieval fails when API key is missing from config. - - This test verifies: - - Missing API key raises ValueError - - Proper error message is provided - - Service handles configuration failures gracefully - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock current_user for the test - mock_current_user = create_autospec(Account, instance=True) - mock_current_user.current_tenant_id = account.current_tenant.id - - with patch("services.website_service.current_user", mock_current_user): - # Mock missing API key in config - mock_external_service_dependencies["api_key_auth_service"].get_auth_credentials.return_value = { - "config": {"base_url": "https://api.example.com"} - } - - # Create API request - api_request = WebsiteCrawlStatusApiRequest(provider="firecrawl", job_id="test_job_id_123") - - # Act & Assert: Verify proper error handling - with pytest.raises(ValueError) as exc_info: - WebsiteService.get_crawl_status_typed(api_request) - - assert "API key not found in configuration" in str(exc_info.value) - - def test_get_crawl_url_data_firecrawl_success(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test successful URL data retrieval with Firecrawl provider. - - This test verifies: - - Firecrawl URL data is properly retrieved - - API credentials are retrieved and decrypted - - Data is returned for matching URL - - Storage fallback works when needed - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock storage to return existing data - mock_external_service_dependencies["storage"].exists.return_value = True - mock_external_service_dependencies["storage"].load_once.return_value = ( - b"[" - b'{"source_url": "https://example.com", "title": "Test Page", ' - b'"description": "Test Description", "markdown": "# Test Content"}' - b"]" - ) - - # Act: Get URL data - result = WebsiteService.get_crawl_url_data( - job_id="test_job_id_123", - provider="firecrawl", - url="https://example.com", - tenant_id=account.current_tenant.id, - ) - - # Assert: Verify successful operation - assert result is not None - assert result["source_url"] == "https://example.com" - assert result["title"] == "Test Page" - assert result["description"] == "Test Description" - assert result["markdown"] == "# Test Content" - - # Verify storage was accessed - mock_external_service_dependencies["storage"].exists.assert_called_once() - mock_external_service_dependencies["storage"].load_once.assert_called_once() - - def test_get_crawl_url_data_watercrawl_success( - self, db_session_with_containers, mock_external_service_dependencies - ): - """ - Test successful URL data retrieval with WaterCrawl provider. - - This test verifies: - - WaterCrawl URL data is properly retrieved - - API credentials are retrieved and decrypted - - Provider returns expected data format - - All required data fields are present - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Act: Get URL data - result = WebsiteService.get_crawl_url_data( - job_id="watercrawl_job_123", - provider="watercrawl", - url="https://example.com", - tenant_id=account.current_tenant.id, - ) - - # Assert: Verify successful operation - assert result is not None - assert result["title"] == "WaterCrawl Page" - assert result["source_url"] == "https://example.com" - assert result["description"] == "Test description" - assert result["markdown"] == "# Test Content" - - def test_get_crawl_url_data_jinareader_success( - self, db_session_with_containers, mock_external_service_dependencies - ): - """ - Test successful URL data retrieval with JinaReader provider. - - This test verifies: - - JinaReader URL data is properly retrieved - - API credentials are retrieved and decrypted - - HTTP requests are made with proper parameters - - Data is properly formatted and returned - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock successful response for JinaReader - mock_response = MagicMock() - mock_response.json.return_value = { - "code": 200, - "data": { - "title": "JinaReader Page", - "url": "https://example.com", - "description": "Test description", - "content": "# Test Content", - }, - } - mock_external_service_dependencies["requests"].get.return_value = mock_response - - # Act: Get URL data without job_id (single page scraping) - result = WebsiteService.get_crawl_url_data( - job_id="", provider="jinareader", url="https://example.com", tenant_id=account.current_tenant.id - ) - - # Assert: Verify successful operation - assert result is not None - assert result["title"] == "JinaReader Page" - assert result["url"] == "https://example.com" - assert result["description"] == "Test description" - assert result["content"] == "# Test Content" - - # Verify HTTP request was made - mock_external_service_dependencies["requests"].get.assert_called_once_with( - "https://r.jina.ai/https://example.com", - headers={"Accept": "application/json", "Authorization": "Bearer decrypted_api_key"}, - ) - - def test_get_scrape_url_data_firecrawl_success( - self, db_session_with_containers, mock_external_service_dependencies - ): - """ - Test successful URL scraping with Firecrawl provider. - - This test verifies: - - Firecrawl scraping is properly executed - - API credentials are retrieved and decrypted - - Scraping parameters are correctly passed - - Scraped data is returned in expected format - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock FirecrawlApp scraping response - mock_firecrawl_instance = MagicMock() - mock_firecrawl_instance.scrape_url.return_value = { - "title": "Scraped Page Title", - "content": "This is the scraped content", - "url": "https://example.com", - "description": "Page description", - } - mock_external_service_dependencies["firecrawl_app"].return_value = mock_firecrawl_instance - - # Act: Scrape URL - result = WebsiteService.get_scrape_url_data( - provider="firecrawl", url="https://example.com", tenant_id=account.current_tenant.id, only_main_content=True - ) - - # Assert: Verify successful operation - assert result is not None - assert result["title"] == "Scraped Page Title" - assert result["content"] == "This is the scraped content" - assert result["url"] == "https://example.com" - assert result["description"] == "Page description" - - # Verify FirecrawlApp was called with correct parameters - mock_external_service_dependencies["firecrawl_app"].assert_called_once_with( - api_key="decrypted_api_key", base_url="https://api.example.com" - ) - mock_firecrawl_instance.scrape_url.assert_called_once_with( - url="https://example.com", params={"onlyMainContent": True} - ) - - def test_get_scrape_url_data_watercrawl_success( - self, db_session_with_containers, mock_external_service_dependencies - ): - """ - Test successful URL scraping with WaterCrawl provider. - - This test verifies: - - WaterCrawl scraping is properly executed - - API credentials are retrieved and decrypted - - Provider returns expected scraping format - - All required data fields are present - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Act: Scrape URL - result = WebsiteService.get_scrape_url_data( - provider="watercrawl", - url="https://example.com", - tenant_id=account.current_tenant.id, - only_main_content=False, - ) - - # Assert: Verify successful operation - assert result is not None - assert result["title"] == "Scraped Page" - assert result["content"] == "Test content" - assert result["url"] == "https://example.com" - - # Verify WaterCrawlProvider was called with correct parameters - mock_external_service_dependencies["watercrawl_provider"].assert_called_once_with( - api_key="decrypted_api_key", base_url="https://api.example.com" - ) - - def test_get_scrape_url_data_invalid_provider(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test URL scraping fails with invalid provider. - - This test verifies: - - Invalid provider raises ValueError - - Proper error message is provided - - Service handles unsupported providers gracefully - """ - # Arrange: Create test account and prepare request with invalid provider - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Act & Assert: Verify proper error handling - with pytest.raises(ValueError) as exc_info: - WebsiteService.get_scrape_url_data( - provider="invalid_provider", - url="https://example.com", - tenant_id=account.current_tenant.id, - only_main_content=False, - ) - - assert "Invalid provider" in str(exc_info.value) - - def test_crawl_options_include_exclude_paths(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test CrawlOptions include and exclude path methods. - - This test verifies: - - Include paths are properly parsed from comma-separated string - - Exclude paths are properly parsed from comma-separated string - - Empty or None values are handled correctly - - Path lists are returned in expected format - """ - # Arrange: Create CrawlOptions with various path configurations - options_with_paths = CrawlOptions(includes="blog,news,articles", excludes="admin,private,test") - - options_without_paths = CrawlOptions(includes=None, excludes="") - - # Act: Get include and exclude paths - include_paths = options_with_paths.get_include_paths() - exclude_paths = options_with_paths.get_exclude_paths() - - empty_include_paths = options_without_paths.get_include_paths() - empty_exclude_paths = options_without_paths.get_exclude_paths() - - # Assert: Verify path parsing - assert include_paths == ["blog", "news", "articles"] - assert exclude_paths == ["admin", "private", "test"] - assert empty_include_paths == [] - assert empty_exclude_paths == [] - - def test_website_crawl_api_request_conversion(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test WebsiteCrawlApiRequest conversion to CrawlRequest. - - This test verifies: - - API request is properly converted to internal CrawlRequest - - All options are correctly mapped - - Default values are applied when options are missing - - Conversion maintains data integrity - """ - # Arrange: Create API request with various options - api_request = WebsiteCrawlApiRequest( - provider="firecrawl", - url="https://example.com", - options={ - "limit": 10, - "crawl_sub_pages": True, - "only_main_content": True, - "includes": "blog,news", - "excludes": "admin,private", - "max_depth": 3, - "use_sitemap": False, - }, - ) - - # Act: Convert to CrawlRequest - crawl_request = api_request.to_crawl_request() - - # Assert: Verify conversion - assert crawl_request.url == "https://example.com" - assert crawl_request.provider == "firecrawl" - assert crawl_request.options.limit == 10 - assert crawl_request.options.crawl_sub_pages is True - assert crawl_request.options.only_main_content is True - assert crawl_request.options.includes == "blog,news" - assert crawl_request.options.excludes == "admin,private" - assert crawl_request.options.max_depth == 3 - assert crawl_request.options.use_sitemap is False - - def test_website_crawl_api_request_from_args(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test WebsiteCrawlApiRequest creation from Flask arguments. - - This test verifies: - - Request is properly created from parsed arguments - - Required fields are validated - - Optional fields are handled correctly - - Validation errors are properly raised - """ - # Arrange: Prepare valid arguments - valid_args = {"provider": "watercrawl", "url": "https://example.com", "options": {"limit": 5}} - - # Act: Create request from args - request = WebsiteCrawlApiRequest.from_args(valid_args) - - # Assert: Verify request creation - assert request.provider == "watercrawl" - assert request.url == "https://example.com" - assert request.options == {"limit": 5} - - # Test missing provider - invalid_args = {"url": "https://example.com", "options": {}} - with pytest.raises(ValueError) as exc_info: - WebsiteCrawlApiRequest.from_args(invalid_args) - assert "Provider is required" in str(exc_info.value) - - # Test missing URL - invalid_args = {"provider": "watercrawl", "options": {}} - with pytest.raises(ValueError) as exc_info: - WebsiteCrawlApiRequest.from_args(invalid_args) - assert "URL is required" in str(exc_info.value) - - # Test missing options - invalid_args = {"provider": "watercrawl", "url": "https://example.com"} - with pytest.raises(ValueError) as exc_info: - WebsiteCrawlApiRequest.from_args(invalid_args) - assert "Options are required" in str(exc_info.value) - - def test_crawl_url_jinareader_sub_pages_success( - self, db_session_with_containers, mock_external_service_dependencies - ): - """ - Test successful URL crawling with JinaReader provider for sub-pages. - - This test verifies: - - JinaReader provider handles sub-page crawling correctly - - HTTP POST request is made with proper parameters - - Job ID is returned for multi-page crawling - - All required parameters are passed correctly - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock current_user for the test - mock_current_user = create_autospec(Account, instance=True) - mock_current_user.current_tenant_id = account.current_tenant.id - - with patch("services.website_service.current_user", mock_current_user): - # Create API request for sub-page crawling - api_request = WebsiteCrawlApiRequest( - provider="jinareader", - url="https://example.com", - options={ - "limit": 5, - "crawl_sub_pages": True, - "only_main_content": False, - "includes": None, - "excludes": None, - "max_depth": None, - "use_sitemap": True, - }, - ) - - # Act: Execute crawl operation - result = WebsiteService.crawl_url(api_request) - - # Assert: Verify successful operation - assert result is not None - assert result["status"] == "active" - assert result["job_id"] == "jina_job_123" - - # Verify HTTP POST request was made for sub-page crawling - mock_external_service_dependencies["requests"].post.assert_called_once_with( - "https://adaptivecrawl-kir3wx7b3a-uc.a.run.app", - json={"url": "https://example.com", "maxPages": 5, "useSitemap": True}, - headers={"Content-Type": "application/json", "Authorization": "Bearer decrypted_api_key"}, - ) - - def test_crawl_url_jinareader_failed_response(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test JinaReader crawling fails when API returns error. - - This test verifies: - - Failed API response raises ValueError - - Proper error message is provided - - Service handles API failures gracefully - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock failed response - mock_failed_response = MagicMock() - mock_failed_response.json.return_value = {"code": 500, "error": "Internal server error"} - mock_external_service_dependencies["requests"].get.return_value = mock_failed_response - - # Mock current_user for the test - mock_current_user = create_autospec(Account, instance=True) - mock_current_user.current_tenant_id = account.current_tenant.id - - with patch("services.website_service.current_user", mock_current_user): - # Create API request - api_request = WebsiteCrawlApiRequest( - provider="jinareader", - url="https://example.com", - options={"limit": 1, "crawl_sub_pages": False, "only_main_content": True}, - ) - - # Act & Assert: Verify proper error handling - with pytest.raises(ValueError) as exc_info: - WebsiteService.crawl_url(api_request) - - assert "Failed to crawl" in str(exc_info.value) - - def test_get_crawl_status_firecrawl_active_job( - self, db_session_with_containers, mock_external_service_dependencies - ): - """ - Test Firecrawl status retrieval for active (not completed) job. - - This test verifies: - - Active job status is properly returned - - Redis cache is not deleted for active jobs - - Time consuming is not calculated for active jobs - - All required status fields are present - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock active job status - mock_firecrawl_instance = MagicMock() - mock_firecrawl_instance.check_crawl_status.return_value = { - "status": "active", - "total": 10, - "current": 3, - "data": [], - } - mock_external_service_dependencies["firecrawl_app"].return_value = mock_firecrawl_instance - - # Mock current_user for the test - mock_current_user = create_autospec(Account, instance=True) - mock_current_user.current_tenant_id = account.current_tenant.id - - with patch("services.website_service.current_user", mock_current_user): - # Create API request - api_request = WebsiteCrawlStatusApiRequest(provider="firecrawl", job_id="active_job_123") - - # Act: Get crawl status - result = WebsiteService.get_crawl_status_typed(api_request) - - # Assert: Verify active job status - assert result is not None - assert result["status"] == "active" - assert result["job_id"] == "active_job_123" - assert result["total"] == 10 - assert result["current"] == 3 - assert "data" in result - assert "time_consuming" not in result - - # Verify Redis cache was not accessed for active jobs - mock_external_service_dependencies["redis_client"].get.assert_not_called() - mock_external_service_dependencies["redis_client"].delete.assert_not_called() - - def test_get_crawl_url_data_firecrawl_storage_fallback( - self, db_session_with_containers, mock_external_service_dependencies - ): - """ - Test Firecrawl URL data retrieval with storage fallback. - - This test verifies: - - Storage fallback works when storage has data - - API call is not made when storage has data - - Data is properly parsed from storage - - Correct URL data is returned - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock storage to return existing data - mock_external_service_dependencies["storage"].exists.return_value = True - mock_external_service_dependencies["storage"].load_once.return_value = ( - b"[" - b'{"source_url": "https://example.com/page1", ' - b'"title": "Page 1", "description": "Description 1", "markdown": "# Page 1"}, ' - b'{"source_url": "https://example.com/page2", "title": "Page 2", ' - b'"description": "Description 2", "markdown": "# Page 2"}' - b"]" - ) - - # Act: Get URL data for specific URL - result = WebsiteService.get_crawl_url_data( - job_id="test_job_id_123", - provider="firecrawl", - url="https://example.com/page1", - tenant_id=account.current_tenant.id, - ) - - # Assert: Verify successful operation - assert result is not None - assert result["source_url"] == "https://example.com/page1" - assert result["title"] == "Page 1" - assert result["description"] == "Description 1" - assert result["markdown"] == "# Page 1" - - # Verify storage was accessed - mock_external_service_dependencies["storage"].exists.assert_called_once() - mock_external_service_dependencies["storage"].load_once.assert_called_once() - - def test_get_crawl_url_data_firecrawl_api_fallback( - self, db_session_with_containers, mock_external_service_dependencies - ): - """ - Test Firecrawl URL data retrieval with API fallback when storage is empty. - - This test verifies: - - API fallback works when storage has no data - - FirecrawlApp is called to get data - - Completed job status is checked - - Data is returned from API response - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock storage to return no data - mock_external_service_dependencies["storage"].exists.return_value = False - - # Mock FirecrawlApp for API fallback - mock_firecrawl_instance = MagicMock() - mock_firecrawl_instance.check_crawl_status.return_value = { - "status": "completed", - "data": [ - { - "source_url": "https://example.com/api_page", - "title": "API Page", - "description": "API Description", - "markdown": "# API Content", - } - ], - } - mock_external_service_dependencies["firecrawl_app"].return_value = mock_firecrawl_instance - - # Act: Get URL data - result = WebsiteService.get_crawl_url_data( - job_id="test_job_id_123", - provider="firecrawl", - url="https://example.com/api_page", - tenant_id=account.current_tenant.id, - ) - - # Assert: Verify successful operation - assert result is not None - assert result["source_url"] == "https://example.com/api_page" - assert result["title"] == "API Page" - assert result["description"] == "API Description" - assert result["markdown"] == "# API Content" - - # Verify API was called - mock_external_service_dependencies["firecrawl_app"].assert_called_once() - - def test_get_crawl_url_data_firecrawl_incomplete_job( - self, db_session_with_containers, mock_external_service_dependencies - ): - """ - Test Firecrawl URL data retrieval fails for incomplete job. - - This test verifies: - - Incomplete job raises ValueError - - Proper error message is provided - - Service handles incomplete jobs gracefully - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock storage to return no data - mock_external_service_dependencies["storage"].exists.return_value = False - - # Mock incomplete job status - mock_firecrawl_instance = MagicMock() - mock_firecrawl_instance.check_crawl_status.return_value = {"status": "active", "data": []} - mock_external_service_dependencies["firecrawl_app"].return_value = mock_firecrawl_instance - - # Act & Assert: Verify proper error handling - with pytest.raises(ValueError) as exc_info: - WebsiteService.get_crawl_url_data( - job_id="test_job_id_123", - provider="firecrawl", - url="https://example.com/page", - tenant_id=account.current_tenant.id, - ) - - assert "Crawl job is not completed" in str(exc_info.value) - - def test_get_crawl_url_data_jinareader_with_job_id( - self, db_session_with_containers, mock_external_service_dependencies - ): - """ - Test JinaReader URL data retrieval with job ID for multi-page crawling. - - This test verifies: - - JinaReader handles job ID-based data retrieval - - Status check is performed before data retrieval - - Processed data is properly formatted - - Correct URL data is returned - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock successful status response - mock_status_response = MagicMock() - mock_status_response.json.return_value = { - "code": 200, - "data": { - "status": "completed", - "processed": { - "https://example.com/page1": { - "data": { - "title": "Page 1", - "url": "https://example.com/page1", - "description": "Description 1", - "content": "# Content 1", - } - } - }, - }, - } - mock_external_service_dependencies["requests"].post.return_value = mock_status_response - - # Act: Get URL data with job ID - result = WebsiteService.get_crawl_url_data( - job_id="jina_job_123", - provider="jinareader", - url="https://example.com/page1", - tenant_id=account.current_tenant.id, - ) - - # Assert: Verify successful operation - assert result is not None - assert result["title"] == "Page 1" - assert result["url"] == "https://example.com/page1" - assert result["description"] == "Description 1" - assert result["content"] == "# Content 1" - - # Verify HTTP requests were made - assert mock_external_service_dependencies["requests"].post.call_count == 2 - - def test_get_crawl_url_data_jinareader_incomplete_job( - self, db_session_with_containers, mock_external_service_dependencies - ): - """ - Test JinaReader URL data retrieval fails for incomplete job. - - This test verifies: - - Incomplete job raises ValueError - - Proper error message is provided - - Service handles incomplete jobs gracefully - """ - # Arrange: Create test account and prepare request - account = self._create_test_account(db_session_with_containers, mock_external_service_dependencies) - - # Mock incomplete job status - mock_status_response = MagicMock() - mock_status_response.json.return_value = {"code": 200, "data": {"status": "active", "processed": {}}} - mock_external_service_dependencies["requests"].post.return_value = mock_status_response - - # Act & Assert: Verify proper error handling - with pytest.raises(ValueError) as exc_info: - WebsiteService.get_crawl_url_data( - job_id="jina_job_123", - provider="jinareader", - url="https://example.com/page", - tenant_id=account.current_tenant.id, - ) - - assert "Crawl job is not completed" in str(exc_info.value) - - def test_crawl_options_default_values(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test CrawlOptions default values and initialization. - - This test verifies: - - Default values are properly set - - Optional fields can be None - - Boolean fields have correct defaults - - Integer fields have correct defaults - """ - # Arrange: Create CrawlOptions with minimal parameters - options = CrawlOptions() - - # Assert: Verify default values - assert options.limit == 1 - assert options.crawl_sub_pages is False - assert options.only_main_content is False - assert options.includes is None - assert options.excludes is None - assert options.max_depth is None - assert options.use_sitemap is True - - # Test with custom values - custom_options = CrawlOptions( - limit=10, - crawl_sub_pages=True, - only_main_content=True, - includes="blog,news", - excludes="admin", - max_depth=3, - use_sitemap=False, - ) - - assert custom_options.limit == 10 - assert custom_options.crawl_sub_pages is True - assert custom_options.only_main_content is True - assert custom_options.includes == "blog,news" - assert custom_options.excludes == "admin" - assert custom_options.max_depth == 3 - assert custom_options.use_sitemap is False - - def test_website_crawl_status_api_request_from_args( - self, db_session_with_containers, mock_external_service_dependencies - ): - """ - Test WebsiteCrawlStatusApiRequest creation from Flask arguments. - - This test verifies: - - Request is properly created from parsed arguments - - Required fields are validated - - Job ID is properly handled - - Validation errors are properly raised - """ - # Arrange: Prepare valid arguments - valid_args = {"provider": "firecrawl"} - job_id = "test_job_123" - - # Act: Create request from args - request = WebsiteCrawlStatusApiRequest.from_args(valid_args, job_id) - - # Assert: Verify request creation - assert request.provider == "firecrawl" - assert request.job_id == "test_job_123" - - # Test missing provider - invalid_args = {} - with pytest.raises(ValueError) as exc_info: - WebsiteCrawlStatusApiRequest.from_args(invalid_args, job_id) - assert "Provider is required" in str(exc_info.value) - - # Test missing job ID - with pytest.raises(ValueError) as exc_info: - WebsiteCrawlStatusApiRequest.from_args(valid_args, "") - assert "Job ID is required" in str(exc_info.value) - - def test_scrape_request_initialization(self, db_session_with_containers, mock_external_service_dependencies): - """ - Test ScrapeRequest dataclass initialization and properties. - - This test verifies: - - ScrapeRequest is properly initialized - - All fields are correctly set - - Boolean field works correctly - - String fields are properly assigned - """ - # Arrange: Create ScrapeRequest - request = ScrapeRequest( - provider="firecrawl", url="https://example.com", tenant_id="tenant_123", only_main_content=True - ) - - # Assert: Verify initialization - assert request.provider == "firecrawl" - assert request.url == "https://example.com" - assert request.tenant_id == "tenant_123" - assert request.only_main_content is True - - # Test with different values - request2 = ScrapeRequest( - provider="watercrawl", url="https://test.com", tenant_id="tenant_456", only_main_content=False - ) - - assert request2.provider == "watercrawl" - assert request2.url == "https://test.com" - assert request2.tenant_id == "tenant_456" - assert request2.only_main_content is False