diff --git a/api/core/datasource/entities/datasource_entities.py b/api/core/datasource/entities/datasource_entities.py index d55e1392c3..2f8f48f0ea 100644 --- a/api/core/datasource/entities/datasource_entities.py +++ b/api/core/datasource/entities/datasource_entities.py @@ -127,7 +127,6 @@ class DatasourceEntity(BaseModel): description: I18nObject = Field(..., description="The label of the datasource") output_schema: Optional[dict] = None - @field_validator("parameters", mode="before") @classmethod def set_parameters(cls, v, validation_info: ValidationInfo) -> list[DatasourceParameter]: diff --git a/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py b/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py index 09babaa8ce..38a2ffc4aa 100644 --- a/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py +++ b/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py @@ -1,4 +1,3 @@ - from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document from services.website_service import WebsiteService @@ -35,9 +34,7 @@ class FirecrawlWebExtractor(BaseExtractor): """Extract content from the URL.""" documents = [] if self.mode == "crawl": - crawl_data = WebsiteService.get_crawl_url_data( - self.job_id, "firecrawl", self._url, self.tenant_id - ) + crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "firecrawl", self._url, self.tenant_id) if crawl_data is None: return [] document = Document( diff --git a/api/core/rag/extractor/jina_reader_extractor.py b/api/core/rag/extractor/jina_reader_extractor.py index a3b9075136..67e9a3c60a 100644 --- a/api/core/rag/extractor/jina_reader_extractor.py +++ b/api/core/rag/extractor/jina_reader_extractor.py @@ -1,4 +1,3 @@ - from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document from services.website_service import WebsiteService @@ -28,9 +27,7 @@ class JinaReaderWebExtractor(BaseExtractor): """Extract content from the URL.""" documents = [] if self.mode == "crawl": - crawl_data = WebsiteService.get_crawl_url_data( - self.job_id, "jinareader", self._url, self.tenant_id - ) + crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "jinareader", self._url, self.tenant_id) if crawl_data is None: return [] document = Document( diff --git a/api/core/rag/extractor/watercrawl/extractor.py b/api/core/rag/extractor/watercrawl/extractor.py index f3663397d7..51a432d879 100644 --- a/api/core/rag/extractor/watercrawl/extractor.py +++ b/api/core/rag/extractor/watercrawl/extractor.py @@ -1,4 +1,3 @@ - from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document from services.website_service import WebsiteService @@ -36,9 +35,7 @@ class WaterCrawlWebExtractor(BaseExtractor): """Extract content from the URL.""" documents = [] if self.mode == "crawl": - crawl_data = WebsiteService.get_crawl_url_data( - self.job_id, "watercrawl", self._url, self.tenant_id - ) + crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "watercrawl", self._url, self.tenant_id) if crawl_data is None: return [] document = Document( diff --git a/api/core/rag/index_processor/processor/parent_child_index_processor.py b/api/core/rag/index_processor/processor/parent_child_index_processor.py index bb2efdafe0..09d3e3bad6 100644 --- a/api/core/rag/index_processor/processor/parent_child_index_processor.py +++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py @@ -233,9 +233,11 @@ class ParentChildIndexProcessor(BaseIndexProcessor): dataset_process_rule = DatasetProcessRule( dataset_id=dataset.id, mode="hierarchical", - rules=json.dumps({ - "parent_mode": parent_childs.parent_mode, - }), + rules=json.dumps( + { + "parent_mode": parent_childs.parent_mode, + } + ), created_by=document.created_by, ) db.session.add(dataset_process_rule) diff --git a/api/models/dataset.py b/api/models/dataset.py index ebffcac2fb..a912a3d7eb 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -816,7 +816,7 @@ class DocumentSegment(Base): base_url = f"/files/{upload_file_id}/file-preview" signed_url = f"{base_url}?{params}" signed_urls.append((match.start(), match.end(), signed_url)) - + # For tools directory - direct file formats (e.g., .png, .jpg, etc.) pattern = r"/files/tools/([a-f0-9\-]+)\.([a-zA-Z0-9]+)(?:\?.*?)?" matches = re.finditer(pattern, text) diff --git a/api/models/oauth.py b/api/models/oauth.py index 23b204cf07..9869fb40ff 100644 --- a/api/models/oauth.py +++ b/api/models/oauth.py @@ -37,7 +37,7 @@ class DatasourceProvider(Base): encrypted_credentials: Mapped[dict] = db.Column(JSONB, nullable=False) avatar_url: Mapped[str] = db.Column(db.String(255), nullable=True, default="default") is_default: Mapped[bool] = db.Column(db.Boolean, nullable=False, server_default=db.text("false")) - expires_at: Mapped[int] = db.Column(db.Integer, nullable=False, server_default='-1') + expires_at: Mapped[int] = db.Column(db.Integer, nullable=False, server_default="-1") created_at: Mapped[datetime] = db.Column(db.DateTime, nullable=False, default=datetime.now) updated_at: Mapped[datetime] = db.Column(db.DateTime, nullable=False, default=datetime.now) diff --git a/api/services/datasource_provider_service.py b/api/services/datasource_provider_service.py index 2ba7cdaa6b..c2c0b0f713 100644 --- a/api/services/datasource_provider_service.py +++ b/api/services/datasource_provider_service.py @@ -2,7 +2,6 @@ import logging import time from typing import Any -from core.plugin.impl.oauth import OAuthHandler from flask_login import current_user from sqlalchemy.orm import Session @@ -14,6 +13,7 @@ from core.helper.provider_cache import NoOpProviderCredentialCache from core.model_runtime.entities.provider_entities import FormType from core.plugin.entities.plugin import DatasourceProviderID from core.plugin.impl.datasource import PluginDatasourceManager +from core.plugin.impl.oauth import OAuthHandler from core.tools.entities.tool_entities import CredentialType from core.tools.utils.encryption import ProviderConfigCache, ProviderConfigEncrypter, create_provider_encrypter from extensions.ext_database import db @@ -143,7 +143,7 @@ class DatasourceProviderService: plugin_id=plugin_id, provider=provider, ) - + def get_all_datasource_credentials_by_provider( self, tenant_id: str, diff --git a/api/services/rag_pipeline/rag_pipeline_transform_service.py b/api/services/rag_pipeline/rag_pipeline_transform_service.py index d732c4809c..f029bb4c3c 100644 --- a/api/services/rag_pipeline/rag_pipeline_transform_service.py +++ b/api/services/rag_pipeline/rag_pipeline_transform_service.py @@ -307,15 +307,17 @@ class RagPipelineTransformService: if file_id: file = db.session.query(UploadFile).filter(UploadFile.id == file_id).first() if file: - data_source_info = json.dumps({ - "real_file_id": file_id, - "name": file.name, - "size": file.size, - "extension": file.extension, - "mime_type": file.mime_type, - "url": "", - "transfer_method": "local_file", - }) + data_source_info = json.dumps( + { + "real_file_id": file_id, + "name": file.name, + "size": file.size, + "extension": file.extension, + "mime_type": file.mime_type, + "url": "", + "transfer_method": "local_file", + } + ) document.data_source_info = data_source_info document_pipeline_execution_log = DocumentPipelineExecutionLog( document_id=document.id, @@ -331,17 +333,19 @@ class RagPipelineTransformService: db.session.add(document_pipeline_execution_log) elif document.data_source_type == "notion_import": document.data_source_type = "online_document" - data_source_info = json.dumps({ - "workspace_id": data_source_info_dict.get("notion_workspace_id"), - "page": { - "page_id": data_source_info_dict.get("notion_page_id"), - "page_name": document.name, - "page_icon": data_source_info_dict.get("notion_page_icon"), - "type": data_source_info_dict.get("type"), - "last_edited_time": data_source_info_dict.get("last_edited_time"), - "parent_id": None, - }, - }) + data_source_info = json.dumps( + { + "workspace_id": data_source_info_dict.get("notion_workspace_id"), + "page": { + "page_id": data_source_info_dict.get("notion_page_id"), + "page_name": document.name, + "page_icon": data_source_info_dict.get("notion_page_icon"), + "type": data_source_info_dict.get("type"), + "last_edited_time": data_source_info_dict.get("last_edited_time"), + "parent_id": None, + }, + } + ) document.data_source_info = data_source_info document_pipeline_execution_log = DocumentPipelineExecutionLog( document_id=document.id, @@ -357,12 +361,14 @@ class RagPipelineTransformService: db.session.add(document_pipeline_execution_log) elif document.data_source_type == "website_crawl": document.data_source_type = "website_crawl" - data_source_info = json.dumps({ - "source_url": data_source_info_dict.get("url"), - "content": "", - "title": document.name, - "description": "", - }) + data_source_info = json.dumps( + { + "source_url": data_source_info_dict.get("url"), + "content": "", + "title": document.name, + "description": "", + } + ) document.data_source_info = data_source_info if data_source_info_dict.get("provider") == "firecrawl": datasource_node_id = firecrawl_node_id @@ -381,4 +387,4 @@ class RagPipelineTransformService: datasource_node_id=datasource_node_id, ) db.session.add(document) - db.session.add(document_pipeline_execution_log) \ No newline at end of file + db.session.add(document_pipeline_execution_log) diff --git a/api/services/website_service.py b/api/services/website_service.py index 854c213d91..baaccb342d 100644 --- a/api/services/website_service.py +++ b/api/services/website_service.py @@ -98,6 +98,7 @@ class WebsiteCrawlStatusApiRequest: provider: str job_id: str + @classmethod def from_args(cls, args: dict, job_id: str) -> "WebsiteCrawlStatusApiRequest": """Create from Flask-RESTful parsed arguments.""" @@ -114,9 +115,7 @@ class WebsiteService: """Service class for website crawling operations using different providers.""" @classmethod - def _get_credentials_and_config( - cls, tenant_id: str, provider: str - ) -> tuple[Any, Any]: + def _get_credentials_and_config(cls, tenant_id: str, provider: str) -> tuple[Any, Any]: """Get and validate credentials for a provider.""" if provider == "firecrawl": plugin_id = "langgenius/firecrawl_datasource" @@ -158,9 +157,7 @@ class WebsiteService: """Crawl a URL using the specified provider with typed request.""" request = api_request.to_crawl_request() - api_key, config = cls._get_credentials_and_config( - current_user.current_tenant_id, request.provider - ) + api_key, config = cls._get_credentials_and_config(current_user.current_tenant_id, request.provider) if request.provider == "firecrawl": return cls._crawl_with_firecrawl(request=request, api_key=api_key, config=config) @@ -250,9 +247,7 @@ class WebsiteService: @classmethod def get_crawl_status_typed(cls, api_request: WebsiteCrawlStatusApiRequest) -> dict[str, Any]: """Get crawl status using typed request.""" - api_key, config = cls._get_credentials_and_config( - current_user.current_tenant_id, api_request.provider - ) + api_key, config = cls._get_credentials_and_config(current_user.current_tenant_id, api_request.provider) if api_request.provider == "firecrawl": return cls._get_firecrawl_status(api_request.job_id, api_key, config) @@ -325,9 +320,7 @@ class WebsiteService: return crawl_status_data @classmethod - def get_crawl_url_data( - cls, job_id: str, provider: str, url: str, tenant_id: str - ) -> dict[str, Any] | None: + def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict[str, Any] | None: api_key, config = cls._get_credentials_and_config(tenant_id, provider) if provider == "firecrawl": @@ -398,14 +391,10 @@ class WebsiteService: return None @classmethod - def get_scrape_url_data( - cls, provider: str, url: str, tenant_id: str, only_main_content: bool - ) -> dict[str, Any]: + def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_content: bool) -> dict[str, Any]: request = ScrapeRequest(provider=provider, url=url, tenant_id=tenant_id, only_main_content=only_main_content) - api_key, config = cls._get_credentials_and_config( - tenant_id=request.tenant_id, provider=request.provider - ) + api_key, config = cls._get_credentials_and_config(tenant_id=request.tenant_id, provider=request.provider) if request.provider == "firecrawl": return cls._scrape_with_firecrawl(request=request, api_key=api_key, config=config)