mirror of
https://github.com/langgenius/dify.git
synced 2026-04-28 11:56:55 +08:00
Merge branch 'feat/rag-2' of https://github.com/langgenius/dify into feat/rag-2
This commit is contained in:
commit
1c813239c9
@ -464,7 +464,6 @@ class DatasetIndexingEstimateApi(Resource):
|
|||||||
"tenant_id": current_user.current_tenant_id,
|
"tenant_id": current_user.current_tenant_id,
|
||||||
"mode": "crawl",
|
"mode": "crawl",
|
||||||
"only_main_content": website_info_list["only_main_content"],
|
"only_main_content": website_info_list["only_main_content"],
|
||||||
"credential_id": website_info_list["credential_id"],
|
|
||||||
},
|
},
|
||||||
document_model=args["doc_form"],
|
document_model=args["doc_form"],
|
||||||
)
|
)
|
||||||
|
|||||||
@ -529,7 +529,6 @@ class DocumentBatchIndexingEstimateApi(DocumentResource):
|
|||||||
"tenant_id": current_user.current_tenant_id,
|
"tenant_id": current_user.current_tenant_id,
|
||||||
"mode": data_source_info["mode"],
|
"mode": data_source_info["mode"],
|
||||||
"only_main_content": data_source_info["only_main_content"],
|
"only_main_content": data_source_info["only_main_content"],
|
||||||
"credential_id": data_source_info["credential_id"],
|
|
||||||
},
|
},
|
||||||
document_model=document.doc_form,
|
document_model=document.doc_form,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -23,7 +23,6 @@ class WebsiteCrawlApi(Resource):
|
|||||||
)
|
)
|
||||||
parser.add_argument("url", type=str, required=True, nullable=True, location="json")
|
parser.add_argument("url", type=str, required=True, nullable=True, location="json")
|
||||||
parser.add_argument("options", type=dict, required=True, nullable=True, location="json")
|
parser.add_argument("options", type=dict, required=True, nullable=True, location="json")
|
||||||
parser.add_argument("credential_id", type=str, required=True, nullable=True, location="json")
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Create typed request and validate
|
# Create typed request and validate
|
||||||
|
|||||||
@ -392,7 +392,6 @@ class IndexingRunner:
|
|||||||
"url": data_source_info["url"],
|
"url": data_source_info["url"],
|
||||||
"mode": data_source_info["mode"],
|
"mode": data_source_info["mode"],
|
||||||
"only_main_content": data_source_info["only_main_content"],
|
"only_main_content": data_source_info["only_main_content"],
|
||||||
"credential_id": data_source_info["credential_id"],
|
|
||||||
},
|
},
|
||||||
document_model=dataset_document.doc_form,
|
document_model=dataset_document.doc_form,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -36,7 +36,6 @@ class WebsiteInfo(BaseModel):
|
|||||||
mode: str
|
mode: str
|
||||||
tenant_id: str
|
tenant_id: str
|
||||||
only_main_content: bool = False
|
only_main_content: bool = False
|
||||||
credential_id: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
class ExtractSetting(BaseModel):
|
class ExtractSetting(BaseModel):
|
||||||
|
|||||||
@ -24,7 +24,6 @@ class FirecrawlWebExtractor(BaseExtractor):
|
|||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
mode: str = "crawl",
|
mode: str = "crawl",
|
||||||
only_main_content: bool = True,
|
only_main_content: bool = True,
|
||||||
credential_id: Optional[str] = None,
|
|
||||||
):
|
):
|
||||||
"""Initialize with url, api_key, base_url and mode."""
|
"""Initialize with url, api_key, base_url and mode."""
|
||||||
self._url = url
|
self._url = url
|
||||||
@ -32,14 +31,13 @@ class FirecrawlWebExtractor(BaseExtractor):
|
|||||||
self.tenant_id = tenant_id
|
self.tenant_id = tenant_id
|
||||||
self.mode = mode
|
self.mode = mode
|
||||||
self.only_main_content = only_main_content
|
self.only_main_content = only_main_content
|
||||||
self.credential_id = credential_id
|
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
"""Extract content from the URL."""
|
"""Extract content from the URL."""
|
||||||
documents = []
|
documents = []
|
||||||
if self.mode == "crawl":
|
if self.mode == "crawl":
|
||||||
crawl_data = WebsiteService.get_crawl_url_data(
|
crawl_data = WebsiteService.get_crawl_url_data(
|
||||||
self.job_id, "firecrawl", self._url, self.tenant_id, self.credential_id
|
self.job_id, "firecrawl", self._url, self.tenant_id
|
||||||
)
|
)
|
||||||
if crawl_data is None:
|
if crawl_data is None:
|
||||||
return []
|
return []
|
||||||
@ -54,7 +52,7 @@ class FirecrawlWebExtractor(BaseExtractor):
|
|||||||
documents.append(document)
|
documents.append(document)
|
||||||
elif self.mode == "scrape":
|
elif self.mode == "scrape":
|
||||||
scrape_data = WebsiteService.get_scrape_url_data(
|
scrape_data = WebsiteService.get_scrape_url_data(
|
||||||
"firecrawl", self._url, self.tenant_id, self.only_main_content, self.credential_id
|
"firecrawl", self._url, self.tenant_id, self.only_main_content
|
||||||
)
|
)
|
||||||
|
|
||||||
document = Document(
|
document = Document(
|
||||||
|
|||||||
@ -17,7 +17,6 @@ class JinaReaderWebExtractor(BaseExtractor):
|
|||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
mode: str = "crawl",
|
mode: str = "crawl",
|
||||||
only_main_content: bool = False,
|
only_main_content: bool = False,
|
||||||
credential_id: Optional[str] = None,
|
|
||||||
):
|
):
|
||||||
"""Initialize with url, api_key, base_url and mode."""
|
"""Initialize with url, api_key, base_url and mode."""
|
||||||
self._url = url
|
self._url = url
|
||||||
@ -25,14 +24,13 @@ class JinaReaderWebExtractor(BaseExtractor):
|
|||||||
self.tenant_id = tenant_id
|
self.tenant_id = tenant_id
|
||||||
self.mode = mode
|
self.mode = mode
|
||||||
self.only_main_content = only_main_content
|
self.only_main_content = only_main_content
|
||||||
self.credential_id = credential_id
|
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
"""Extract content from the URL."""
|
"""Extract content from the URL."""
|
||||||
documents = []
|
documents = []
|
||||||
if self.mode == "crawl":
|
if self.mode == "crawl":
|
||||||
crawl_data = WebsiteService.get_crawl_url_data(
|
crawl_data = WebsiteService.get_crawl_url_data(
|
||||||
self.job_id, "jinareader", self._url, self.tenant_id, self.credential_id
|
self.job_id, "jinareader", self._url, self.tenant_id
|
||||||
)
|
)
|
||||||
if crawl_data is None:
|
if crawl_data is None:
|
||||||
return []
|
return []
|
||||||
|
|||||||
@ -25,7 +25,6 @@ class WaterCrawlWebExtractor(BaseExtractor):
|
|||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
mode: str = "crawl",
|
mode: str = "crawl",
|
||||||
only_main_content: bool = True,
|
only_main_content: bool = True,
|
||||||
credential_id: Optional[str] = None,
|
|
||||||
):
|
):
|
||||||
"""Initialize with url, api_key, base_url and mode."""
|
"""Initialize with url, api_key, base_url and mode."""
|
||||||
self._url = url
|
self._url = url
|
||||||
@ -33,14 +32,13 @@ class WaterCrawlWebExtractor(BaseExtractor):
|
|||||||
self.tenant_id = tenant_id
|
self.tenant_id = tenant_id
|
||||||
self.mode = mode
|
self.mode = mode
|
||||||
self.only_main_content = only_main_content
|
self.only_main_content = only_main_content
|
||||||
self.credential_id = credential_id
|
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
"""Extract content from the URL."""
|
"""Extract content from the URL."""
|
||||||
documents = []
|
documents = []
|
||||||
if self.mode == "crawl":
|
if self.mode == "crawl":
|
||||||
crawl_data = WebsiteService.get_crawl_url_data(
|
crawl_data = WebsiteService.get_crawl_url_data(
|
||||||
self.job_id, "watercrawl", self._url, self.tenant_id, self.credential_id
|
self.job_id, "watercrawl", self._url, self.tenant_id
|
||||||
)
|
)
|
||||||
if crawl_data is None:
|
if crawl_data is None:
|
||||||
return []
|
return []
|
||||||
@ -55,7 +53,7 @@ class WaterCrawlWebExtractor(BaseExtractor):
|
|||||||
documents.append(document)
|
documents.append(document)
|
||||||
elif self.mode == "scrape":
|
elif self.mode == "scrape":
|
||||||
scrape_data = WebsiteService.get_scrape_url_data(
|
scrape_data = WebsiteService.get_scrape_url_data(
|
||||||
"watercrawl", self._url, self.tenant_id, self.only_main_content, self.credential_id
|
"watercrawl", self._url, self.tenant_id, self.only_main_content
|
||||||
)
|
)
|
||||||
|
|
||||||
document = Document(
|
document = Document(
|
||||||
|
|||||||
@ -82,6 +82,36 @@ class DatasourceProviderService:
|
|||||||
if key in credential_secret_variables:
|
if key in credential_secret_variables:
|
||||||
copy_credentials[key] = encrypter.decrypt_token(tenant_id, value)
|
copy_credentials[key] = encrypter.decrypt_token(tenant_id, value)
|
||||||
return copy_credentials
|
return copy_credentials
|
||||||
|
|
||||||
|
def get_default_real_credential(
|
||||||
|
self, tenant_id: str, provider: str, plugin_id: str
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
get default credential
|
||||||
|
"""
|
||||||
|
with Session(db.engine) as session:
|
||||||
|
datasource_provider = (
|
||||||
|
session.query(DatasourceProvider).filter_by(tenant_id=tenant_id,
|
||||||
|
is_default=True,
|
||||||
|
provider=provider,
|
||||||
|
plugin_id=plugin_id).first()
|
||||||
|
)
|
||||||
|
if not datasource_provider:
|
||||||
|
return {}
|
||||||
|
encrypted_credentials = datasource_provider.encrypted_credentials
|
||||||
|
# Get provider credential secret variables
|
||||||
|
credential_secret_variables = self.extract_secret_variables(
|
||||||
|
tenant_id=tenant_id,
|
||||||
|
provider_id=f"{plugin_id}/{provider}",
|
||||||
|
credential_type=CredentialType.of(datasource_provider.auth_type),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Obfuscate provider credentials
|
||||||
|
copy_credentials = encrypted_credentials.copy()
|
||||||
|
for key, value in copy_credentials.items():
|
||||||
|
if key in credential_secret_variables:
|
||||||
|
copy_credentials[key] = encrypter.decrypt_token(tenant_id, value)
|
||||||
|
return copy_credentials
|
||||||
|
|
||||||
def update_datasource_provider_name(
|
def update_datasource_provider_name(
|
||||||
self, tenant_id: str, datasource_provider_id: DatasourceProviderID, name: str, credential_id: str
|
self, tenant_id: str, datasource_provider_id: DatasourceProviderID, name: str, credential_id: str
|
||||||
|
|||||||
@ -62,7 +62,6 @@ class WebsiteCrawlApiRequest:
|
|||||||
provider: str
|
provider: str
|
||||||
url: str
|
url: str
|
||||||
options: dict[str, Any]
|
options: dict[str, Any]
|
||||||
credential_id: Optional[str] = None
|
|
||||||
|
|
||||||
def to_crawl_request(self) -> CrawlRequest:
|
def to_crawl_request(self) -> CrawlRequest:
|
||||||
"""Convert API request to internal CrawlRequest."""
|
"""Convert API request to internal CrawlRequest."""
|
||||||
@ -121,29 +120,22 @@ class WebsiteService:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _get_credentials_and_config(
|
def _get_credentials_and_config(
|
||||||
cls, tenant_id: str, provider: str, credential_id: Optional[str] = None
|
cls, tenant_id: str, provider: str
|
||||||
) -> tuple[Any, Any]:
|
) -> tuple[Any, Any]:
|
||||||
"""Get and validate credentials for a provider."""
|
"""Get and validate credentials for a provider."""
|
||||||
if credential_id:
|
if provider == "firecrawl":
|
||||||
if provider == "firecrawl":
|
plugin_id = "langgenius/firecrawl_datasource"
|
||||||
plugin_id = "langgenius/firecrawl_datasource"
|
elif provider == "watercrawl":
|
||||||
elif provider == "watercrawl":
|
plugin_id = "langgenius/watercrawl_datasource"
|
||||||
plugin_id = "langgenius/watercrawl_datasource"
|
elif provider == "jinareader":
|
||||||
elif provider == "jinareader":
|
plugin_id = "langgenius/jinareader_datasource"
|
||||||
plugin_id = "langgenius/jinareader_datasource"
|
datasource_provider_service = DatasourceProviderService()
|
||||||
datasource_provider_service = DatasourceProviderService()
|
credential = datasource_provider_service.get_default_real_credential(
|
||||||
credential = datasource_provider_service.get_real_credential_by_id(
|
tenant_id=tenant_id,
|
||||||
tenant_id=tenant_id,
|
provider=provider,
|
||||||
credential_id=credential_id,
|
plugin_id=plugin_id,
|
||||||
provider=provider,
|
)
|
||||||
plugin_id=plugin_id,
|
return credential.get("api_key"), credential
|
||||||
)
|
|
||||||
return credential.get("api_key"), credential
|
|
||||||
else:
|
|
||||||
credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
|
|
||||||
if not credentials or "config" not in credentials:
|
|
||||||
raise ValueError("No valid credentials found for the provider")
|
|
||||||
return credentials, credentials["config"]
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _get_decrypted_api_key(cls, tenant_id: str, config: dict) -> str:
|
def _get_decrypted_api_key(cls, tenant_id: str, config: dict) -> str:
|
||||||
@ -166,13 +158,9 @@ class WebsiteService:
|
|||||||
"""Crawl a URL using the specified provider with typed request."""
|
"""Crawl a URL using the specified provider with typed request."""
|
||||||
request = api_request.to_crawl_request()
|
request = api_request.to_crawl_request()
|
||||||
|
|
||||||
_, config = cls._get_credentials_and_config(
|
api_key, config = cls._get_credentials_and_config(
|
||||||
current_user.current_tenant_id, request.provider, api_request.credential_id
|
current_user.current_tenant_id, request.provider
|
||||||
)
|
)
|
||||||
if api_request.credential_id:
|
|
||||||
api_key = _
|
|
||||||
else:
|
|
||||||
api_key = cls._get_decrypted_api_key(current_user.current_tenant_id, config)
|
|
||||||
|
|
||||||
if request.provider == "firecrawl":
|
if request.provider == "firecrawl":
|
||||||
return cls._crawl_with_firecrawl(request=request, api_key=api_key, config=config)
|
return cls._crawl_with_firecrawl(request=request, api_key=api_key, config=config)
|
||||||
@ -262,13 +250,9 @@ class WebsiteService:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def get_crawl_status_typed(cls, api_request: WebsiteCrawlStatusApiRequest) -> dict[str, Any]:
|
def get_crawl_status_typed(cls, api_request: WebsiteCrawlStatusApiRequest) -> dict[str, Any]:
|
||||||
"""Get crawl status using typed request."""
|
"""Get crawl status using typed request."""
|
||||||
_, config = cls._get_credentials_and_config(
|
api_key, config = cls._get_credentials_and_config(
|
||||||
current_user.current_tenant_id, api_request.provider, api_request.credential_id
|
current_user.current_tenant_id, api_request.provider
|
||||||
)
|
)
|
||||||
if api_request.credential_id:
|
|
||||||
api_key = _
|
|
||||||
else:
|
|
||||||
api_key = cls._get_decrypted_api_key(current_user.current_tenant_id, config)
|
|
||||||
|
|
||||||
if api_request.provider == "firecrawl":
|
if api_request.provider == "firecrawl":
|
||||||
return cls._get_firecrawl_status(api_request.job_id, api_key, config)
|
return cls._get_firecrawl_status(api_request.job_id, api_key, config)
|
||||||
@ -342,13 +326,9 @@ class WebsiteService:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_crawl_url_data(
|
def get_crawl_url_data(
|
||||||
cls, job_id: str, provider: str, url: str, tenant_id: str, credential_id: Optional[str] = None
|
cls, job_id: str, provider: str, url: str, tenant_id: str
|
||||||
) -> dict[str, Any] | None:
|
) -> dict[str, Any] | None:
|
||||||
_, config = cls._get_credentials_and_config(tenant_id, provider, credential_id)
|
api_key, config = cls._get_credentials_and_config(tenant_id, provider)
|
||||||
if credential_id:
|
|
||||||
api_key = _
|
|
||||||
else:
|
|
||||||
api_key = cls._get_decrypted_api_key(tenant_id, config)
|
|
||||||
|
|
||||||
if provider == "firecrawl":
|
if provider == "firecrawl":
|
||||||
return cls._get_firecrawl_url_data(job_id, url, api_key, config)
|
return cls._get_firecrawl_url_data(job_id, url, api_key, config)
|
||||||
@ -419,17 +399,13 @@ class WebsiteService:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_scrape_url_data(
|
def get_scrape_url_data(
|
||||||
cls, provider: str, url: str, tenant_id: str, only_main_content: bool, credential_id: Optional[str] = None
|
cls, provider: str, url: str, tenant_id: str, only_main_content: bool
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
request = ScrapeRequest(provider=provider, url=url, tenant_id=tenant_id, only_main_content=only_main_content)
|
request = ScrapeRequest(provider=provider, url=url, tenant_id=tenant_id, only_main_content=only_main_content)
|
||||||
|
|
||||||
_, config = cls._get_credentials_and_config(
|
api_key, config = cls._get_credentials_and_config(
|
||||||
tenant_id=request.tenant_id, provider=request.provider, credential_id=credential_id
|
tenant_id=request.tenant_id, provider=request.provider
|
||||||
)
|
)
|
||||||
if credential_id:
|
|
||||||
api_key = _
|
|
||||||
else:
|
|
||||||
api_key = cls._get_decrypted_api_key(tenant_id=request.tenant_id, config=config)
|
|
||||||
|
|
||||||
if request.provider == "firecrawl":
|
if request.provider == "firecrawl":
|
||||||
return cls._scrape_with_firecrawl(request=request, api_key=api_key, config=config)
|
return cls._scrape_with_firecrawl(request=request, api_key=api_key, config=config)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user