mirror of
https://github.com/langgenius/dify.git
synced 2026-06-07 16:23:44 +08:00
chore: add missing @override decorator to api/core/rag/extractor (#37013)
Co-authored-by: mac <mac@1234.local>
This commit is contained in:
parent
bb3c9929f9
commit
e14cb209a4
@ -12,7 +12,7 @@ import mimetypes
|
||||
from collections.abc import Generator, Mapping
|
||||
from io import BufferedReader, BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import Any
|
||||
from typing import Any, override
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, model_validator
|
||||
|
||||
@ -139,6 +139,7 @@ class Blob(BaseModel):
|
||||
"""
|
||||
return cls(data=data, mimetype=mime_type, encoding=encoding, path=path)
|
||||
|
||||
@override
|
||||
def __repr__(self) -> str:
|
||||
"""Define the blob representation."""
|
||||
str_repr = f"Blob {id(self)}"
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
"""Abstract interface for document loader implementations."""
|
||||
|
||||
import csv
|
||||
from typing import Any
|
||||
from typing import Any, override
|
||||
|
||||
import pandas as pd
|
||||
|
||||
@ -33,6 +33,7 @@ class CSVExtractor(BaseExtractor):
|
||||
self.source_column = source_column
|
||||
self.csv_args = csv_args or {}
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
"""Load data into document objects."""
|
||||
docs = []
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
"""Abstract interface for document loader implementations."""
|
||||
|
||||
import os
|
||||
from typing import TypedDict
|
||||
from typing import TypedDict, override
|
||||
|
||||
import pandas as pd
|
||||
from openpyxl import load_workbook
|
||||
@ -30,6 +30,7 @@ class ExcelExtractor(BaseExtractor):
|
||||
self._encoding = encoding
|
||||
self._autodetect_encoding = autodetect_encoding
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
"""Load from Excel file in xls or xlsx format using Pandas and openpyxl."""
|
||||
documents = []
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
from typing import override
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
from services.website_service import WebsiteService
|
||||
@ -30,6 +32,7 @@ class FirecrawlWebExtractor(BaseExtractor):
|
||||
self.mode = mode
|
||||
self.only_main_content = only_main_content
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
"""Extract content from the URL."""
|
||||
documents = []
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
"""Abstract interface for document loader implementations."""
|
||||
|
||||
from typing import override
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
@ -19,6 +21,7 @@ class HtmlExtractor(BaseExtractor):
|
||||
"""Initialize with file path."""
|
||||
self._file_path = file_path
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
return [Document(page_content=self._load_as_text())]
|
||||
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
from typing import override
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
from services.website_service import WebsiteService
|
||||
@ -23,6 +25,7 @@ class JinaReaderWebExtractor(BaseExtractor):
|
||||
self.mode = mode
|
||||
self.only_main_content = only_main_content
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
"""Extract content from the URL."""
|
||||
documents = []
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import override
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.extractor.helpers import detect_file_encodings
|
||||
@ -31,6 +32,7 @@ class MarkdownExtractor(BaseExtractor):
|
||||
self._encoding = encoding
|
||||
self._autodetect_encoding = autodetect_encoding
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
"""Load from file path."""
|
||||
tups = self.parse_tups(self._file_path)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import json
|
||||
import logging
|
||||
import operator
|
||||
from typing import Any, cast
|
||||
from typing import Any, cast, override
|
||||
|
||||
import httpx
|
||||
from sqlalchemy import update
|
||||
@ -67,6 +67,7 @@ class NotionExtractor(BaseExtractor):
|
||||
|
||||
self._notion_access_token = integration_token
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
self.update_last_edited_time(self._document_model)
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ import io
|
||||
import logging
|
||||
import uuid
|
||||
from collections.abc import Iterator
|
||||
from typing import override
|
||||
|
||||
import pypdfium2
|
||||
import pypdfium2.raw as pdfium_c
|
||||
@ -55,6 +56,7 @@ class PdfExtractor(BaseExtractor):
|
||||
self._user_id = user_id
|
||||
self._file_cache_key = file_cache_key
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
plaintext_file_exists = False
|
||||
if self._file_cache_key:
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
"""Abstract interface for document loader implementations."""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import override
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.extractor.helpers import detect_file_encodings
|
||||
@ -21,6 +22,7 @@ class TextExtractor(BaseExtractor):
|
||||
self._encoding = encoding
|
||||
self._autodetect_encoding = autodetect_encoding
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
"""Load from file path."""
|
||||
text = ""
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import logging
|
||||
import os
|
||||
from typing import override
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
@ -17,6 +18,7 @@ class UnstructuredWordExtractor(BaseExtractor):
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||
from unstructured.file_utils.filetype import (
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import base64
|
||||
import contextlib
|
||||
import logging
|
||||
from typing import override
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
@ -23,6 +24,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
if self._api_url:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
from typing import override
|
||||
|
||||
import pypandoc # type: ignore
|
||||
|
||||
@ -28,6 +29,7 @@ class UnstructuredEpubExtractor(BaseExtractor):
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
if self._api_url:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
from typing import override
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
@ -22,6 +23,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
if self._api_url:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
from typing import override
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
@ -21,6 +22,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
if self._api_url:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
from typing import override
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
@ -20,6 +21,7 @@ class UnstructuredPPTExtractor(BaseExtractor):
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
if self._api_url:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
from typing import override
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
@ -20,6 +21,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
if self._api_url:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
from typing import override
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
@ -21,6 +22,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
|
||||
self._api_url = api_url
|
||||
self._api_key = api_key
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
if self._api_url:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import json
|
||||
from typing import override
|
||||
|
||||
|
||||
class WaterCrawlError(Exception):
|
||||
@ -18,15 +19,18 @@ class WaterCrawlBadRequestError(WaterCrawlError):
|
||||
def flat_errors(self):
|
||||
return json.dumps(self.errors)
|
||||
|
||||
@override
|
||||
def __str__(self):
|
||||
return f"WaterCrawlBadRequestError: {self.message} \n {self.flat_errors}"
|
||||
|
||||
|
||||
class WaterCrawlPermissionError(WaterCrawlBadRequestError):
|
||||
@override
|
||||
def __str__(self):
|
||||
return f"You are exceeding your WaterCrawl API limits. {self.message}"
|
||||
|
||||
|
||||
class WaterCrawlAuthenticationError(WaterCrawlBadRequestError):
|
||||
@override
|
||||
def __str__(self):
|
||||
return "WaterCrawl API key is invalid or expired. Please check your API key and try again."
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
from typing import override
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
from services.website_service import WebsiteService
|
||||
@ -31,6 +33,7 @@ class WaterCrawlWebExtractor(BaseExtractor):
|
||||
self.mode = mode
|
||||
self.only_main_content = only_main_content
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
"""Extract content from the URL."""
|
||||
documents = []
|
||||
|
||||
@ -10,6 +10,7 @@ import os
|
||||
import re
|
||||
import tempfile
|
||||
import uuid
|
||||
from typing import override
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from docx import Document as DocxDocument
|
||||
@ -91,6 +92,7 @@ class WordExtractor(BaseExtractor):
|
||||
def __del__(self):
|
||||
self.close()
|
||||
|
||||
@override
|
||||
def extract(self) -> list[Document]:
|
||||
"""Load given path as single page."""
|
||||
content = self.parse_docx(self.file_path)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user