chore: add missing @override decorator to api/core/rag/extractor (#37013)

Co-authored-by: mac <mac@1234.local>
This commit is contained in:
Siyu/Audrey Xiao 2026-06-03 20:34:10 +08:00 committed by GitHub
parent bb3c9929f9
commit e14cb209a4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 48 additions and 4 deletions

View File

@ -12,7 +12,7 @@ import mimetypes
from collections.abc import Generator, Mapping from collections.abc import Generator, Mapping
from io import BufferedReader, BytesIO from io import BufferedReader, BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import Any from typing import Any, override
from pydantic import BaseModel, ConfigDict, model_validator from pydantic import BaseModel, ConfigDict, model_validator
@ -139,6 +139,7 @@ class Blob(BaseModel):
""" """
return cls(data=data, mimetype=mime_type, encoding=encoding, path=path) return cls(data=data, mimetype=mime_type, encoding=encoding, path=path)
@override
def __repr__(self) -> str: def __repr__(self) -> str:
"""Define the blob representation.""" """Define the blob representation."""
str_repr = f"Blob {id(self)}" str_repr = f"Blob {id(self)}"

View File

@ -1,7 +1,7 @@
"""Abstract interface for document loader implementations.""" """Abstract interface for document loader implementations."""
import csv import csv
from typing import Any from typing import Any, override
import pandas as pd import pandas as pd
@ -33,6 +33,7 @@ class CSVExtractor(BaseExtractor):
self.source_column = source_column self.source_column = source_column
self.csv_args = csv_args or {} self.csv_args = csv_args or {}
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
"""Load data into document objects.""" """Load data into document objects."""
docs = [] docs = []

View File

@ -1,7 +1,7 @@
"""Abstract interface for document loader implementations.""" """Abstract interface for document loader implementations."""
import os import os
from typing import TypedDict from typing import TypedDict, override
import pandas as pd import pandas as pd
from openpyxl import load_workbook from openpyxl import load_workbook
@ -30,6 +30,7 @@ class ExcelExtractor(BaseExtractor):
self._encoding = encoding self._encoding = encoding
self._autodetect_encoding = autodetect_encoding self._autodetect_encoding = autodetect_encoding
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
"""Load from Excel file in xls or xlsx format using Pandas and openpyxl.""" """Load from Excel file in xls or xlsx format using Pandas and openpyxl."""
documents = [] documents = []

View File

@ -1,3 +1,5 @@
from typing import override
from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document from core.rag.models.document import Document
from services.website_service import WebsiteService from services.website_service import WebsiteService
@ -30,6 +32,7 @@ class FirecrawlWebExtractor(BaseExtractor):
self.mode = mode self.mode = mode
self.only_main_content = only_main_content self.only_main_content = only_main_content
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
"""Extract content from the URL.""" """Extract content from the URL."""
documents = [] documents = []

View File

@ -1,5 +1,7 @@
"""Abstract interface for document loader implementations.""" """Abstract interface for document loader implementations."""
from typing import override
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.extractor_base import BaseExtractor
@ -19,6 +21,7 @@ class HtmlExtractor(BaseExtractor):
"""Initialize with file path.""" """Initialize with file path."""
self._file_path = file_path self._file_path = file_path
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
return [Document(page_content=self._load_as_text())] return [Document(page_content=self._load_as_text())]

View File

@ -1,3 +1,5 @@
from typing import override
from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document from core.rag.models.document import Document
from services.website_service import WebsiteService from services.website_service import WebsiteService
@ -23,6 +25,7 @@ class JinaReaderWebExtractor(BaseExtractor):
self.mode = mode self.mode = mode
self.only_main_content = only_main_content self.only_main_content = only_main_content
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
"""Extract content from the URL.""" """Extract content from the URL."""
documents = [] documents = []

View File

@ -2,6 +2,7 @@
import re import re
from pathlib import Path from pathlib import Path
from typing import override
from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.extractor.helpers import detect_file_encodings from core.rag.extractor.helpers import detect_file_encodings
@ -31,6 +32,7 @@ class MarkdownExtractor(BaseExtractor):
self._encoding = encoding self._encoding = encoding
self._autodetect_encoding = autodetect_encoding self._autodetect_encoding = autodetect_encoding
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
"""Load from file path.""" """Load from file path."""
tups = self.parse_tups(self._file_path) tups = self.parse_tups(self._file_path)

View File

@ -1,7 +1,7 @@
import json import json
import logging import logging
import operator import operator
from typing import Any, cast from typing import Any, cast, override
import httpx import httpx
from sqlalchemy import update from sqlalchemy import update
@ -67,6 +67,7 @@ class NotionExtractor(BaseExtractor):
self._notion_access_token = integration_token self._notion_access_token = integration_token
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
self.update_last_edited_time(self._document_model) self.update_last_edited_time(self._document_model)

View File

@ -5,6 +5,7 @@ import io
import logging import logging
import uuid import uuid
from collections.abc import Iterator from collections.abc import Iterator
from typing import override
import pypdfium2 import pypdfium2
import pypdfium2.raw as pdfium_c import pypdfium2.raw as pdfium_c
@ -55,6 +56,7 @@ class PdfExtractor(BaseExtractor):
self._user_id = user_id self._user_id = user_id
self._file_cache_key = file_cache_key self._file_cache_key = file_cache_key
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
plaintext_file_exists = False plaintext_file_exists = False
if self._file_cache_key: if self._file_cache_key:

View File

@ -1,6 +1,7 @@
"""Abstract interface for document loader implementations.""" """Abstract interface for document loader implementations."""
from pathlib import Path from pathlib import Path
from typing import override
from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.extractor.helpers import detect_file_encodings from core.rag.extractor.helpers import detect_file_encodings
@ -21,6 +22,7 @@ class TextExtractor(BaseExtractor):
self._encoding = encoding self._encoding = encoding
self._autodetect_encoding = autodetect_encoding self._autodetect_encoding = autodetect_encoding
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
"""Load from file path.""" """Load from file path."""
text = "" text = ""

View File

@ -1,5 +1,6 @@
import logging import logging
import os import os
from typing import override
from configs import dify_config from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.extractor_base import BaseExtractor
@ -17,6 +18,7 @@ class UnstructuredWordExtractor(BaseExtractor):
self._api_url = api_url self._api_url = api_url
self._api_key = api_key self._api_key = api_key
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
from unstructured.__version__ import __version__ as __unstructured_version__ from unstructured.__version__ import __version__ as __unstructured_version__
from unstructured.file_utils.filetype import ( from unstructured.file_utils.filetype import (

View File

@ -1,6 +1,7 @@
import base64 import base64
import contextlib import contextlib
import logging import logging
from typing import override
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -23,6 +24,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
self._api_url = api_url self._api_url = api_url
self._api_key = api_key self._api_key = api_key
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
if self._api_url: if self._api_url:
from unstructured.partition.api import partition_via_api from unstructured.partition.api import partition_via_api

View File

@ -1,4 +1,5 @@
import logging import logging
from typing import override
import pypandoc # type: ignore import pypandoc # type: ignore
@ -28,6 +29,7 @@ class UnstructuredEpubExtractor(BaseExtractor):
self._api_url = api_url self._api_url = api_url
self._api_key = api_key self._api_key = api_key
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
if self._api_url: if self._api_url:
from unstructured.partition.api import partition_via_api from unstructured.partition.api import partition_via_api

View File

@ -1,4 +1,5 @@
import logging import logging
from typing import override
from configs import dify_config from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.extractor_base import BaseExtractor
@ -22,6 +23,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
self._api_url = api_url self._api_url = api_url
self._api_key = api_key self._api_key = api_key
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
if self._api_url: if self._api_url:
from unstructured.partition.api import partition_via_api from unstructured.partition.api import partition_via_api

View File

@ -1,4 +1,5 @@
import logging import logging
from typing import override
from configs import dify_config from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.extractor_base import BaseExtractor
@ -21,6 +22,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
self._api_url = api_url self._api_url = api_url
self._api_key = api_key self._api_key = api_key
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
if self._api_url: if self._api_url:
from unstructured.partition.api import partition_via_api from unstructured.partition.api import partition_via_api

View File

@ -1,4 +1,5 @@
import logging import logging
from typing import override
from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document from core.rag.models.document import Document
@ -20,6 +21,7 @@ class UnstructuredPPTExtractor(BaseExtractor):
self._api_url = api_url self._api_url = api_url
self._api_key = api_key self._api_key = api_key
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
if self._api_url: if self._api_url:
from unstructured.partition.api import partition_via_api from unstructured.partition.api import partition_via_api

View File

@ -1,4 +1,5 @@
import logging import logging
from typing import override
from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document from core.rag.models.document import Document
@ -20,6 +21,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
self._api_url = api_url self._api_url = api_url
self._api_key = api_key self._api_key = api_key
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
if self._api_url: if self._api_url:
from unstructured.partition.api import partition_via_api from unstructured.partition.api import partition_via_api

View File

@ -1,4 +1,5 @@
import logging import logging
from typing import override
from configs import dify_config from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.extractor_base import BaseExtractor
@ -21,6 +22,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
self._api_url = api_url self._api_url = api_url
self._api_key = api_key self._api_key = api_key
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
if self._api_url: if self._api_url:
from unstructured.partition.api import partition_via_api from unstructured.partition.api import partition_via_api

View File

@ -1,4 +1,5 @@
import json import json
from typing import override
class WaterCrawlError(Exception): class WaterCrawlError(Exception):
@ -18,15 +19,18 @@ class WaterCrawlBadRequestError(WaterCrawlError):
def flat_errors(self): def flat_errors(self):
return json.dumps(self.errors) return json.dumps(self.errors)
@override
def __str__(self): def __str__(self):
return f"WaterCrawlBadRequestError: {self.message} \n {self.flat_errors}" return f"WaterCrawlBadRequestError: {self.message} \n {self.flat_errors}"
class WaterCrawlPermissionError(WaterCrawlBadRequestError): class WaterCrawlPermissionError(WaterCrawlBadRequestError):
@override
def __str__(self): def __str__(self):
return f"You are exceeding your WaterCrawl API limits. {self.message}" return f"You are exceeding your WaterCrawl API limits. {self.message}"
class WaterCrawlAuthenticationError(WaterCrawlBadRequestError): class WaterCrawlAuthenticationError(WaterCrawlBadRequestError):
@override
def __str__(self): def __str__(self):
return "WaterCrawl API key is invalid or expired. Please check your API key and try again." return "WaterCrawl API key is invalid or expired. Please check your API key and try again."

View File

@ -1,3 +1,5 @@
from typing import override
from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document from core.rag.models.document import Document
from services.website_service import WebsiteService from services.website_service import WebsiteService
@ -31,6 +33,7 @@ class WaterCrawlWebExtractor(BaseExtractor):
self.mode = mode self.mode = mode
self.only_main_content = only_main_content self.only_main_content = only_main_content
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
"""Extract content from the URL.""" """Extract content from the URL."""
documents = [] documents = []

View File

@ -10,6 +10,7 @@ import os
import re import re
import tempfile import tempfile
import uuid import uuid
from typing import override
from urllib.parse import urlparse from urllib.parse import urlparse
from docx import Document as DocxDocument from docx import Document as DocxDocument
@ -91,6 +92,7 @@ class WordExtractor(BaseExtractor):
def __del__(self): def __del__(self):
self.close() self.close()
@override
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
"""Load given path as single page.""" """Load given path as single page."""
content = self.parse_docx(self.file_path) content = self.parse_docx(self.file_path)