chore: add missing @override decorator to api/core/rag/extractor (#37013)

Co-authored-by: mac <mac@1234.local>
This commit is contained in:
Siyu/Audrey Xiao 2026-06-03 20:34:10 +08:00 committed by GitHub
parent bb3c9929f9
commit e14cb209a4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 48 additions and 4 deletions

View File

@ -12,7 +12,7 @@ import mimetypes
from collections.abc import Generator, Mapping
from io import BufferedReader, BytesIO
from pathlib import Path, PurePath
from typing import Any
from typing import Any, override
from pydantic import BaseModel, ConfigDict, model_validator
@ -139,6 +139,7 @@ class Blob(BaseModel):
"""
return cls(data=data, mimetype=mime_type, encoding=encoding, path=path)
@override
def __repr__(self) -> str:
"""Define the blob representation."""
str_repr = f"Blob {id(self)}"

View File

@ -1,7 +1,7 @@
"""Abstract interface for document loader implementations."""
import csv
from typing import Any
from typing import Any, override
import pandas as pd
@ -33,6 +33,7 @@ class CSVExtractor(BaseExtractor):
self.source_column = source_column
self.csv_args = csv_args or {}
@override
def extract(self) -> list[Document]:
"""Load data into document objects."""
docs = []

View File

@ -1,7 +1,7 @@
"""Abstract interface for document loader implementations."""
import os
from typing import TypedDict
from typing import TypedDict, override
import pandas as pd
from openpyxl import load_workbook
@ -30,6 +30,7 @@ class ExcelExtractor(BaseExtractor):
self._encoding = encoding
self._autodetect_encoding = autodetect_encoding
@override
def extract(self) -> list[Document]:
"""Load from Excel file in xls or xlsx format using Pandas and openpyxl."""
documents = []

View File

@ -1,3 +1,5 @@
from typing import override
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from services.website_service import WebsiteService
@ -30,6 +32,7 @@ class FirecrawlWebExtractor(BaseExtractor):
self.mode = mode
self.only_main_content = only_main_content
@override
def extract(self) -> list[Document]:
"""Extract content from the URL."""
documents = []

View File

@ -1,5 +1,7 @@
"""Abstract interface for document loader implementations."""
from typing import override
from bs4 import BeautifulSoup
from core.rag.extractor.extractor_base import BaseExtractor
@ -19,6 +21,7 @@ class HtmlExtractor(BaseExtractor):
"""Initialize with file path."""
self._file_path = file_path
@override
def extract(self) -> list[Document]:
return [Document(page_content=self._load_as_text())]

View File

@ -1,3 +1,5 @@
from typing import override
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from services.website_service import WebsiteService
@ -23,6 +25,7 @@ class JinaReaderWebExtractor(BaseExtractor):
self.mode = mode
self.only_main_content = only_main_content
@override
def extract(self) -> list[Document]:
"""Extract content from the URL."""
documents = []

View File

@ -2,6 +2,7 @@
import re
from pathlib import Path
from typing import override
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.extractor.helpers import detect_file_encodings
@ -31,6 +32,7 @@ class MarkdownExtractor(BaseExtractor):
self._encoding = encoding
self._autodetect_encoding = autodetect_encoding
@override
def extract(self) -> list[Document]:
"""Load from file path."""
tups = self.parse_tups(self._file_path)

View File

@ -1,7 +1,7 @@
import json
import logging
import operator
from typing import Any, cast
from typing import Any, cast, override
import httpx
from sqlalchemy import update
@ -67,6 +67,7 @@ class NotionExtractor(BaseExtractor):
self._notion_access_token = integration_token
@override
def extract(self) -> list[Document]:
self.update_last_edited_time(self._document_model)

View File

@ -5,6 +5,7 @@ import io
import logging
import uuid
from collections.abc import Iterator
from typing import override
import pypdfium2
import pypdfium2.raw as pdfium_c
@ -55,6 +56,7 @@ class PdfExtractor(BaseExtractor):
self._user_id = user_id
self._file_cache_key = file_cache_key
@override
def extract(self) -> list[Document]:
plaintext_file_exists = False
if self._file_cache_key:

View File

@ -1,6 +1,7 @@
"""Abstract interface for document loader implementations."""
from pathlib import Path
from typing import override
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.extractor.helpers import detect_file_encodings
@ -21,6 +22,7 @@ class TextExtractor(BaseExtractor):
self._encoding = encoding
self._autodetect_encoding = autodetect_encoding
@override
def extract(self) -> list[Document]:
"""Load from file path."""
text = ""

View File

@ -1,5 +1,6 @@
import logging
import os
from typing import override
from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor
@ -17,6 +18,7 @@ class UnstructuredWordExtractor(BaseExtractor):
self._api_url = api_url
self._api_key = api_key
@override
def extract(self) -> list[Document]:
from unstructured.__version__ import __version__ as __unstructured_version__
from unstructured.file_utils.filetype import (

View File

@ -1,6 +1,7 @@
import base64
import contextlib
import logging
from typing import override
from bs4 import BeautifulSoup
@ -23,6 +24,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
self._api_url = api_url
self._api_key = api_key
@override
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api

View File

@ -1,4 +1,5 @@
import logging
from typing import override
import pypandoc # type: ignore
@ -28,6 +29,7 @@ class UnstructuredEpubExtractor(BaseExtractor):
self._api_url = api_url
self._api_key = api_key
@override
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api

View File

@ -1,4 +1,5 @@
import logging
from typing import override
from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor
@ -22,6 +23,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
self._api_url = api_url
self._api_key = api_key
@override
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api

View File

@ -1,4 +1,5 @@
import logging
from typing import override
from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor
@ -21,6 +22,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
self._api_url = api_url
self._api_key = api_key
@override
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api

View File

@ -1,4 +1,5 @@
import logging
from typing import override
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
@ -20,6 +21,7 @@ class UnstructuredPPTExtractor(BaseExtractor):
self._api_url = api_url
self._api_key = api_key
@override
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api

View File

@ -1,4 +1,5 @@
import logging
from typing import override
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
@ -20,6 +21,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
self._api_url = api_url
self._api_key = api_key
@override
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api

View File

@ -1,4 +1,5 @@
import logging
from typing import override
from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor
@ -21,6 +22,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
self._api_url = api_url
self._api_key = api_key
@override
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api

View File

@ -1,4 +1,5 @@
import json
from typing import override
class WaterCrawlError(Exception):
@ -18,15 +19,18 @@ class WaterCrawlBadRequestError(WaterCrawlError):
def flat_errors(self):
return json.dumps(self.errors)
@override
def __str__(self):
return f"WaterCrawlBadRequestError: {self.message} \n {self.flat_errors}"
class WaterCrawlPermissionError(WaterCrawlBadRequestError):
@override
def __str__(self):
return f"You are exceeding your WaterCrawl API limits. {self.message}"
class WaterCrawlAuthenticationError(WaterCrawlBadRequestError):
@override
def __str__(self):
return "WaterCrawl API key is invalid or expired. Please check your API key and try again."

View File

@ -1,3 +1,5 @@
from typing import override
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from services.website_service import WebsiteService
@ -31,6 +33,7 @@ class WaterCrawlWebExtractor(BaseExtractor):
self.mode = mode
self.only_main_content = only_main_content
@override
def extract(self) -> list[Document]:
"""Extract content from the URL."""
documents = []

View File

@ -10,6 +10,7 @@ import os
import re
import tempfile
import uuid
from typing import override
from urllib.parse import urlparse
from docx import Document as DocxDocument
@ -91,6 +92,7 @@ class WordExtractor(BaseExtractor):
def __del__(self):
self.close()
@override
def extract(self) -> list[Document]:
"""Load given path as single page."""
content = self.parse_docx(self.file_path)