From e14cb209a4ce6e2b99e5fbe3f2ecdf0e1f807a04 Mon Sep 17 00:00:00 2001 From: Siyu/Audrey Xiao Date: Wed, 3 Jun 2026 20:34:10 +0800 Subject: [PATCH] chore: add missing @override decorator to api/core/rag/extractor (#37013) Co-authored-by: mac --- api/core/rag/extractor/blob/blob.py | 3 ++- api/core/rag/extractor/csv_extractor.py | 3 ++- api/core/rag/extractor/excel_extractor.py | 3 ++- api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py | 3 +++ api/core/rag/extractor/html_extractor.py | 3 +++ api/core/rag/extractor/jina_reader_extractor.py | 3 +++ api/core/rag/extractor/markdown_extractor.py | 2 ++ api/core/rag/extractor/notion_extractor.py | 3 ++- api/core/rag/extractor/pdf_extractor.py | 2 ++ api/core/rag/extractor/text_extractor.py | 2 ++ .../rag/extractor/unstructured/unstructured_doc_extractor.py | 2 ++ .../rag/extractor/unstructured/unstructured_eml_extractor.py | 2 ++ .../rag/extractor/unstructured/unstructured_epub_extractor.py | 2 ++ .../extractor/unstructured/unstructured_markdown_extractor.py | 2 ++ .../rag/extractor/unstructured/unstructured_msg_extractor.py | 2 ++ .../rag/extractor/unstructured/unstructured_ppt_extractor.py | 2 ++ .../rag/extractor/unstructured/unstructured_pptx_extractor.py | 2 ++ .../rag/extractor/unstructured/unstructured_xml_extractor.py | 2 ++ api/core/rag/extractor/watercrawl/exceptions.py | 4 ++++ api/core/rag/extractor/watercrawl/extractor.py | 3 +++ api/core/rag/extractor/word_extractor.py | 2 ++ 21 files changed, 48 insertions(+), 4 deletions(-) diff --git a/api/core/rag/extractor/blob/blob.py b/api/core/rag/extractor/blob/blob.py index 4537c1b537..86d71cbafe 100644 --- a/api/core/rag/extractor/blob/blob.py +++ b/api/core/rag/extractor/blob/blob.py @@ -12,7 +12,7 @@ import mimetypes from collections.abc import Generator, Mapping from io import BufferedReader, BytesIO from pathlib import Path, PurePath -from typing import Any +from typing import Any, override from pydantic import BaseModel, ConfigDict, model_validator @@ -139,6 +139,7 @@ class Blob(BaseModel): """ return cls(data=data, mimetype=mime_type, encoding=encoding, path=path) + @override def __repr__(self) -> str: """Define the blob representation.""" str_repr = f"Blob {id(self)}" diff --git a/api/core/rag/extractor/csv_extractor.py b/api/core/rag/extractor/csv_extractor.py index 19bc9cec84..778dbd40c6 100644 --- a/api/core/rag/extractor/csv_extractor.py +++ b/api/core/rag/extractor/csv_extractor.py @@ -1,7 +1,7 @@ """Abstract interface for document loader implementations.""" import csv -from typing import Any +from typing import Any, override import pandas as pd @@ -33,6 +33,7 @@ class CSVExtractor(BaseExtractor): self.source_column = source_column self.csv_args = csv_args or {} + @override def extract(self) -> list[Document]: """Load data into document objects.""" docs = [] diff --git a/api/core/rag/extractor/excel_extractor.py b/api/core/rag/extractor/excel_extractor.py index 875bfd1439..6aa379e501 100644 --- a/api/core/rag/extractor/excel_extractor.py +++ b/api/core/rag/extractor/excel_extractor.py @@ -1,7 +1,7 @@ """Abstract interface for document loader implementations.""" import os -from typing import TypedDict +from typing import TypedDict, override import pandas as pd from openpyxl import load_workbook @@ -30,6 +30,7 @@ class ExcelExtractor(BaseExtractor): self._encoding = encoding self._autodetect_encoding = autodetect_encoding + @override def extract(self) -> list[Document]: """Load from Excel file in xls or xlsx format using Pandas and openpyxl.""" documents = [] diff --git a/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py b/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py index 38a2ffc4aa..740527a32d 100644 --- a/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py +++ b/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py @@ -1,3 +1,5 @@ +from typing import override + from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document from services.website_service import WebsiteService @@ -30,6 +32,7 @@ class FirecrawlWebExtractor(BaseExtractor): self.mode = mode self.only_main_content = only_main_content + @override def extract(self) -> list[Document]: """Extract content from the URL.""" documents = [] diff --git a/api/core/rag/extractor/html_extractor.py b/api/core/rag/extractor/html_extractor.py index 9ff1dfa1bd..858bfb18f3 100644 --- a/api/core/rag/extractor/html_extractor.py +++ b/api/core/rag/extractor/html_extractor.py @@ -1,5 +1,7 @@ """Abstract interface for document loader implementations.""" +from typing import override + from bs4 import BeautifulSoup from core.rag.extractor.extractor_base import BaseExtractor @@ -19,6 +21,7 @@ class HtmlExtractor(BaseExtractor): """Initialize with file path.""" self._file_path = file_path + @override def extract(self) -> list[Document]: return [Document(page_content=self._load_as_text())] diff --git a/api/core/rag/extractor/jina_reader_extractor.py b/api/core/rag/extractor/jina_reader_extractor.py index 67e9a3c60a..b184e8fac6 100644 --- a/api/core/rag/extractor/jina_reader_extractor.py +++ b/api/core/rag/extractor/jina_reader_extractor.py @@ -1,3 +1,5 @@ +from typing import override + from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document from services.website_service import WebsiteService @@ -23,6 +25,7 @@ class JinaReaderWebExtractor(BaseExtractor): self.mode = mode self.only_main_content = only_main_content + @override def extract(self) -> list[Document]: """Extract content from the URL.""" documents = [] diff --git a/api/core/rag/extractor/markdown_extractor.py b/api/core/rag/extractor/markdown_extractor.py index 79d6ae2dac..662da9db27 100644 --- a/api/core/rag/extractor/markdown_extractor.py +++ b/api/core/rag/extractor/markdown_extractor.py @@ -2,6 +2,7 @@ import re from pathlib import Path +from typing import override from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.helpers import detect_file_encodings @@ -31,6 +32,7 @@ class MarkdownExtractor(BaseExtractor): self._encoding = encoding self._autodetect_encoding = autodetect_encoding + @override def extract(self) -> list[Document]: """Load from file path.""" tups = self.parse_tups(self._file_path) diff --git a/api/core/rag/extractor/notion_extractor.py b/api/core/rag/extractor/notion_extractor.py index aa36160711..568ccb1912 100644 --- a/api/core/rag/extractor/notion_extractor.py +++ b/api/core/rag/extractor/notion_extractor.py @@ -1,7 +1,7 @@ import json import logging import operator -from typing import Any, cast +from typing import Any, cast, override import httpx from sqlalchemy import update @@ -67,6 +67,7 @@ class NotionExtractor(BaseExtractor): self._notion_access_token = integration_token + @override def extract(self) -> list[Document]: self.update_last_edited_time(self._document_model) diff --git a/api/core/rag/extractor/pdf_extractor.py b/api/core/rag/extractor/pdf_extractor.py index 25f6fe3e2a..a79854a735 100644 --- a/api/core/rag/extractor/pdf_extractor.py +++ b/api/core/rag/extractor/pdf_extractor.py @@ -5,6 +5,7 @@ import io import logging import uuid from collections.abc import Iterator +from typing import override import pypdfium2 import pypdfium2.raw as pdfium_c @@ -55,6 +56,7 @@ class PdfExtractor(BaseExtractor): self._user_id = user_id self._file_cache_key = file_cache_key + @override def extract(self) -> list[Document]: plaintext_file_exists = False if self._file_cache_key: diff --git a/api/core/rag/extractor/text_extractor.py b/api/core/rag/extractor/text_extractor.py index 93f301ceff..b6fcd29c9d 100644 --- a/api/core/rag/extractor/text_extractor.py +++ b/api/core/rag/extractor/text_extractor.py @@ -1,6 +1,7 @@ """Abstract interface for document loader implementations.""" from pathlib import Path +from typing import override from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.helpers import detect_file_encodings @@ -21,6 +22,7 @@ class TextExtractor(BaseExtractor): self._encoding = encoding self._autodetect_encoding = autodetect_encoding + @override def extract(self) -> list[Document]: """Load from file path.""" text = "" diff --git a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py index da370f70e7..dec0065e56 100644 --- a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py @@ -1,5 +1,6 @@ import logging import os +from typing import override from configs import dify_config from core.rag.extractor.extractor_base import BaseExtractor @@ -17,6 +18,7 @@ class UnstructuredWordExtractor(BaseExtractor): self._api_url = api_url self._api_key = api_key + @override def extract(self) -> list[Document]: from unstructured.__version__ import __version__ as __unstructured_version__ from unstructured.file_utils.filetype import ( diff --git a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py index d97d4c3a48..1b87298587 100644 --- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py @@ -1,6 +1,7 @@ import base64 import contextlib import logging +from typing import override from bs4 import BeautifulSoup @@ -23,6 +24,7 @@ class UnstructuredEmailExtractor(BaseExtractor): self._api_url = api_url self._api_key = api_key + @override def extract(self) -> list[Document]: if self._api_url: from unstructured.partition.api import partition_via_api diff --git a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py index 3061d957ac..74c2c36689 100644 --- a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py @@ -1,4 +1,5 @@ import logging +from typing import override import pypandoc # type: ignore @@ -28,6 +29,7 @@ class UnstructuredEpubExtractor(BaseExtractor): self._api_url = api_url self._api_key = api_key + @override def extract(self) -> list[Document]: if self._api_url: from unstructured.partition.api import partition_via_api diff --git a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py index b6d8c47111..f71afbbae4 100644 --- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py @@ -1,4 +1,5 @@ import logging +from typing import override from configs import dify_config from core.rag.extractor.extractor_base import BaseExtractor @@ -22,6 +23,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor): self._api_url = api_url self._api_key = api_key + @override def extract(self) -> list[Document]: if self._api_url: from unstructured.partition.api import partition_via_api diff --git a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py index ae60fc7981..b73c383ef2 100644 --- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py @@ -1,4 +1,5 @@ import logging +from typing import override from configs import dify_config from core.rag.extractor.extractor_base import BaseExtractor @@ -21,6 +22,7 @@ class UnstructuredMsgExtractor(BaseExtractor): self._api_url = api_url self._api_key = api_key + @override def extract(self) -> list[Document]: if self._api_url: from unstructured.partition.api import partition_via_api diff --git a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py index c12a55ee4b..bb4bf8f9ba 100644 --- a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py @@ -1,4 +1,5 @@ import logging +from typing import override from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -20,6 +21,7 @@ class UnstructuredPPTExtractor(BaseExtractor): self._api_url = api_url self._api_key = api_key + @override def extract(self) -> list[Document]: if self._api_url: from unstructured.partition.api import partition_via_api diff --git a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py index 99e3eec501..9de2056e63 100644 --- a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py @@ -1,4 +1,5 @@ import logging +from typing import override from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -20,6 +21,7 @@ class UnstructuredPPTXExtractor(BaseExtractor): self._api_url = api_url self._api_key = api_key + @override def extract(self) -> list[Document]: if self._api_url: from unstructured.partition.api import partition_via_api diff --git a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py index 2d4846d85e..2f92662877 100644 --- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py @@ -1,4 +1,5 @@ import logging +from typing import override from configs import dify_config from core.rag.extractor.extractor_base import BaseExtractor @@ -21,6 +22,7 @@ class UnstructuredXmlExtractor(BaseExtractor): self._api_url = api_url self._api_key = api_key + @override def extract(self) -> list[Document]: if self._api_url: from unstructured.partition.api import partition_via_api diff --git a/api/core/rag/extractor/watercrawl/exceptions.py b/api/core/rag/extractor/watercrawl/exceptions.py index e407a594e0..fc457697a2 100644 --- a/api/core/rag/extractor/watercrawl/exceptions.py +++ b/api/core/rag/extractor/watercrawl/exceptions.py @@ -1,4 +1,5 @@ import json +from typing import override class WaterCrawlError(Exception): @@ -18,15 +19,18 @@ class WaterCrawlBadRequestError(WaterCrawlError): def flat_errors(self): return json.dumps(self.errors) + @override def __str__(self): return f"WaterCrawlBadRequestError: {self.message} \n {self.flat_errors}" class WaterCrawlPermissionError(WaterCrawlBadRequestError): + @override def __str__(self): return f"You are exceeding your WaterCrawl API limits. {self.message}" class WaterCrawlAuthenticationError(WaterCrawlBadRequestError): + @override def __str__(self): return "WaterCrawl API key is invalid or expired. Please check your API key and try again." diff --git a/api/core/rag/extractor/watercrawl/extractor.py b/api/core/rag/extractor/watercrawl/extractor.py index 51a432d879..fdfe75b677 100644 --- a/api/core/rag/extractor/watercrawl/extractor.py +++ b/api/core/rag/extractor/watercrawl/extractor.py @@ -1,3 +1,5 @@ +from typing import override + from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document from services.website_service import WebsiteService @@ -31,6 +33,7 @@ class WaterCrawlWebExtractor(BaseExtractor): self.mode = mode self.only_main_content = only_main_content + @override def extract(self) -> list[Document]: """Extract content from the URL.""" documents = [] diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index ee89ee3724..db38990c14 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -10,6 +10,7 @@ import os import re import tempfile import uuid +from typing import override from urllib.parse import urlparse from docx import Document as DocxDocument @@ -91,6 +92,7 @@ class WordExtractor(BaseExtractor): def __del__(self): self.close() + @override def extract(self) -> list[Document]: """Load given path as single page.""" content = self.parse_docx(self.file_path)