From e14cb209a4ce6e2b99e5fbe3f2ecdf0e1f807a04 Mon Sep 17 00:00:00 2001
From: Siyu/Audrey Xiao <siyux1927@proton.me>
Date: Wed, 3 Jun 2026 20:34:10 +0800
Subject: [PATCH] chore: add missing @override decorator to
 api/core/rag/extractor (#37013)

Co-authored-by: mac <mac@1234.local>
---
 api/core/rag/extractor/blob/blob.py                           | 3 ++-
 api/core/rag/extractor/csv_extractor.py                       | 3 ++-
 api/core/rag/extractor/excel_extractor.py                     | 3 ++-
 api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py   | 3 +++
 api/core/rag/extractor/html_extractor.py                      | 3 +++
 api/core/rag/extractor/jina_reader_extractor.py               | 3 +++
 api/core/rag/extractor/markdown_extractor.py                  | 2 ++
 api/core/rag/extractor/notion_extractor.py                    | 3 ++-
 api/core/rag/extractor/pdf_extractor.py                       | 2 ++
 api/core/rag/extractor/text_extractor.py                      | 2 ++
 .../rag/extractor/unstructured/unstructured_doc_extractor.py  | 2 ++
 .../rag/extractor/unstructured/unstructured_eml_extractor.py  | 2 ++
 .../rag/extractor/unstructured/unstructured_epub_extractor.py | 2 ++
 .../extractor/unstructured/unstructured_markdown_extractor.py | 2 ++
 .../rag/extractor/unstructured/unstructured_msg_extractor.py  | 2 ++
 .../rag/extractor/unstructured/unstructured_ppt_extractor.py  | 2 ++
 .../rag/extractor/unstructured/unstructured_pptx_extractor.py | 2 ++
 .../rag/extractor/unstructured/unstructured_xml_extractor.py  | 2 ++
 api/core/rag/extractor/watercrawl/exceptions.py               | 4 ++++
 api/core/rag/extractor/watercrawl/extractor.py                | 3 +++
 api/core/rag/extractor/word_extractor.py                      | 2 ++
 21 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/api/core/rag/extractor/blob/blob.py b/api/core/rag/extractor/blob/blob.py
index 4537c1b537..86d71cbafe 100644
--- a/api/core/rag/extractor/blob/blob.py
+++ b/api/core/rag/extractor/blob/blob.py
@@ -12,7 +12,7 @@ import mimetypes
 from collections.abc import Generator, Mapping
 from io import BufferedReader, BytesIO
 from pathlib import Path, PurePath
-from typing import Any
+from typing import Any, override
 
 from pydantic import BaseModel, ConfigDict, model_validator
 
@@ -139,6 +139,7 @@ class Blob(BaseModel):
         """
         return cls(data=data, mimetype=mime_type, encoding=encoding, path=path)
 
+    @override
     def __repr__(self) -> str:
         """Define the blob representation."""
         str_repr = f"Blob {id(self)}"
diff --git a/api/core/rag/extractor/csv_extractor.py b/api/core/rag/extractor/csv_extractor.py
index 19bc9cec84..778dbd40c6 100644
--- a/api/core/rag/extractor/csv_extractor.py
+++ b/api/core/rag/extractor/csv_extractor.py
@@ -1,7 +1,7 @@
 """Abstract interface for document loader implementations."""
 
 import csv
-from typing import Any
+from typing import Any, override
 
 import pandas as pd
 
@@ -33,6 +33,7 @@ class CSVExtractor(BaseExtractor):
         self.source_column = source_column
         self.csv_args = csv_args or {}
 
+    @override
     def extract(self) -> list[Document]:
         """Load data into document objects."""
         docs = []
diff --git a/api/core/rag/extractor/excel_extractor.py b/api/core/rag/extractor/excel_extractor.py
index 875bfd1439..6aa379e501 100644
--- a/api/core/rag/extractor/excel_extractor.py
+++ b/api/core/rag/extractor/excel_extractor.py
@@ -1,7 +1,7 @@
 """Abstract interface for document loader implementations."""
 
 import os
-from typing import TypedDict
+from typing import TypedDict, override
 
 import pandas as pd
 from openpyxl import load_workbook
@@ -30,6 +30,7 @@ class ExcelExtractor(BaseExtractor):
         self._encoding = encoding
         self._autodetect_encoding = autodetect_encoding
 
+    @override
     def extract(self) -> list[Document]:
         """Load from Excel file in xls or xlsx format using Pandas and openpyxl."""
         documents = []
diff --git a/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py b/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py
index 38a2ffc4aa..740527a32d 100644
--- a/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py
+++ b/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py
@@ -1,3 +1,5 @@
+from typing import override
+
 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
 from services.website_service import WebsiteService
@@ -30,6 +32,7 @@ class FirecrawlWebExtractor(BaseExtractor):
         self.mode = mode
         self.only_main_content = only_main_content
 
+    @override
     def extract(self) -> list[Document]:
         """Extract content from the URL."""
         documents = []
diff --git a/api/core/rag/extractor/html_extractor.py b/api/core/rag/extractor/html_extractor.py
index 9ff1dfa1bd..858bfb18f3 100644
--- a/api/core/rag/extractor/html_extractor.py
+++ b/api/core/rag/extractor/html_extractor.py
@@ -1,5 +1,7 @@
 """Abstract interface for document loader implementations."""
 
+from typing import override
+
 from bs4 import BeautifulSoup
 
 from core.rag.extractor.extractor_base import BaseExtractor
@@ -19,6 +21,7 @@ class HtmlExtractor(BaseExtractor):
         """Initialize with file path."""
         self._file_path = file_path
 
+    @override
     def extract(self) -> list[Document]:
         return [Document(page_content=self._load_as_text())]
 
diff --git a/api/core/rag/extractor/jina_reader_extractor.py b/api/core/rag/extractor/jina_reader_extractor.py
index 67e9a3c60a..b184e8fac6 100644
--- a/api/core/rag/extractor/jina_reader_extractor.py
+++ b/api/core/rag/extractor/jina_reader_extractor.py
@@ -1,3 +1,5 @@
+from typing import override
+
 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
 from services.website_service import WebsiteService
@@ -23,6 +25,7 @@ class JinaReaderWebExtractor(BaseExtractor):
         self.mode = mode
         self.only_main_content = only_main_content
 
+    @override
     def extract(self) -> list[Document]:
         """Extract content from the URL."""
         documents = []
diff --git a/api/core/rag/extractor/markdown_extractor.py b/api/core/rag/extractor/markdown_extractor.py
index 79d6ae2dac..662da9db27 100644
--- a/api/core/rag/extractor/markdown_extractor.py
+++ b/api/core/rag/extractor/markdown_extractor.py
@@ -2,6 +2,7 @@
 
 import re
 from pathlib import Path
+from typing import override
 
 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.extractor.helpers import detect_file_encodings
@@ -31,6 +32,7 @@ class MarkdownExtractor(BaseExtractor):
         self._encoding = encoding
         self._autodetect_encoding = autodetect_encoding
 
+    @override
     def extract(self) -> list[Document]:
         """Load from file path."""
         tups = self.parse_tups(self._file_path)
diff --git a/api/core/rag/extractor/notion_extractor.py b/api/core/rag/extractor/notion_extractor.py
index aa36160711..568ccb1912 100644
--- a/api/core/rag/extractor/notion_extractor.py
+++ b/api/core/rag/extractor/notion_extractor.py
@@ -1,7 +1,7 @@
 import json
 import logging
 import operator
-from typing import Any, cast
+from typing import Any, cast, override
 
 import httpx
 from sqlalchemy import update
@@ -67,6 +67,7 @@ class NotionExtractor(BaseExtractor):
 
                 self._notion_access_token = integration_token
 
+    @override
     def extract(self) -> list[Document]:
         self.update_last_edited_time(self._document_model)
 
diff --git a/api/core/rag/extractor/pdf_extractor.py b/api/core/rag/extractor/pdf_extractor.py
index 25f6fe3e2a..a79854a735 100644
--- a/api/core/rag/extractor/pdf_extractor.py
+++ b/api/core/rag/extractor/pdf_extractor.py
@@ -5,6 +5,7 @@ import io
 import logging
 import uuid
 from collections.abc import Iterator
+from typing import override
 
 import pypdfium2
 import pypdfium2.raw as pdfium_c
@@ -55,6 +56,7 @@ class PdfExtractor(BaseExtractor):
         self._user_id = user_id
         self._file_cache_key = file_cache_key
 
+    @override
     def extract(self) -> list[Document]:
         plaintext_file_exists = False
         if self._file_cache_key:
diff --git a/api/core/rag/extractor/text_extractor.py b/api/core/rag/extractor/text_extractor.py
index 93f301ceff..b6fcd29c9d 100644
--- a/api/core/rag/extractor/text_extractor.py
+++ b/api/core/rag/extractor/text_extractor.py
@@ -1,6 +1,7 @@
 """Abstract interface for document loader implementations."""
 
 from pathlib import Path
+from typing import override
 
 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.extractor.helpers import detect_file_encodings
@@ -21,6 +22,7 @@ class TextExtractor(BaseExtractor):
         self._encoding = encoding
         self._autodetect_encoding = autodetect_encoding
 
+    @override
     def extract(self) -> list[Document]:
         """Load from file path."""
         text = ""
diff --git a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py
index da370f70e7..dec0065e56 100644
--- a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py
@@ -1,5 +1,6 @@
 import logging
 import os
+from typing import override
 
 from configs import dify_config
 from core.rag.extractor.extractor_base import BaseExtractor
@@ -17,6 +18,7 @@ class UnstructuredWordExtractor(BaseExtractor):
         self._api_url = api_url
         self._api_key = api_key
 
+    @override
     def extract(self) -> list[Document]:
         from unstructured.__version__ import __version__ as __unstructured_version__
         from unstructured.file_utils.filetype import (
diff --git a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
index d97d4c3a48..1b87298587 100644
--- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
@@ -1,6 +1,7 @@
 import base64
 import contextlib
 import logging
+from typing import override
 
 from bs4 import BeautifulSoup
 
@@ -23,6 +24,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
         self._api_url = api_url
         self._api_key = api_key
 
+    @override
     def extract(self) -> list[Document]:
         if self._api_url:
             from unstructured.partition.api import partition_via_api
diff --git a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
index 3061d957ac..74c2c36689 100644
--- a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
@@ -1,4 +1,5 @@
 import logging
+from typing import override
 
 import pypandoc  # type: ignore
 
@@ -28,6 +29,7 @@ class UnstructuredEpubExtractor(BaseExtractor):
         self._api_url = api_url
         self._api_key = api_key
 
+    @override
     def extract(self) -> list[Document]:
         if self._api_url:
             from unstructured.partition.api import partition_via_api
diff --git a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
index b6d8c47111..f71afbbae4 100644
--- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
@@ -1,4 +1,5 @@
 import logging
+from typing import override
 
 from configs import dify_config
 from core.rag.extractor.extractor_base import BaseExtractor
@@ -22,6 +23,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
         self._api_url = api_url
         self._api_key = api_key
 
+    @override
     def extract(self) -> list[Document]:
         if self._api_url:
             from unstructured.partition.api import partition_via_api
diff --git a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
index ae60fc7981..b73c383ef2 100644
--- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
@@ -1,4 +1,5 @@
 import logging
+from typing import override
 
 from configs import dify_config
 from core.rag.extractor.extractor_base import BaseExtractor
@@ -21,6 +22,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
         self._api_url = api_url
         self._api_key = api_key
 
+    @override
     def extract(self) -> list[Document]:
         if self._api_url:
             from unstructured.partition.api import partition_via_api
diff --git a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
index c12a55ee4b..bb4bf8f9ba 100644
--- a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
@@ -1,4 +1,5 @@
 import logging
+from typing import override
 
 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@@ -20,6 +21,7 @@ class UnstructuredPPTExtractor(BaseExtractor):
         self._api_url = api_url
         self._api_key = api_key
 
+    @override
     def extract(self) -> list[Document]:
         if self._api_url:
             from unstructured.partition.api import partition_via_api
diff --git a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
index 99e3eec501..9de2056e63 100644
--- a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
@@ -1,4 +1,5 @@
 import logging
+from typing import override
 
 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
@@ -20,6 +21,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
         self._api_url = api_url
         self._api_key = api_key
 
+    @override
     def extract(self) -> list[Document]:
         if self._api_url:
             from unstructured.partition.api import partition_via_api
diff --git a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
index 2d4846d85e..2f92662877 100644
--- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
@@ -1,4 +1,5 @@
 import logging
+from typing import override
 
 from configs import dify_config
 from core.rag.extractor.extractor_base import BaseExtractor
@@ -21,6 +22,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
         self._api_url = api_url
         self._api_key = api_key
 
+    @override
     def extract(self) -> list[Document]:
         if self._api_url:
             from unstructured.partition.api import partition_via_api
diff --git a/api/core/rag/extractor/watercrawl/exceptions.py b/api/core/rag/extractor/watercrawl/exceptions.py
index e407a594e0..fc457697a2 100644
--- a/api/core/rag/extractor/watercrawl/exceptions.py
+++ b/api/core/rag/extractor/watercrawl/exceptions.py
@@ -1,4 +1,5 @@
 import json
+from typing import override
 
 
 class WaterCrawlError(Exception):
@@ -18,15 +19,18 @@ class WaterCrawlBadRequestError(WaterCrawlError):
     def flat_errors(self):
         return json.dumps(self.errors)
 
+    @override
     def __str__(self):
         return f"WaterCrawlBadRequestError: {self.message} \n {self.flat_errors}"
 
 
 class WaterCrawlPermissionError(WaterCrawlBadRequestError):
+    @override
     def __str__(self):
         return f"You are exceeding your WaterCrawl API limits. {self.message}"
 
 
 class WaterCrawlAuthenticationError(WaterCrawlBadRequestError):
+    @override
     def __str__(self):
         return "WaterCrawl API key is invalid or expired. Please check your API key and try again."
diff --git a/api/core/rag/extractor/watercrawl/extractor.py b/api/core/rag/extractor/watercrawl/extractor.py
index 51a432d879..fdfe75b677 100644
--- a/api/core/rag/extractor/watercrawl/extractor.py
+++ b/api/core/rag/extractor/watercrawl/extractor.py
@@ -1,3 +1,5 @@
+from typing import override
+
 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
 from services.website_service import WebsiteService
@@ -31,6 +33,7 @@ class WaterCrawlWebExtractor(BaseExtractor):
         self.mode = mode
         self.only_main_content = only_main_content
 
+    @override
     def extract(self) -> list[Document]:
         """Extract content from the URL."""
         documents = []
diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py
index ee89ee3724..db38990c14 100644
--- a/api/core/rag/extractor/word_extractor.py
+++ b/api/core/rag/extractor/word_extractor.py
@@ -10,6 +10,7 @@ import os
 import re
 import tempfile
 import uuid
+from typing import override
 from urllib.parse import urlparse
 
 from docx import Document as DocxDocument
@@ -91,6 +92,7 @@ class WordExtractor(BaseExtractor):
     def __del__(self):
         self.close()
 
+    @override
     def extract(self) -> list[Document]:
         """Load given path as single page."""
         content = self.parse_docx(self.file_path)