mirror of
https://github.com/langgenius/dify.git
synced 2026-06-07 16:32:01 +08:00
chore: add missing @override decorator to api/core/rag/extractor (#37013)
Co-authored-by: mac <mac@1234.local>
This commit is contained in:
parent
bb3c9929f9
commit
e14cb209a4
@ -12,7 +12,7 @@ import mimetypes
|
|||||||
from collections.abc import Generator, Mapping
|
from collections.abc import Generator, Mapping
|
||||||
from io import BufferedReader, BytesIO
|
from io import BufferedReader, BytesIO
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import Any
|
from typing import Any, override
|
||||||
|
|
||||||
from pydantic import BaseModel, ConfigDict, model_validator
|
from pydantic import BaseModel, ConfigDict, model_validator
|
||||||
|
|
||||||
@ -139,6 +139,7 @@ class Blob(BaseModel):
|
|||||||
"""
|
"""
|
||||||
return cls(data=data, mimetype=mime_type, encoding=encoding, path=path)
|
return cls(data=data, mimetype=mime_type, encoding=encoding, path=path)
|
||||||
|
|
||||||
|
@override
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
"""Define the blob representation."""
|
"""Define the blob representation."""
|
||||||
str_repr = f"Blob {id(self)}"
|
str_repr = f"Blob {id(self)}"
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
"""Abstract interface for document loader implementations."""
|
"""Abstract interface for document loader implementations."""
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
from typing import Any
|
from typing import Any, override
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
@ -33,6 +33,7 @@ class CSVExtractor(BaseExtractor):
|
|||||||
self.source_column = source_column
|
self.source_column = source_column
|
||||||
self.csv_args = csv_args or {}
|
self.csv_args = csv_args or {}
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
"""Load data into document objects."""
|
"""Load data into document objects."""
|
||||||
docs = []
|
docs = []
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
"""Abstract interface for document loader implementations."""
|
"""Abstract interface for document loader implementations."""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from typing import TypedDict
|
from typing import TypedDict, override
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from openpyxl import load_workbook
|
from openpyxl import load_workbook
|
||||||
@ -30,6 +30,7 @@ class ExcelExtractor(BaseExtractor):
|
|||||||
self._encoding = encoding
|
self._encoding = encoding
|
||||||
self._autodetect_encoding = autodetect_encoding
|
self._autodetect_encoding = autodetect_encoding
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
"""Load from Excel file in xls or xlsx format using Pandas and openpyxl."""
|
"""Load from Excel file in xls or xlsx format using Pandas and openpyxl."""
|
||||||
documents = []
|
documents = []
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
from typing import override
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
from services.website_service import WebsiteService
|
from services.website_service import WebsiteService
|
||||||
@ -30,6 +32,7 @@ class FirecrawlWebExtractor(BaseExtractor):
|
|||||||
self.mode = mode
|
self.mode = mode
|
||||||
self.only_main_content = only_main_content
|
self.only_main_content = only_main_content
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
"""Extract content from the URL."""
|
"""Extract content from the URL."""
|
||||||
documents = []
|
documents = []
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
"""Abstract interface for document loader implementations."""
|
"""Abstract interface for document loader implementations."""
|
||||||
|
|
||||||
|
from typing import override
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
@ -19,6 +21,7 @@ class HtmlExtractor(BaseExtractor):
|
|||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
return [Document(page_content=self._load_as_text())]
|
return [Document(page_content=self._load_as_text())]
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
from typing import override
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
from services.website_service import WebsiteService
|
from services.website_service import WebsiteService
|
||||||
@ -23,6 +25,7 @@ class JinaReaderWebExtractor(BaseExtractor):
|
|||||||
self.mode = mode
|
self.mode = mode
|
||||||
self.only_main_content = only_main_content
|
self.only_main_content = only_main_content
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
"""Extract content from the URL."""
|
"""Extract content from the URL."""
|
||||||
documents = []
|
documents = []
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import override
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.extractor.helpers import detect_file_encodings
|
from core.rag.extractor.helpers import detect_file_encodings
|
||||||
@ -31,6 +32,7 @@ class MarkdownExtractor(BaseExtractor):
|
|||||||
self._encoding = encoding
|
self._encoding = encoding
|
||||||
self._autodetect_encoding = autodetect_encoding
|
self._autodetect_encoding = autodetect_encoding
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
"""Load from file path."""
|
"""Load from file path."""
|
||||||
tups = self.parse_tups(self._file_path)
|
tups = self.parse_tups(self._file_path)
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import operator
|
import operator
|
||||||
from typing import Any, cast
|
from typing import Any, cast, override
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from sqlalchemy import update
|
from sqlalchemy import update
|
||||||
@ -67,6 +67,7 @@ class NotionExtractor(BaseExtractor):
|
|||||||
|
|
||||||
self._notion_access_token = integration_token
|
self._notion_access_token = integration_token
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
self.update_last_edited_time(self._document_model)
|
self.update_last_edited_time(self._document_model)
|
||||||
|
|
||||||
|
|||||||
@ -5,6 +5,7 @@ import io
|
|||||||
import logging
|
import logging
|
||||||
import uuid
|
import uuid
|
||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
|
from typing import override
|
||||||
|
|
||||||
import pypdfium2
|
import pypdfium2
|
||||||
import pypdfium2.raw as pdfium_c
|
import pypdfium2.raw as pdfium_c
|
||||||
@ -55,6 +56,7 @@ class PdfExtractor(BaseExtractor):
|
|||||||
self._user_id = user_id
|
self._user_id = user_id
|
||||||
self._file_cache_key = file_cache_key
|
self._file_cache_key = file_cache_key
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
plaintext_file_exists = False
|
plaintext_file_exists = False
|
||||||
if self._file_cache_key:
|
if self._file_cache_key:
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
"""Abstract interface for document loader implementations."""
|
"""Abstract interface for document loader implementations."""
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import override
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.extractor.helpers import detect_file_encodings
|
from core.rag.extractor.helpers import detect_file_encodings
|
||||||
@ -21,6 +22,7 @@ class TextExtractor(BaseExtractor):
|
|||||||
self._encoding = encoding
|
self._encoding = encoding
|
||||||
self._autodetect_encoding = autodetect_encoding
|
self._autodetect_encoding = autodetect_encoding
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
"""Load from file path."""
|
"""Load from file path."""
|
||||||
text = ""
|
text = ""
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
from typing import override
|
||||||
|
|
||||||
from configs import dify_config
|
from configs import dify_config
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
@ -17,6 +18,7 @@ class UnstructuredWordExtractor(BaseExtractor):
|
|||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
self._api_key = api_key
|
self._api_key = api_key
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
from unstructured.__version__ import __version__ as __unstructured_version__
|
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||||
from unstructured.file_utils.filetype import (
|
from unstructured.file_utils.filetype import (
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
import base64
|
import base64
|
||||||
import contextlib
|
import contextlib
|
||||||
import logging
|
import logging
|
||||||
|
from typing import override
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
@ -23,6 +24,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
|
|||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
self._api_key = api_key
|
self._api_key = api_key
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
if self._api_url:
|
if self._api_url:
|
||||||
from unstructured.partition.api import partition_via_api
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from typing import override
|
||||||
|
|
||||||
import pypandoc # type: ignore
|
import pypandoc # type: ignore
|
||||||
|
|
||||||
@ -28,6 +29,7 @@ class UnstructuredEpubExtractor(BaseExtractor):
|
|||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
self._api_key = api_key
|
self._api_key = api_key
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
if self._api_url:
|
if self._api_url:
|
||||||
from unstructured.partition.api import partition_via_api
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from typing import override
|
||||||
|
|
||||||
from configs import dify_config
|
from configs import dify_config
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
@ -22,6 +23,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
|
|||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
self._api_key = api_key
|
self._api_key = api_key
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
if self._api_url:
|
if self._api_url:
|
||||||
from unstructured.partition.api import partition_via_api
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from typing import override
|
||||||
|
|
||||||
from configs import dify_config
|
from configs import dify_config
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
@ -21,6 +22,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
|
|||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
self._api_key = api_key
|
self._api_key = api_key
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
if self._api_url:
|
if self._api_url:
|
||||||
from unstructured.partition.api import partition_via_api
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from typing import override
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
@ -20,6 +21,7 @@ class UnstructuredPPTExtractor(BaseExtractor):
|
|||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
self._api_key = api_key
|
self._api_key = api_key
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
if self._api_url:
|
if self._api_url:
|
||||||
from unstructured.partition.api import partition_via_api
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from typing import override
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
@ -20,6 +21,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
|
|||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
self._api_key = api_key
|
self._api_key = api_key
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
if self._api_url:
|
if self._api_url:
|
||||||
from unstructured.partition.api import partition_via_api
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from typing import override
|
||||||
|
|
||||||
from configs import dify_config
|
from configs import dify_config
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
@ -21,6 +22,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
|
|||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
self._api_key = api_key
|
self._api_key = api_key
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
if self._api_url:
|
if self._api_url:
|
||||||
from unstructured.partition.api import partition_via_api
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
import json
|
import json
|
||||||
|
from typing import override
|
||||||
|
|
||||||
|
|
||||||
class WaterCrawlError(Exception):
|
class WaterCrawlError(Exception):
|
||||||
@ -18,15 +19,18 @@ class WaterCrawlBadRequestError(WaterCrawlError):
|
|||||||
def flat_errors(self):
|
def flat_errors(self):
|
||||||
return json.dumps(self.errors)
|
return json.dumps(self.errors)
|
||||||
|
|
||||||
|
@override
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f"WaterCrawlBadRequestError: {self.message} \n {self.flat_errors}"
|
return f"WaterCrawlBadRequestError: {self.message} \n {self.flat_errors}"
|
||||||
|
|
||||||
|
|
||||||
class WaterCrawlPermissionError(WaterCrawlBadRequestError):
|
class WaterCrawlPermissionError(WaterCrawlBadRequestError):
|
||||||
|
@override
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f"You are exceeding your WaterCrawl API limits. {self.message}"
|
return f"You are exceeding your WaterCrawl API limits. {self.message}"
|
||||||
|
|
||||||
|
|
||||||
class WaterCrawlAuthenticationError(WaterCrawlBadRequestError):
|
class WaterCrawlAuthenticationError(WaterCrawlBadRequestError):
|
||||||
|
@override
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "WaterCrawl API key is invalid or expired. Please check your API key and try again."
|
return "WaterCrawl API key is invalid or expired. Please check your API key and try again."
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
from typing import override
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
from services.website_service import WebsiteService
|
from services.website_service import WebsiteService
|
||||||
@ -31,6 +33,7 @@ class WaterCrawlWebExtractor(BaseExtractor):
|
|||||||
self.mode = mode
|
self.mode = mode
|
||||||
self.only_main_content = only_main_content
|
self.only_main_content = only_main_content
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
"""Extract content from the URL."""
|
"""Extract content from the URL."""
|
||||||
documents = []
|
documents = []
|
||||||
|
|||||||
@ -10,6 +10,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
import uuid
|
import uuid
|
||||||
|
from typing import override
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from docx import Document as DocxDocument
|
from docx import Document as DocxDocument
|
||||||
@ -91,6 +92,7 @@ class WordExtractor(BaseExtractor):
|
|||||||
def __del__(self):
|
def __del__(self):
|
||||||
self.close()
|
self.close()
|
||||||
|
|
||||||
|
@override
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
"""Load given path as single page."""
|
"""Load given path as single page."""
|
||||||
content = self.parse_docx(self.file_path)
|
content = self.parse_docx(self.file_path)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user