From 624db69f1223254b8ce8ba1aa6fa30adf3bf29b2 Mon Sep 17 00:00:00 2001 From: corevibe555 <45244658+corevibe555@users.noreply.github.com> Date: Wed, 8 Apr 2026 02:36:59 +0300 Subject: [PATCH] refactor(api): remove duplicated RAG entities from services layer (#34689) --- api/controllers/console/app/app.py | 4 +-- .../service_api/dataset/document.py | 4 +-- .../processor/paragraph_index_processor.py | 2 +- .../processor/parent_child_index_processor.py | 2 +- .../processor/qa_index_processor.py | 2 +- api/models/dataset.py | 2 +- .../knowledge_entities/knowledge_entities.py | 25 +------------------ .../rag_pipeline_entities.py | 19 +------------- api/services/hit_testing_service.py | 2 +- api/services/vector_service.py | 2 +- .../test_parent_child_index_processor.py | 2 +- .../services/dataset_service_test_helpers.py | 4 +-- .../services/document_service_validation.py | 4 +-- 13 files changed, 13 insertions(+), 61 deletions(-) diff --git a/api/controllers/console/app/app.py b/api/controllers/console/app/app.py index 32a5edbab1..c4b9bf6540 100644 --- a/api/controllers/console/app/app.py +++ b/api/controllers/console/app/app.py @@ -26,6 +26,7 @@ from controllers.console.wraps import ( setup_required, ) from core.ops.ops_trace_manager import OpsTraceManager +from core.rag.entities import PreProcessingRule, Rule, Segmentation from core.rag.retrieval.retrieval_methods import RetrievalMethod from core.trigger.constants import TRIGGER_NODE_TYPES from extensions.ext_database import db @@ -42,10 +43,7 @@ from services.entities.knowledge_entities.knowledge_entities import ( NotionIcon, NotionInfo, NotionPage, - PreProcessingRule, RerankingModel, - Rule, - Segmentation, WebsiteInfo, WeightKeywordSetting, WeightModel, diff --git a/api/controllers/service_api/dataset/document.py b/api/controllers/service_api/dataset/document.py index 2c094aa3e6..9f1ce17ed9 100644 --- a/api/controllers/service_api/dataset/document.py +++ b/api/controllers/service_api/dataset/document.py @@ -31,6 +31,7 @@ from controllers.service_api.wraps import ( cloud_edition_billing_resource_check, ) from core.errors.error import ProviderTokenNotInitError +from core.rag.entities import PreProcessingRule, Rule, Segmentation from core.rag.retrieval.retrieval_methods import RetrievalMethod from extensions.ext_database import db from fields.document_fields import document_fields, document_status_fields @@ -40,11 +41,8 @@ from models.enums import SegmentStatus from services.dataset_service import DatasetService, DocumentService from services.entities.knowledge_entities.knowledge_entities import ( KnowledgeConfig, - PreProcessingRule, ProcessRule, RetrievalModel, - Rule, - Segmentation, ) from services.file_service import FileService from services.summary_index_service import SummaryIndexService diff --git a/api/core/rag/index_processor/processor/paragraph_index_processor.py b/api/core/rag/index_processor/processor/paragraph_index_processor.py index 22ab492cbf..4a731bf277 100644 --- a/api/core/rag/index_processor/processor/paragraph_index_processor.py +++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py @@ -32,6 +32,7 @@ from core.rag.datasource.keyword.keyword_factory import Keyword from core.rag.datasource.retrieval_service import RetrievalService from core.rag.datasource.vdb.vector_factory import Vector from core.rag.docstore.dataset_docstore import DatasetDocumentStore +from core.rag.entities import Rule from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.extractor.extract_processor import ExtractProcessor from core.rag.index_processor.constant.doc_type import DocType @@ -49,7 +50,6 @@ from models.account import Account from models.dataset import Dataset, DatasetProcessRule, DocumentSegment, SegmentAttachmentBinding from models.dataset import Document as DatasetDocument from services.account_service import AccountService -from services.entities.knowledge_entities.knowledge_entities import Rule from services.summary_index_service import SummaryIndexService _file_access_controller = DatabaseFileAccessController() diff --git a/api/core/rag/index_processor/processor/parent_child_index_processor.py b/api/core/rag/index_processor/processor/parent_child_index_processor.py index 1c5e02e9c8..53596b5de8 100644 --- a/api/core/rag/index_processor/processor/parent_child_index_processor.py +++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py @@ -17,6 +17,7 @@ from core.rag.data_post_processor.data_post_processor import RerankingModelDict from core.rag.datasource.retrieval_service import RetrievalService from core.rag.datasource.vdb.vector_factory import Vector from core.rag.docstore.dataset_docstore import DatasetDocumentStore +from core.rag.entities import ParentMode, Rule from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.extractor.extract_processor import ExtractProcessor from core.rag.index_processor.constant.doc_type import DocType @@ -30,7 +31,6 @@ from models import Account from models.dataset import ChildChunk, Dataset, DatasetProcessRule, DocumentSegment from models.dataset import Document as DatasetDocument from services.account_service import AccountService -from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule from services.summary_index_service import SummaryIndexService logger = logging.getLogger(__name__) diff --git a/api/core/rag/index_processor/processor/qa_index_processor.py b/api/core/rag/index_processor/processor/qa_index_processor.py index 6874603a83..273ea0f852 100644 --- a/api/core/rag/index_processor/processor/qa_index_processor.py +++ b/api/core/rag/index_processor/processor/qa_index_processor.py @@ -19,6 +19,7 @@ from core.rag.data_post_processor.data_post_processor import RerankingModelDict from core.rag.datasource.retrieval_service import RetrievalService from core.rag.datasource.vdb.vector_factory import Vector from core.rag.docstore.dataset_docstore import DatasetDocumentStore +from core.rag.entities import Rule from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.extractor.extract_processor import ExtractProcessor from core.rag.index_processor.constant.index_type import IndexStructureType, IndexTechniqueType @@ -30,7 +31,6 @@ from libs import helper from models.account import Account from models.dataset import Dataset, DocumentSegment from models.dataset import Document as DatasetDocument -from services.entities.knowledge_entities.knowledge_entities import Rule from services.summary_index_service import SummaryIndexService logger = logging.getLogger(__name__) diff --git a/api/models/dataset.py b/api/models/dataset.py index e323ccfd7f..97604848af 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -19,6 +19,7 @@ from sqlalchemy import DateTime, String, func, select from sqlalchemy.orm import Mapped, Session, mapped_column from configs import dify_config +from core.rag.entities import ParentMode, Rule from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource from core.rag.index_processor.constant.index_type import IndexStructureType, IndexTechniqueType from core.rag.index_processor.constant.query_type import QueryType @@ -26,7 +27,6 @@ from core.rag.retrieval.retrieval_methods import RetrievalMethod from core.tools.signature import sign_upload_file from extensions.ext_storage import storage from libs.uuid_utils import uuidv7 -from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule from .account import Account from .base import Base, TypeBase diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py index 66309f0e59..cb38104e8c 100644 --- a/api/services/entities/knowledge_entities/knowledge_entities.py +++ b/api/services/entities/knowledge_entities/knowledge_entities.py @@ -1,17 +1,12 @@ -from enum import StrEnum from typing import Literal from pydantic import BaseModel, field_validator +from core.rag.entities import Rule from core.rag.index_processor.constant.index_type import IndexStructureType from core.rag.retrieval.retrieval_methods import RetrievalMethod -class ParentMode(StrEnum): - FULL_DOC = "full-doc" - PARAGRAPH = "paragraph" - - class NotionIcon(BaseModel): type: str url: str | None = None @@ -53,24 +48,6 @@ class DataSource(BaseModel): info_list: InfoList -class PreProcessingRule(BaseModel): - id: str - enabled: bool - - -class Segmentation(BaseModel): - separator: str = "\n" - max_tokens: int - chunk_overlap: int = 0 - - -class Rule(BaseModel): - pre_processing_rules: list[PreProcessingRule] | None = None - segmentation: Segmentation | None = None - parent_mode: Literal["full-doc", "paragraph"] | None = None - subchunk_segmentation: Segmentation | None = None - - class ProcessRule(BaseModel): mode: Literal["automatic", "custom", "hierarchical"] rules: Rule | None = None diff --git a/api/services/entities/knowledge_entities/rag_pipeline_entities.py b/api/services/entities/knowledge_entities/rag_pipeline_entities.py index 041ae4edba..07fbe963d6 100644 --- a/api/services/entities/knowledge_entities/rag_pipeline_entities.py +++ b/api/services/entities/knowledge_entities/rag_pipeline_entities.py @@ -2,6 +2,7 @@ from typing import Literal from pydantic import BaseModel, field_validator +from core.rag.entities import KeywordSetting, VectorSetting from core.rag.retrieval.retrieval_methods import RetrievalMethod @@ -36,24 +37,6 @@ class RerankingModelConfig(BaseModel): reranking_model_name: str | None = "" -class VectorSetting(BaseModel): - """ - Vector Setting. - """ - - vector_weight: float - embedding_provider_name: str - embedding_model_name: str - - -class KeywordSetting(BaseModel): - """ - Keyword Setting. - """ - - keyword_weight: float - - class WeightedScoreConfig(BaseModel): """ Weighted score Config. diff --git a/api/services/hit_testing_service.py b/api/services/hit_testing_service.py index fa7b0a533b..7e0100212a 100644 --- a/api/services/hit_testing_service.py +++ b/api/services/hit_testing_service.py @@ -60,7 +60,7 @@ class HitTestingService: if metadata_filtering_conditions and query: dataset_retrieval = DatasetRetrieval() - from core.app.app_config.entities import MetadataFilteringCondition + from core.rag.entities import MetadataFilteringCondition metadata_filtering_conditions = MetadataFilteringCondition.model_validate(metadata_filtering_conditions) diff --git a/api/services/vector_service.py b/api/services/vector_service.py index e7266cb8e9..9827c8dfbc 100644 --- a/api/services/vector_service.py +++ b/api/services/vector_service.py @@ -6,6 +6,7 @@ from sqlalchemy import delete, select from core.model_manager import ModelInstance, ModelManager from core.rag.datasource.keyword.keyword_factory import Keyword from core.rag.datasource.vdb.vector_factory import Vector +from core.rag.entities import ParentMode from core.rag.index_processor.constant.doc_type import DocType from core.rag.index_processor.constant.index_type import IndexStructureType, IndexTechniqueType from core.rag.index_processor.index_processor_base import BaseIndexProcessor @@ -15,7 +16,6 @@ from extensions.ext_database import db from models import UploadFile from models.dataset import ChildChunk, Dataset, DatasetProcessRule, DocumentSegment, SegmentAttachmentBinding from models.dataset import Document as DatasetDocument -from services.entities.knowledge_entities.knowledge_entities import ParentMode logger = logging.getLogger(__name__) diff --git a/api/tests/unit_tests/core/rag/indexing/processor/test_parent_child_index_processor.py b/api/tests/unit_tests/core/rag/indexing/processor/test_parent_child_index_processor.py index d363a0804d..c241b44d52 100644 --- a/api/tests/unit_tests/core/rag/indexing/processor/test_parent_child_index_processor.py +++ b/api/tests/unit_tests/core/rag/indexing/processor/test_parent_child_index_processor.py @@ -4,10 +4,10 @@ from unittest.mock import MagicMock, Mock, patch import pytest from core.entities.knowledge_entities import PreviewDetail +from core.rag.entities import ParentMode from core.rag.index_processor.constant.index_type import IndexTechniqueType from core.rag.index_processor.processor.parent_child_index_processor import ParentChildIndexProcessor from core.rag.models.document import AttachmentDocument, ChildDocument, Document -from services.entities.knowledge_entities.knowledge_entities import ParentMode class TestParentChildIndexProcessor: diff --git a/api/tests/unit_tests/services/dataset_service_test_helpers.py b/api/tests/unit_tests/services/dataset_service_test_helpers.py index ef73bc0e01..da557de8a4 100644 --- a/api/tests/unit_tests/services/dataset_service_test_helpers.py +++ b/api/tests/unit_tests/services/dataset_service_test_helpers.py @@ -14,6 +14,7 @@ from graphon.model_runtime.entities.model_entities import ModelFeature, ModelTyp from werkzeug.exceptions import Forbidden, NotFound from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError +from core.rag.entities import PreProcessingRule, Rule, Segmentation from core.rag.index_processor.constant.built_in_field import BuiltInField from core.rag.index_processor.constant.index_type import IndexStructureType from core.rag.retrieval.retrieval_methods import RetrievalMethod @@ -44,12 +45,9 @@ from services.entities.knowledge_entities.knowledge_entities import ( NotionIcon, NotionInfo, NotionPage, - PreProcessingRule, ProcessRule, RerankingModel, RetrievalModel, - Rule, - Segmentation, SegmentUpdateArgs, WebsiteInfo, ) diff --git a/api/tests/unit_tests/services/document_service_validation.py b/api/tests/unit_tests/services/document_service_validation.py index 7c36e9d960..6903c47a24 100644 --- a/api/tests/unit_tests/services/document_service_validation.py +++ b/api/tests/unit_tests/services/document_service_validation.py @@ -112,6 +112,7 @@ import pytest from graphon.model_runtime.entities.model_entities import ModelType from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError +from core.rag.entities import PreProcessingRule, Rule, Segmentation from core.rag.index_processor.constant.index_type import IndexStructureType, IndexTechniqueType from models.dataset import Dataset, DatasetProcessRule, Document from services.dataset_service import DatasetService, DocumentService @@ -122,10 +123,7 @@ from services.entities.knowledge_entities.knowledge_entities import ( KnowledgeConfig, NotionInfo, NotionPage, - PreProcessingRule, ProcessRule, - Rule, - Segmentation, WebsiteInfo, )