dify/api/core/rag/entities/processing_entities.py
Stephen Zhou c52eafe2ca
docs: enrich generated service API descriptions (#37615)
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
2026-06-18 08:43:39 +00:00

48 lines
1.4 KiB
Python

from enum import StrEnum
from typing import Annotated, Literal
from pydantic import BaseModel, Field, WithJsonSchema
class ParentMode(StrEnum):
FULL_DOC = "full-doc"
PARAGRAPH = "paragraph"
PreProcessingRuleID = Annotated[
str,
WithJsonSchema(
{
"enum": ["remove_stopwords", "remove_extra_spaces", "remove_urls_emails"],
"type": "string",
}
),
]
class PreProcessingRule(BaseModel):
id: PreProcessingRuleID = Field(description="Rule identifier.")
enabled: bool = Field(description="Whether this preprocessing rule is enabled.")
class Segmentation(BaseModel):
separator: str = Field(default="\n", description="Custom separator for splitting text.")
max_tokens: int = Field(description="Maximum token count per chunk.")
chunk_overlap: int = Field(default=0, description="Token overlap between chunks.")
class Rule(BaseModel):
pre_processing_rules: list[PreProcessingRule] | None = Field(
default=None,
description="Pre-processing rules to apply before segmentation.",
)
segmentation: Segmentation | None = Field(default=None, description="Parent chunk segmentation settings.")
parent_mode: Literal["full-doc", "paragraph"] | None = Field(
default=None,
description="Parent-child segmentation mode.",
)
subchunk_segmentation: Segmentation | None = Field(
default=None,
description="Child chunk segmentation settings.",
)