dify/web/app/components/datasets/create/step-two/hooks/use-segmentation-state.ts
qiuqiua 9ef6b90843
feat: sync main branch (#31938)
Signed-off-by: majiayu000 <1835304752@qq.com>
Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: NeatGuyCoding <15627489+NeatGuyCoding@users.noreply.github.com>
Signed-off-by: -LAN- <laipz8200@outlook.com>
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com>
Co-authored-by: 盐粒 Yanli <yanli@dify.ai>
Co-authored-by: wangxiaolei <fatelei@gmail.com>
Co-authored-by: Stephen Zhou <38493346+hyoban@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cursx <33718736+Cursx@users.noreply.github.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: lif <1835304752@qq.com>
Co-authored-by: 非法操作 <hjlarry@163.com>
Co-authored-by: Asuka Minato <i@asukaminato.eu.org>
Co-authored-by: fenglin <790872612@qq.com>
Co-authored-by: qiaofenglin <qiaofenglin@baidu.com>
Co-authored-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: TomoOkuyama <49631611+TomoOkuyama@users.noreply.github.com>
Co-authored-by: Tomo Okuyama <tomo.okuyama@intersystems.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: zyssyz123 <916125788@qq.com>
Co-authored-by: hj24 <mambahj24@gmail.com>
Co-authored-by: Coding On Star <447357187@qq.com>
Co-authored-by: CodingOnStar <hanxujiang@dify.ai>
Co-authored-by: yyh <92089059+lyzno1@users.noreply.github.com>
Co-authored-by: Xiangxuan Qu <fghpdf@outlook.com>
Co-authored-by: fghpdf <fghpdf@users.noreply.github.com>
Co-authored-by: coopercoder <whitetiger0127@163.com>
Co-authored-by: zhaiguangpeng <zhaiguangpeng@didiglobal.com>
Co-authored-by: Junyan Qin (Chin) <rockchinq@gmail.com>
Co-authored-by: E.G <146701565+GlobalStar117@users.noreply.github.com>
Co-authored-by: GlobalStar117 <GlobalStar117@users.noreply.github.com>
Co-authored-by: Claude Haiku 4.5 <noreply@anthropic.com>
Co-authored-by: CodingOnStar <hanxujiang@dify.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
Co-authored-by: heyszt <270985384@qq.com>
Co-authored-by: NeatGuyCoding <15627489+NeatGuyCoding@users.noreply.github.com>
Co-authored-by: Yeuoly <45712896+Yeuoly@users.noreply.github.com>
Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
Co-authored-by: moonpanda <chuanzegao@163.com>
Co-authored-by: warlocgao <warlocgao@tencent.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com>
Co-authored-by: KVOJJJin <jzongcode@gmail.com>
Co-authored-by: eux <euxx@users.noreply.github.com>
Co-authored-by: bangjiehan <bangjiehan@gmail.com>
Co-authored-by: FFXN <31929997+FFXN@users.noreply.github.com>
Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com>
Co-authored-by: Nie Ronghua <nieronghua@sf-express.com>
Co-authored-by: JQSevenMiao <141806521+JQSevenMiao@users.noreply.github.com>
Co-authored-by: jiasiqi <jiasiqi3@tal.com>
Co-authored-by: Seokrin Taron Sung <sungsjade@gmail.com>
Co-authored-by: CrabSAMA <40541269+CrabSAMA@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: yihong <zouzou0208@gmail.com>
Co-authored-by: Joel <iamjoel007@gmail.com>
Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com>
Co-authored-by: yessenia <yessenia.contact@gmail.com>
Co-authored-by: Jax <anobaka@qq.com>
Co-authored-by: niveshdandyan <155956228+niveshdandyan@users.noreply.github.com>
Co-authored-by: OSS Contributor <oss-contributor@example.com>
Co-authored-by: niveshdandyan <niveshdandyan@users.noreply.github.com>
Co-authored-by: Sean Kenneth Doherty <Smaster7772@gmail.com>
2026-02-04 19:04:24 +08:00

237 lines
7.4 KiB
TypeScript

import type { ParentMode, PreProcessingRule, ProcessRule, Rules, SummaryIndexSetting as SummaryIndexSettingType } from '@/models/datasets'
import { useCallback, useRef, useState } from 'react'
import { ChunkingMode, ProcessMode } from '@/models/datasets'
import escape from './escape'
import unescape from './unescape'
// Constants
export const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
export const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024
export const DEFAULT_OVERLAP = 50
export const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(
globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000',
10,
)
export type ParentChildConfig = {
chunkForContext: ParentMode
parent: {
delimiter: string
maxLength: number
}
child: {
delimiter: string
maxLength: number
}
}
export const defaultParentChildConfig: ParentChildConfig = {
chunkForContext: 'paragraph',
parent: {
delimiter: '\\n\\n',
maxLength: 1024,
},
child: {
delimiter: '\\n',
maxLength: 512,
},
}
export type UseSegmentationStateOptions = {
initialSegmentationType?: ProcessMode
initialSummaryIndexSetting?: SummaryIndexSettingType
}
export const useSegmentationState = (options: UseSegmentationStateOptions = {}) => {
const { initialSegmentationType, initialSummaryIndexSetting } = options
// Segmentation type (general or parent-child)
const [segmentationType, setSegmentationType] = useState<ProcessMode>(
initialSegmentationType ?? ProcessMode.general,
)
// General chunking settings
const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH)
const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH)
const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
// Pre-processing rules
const [rules, setRules] = useState<PreProcessingRule[]>([])
const [defaultConfig, setDefaultConfig] = useState<Rules>()
const [summaryIndexSetting, setSummaryIndexSetting] = useState<SummaryIndexSettingType | undefined>(initialSummaryIndexSetting)
const summaryIndexSettingRef = useRef<SummaryIndexSettingType | undefined>(initialSummaryIndexSetting)
const handleSummaryIndexSettingChange = useCallback((payload: SummaryIndexSettingType) => {
setSummaryIndexSetting((prev) => {
const newSetting = { ...prev, ...payload }
summaryIndexSettingRef.current = newSetting
return newSetting
})
}, [])
// Parent-child config
const [parentChildConfig, setParentChildConfig] = useState<ParentChildConfig>(defaultParentChildConfig)
// Escaped segment identifier setter
const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => {
if (value) {
doSetSegmentIdentifier(escape(value))
}
else {
doSetSegmentIdentifier(canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER)
}
}, [])
// Rule toggle handler
const toggleRule = useCallback((id: string) => {
setRules(prev => prev.map(rule =>
rule.id === id ? { ...rule, enabled: !rule.enabled } : rule,
))
}, [])
// Reset to defaults
const resetToDefaults = useCallback(() => {
if (defaultConfig) {
setSegmentIdentifier(defaultConfig.segmentation.separator)
setMaxChunkLength(defaultConfig.segmentation.max_tokens)
setOverlap(defaultConfig.segmentation.chunk_overlap!)
setRules(defaultConfig.pre_processing_rules)
}
setParentChildConfig(defaultParentChildConfig)
}, [defaultConfig, setSegmentIdentifier])
// Apply config from document detail
const applyConfigFromRules = useCallback((rulesConfig: Rules, isHierarchical: boolean) => {
const separator = rulesConfig.segmentation.separator
const max = rulesConfig.segmentation.max_tokens
const chunkOverlap = rulesConfig.segmentation.chunk_overlap
setSegmentIdentifier(separator)
setMaxChunkLength(max)
setOverlap(chunkOverlap!)
setRules(rulesConfig.pre_processing_rules)
setDefaultConfig(rulesConfig)
if (isHierarchical) {
setParentChildConfig({
chunkForContext: rulesConfig.parent_mode || 'paragraph',
parent: {
delimiter: escape(rulesConfig.segmentation.separator),
maxLength: rulesConfig.segmentation.max_tokens,
},
child: {
delimiter: escape(rulesConfig.subchunk_segmentation!.separator),
maxLength: rulesConfig.subchunk_segmentation!.max_tokens,
},
})
}
}, [setSegmentIdentifier])
// Get process rule for API
const getProcessRule = useCallback((docForm: ChunkingMode): ProcessRule => {
if (docForm === ChunkingMode.parentChild) {
return {
rules: {
pre_processing_rules: rules,
segmentation: {
separator: unescape(parentChildConfig.parent.delimiter),
max_tokens: parentChildConfig.parent.maxLength,
},
parent_mode: parentChildConfig.chunkForContext,
subchunk_segmentation: {
separator: unescape(parentChildConfig.child.delimiter),
max_tokens: parentChildConfig.child.maxLength,
},
},
mode: 'hierarchical',
summary_index_setting: summaryIndexSettingRef.current,
} as ProcessRule
}
return {
rules: {
pre_processing_rules: rules,
segmentation: {
separator: unescape(segmentIdentifier),
max_tokens: maxChunkLength,
chunk_overlap: overlap,
},
},
mode: segmentationType,
summary_index_setting: summaryIndexSettingRef.current,
} as ProcessRule
}, [rules, parentChildConfig, segmentIdentifier, maxChunkLength, overlap, segmentationType])
// Update parent config field
const updateParentConfig = useCallback((field: 'delimiter' | 'maxLength', value: string | number) => {
setParentChildConfig((prev) => {
let newValue: string | number
if (field === 'delimiter')
newValue = value ? escape(value as string) : ''
else
newValue = value
return {
...prev,
parent: { ...prev.parent, [field]: newValue },
}
})
}, [])
// Update child config field
const updateChildConfig = useCallback((field: 'delimiter' | 'maxLength', value: string | number) => {
setParentChildConfig((prev) => {
let newValue: string | number
if (field === 'delimiter')
newValue = value ? escape(value as string) : ''
else
newValue = value
return {
...prev,
child: { ...prev.child, [field]: newValue },
}
})
}, [])
// Set chunk for context mode
const setChunkForContext = useCallback((mode: ParentMode) => {
setParentChildConfig(prev => ({ ...prev, chunkForContext: mode }))
}, [])
return {
// General chunking state
segmentationType,
setSegmentationType,
segmentIdentifier,
setSegmentIdentifier,
maxChunkLength,
setMaxChunkLength,
limitMaxChunkLength,
setLimitMaxChunkLength,
overlap,
setOverlap,
// Rules
rules,
setRules,
defaultConfig,
setDefaultConfig,
toggleRule,
summaryIndexSetting,
handleSummaryIndexSettingChange,
// Parent-child config
parentChildConfig,
setParentChildConfig,
updateParentConfig,
updateChildConfig,
setChunkForContext,
// Actions
resetToDefaults,
applyConfigFromRules,
getProcessRule,
}
}
export type SegmentationState = ReturnType<typeof useSegmentationState>