mirror of
https://github.com/langgenius/dify.git
synced 2026-05-13 08:57:28 +08:00
Signed-off-by: majiayu000 <1835304752@qq.com> Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: NeatGuyCoding <15627489+NeatGuyCoding@users.noreply.github.com> Signed-off-by: -LAN- <laipz8200@outlook.com> Signed-off-by: yihong0618 <zouzou0208@gmail.com> Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com> Co-authored-by: 盐粒 Yanli <yanli@dify.ai> Co-authored-by: wangxiaolei <fatelei@gmail.com> Co-authored-by: Stephen Zhou <38493346+hyoban@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cursx <33718736+Cursx@users.noreply.github.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: lif <1835304752@qq.com> Co-authored-by: 非法操作 <hjlarry@163.com> Co-authored-by: Asuka Minato <i@asukaminato.eu.org> Co-authored-by: fenglin <790872612@qq.com> Co-authored-by: qiaofenglin <qiaofenglin@baidu.com> Co-authored-by: -LAN- <laipz8200@outlook.com> Co-authored-by: TomoOkuyama <49631611+TomoOkuyama@users.noreply.github.com> Co-authored-by: Tomo Okuyama <tomo.okuyama@intersystems.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: zyssyz123 <916125788@qq.com> Co-authored-by: hj24 <mambahj24@gmail.com> Co-authored-by: Coding On Star <447357187@qq.com> Co-authored-by: CodingOnStar <hanxujiang@dify.ai> Co-authored-by: yyh <92089059+lyzno1@users.noreply.github.com> Co-authored-by: Xiangxuan Qu <fghpdf@outlook.com> Co-authored-by: fghpdf <fghpdf@users.noreply.github.com> Co-authored-by: coopercoder <whitetiger0127@163.com> Co-authored-by: zhaiguangpeng <zhaiguangpeng@didiglobal.com> Co-authored-by: Junyan Qin (Chin) <rockchinq@gmail.com> Co-authored-by: E.G <146701565+GlobalStar117@users.noreply.github.com> Co-authored-by: GlobalStar117 <GlobalStar117@users.noreply.github.com> Co-authored-by: Claude Haiku 4.5 <noreply@anthropic.com> Co-authored-by: CodingOnStar <hanxujiang@dify.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: heyszt <270985384@qq.com> Co-authored-by: NeatGuyCoding <15627489+NeatGuyCoding@users.noreply.github.com> Co-authored-by: Yeuoly <45712896+Yeuoly@users.noreply.github.com> Co-authored-by: zxhlyh <jasonapring2015@outlook.com> Co-authored-by: moonpanda <chuanzegao@163.com> Co-authored-by: warlocgao <warlocgao@tencent.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com> Co-authored-by: KVOJJJin <jzongcode@gmail.com> Co-authored-by: eux <euxx@users.noreply.github.com> Co-authored-by: bangjiehan <bangjiehan@gmail.com> Co-authored-by: FFXN <31929997+FFXN@users.noreply.github.com> Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com> Co-authored-by: Nie Ronghua <nieronghua@sf-express.com> Co-authored-by: JQSevenMiao <141806521+JQSevenMiao@users.noreply.github.com> Co-authored-by: jiasiqi <jiasiqi3@tal.com> Co-authored-by: Seokrin Taron Sung <sungsjade@gmail.com> Co-authored-by: CrabSAMA <40541269+CrabSAMA@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: yihong <zouzou0208@gmail.com> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com> Co-authored-by: yessenia <yessenia.contact@gmail.com> Co-authored-by: Jax <anobaka@qq.com> Co-authored-by: niveshdandyan <155956228+niveshdandyan@users.noreply.github.com> Co-authored-by: OSS Contributor <oss-contributor@example.com> Co-authored-by: niveshdandyan <niveshdandyan@users.noreply.github.com> Co-authored-by: Sean Kenneth Doherty <Smaster7772@gmail.com>
237 lines
7.4 KiB
TypeScript
237 lines
7.4 KiB
TypeScript
import type { ParentMode, PreProcessingRule, ProcessRule, Rules, SummaryIndexSetting as SummaryIndexSettingType } from '@/models/datasets'
|
|
import { useCallback, useRef, useState } from 'react'
|
|
import { ChunkingMode, ProcessMode } from '@/models/datasets'
|
|
import escape from './escape'
|
|
import unescape from './unescape'
|
|
|
|
// Constants
|
|
export const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
|
|
export const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024
|
|
export const DEFAULT_OVERLAP = 50
|
|
export const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(
|
|
globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000',
|
|
10,
|
|
)
|
|
|
|
export type ParentChildConfig = {
|
|
chunkForContext: ParentMode
|
|
parent: {
|
|
delimiter: string
|
|
maxLength: number
|
|
}
|
|
child: {
|
|
delimiter: string
|
|
maxLength: number
|
|
}
|
|
}
|
|
|
|
export const defaultParentChildConfig: ParentChildConfig = {
|
|
chunkForContext: 'paragraph',
|
|
parent: {
|
|
delimiter: '\\n\\n',
|
|
maxLength: 1024,
|
|
},
|
|
child: {
|
|
delimiter: '\\n',
|
|
maxLength: 512,
|
|
},
|
|
}
|
|
|
|
export type UseSegmentationStateOptions = {
|
|
initialSegmentationType?: ProcessMode
|
|
initialSummaryIndexSetting?: SummaryIndexSettingType
|
|
}
|
|
|
|
export const useSegmentationState = (options: UseSegmentationStateOptions = {}) => {
|
|
const { initialSegmentationType, initialSummaryIndexSetting } = options
|
|
|
|
// Segmentation type (general or parent-child)
|
|
const [segmentationType, setSegmentationType] = useState<ProcessMode>(
|
|
initialSegmentationType ?? ProcessMode.general,
|
|
)
|
|
|
|
// General chunking settings
|
|
const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
|
|
const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH)
|
|
const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH)
|
|
const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
|
|
|
|
// Pre-processing rules
|
|
const [rules, setRules] = useState<PreProcessingRule[]>([])
|
|
const [defaultConfig, setDefaultConfig] = useState<Rules>()
|
|
const [summaryIndexSetting, setSummaryIndexSetting] = useState<SummaryIndexSettingType | undefined>(initialSummaryIndexSetting)
|
|
const summaryIndexSettingRef = useRef<SummaryIndexSettingType | undefined>(initialSummaryIndexSetting)
|
|
const handleSummaryIndexSettingChange = useCallback((payload: SummaryIndexSettingType) => {
|
|
setSummaryIndexSetting((prev) => {
|
|
const newSetting = { ...prev, ...payload }
|
|
summaryIndexSettingRef.current = newSetting
|
|
return newSetting
|
|
})
|
|
}, [])
|
|
|
|
// Parent-child config
|
|
const [parentChildConfig, setParentChildConfig] = useState<ParentChildConfig>(defaultParentChildConfig)
|
|
|
|
// Escaped segment identifier setter
|
|
const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => {
|
|
if (value) {
|
|
doSetSegmentIdentifier(escape(value))
|
|
}
|
|
else {
|
|
doSetSegmentIdentifier(canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER)
|
|
}
|
|
}, [])
|
|
|
|
// Rule toggle handler
|
|
const toggleRule = useCallback((id: string) => {
|
|
setRules(prev => prev.map(rule =>
|
|
rule.id === id ? { ...rule, enabled: !rule.enabled } : rule,
|
|
))
|
|
}, [])
|
|
|
|
// Reset to defaults
|
|
const resetToDefaults = useCallback(() => {
|
|
if (defaultConfig) {
|
|
setSegmentIdentifier(defaultConfig.segmentation.separator)
|
|
setMaxChunkLength(defaultConfig.segmentation.max_tokens)
|
|
setOverlap(defaultConfig.segmentation.chunk_overlap!)
|
|
setRules(defaultConfig.pre_processing_rules)
|
|
}
|
|
setParentChildConfig(defaultParentChildConfig)
|
|
}, [defaultConfig, setSegmentIdentifier])
|
|
|
|
// Apply config from document detail
|
|
const applyConfigFromRules = useCallback((rulesConfig: Rules, isHierarchical: boolean) => {
|
|
const separator = rulesConfig.segmentation.separator
|
|
const max = rulesConfig.segmentation.max_tokens
|
|
const chunkOverlap = rulesConfig.segmentation.chunk_overlap
|
|
|
|
setSegmentIdentifier(separator)
|
|
setMaxChunkLength(max)
|
|
setOverlap(chunkOverlap!)
|
|
setRules(rulesConfig.pre_processing_rules)
|
|
setDefaultConfig(rulesConfig)
|
|
|
|
if (isHierarchical) {
|
|
setParentChildConfig({
|
|
chunkForContext: rulesConfig.parent_mode || 'paragraph',
|
|
parent: {
|
|
delimiter: escape(rulesConfig.segmentation.separator),
|
|
maxLength: rulesConfig.segmentation.max_tokens,
|
|
},
|
|
child: {
|
|
delimiter: escape(rulesConfig.subchunk_segmentation!.separator),
|
|
maxLength: rulesConfig.subchunk_segmentation!.max_tokens,
|
|
},
|
|
})
|
|
}
|
|
}, [setSegmentIdentifier])
|
|
|
|
// Get process rule for API
|
|
const getProcessRule = useCallback((docForm: ChunkingMode): ProcessRule => {
|
|
if (docForm === ChunkingMode.parentChild) {
|
|
return {
|
|
rules: {
|
|
pre_processing_rules: rules,
|
|
segmentation: {
|
|
separator: unescape(parentChildConfig.parent.delimiter),
|
|
max_tokens: parentChildConfig.parent.maxLength,
|
|
},
|
|
parent_mode: parentChildConfig.chunkForContext,
|
|
subchunk_segmentation: {
|
|
separator: unescape(parentChildConfig.child.delimiter),
|
|
max_tokens: parentChildConfig.child.maxLength,
|
|
},
|
|
},
|
|
mode: 'hierarchical',
|
|
summary_index_setting: summaryIndexSettingRef.current,
|
|
} as ProcessRule
|
|
}
|
|
|
|
return {
|
|
rules: {
|
|
pre_processing_rules: rules,
|
|
segmentation: {
|
|
separator: unescape(segmentIdentifier),
|
|
max_tokens: maxChunkLength,
|
|
chunk_overlap: overlap,
|
|
},
|
|
},
|
|
mode: segmentationType,
|
|
summary_index_setting: summaryIndexSettingRef.current,
|
|
} as ProcessRule
|
|
}, [rules, parentChildConfig, segmentIdentifier, maxChunkLength, overlap, segmentationType])
|
|
|
|
// Update parent config field
|
|
const updateParentConfig = useCallback((field: 'delimiter' | 'maxLength', value: string | number) => {
|
|
setParentChildConfig((prev) => {
|
|
let newValue: string | number
|
|
if (field === 'delimiter')
|
|
newValue = value ? escape(value as string) : ''
|
|
else
|
|
newValue = value
|
|
return {
|
|
...prev,
|
|
parent: { ...prev.parent, [field]: newValue },
|
|
}
|
|
})
|
|
}, [])
|
|
|
|
// Update child config field
|
|
const updateChildConfig = useCallback((field: 'delimiter' | 'maxLength', value: string | number) => {
|
|
setParentChildConfig((prev) => {
|
|
let newValue: string | number
|
|
if (field === 'delimiter')
|
|
newValue = value ? escape(value as string) : ''
|
|
else
|
|
newValue = value
|
|
return {
|
|
...prev,
|
|
child: { ...prev.child, [field]: newValue },
|
|
}
|
|
})
|
|
}, [])
|
|
|
|
// Set chunk for context mode
|
|
const setChunkForContext = useCallback((mode: ParentMode) => {
|
|
setParentChildConfig(prev => ({ ...prev, chunkForContext: mode }))
|
|
}, [])
|
|
|
|
return {
|
|
// General chunking state
|
|
segmentationType,
|
|
setSegmentationType,
|
|
segmentIdentifier,
|
|
setSegmentIdentifier,
|
|
maxChunkLength,
|
|
setMaxChunkLength,
|
|
limitMaxChunkLength,
|
|
setLimitMaxChunkLength,
|
|
overlap,
|
|
setOverlap,
|
|
|
|
// Rules
|
|
rules,
|
|
setRules,
|
|
defaultConfig,
|
|
setDefaultConfig,
|
|
toggleRule,
|
|
summaryIndexSetting,
|
|
handleSummaryIndexSettingChange,
|
|
|
|
// Parent-child config
|
|
parentChildConfig,
|
|
setParentChildConfig,
|
|
updateParentConfig,
|
|
updateChildConfig,
|
|
setChunkForContext,
|
|
|
|
// Actions
|
|
resetToDefaults,
|
|
applyConfigFromRules,
|
|
getProcessRule,
|
|
}
|
|
}
|
|
|
|
export type SegmentationState = ReturnType<typeof useSegmentationState>
|