'use client' import type { FC } from 'react' import type { StepTwoProps } from './types' import { useCallback, useEffect, useState } from 'react' import { useTranslation } from 'react-i18next' import Divider from '@/app/components/base/divider' import Toast from '@/app/components/base/toast' import { useDatasetDetailContextWithSelector } from '@/context/dataset-detail' import { useLocale } from '@/context/i18n' import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints' import { LanguagesSupported } from '@/i18n-config/language' import { DataSourceProvider } from '@/models/common' import { ChunkingMode, ProcessMode } from '@/models/datasets' import { useFetchDefaultProcessRule } from '@/service/knowledge/use-create-dataset' import { cn } from '@/utils/classnames' import { GeneralChunkingOptions, IndexingModeSection, ParentChildOptions, PreviewPanel, StepTwoFooter } from './components' import { IndexingType, MAXIMUM_CHUNK_TOKEN_LENGTH, useDocumentCreation, useIndexingConfig, useIndexingEstimate, usePreviewState, useSegmentationState } from './hooks' export { IndexingType } const StepTwo: FC = ({ isSetting, documentDetail, isAPIKeySet, datasetId, indexingType: propsIndexingType, dataSourceType: inCreatePageDataSourceType, files, notionPages = [], notionCredentialId, websitePages = [], crawlOptions, websiteCrawlProvider = DataSourceProvider.jinaReader, websiteCrawlJobId = '', onStepChange, updateIndexingTypeCache, updateResultCache, onSave, onCancel, updateRetrievalMethodCache, }) => { const { t } = useTranslation() const locale = useLocale() const isMobile = useBreakpoints() === MediaType.mobile const currentDataset = useDatasetDetailContextWithSelector(s => s.dataset) const mutateDatasetRes = useDatasetDetailContextWithSelector(s => s.mutateDatasetRes) // Computed flags const isInUpload = Boolean(currentDataset) const isUploadInEmptyDataset = isInUpload && !currentDataset?.doc_form const isNotUploadInEmptyDataset = !isUploadInEmptyDataset const isInInit = !isInUpload && !isSetting const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type) const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : (currentDataset?.data_source_type ?? inCreatePageDataSourceType) const hasSetIndexType = !!propsIndexingType const isModelAndRetrievalConfigDisabled = !!datasetId && !!currentDataset?.data_source_type // Document form state const [docForm, setDocForm] = useState((datasetId && documentDetail) ? documentDetail.doc_form as ChunkingMode : ChunkingMode.text) const [docLanguage, setDocLanguage] = useState(() => (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese Simplified')) const [isQAConfirmDialogOpen, setIsQAConfirmDialogOpen] = useState(false) const currentDocForm = currentDataset?.doc_form || docForm // Custom hooks const segmentation = useSegmentationState({ initialSegmentationType: currentDataset?.doc_form === ChunkingMode.parentChild ? ProcessMode.parentChild : ProcessMode.general, initialSummaryIndexSetting: currentDataset?.summary_index_setting, }) const showSummaryIndexSetting = !currentDataset const indexing = useIndexingConfig({ initialIndexType: propsIndexingType, initialEmbeddingModel: currentDataset?.embedding_model ? { provider: currentDataset.embedding_model_provider, model: currentDataset.embedding_model } : undefined, initialRetrievalConfig: currentDataset?.retrieval_model_dict, isAPIKeySet, hasSetIndexType, }) const preview = usePreviewState({ dataSourceType, files, notionPages, websitePages, documentDetail, datasetId }) const creation = useDocumentCreation({ datasetId, isSetting, documentDetail, dataSourceType, files, notionPages, notionCredentialId, websitePages, crawlOptions, websiteCrawlProvider, websiteCrawlJobId, onStepChange, updateIndexingTypeCache, updateResultCache, updateRetrievalMethodCache, onSave, mutateDatasetRes, }) const estimateHook = useIndexingEstimate({ dataSourceType, datasetId, currentDocForm, docLanguage, files, previewFileName: preview.previewFile?.name, previewNotionPage: preview.previewNotionPage, notionCredentialId, previewWebsitePage: preview.previewWebsitePage, crawlOptions, websiteCrawlProvider, websiteCrawlJobId, indexingTechnique: indexing.getIndexingTechnique() as IndexingType, processRule: segmentation.getProcessRule(currentDocForm), }) // Fetch default process rule const fetchDefaultProcessRuleMutation = useFetchDefaultProcessRule({ onSuccess(data) { segmentation.setSegmentIdentifier(data.rules.segmentation.separator) segmentation.setMaxChunkLength(data.rules.segmentation.max_tokens) segmentation.setOverlap(data.rules.segmentation.chunk_overlap!) segmentation.setRules(data.rules.pre_processing_rules) segmentation.setDefaultConfig(data.rules) segmentation.setLimitMaxChunkLength(data.limits.indexing_max_segmentation_tokens_length) }, }) // Event handlers const handleDocFormChange = useCallback((value: ChunkingMode) => { if (value === ChunkingMode.qa && indexing.indexType === IndexingType.ECONOMICAL) { setIsQAConfirmDialogOpen(true) return } if (value === ChunkingMode.parentChild && indexing.indexType === IndexingType.ECONOMICAL) indexing.setIndexType(IndexingType.QUALIFIED) setDocForm(value) segmentation.setSegmentationType(value === ChunkingMode.parentChild ? ProcessMode.parentChild : ProcessMode.general) estimateHook.reset() }, [indexing, segmentation, estimateHook]) const updatePreview = useCallback(() => { if (segmentation.segmentationType === ProcessMode.general && segmentation.maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) { Toast.notify({ type: 'error', message: t('stepTwo.maxLengthCheck', { ns: 'datasetCreation', limit: MAXIMUM_CHUNK_TOKEN_LENGTH }) }) return } estimateHook.fetchEstimate() }, [segmentation, t, estimateHook]) const handleCreate = useCallback(async () => { const isValid = creation.validateParams({ segmentationType: segmentation.segmentationType, maxChunkLength: segmentation.maxChunkLength, limitMaxChunkLength: segmentation.limitMaxChunkLength, overlap: segmentation.overlap, indexType: indexing.indexType, embeddingModel: indexing.embeddingModel, rerankModelList: indexing.rerankModelList, retrievalConfig: indexing.retrievalConfig, }) if (!isValid) return const params = creation.buildCreationParams(currentDocForm, docLanguage, segmentation.getProcessRule(currentDocForm), indexing.retrievalConfig, indexing.embeddingModel, indexing.getIndexingTechnique(), segmentation.summaryIndexSetting) if (!params) return await creation.executeCreation(params, indexing.indexType, indexing.retrievalConfig) }, [creation, segmentation, indexing, currentDocForm, docLanguage]) const handlePickerChange = useCallback((selected: { id: string, name: string }) => { estimateHook.reset() preview.handlePreviewChange(selected) estimateHook.fetchEstimate() }, [estimateHook, preview]) const handleQAConfirm = useCallback(() => { setIsQAConfirmDialogOpen(false) indexing.setIndexType(IndexingType.QUALIFIED) setDocForm(ChunkingMode.qa) }, [indexing]) // Initialize rules useEffect(() => { if (!isSetting) { fetchDefaultProcessRuleMutation.mutate('/datasets/process-rule') } else if (documentDetail) { const rules = documentDetail.dataset_process_rule.rules const isHierarchical = documentDetail.doc_form === ChunkingMode.parentChild || Boolean(rules.parent_mode && rules.subchunk_segmentation) segmentation.applyConfigFromRules(rules, isHierarchical) segmentation.setSegmentationType(documentDetail.dataset_process_rule.mode) } // eslint-disable-next-line react-hooks/exhaustive-deps }, []) // Show options conditions const showGeneralOption = (isInUpload && [ChunkingMode.text, ChunkingMode.qa].includes(currentDataset!.doc_form)) || isUploadInEmptyDataset || isInInit const showParentChildOption = (isInUpload && currentDataset!.doc_form === ChunkingMode.parentChild) || isUploadInEmptyDataset || isInInit return (
{t('stepTwo.segmentation', { ns: 'datasetCreation' })}
{showGeneralOption && ( segmentation.setSegmentIdentifier(value, true)} onMaxChunkLengthChange={segmentation.setMaxChunkLength} onOverlapChange={segmentation.setOverlap} onRuleToggle={segmentation.toggleRule} onDocFormChange={handleDocFormChange} onDocLanguageChange={setDocLanguage} onPreview={updatePreview} onReset={segmentation.resetToDefaults} locale={locale} showSummaryIndexSetting={showSummaryIndexSetting} summaryIndexSetting={segmentation.summaryIndexSetting} onSummaryIndexSettingChange={segmentation.handleSummaryIndexSettingChange} /> )} {showParentChildOption && ( segmentation.updateParentConfig('delimiter', v)} onParentMaxLengthChange={v => segmentation.updateParentConfig('maxLength', v)} onChildDelimiterChange={v => segmentation.updateChildConfig('delimiter', v)} onChildMaxLengthChange={v => segmentation.updateChildConfig('maxLength', v)} onRuleToggle={segmentation.toggleRule} onPreview={updatePreview} onReset={segmentation.resetToDefaults} showSummaryIndexSetting={showSummaryIndexSetting} summaryIndexSetting={segmentation.summaryIndexSetting} onSummaryIndexSettingChange={segmentation.handleSummaryIndexSettingChange} /> )} setIsQAConfirmDialogOpen(false)} onQAConfirmDialogConfirm={handleQAConfirm} /> onStepChange?.(-1)} onCreate={handleCreate} onCancel={onCancel} />
} pickerValue={preview.getPreviewPickerValue()} isIdle={estimateHook.isIdle} isPending={estimateHook.isPending} onPickerChange={handlePickerChange} />
) } export default StepTwo