diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx index 9ecd885c51..5e50218de6 100644 --- a/web/app/components/datasets/create/step-two/index.tsx +++ b/web/app/components/datasets/create/step-two/index.tsx @@ -27,7 +27,7 @@ import { OptionCard } from './option-card' import LanguageSelect from './language-select' import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs' import cn from '@/utils/classnames' -import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FullDocumentDetail, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets' +import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FullDocumentDetail, ParentMode, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets' import Button from '@/app/components/base/button' import FloatRightContainer from '@/app/components/base/float-right-container' @@ -38,7 +38,7 @@ import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/componen import Toast from '@/app/components/base/toast' import type { NotionPage } from '@/models/common' import { DataSourceProvider } from '@/models/common' -import { DataSourceType, DocForm } from '@/models/datasets' +import { ChuckingMode, DataSourceType } from '@/models/datasets' import { useDatasetDetailContext } from '@/context/dataset-detail' import I18n from '@/context/i18n' import { RETRIEVE_METHOD } from '@/types/app' @@ -96,7 +96,7 @@ export enum IndexingType { const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n' type ParentChildConfig = { - chunkForContext: 'paragraph' | 'full_doc' + chunkForContext: ParentMode parent: { delimiter: string maxLength: number @@ -168,8 +168,8 @@ const StepTwo = ({ // QA Related const [isLanguageSelectDisabled, setIsLanguageSelectDisabled] = useState(false) - const [docForm, setDocForm] = useState( - (datasetId && documentDetail) ? documentDetail.doc_form : DocForm.TEXT, + const [docForm, setDocForm] = useState( + (datasetId && documentDetail) ? documentDetail.doc_form as ChuckingMode : ChuckingMode.text, ) const [docLanguage, setDocLanguage] = useState( @@ -181,27 +181,28 @@ const StepTwo = ({ const getIndexing_technique = () => indexingType || indexType const getProcessRule = () => { - const processRule: ProcessRule = { - rules: {} as any, // api will check this. It will be removed after api refactored. - mode: segmentationType, - } - if (segmentationType === SegmentType.CUSTOM) { - const ruleObj = { + return { + rules: { pre_processing_rules: rules, segmentation: { separator: unescape(segmentIdentifier), max_tokens: maxChunkLength, chunk_overlap: overlap, }, - } - // @ts-expect-error will be removed after api refactored. - processRule.rules = ruleObj - } - return processRule + parent_mode: parentChildConfig.chunkForContext, + subchunk_segmentation: { + separator: parentChildConfig.child.delimiter, + max_tokens: parentChildConfig.child.maxLength, + }, + }, // api will check this. It will be removed after api refactored. + mode: docForm === ChuckingMode.parentChild + ? 'hierarchical' + : segmentationType, + } as ProcessRule } const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({ - docForm: docForm as DocForm, + docForm, docLanguage, dataSourceType: DataSourceType.FILE, files, @@ -210,7 +211,7 @@ const StepTwo = ({ dataset_id: datasetId!, }) const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({ - docForm: docForm as DocForm, + docForm, docLanguage, dataSourceType: DataSourceType.NOTION, notionPages, @@ -220,7 +221,7 @@ const StepTwo = ({ }) const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({ - docForm: docForm as DocForm, + docForm, docLanguage, dataSourceType: DataSourceType.WEB, websitePages, @@ -481,29 +482,10 @@ const StepTwo = ({ isSetting && onSave && onSave() } - const handleDocformSwitch = (isQAMode: boolean) => { - if (isQAMode) - setDocForm(DocForm.QA) - else - setDocForm(DocForm.TEXT) - } - - const previewSwitch = () => { - setIsLanguageSelectDisabled(true) - fetchEstimate() - } - - const handleSelect = (language: string) => { - setDocLanguage(language) - // Switch language, re-cutter - if (docForm === DocForm.QA) - previewSwitch() - } - const changeToEconomicalType = () => { if (!hasSetIndexType) { setIndexType(IndexingType.ECONOMICAL) - setDocForm(DocForm.TEXT) + setDocForm(ChuckingMode.text) } } @@ -520,8 +502,8 @@ const StepTwo = ({ }, []) useEffect(() => { - if (indexingType === IndexingType.ECONOMICAL && docForm === DocForm.QA) - setDocForm(DocForm.TEXT) + if (indexingType === IndexingType.ECONOMICAL && docForm === ChuckingMode.qa) + setDocForm(ChuckingMode.text) }, [indexingType, docForm]) useEffect(() => { @@ -557,8 +539,8 @@ const StepTwo = ({ icon={{t('datasetCreation.stepTwo.general')}} activeHeaderClassName='bg-gradient-to-r from-[#EFF0F9] to-[#F9FAFB]' description={t('datasetCreation.stepTwo.generalTip')} - isActive={SegmentType.AUTO === segmentationType} - onClick={() => setSegmentationType(SegmentType.AUTO)} + isActive={docForm === ChuckingMode.qa || docForm === ChuckingMode.text} + onClick={() => setDocForm(ChuckingMode.text)} actions={ <>