diff --git a/web/app/components/datasets/create/step-two/components/general-chunking-options.tsx b/web/app/components/datasets/create/step-two/components/general-chunking-options.tsx new file mode 100644 index 0000000000..5140c902f5 --- /dev/null +++ b/web/app/components/datasets/create/step-two/components/general-chunking-options.tsx @@ -0,0 +1,199 @@ +'use client' + +import type { FC } from 'react' +import type { PreProcessingRule } from '@/models/datasets' +import { + RiAlertFill, + RiSearchEyeLine, +} from '@remixicon/react' +import Image from 'next/image' +import { useTranslation } from 'react-i18next' +import Button from '@/app/components/base/button' +import Checkbox from '@/app/components/base/checkbox' +import Divider from '@/app/components/base/divider' +import Tooltip from '@/app/components/base/tooltip' +import { IS_CE_EDITION } from '@/config' +import { ChunkingMode } from '@/models/datasets' +import SettingCog from '../../assets/setting-gear-mod.svg' +import s from '../index.module.css' +import LanguageSelect from '../language-select' +import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs' +import { OptionCard } from './option-card' + +type TextLabelProps = { + children: React.ReactNode +} + +const TextLabel: FC = ({ children }) => { + return +} + +type GeneralChunkingOptionsProps = { + // State + segmentIdentifier: string + maxChunkLength: number + overlap: number + rules: PreProcessingRule[] + currentDocForm: ChunkingMode + docLanguage: string + // Flags + isActive: boolean + isInUpload: boolean + isNotUploadInEmptyDataset: boolean + hasCurrentDatasetDocForm: boolean + // Actions + onSegmentIdentifierChange: (value: string) => void + onMaxChunkLengthChange: (value: number) => void + onOverlapChange: (value: number) => void + onRuleToggle: (id: string) => void + onDocFormChange: (form: ChunkingMode) => void + onDocLanguageChange: (lang: string) => void + onPreview: () => void + onReset: () => void + // Locale + locale: string +} + +export const GeneralChunkingOptions: FC = ({ + segmentIdentifier, + maxChunkLength, + overlap, + rules, + currentDocForm, + docLanguage, + isActive, + isInUpload, + isNotUploadInEmptyDataset, + hasCurrentDatasetDocForm, + onSegmentIdentifierChange, + onMaxChunkLengthChange, + onOverlapChange, + onRuleToggle, + onDocFormChange, + onDocLanguageChange, + onPreview, + onReset, + locale, +}) => { + const { t } = useTranslation() + + const getRuleName = (key: string): string => { + const ruleNameMap: Record = { + remove_extra_spaces: t('stepTwo.removeExtraSpaces', { ns: 'datasetCreation' }), + remove_urls_emails: t('stepTwo.removeUrlEmails', { ns: 'datasetCreation' }), + remove_stopwords: t('stepTwo.removeStopwords', { ns: 'datasetCreation' }), + } + return ruleNameMap[key] ?? key + } + + return ( + } + activeHeaderClassName="bg-dataset-option-card-blue-gradient" + description={t('stepTwo.generalTip', { ns: 'datasetCreation' })} + isActive={isActive} + onSwitched={() => onDocFormChange(ChunkingMode.text)} + actions={( + <> + + + + )} + noHighlight={isInUpload && isNotUploadInEmptyDataset} + > +
+
+ onSegmentIdentifierChange(e.target.value)} + /> + + +
+
+
+
+ {t('stepTwo.rules', { ns: 'datasetCreation' })} +
+ +
+
+ {rules.map(rule => ( +
onRuleToggle(rule.id)} + > + + +
+ ))} + {IS_CE_EDITION && ( + <> + +
+
{ + if (hasCurrentDatasetDocForm) + return + if (currentDocForm === ChunkingMode.qa) + onDocFormChange(ChunkingMode.text) + else + onDocFormChange(ChunkingMode.qa) + }} + > + + +
+ + +
+ {currentDocForm === ChunkingMode.qa && ( +
+ + + {t('stepTwo.QATip', { ns: 'datasetCreation' })} + +
+ )} + + )} +
+
+
+
+ ) +} diff --git a/web/app/components/datasets/create/step-two/components/index.ts b/web/app/components/datasets/create/step-two/components/index.ts new file mode 100644 index 0000000000..d5382e0c4b --- /dev/null +++ b/web/app/components/datasets/create/step-two/components/index.ts @@ -0,0 +1,5 @@ +export { GeneralChunkingOptions } from './general-chunking-options' +export { IndexingModeSection } from './indexing-mode-section' +export { ParentChildOptions } from './parent-child-options' +export { PreviewPanel } from './preview-panel' +export { StepTwoFooter } from './step-two-footer' diff --git a/web/app/components/datasets/create/step-two/components/indexing-mode-section.tsx b/web/app/components/datasets/create/step-two/components/indexing-mode-section.tsx new file mode 100644 index 0000000000..ee49f42903 --- /dev/null +++ b/web/app/components/datasets/create/step-two/components/indexing-mode-section.tsx @@ -0,0 +1,253 @@ +'use client' + +import type { FC } from 'react' +import type { DefaultModel, Model } from '@/app/components/header/account-setting/model-provider-page/declarations' +import type { RetrievalConfig } from '@/types/app' +import Image from 'next/image' +import Link from 'next/link' +import { useTranslation } from 'react-i18next' +import Badge from '@/app/components/base/badge' +import Button from '@/app/components/base/button' +import CustomDialog from '@/app/components/base/dialog' +import Divider from '@/app/components/base/divider' +import { AlertTriangle } from '@/app/components/base/icons/src/vender/solid/alertsAndFeedback' +import Tooltip from '@/app/components/base/tooltip' +import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config' +import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config' +import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector' +import { useDocLink } from '@/context/i18n' +import { ChunkingMode } from '@/models/datasets' +import { cn } from '@/utils/classnames' +import { indexMethodIcon } from '../../icons' +import { IndexingType } from '../hooks' +import s from '../index.module.css' +import { OptionCard } from './option-card' + +type IndexingModeSectionProps = { + // State + indexType: IndexingType + hasSetIndexType: boolean + docForm: ChunkingMode + embeddingModel: DefaultModel + embeddingModelList?: Model[] + retrievalConfig: RetrievalConfig + showMultiModalTip: boolean + // Flags + isModelAndRetrievalConfigDisabled: boolean + datasetId?: string + // Modal state + isQAConfirmDialogOpen: boolean + // Actions + onIndexTypeChange: (type: IndexingType) => void + onEmbeddingModelChange: (model: DefaultModel) => void + onRetrievalConfigChange: (config: RetrievalConfig) => void + onQAConfirmDialogClose: () => void + onQAConfirmDialogConfirm: () => void +} + +export const IndexingModeSection: FC = ({ + indexType, + hasSetIndexType, + docForm, + embeddingModel, + embeddingModelList, + retrievalConfig, + showMultiModalTip, + isModelAndRetrievalConfigDisabled, + datasetId, + isQAConfirmDialogOpen, + onIndexTypeChange, + onEmbeddingModelChange, + onRetrievalConfigChange, + onQAConfirmDialogClose, + onQAConfirmDialogConfirm, +}) => { + const { t } = useTranslation() + const docLink = useDocLink() + + const getIndexingTechnique = () => indexType + + return ( + <> + {/* Index Mode */} +
+ {t('stepTwo.indexMode', { ns: 'datasetCreation' })} +
+
+ {/* Qualified option */} + {(!hasSetIndexType || (hasSetIndexType && indexType === IndexingType.QUALIFIED)) && ( + + {t('stepTwo.qualified', { ns: 'datasetCreation' })} + + {t('stepTwo.recommend', { ns: 'datasetCreation' })} + + + {!hasSetIndexType && } + +
+ )} + description={t('stepTwo.qualifiedTip', { ns: 'datasetCreation' })} + icon={} + isActive={!hasSetIndexType && indexType === IndexingType.QUALIFIED} + disabled={hasSetIndexType} + onSwitched={() => onIndexTypeChange(IndexingType.QUALIFIED)} + /> + )} + + {/* Economical option */} + {(!hasSetIndexType || (hasSetIndexType && indexType === IndexingType.ECONOMICAL)) && ( + <> + +
+

+ {t('stepTwo.qaSwitchHighQualityTipTitle', { ns: 'datasetCreation' })} +

+

+ {t('stepTwo.qaSwitchHighQualityTipContent', { ns: 'datasetCreation' })} +

+
+
+ + +
+
+ + {docForm === ChunkingMode.qa + ? t('stepTwo.notAvailableForQA', { ns: 'datasetCreation' }) + : t('stepTwo.notAvailableForParentChild', { ns: 'datasetCreation' })} + + )} + noDecoration + position="top" + asChild={false} + triggerClassName="flex-1 self-stretch" + > + } + isActive={!hasSetIndexType && indexType === IndexingType.ECONOMICAL} + disabled={hasSetIndexType || docForm !== ChunkingMode.text} + onSwitched={() => onIndexTypeChange(IndexingType.ECONOMICAL)} + /> + + + )} + + + {/* High quality tip */} + {!hasSetIndexType && indexType === IndexingType.QUALIFIED && ( +
+
+
+ +
+ + {t('stepTwo.highQualityTip', { ns: 'datasetCreation' })} + +
+ )} + + {/* Economical index setting tip */} + {hasSetIndexType && indexType === IndexingType.ECONOMICAL && ( +
+ {t('stepTwo.indexSettingTip', { ns: 'datasetCreation' })} + + {t('stepTwo.datasetSettingLink', { ns: 'datasetCreation' })} + +
+ )} + + {/* Embedding model */} + {indexType === IndexingType.QUALIFIED && ( +
+
+ {t('form.embeddingModel', { ns: 'datasetSettings' })} +
+ + {isModelAndRetrievalConfigDisabled && ( +
+ {t('stepTwo.indexSettingTip', { ns: 'datasetCreation' })} + + {t('stepTwo.datasetSettingLink', { ns: 'datasetCreation' })} + +
+ )} +
+ )} + + + + {/* Retrieval Method Config */} +
+ {!isModelAndRetrievalConfigDisabled + ? ( +
+
+ {t('form.retrievalSetting.title', { ns: 'datasetSettings' })} +
+
+ + {t('form.retrievalSetting.learnMore', { ns: 'datasetSettings' })} + + {t('form.retrievalSetting.longDescription', { ns: 'datasetSettings' })} +
+
+ ) + : ( +
+
{t('form.retrievalSetting.title', { ns: 'datasetSettings' })}
+
+ )} + +
+ {getIndexingTechnique() === IndexingType.QUALIFIED + ? ( + + ) + : ( + + )} +
+
+ + ) +} diff --git a/web/app/components/datasets/create/step-two/inputs.tsx b/web/app/components/datasets/create/step-two/components/inputs.tsx similarity index 100% rename from web/app/components/datasets/create/step-two/inputs.tsx rename to web/app/components/datasets/create/step-two/components/inputs.tsx diff --git a/web/app/components/datasets/create/step-two/option-card.tsx b/web/app/components/datasets/create/step-two/components/option-card.tsx similarity index 100% rename from web/app/components/datasets/create/step-two/option-card.tsx rename to web/app/components/datasets/create/step-two/components/option-card.tsx diff --git a/web/app/components/datasets/create/step-two/components/parent-child-options.tsx b/web/app/components/datasets/create/step-two/components/parent-child-options.tsx new file mode 100644 index 0000000000..e46aa5817b --- /dev/null +++ b/web/app/components/datasets/create/step-two/components/parent-child-options.tsx @@ -0,0 +1,191 @@ +'use client' + +import type { FC } from 'react' +import type { ParentChildConfig } from '../hooks' +import type { ParentMode, PreProcessingRule } from '@/models/datasets' +import { RiSearchEyeLine } from '@remixicon/react' +import Image from 'next/image' +import { useTranslation } from 'react-i18next' +import Button from '@/app/components/base/button' +import Checkbox from '@/app/components/base/checkbox' +import Divider from '@/app/components/base/divider' +import { ParentChildChunk } from '@/app/components/base/icons/src/vender/knowledge' +import RadioCard from '@/app/components/base/radio-card' +import { ChunkingMode } from '@/models/datasets' +import FileList from '../../assets/file-list-3-fill.svg' +import Note from '../../assets/note-mod.svg' +import BlueEffect from '../../assets/option-card-effect-blue.svg' +import s from '../index.module.css' +import { DelimiterInput, MaxLengthInput } from './inputs' +import { OptionCard } from './option-card' + +type TextLabelProps = { + children: React.ReactNode +} + +const TextLabel: FC = ({ children }) => { + return +} + +type ParentChildOptionsProps = { + // State + parentChildConfig: ParentChildConfig + rules: PreProcessingRule[] + currentDocForm: ChunkingMode + // Flags + isActive: boolean + isInUpload: boolean + isNotUploadInEmptyDataset: boolean + // Actions + onDocFormChange: (form: ChunkingMode) => void + onChunkForContextChange: (mode: ParentMode) => void + onParentDelimiterChange: (value: string) => void + onParentMaxLengthChange: (value: number) => void + onChildDelimiterChange: (value: string) => void + onChildMaxLengthChange: (value: number) => void + onRuleToggle: (id: string) => void + onPreview: () => void + onReset: () => void +} + +export const ParentChildOptions: FC = ({ + parentChildConfig, + rules, + currentDocForm: _currentDocForm, + isActive, + isInUpload, + isNotUploadInEmptyDataset, + onDocFormChange, + onChunkForContextChange, + onParentDelimiterChange, + onParentMaxLengthChange, + onChildDelimiterChange, + onChildMaxLengthChange, + onRuleToggle, + onPreview, + onReset, +}) => { + const { t } = useTranslation() + + const getRuleName = (key: string): string => { + const ruleNameMap: Record = { + remove_extra_spaces: t('stepTwo.removeExtraSpaces', { ns: 'datasetCreation' }), + remove_urls_emails: t('stepTwo.removeUrlEmails', { ns: 'datasetCreation' }), + remove_stopwords: t('stepTwo.removeStopwords', { ns: 'datasetCreation' }), + } + return ruleNameMap[key] ?? key + } + + return ( + } + effectImg={BlueEffect.src} + className="text-util-colors-blue-light-blue-light-500" + activeHeaderClassName="bg-dataset-option-card-blue-gradient" + description={t('stepTwo.parentChildTip', { ns: 'datasetCreation' })} + isActive={isActive} + onSwitched={() => onDocFormChange(ChunkingMode.parentChild)} + actions={( + <> + + + + )} + noHighlight={isInUpload && isNotUploadInEmptyDataset} + > +
+ {/* Parent chunk for context */} +
+
+
+ {t('stepTwo.parentChunkForContext', { ns: 'datasetCreation' })} +
+ +
+ } + title={t('stepTwo.paragraph', { ns: 'datasetCreation' })} + description={t('stepTwo.paragraphTip', { ns: 'datasetCreation' })} + isChosen={parentChildConfig.chunkForContext === 'paragraph'} + onChosen={() => onChunkForContextChange('paragraph')} + chosenConfig={( +
+ onParentDelimiterChange(e.target.value)} + /> + +
+ )} + /> + } + title={t('stepTwo.fullDoc', { ns: 'datasetCreation' })} + description={t('stepTwo.fullDocTip', { ns: 'datasetCreation' })} + onChosen={() => onChunkForContextChange('full-doc')} + isChosen={parentChildConfig.chunkForContext === 'full-doc'} + /> +
+ + {/* Child chunk for retrieval */} +
+
+
+ {t('stepTwo.childChunkForRetrieval', { ns: 'datasetCreation' })} +
+ +
+
+ onChildDelimiterChange(e.target.value)} + /> + +
+
+ + {/* Rules */} +
+
+
+ {t('stepTwo.rules', { ns: 'datasetCreation' })} +
+ +
+
+ {rules.map(rule => ( +
onRuleToggle(rule.id)} + > + + +
+ ))} +
+
+
+
+ ) +} diff --git a/web/app/components/datasets/create/step-two/components/preview-panel.tsx b/web/app/components/datasets/create/step-two/components/preview-panel.tsx new file mode 100644 index 0000000000..4f25cee5bd --- /dev/null +++ b/web/app/components/datasets/create/step-two/components/preview-panel.tsx @@ -0,0 +1,171 @@ +'use client' + +import type { FC } from 'react' +import type { ParentChildConfig } from '../hooks' +import type { DataSourceType, FileIndexingEstimateResponse } from '@/models/datasets' +import { RiSearchEyeLine } from '@remixicon/react' +import { noop } from 'es-toolkit/function' +import { useTranslation } from 'react-i18next' +import Badge from '@/app/components/base/badge' +import FloatRightContainer from '@/app/components/base/float-right-container' +import { SkeletonContainer, SkeletonPoint, SkeletonRectangle, SkeletonRow } from '@/app/components/base/skeleton' +import { FULL_DOC_PREVIEW_LENGTH } from '@/config' +import { ChunkingMode } from '@/models/datasets' +import { cn } from '@/utils/classnames' +import { ChunkContainer, QAPreview } from '../../../chunk' +import PreviewDocumentPicker from '../../../common/document-picker/preview-document-picker' +import { PreviewSlice } from '../../../formatted-text/flavours/preview-slice' +import { FormattedText } from '../../../formatted-text/formatted' +import PreviewContainer from '../../../preview/container' +import { PreviewHeader } from '../../../preview/header' + +type PreviewPanelProps = { + // State + isMobile: boolean + dataSourceType: DataSourceType + currentDocForm: ChunkingMode + estimate?: FileIndexingEstimateResponse + parentChildConfig: ParentChildConfig + isSetting?: boolean + // Picker + pickerFiles: Array<{ id: string, name: string, extension: string }> + pickerValue: { id: string, name: string, extension: string } + // Mutation state + isIdle: boolean + isPending: boolean + // Actions + onPickerChange: (selected: { id: string, name: string }) => void +} + +export const PreviewPanel: FC = ({ + isMobile, + dataSourceType: _dataSourceType, + currentDocForm, + estimate, + parentChildConfig, + isSetting, + pickerFiles, + pickerValue, + isIdle, + isPending, + onPickerChange, +}) => { + const { t } = useTranslation() + + return ( + + +
+ >} + onChange={onPickerChange} + value={isSetting ? pickerFiles[0] : pickerValue} + /> + {currentDocForm !== ChunkingMode.qa && ( + + )} +
+ + )} + className={cn('relative flex h-full w-1/2 shrink-0 p-4 pr-0', isMobile && 'w-full max-w-[524px]')} + mainClassName="space-y-6" + > + {/* QA Preview */} + {currentDocForm === ChunkingMode.qa && estimate?.qa_preview && ( + estimate.qa_preview.map((item, index) => ( + + + + )) + )} + + {/* Text Preview */} + {currentDocForm === ChunkingMode.text && estimate?.preview && ( + estimate.preview.map((item, index) => ( + + {item.content} + + )) + )} + + {/* Parent-Child Preview */} + {currentDocForm === ChunkingMode.parentChild && estimate?.preview && ( + estimate.preview.map((item, index) => { + const indexForLabel = index + 1 + const childChunks = parentChildConfig.chunkForContext === 'full-doc' + ? item.child_chunks.slice(0, FULL_DOC_PREVIEW_LENGTH) + : item.child_chunks + return ( + + + {childChunks.map((child, childIndex) => { + const childIndexForLabel = childIndex + 1 + return ( + + ) + })} + + + ) + }) + )} + + {/* Idle State */} + {isIdle && ( +
+
+ +

+ {t('stepTwo.previewChunkTip', { ns: 'datasetCreation' })} +

+
+
+ )} + + {/* Loading State */} + {isPending && ( +
+ {Array.from({ length: 10 }, (_, i) => ( + + + + + + + + + + + ))} +
+ )} +
+
+ ) +} diff --git a/web/app/components/datasets/create/step-two/components/step-two-footer.tsx b/web/app/components/datasets/create/step-two/components/step-two-footer.tsx new file mode 100644 index 0000000000..a22be64a75 --- /dev/null +++ b/web/app/components/datasets/create/step-two/components/step-two-footer.tsx @@ -0,0 +1,58 @@ +'use client' + +import type { FC } from 'react' +import { RiArrowLeftLine } from '@remixicon/react' +import { useTranslation } from 'react-i18next' +import Button from '@/app/components/base/button' + +type StepTwoFooterProps = { + isSetting?: boolean + isCreating: boolean + onPrevious: () => void + onCreate: () => void + onCancel?: () => void +} + +export const StepTwoFooter: FC = ({ + isSetting, + isCreating, + onPrevious, + onCreate, + onCancel, +}) => { + const { t } = useTranslation() + + if (!isSetting) { + return ( +
+ + +
+ ) + } + + return ( +
+ + +
+ ) +} diff --git a/web/app/components/datasets/create/step-two/escape.ts b/web/app/components/datasets/create/step-two/hooks/escape.ts similarity index 100% rename from web/app/components/datasets/create/step-two/escape.ts rename to web/app/components/datasets/create/step-two/hooks/escape.ts diff --git a/web/app/components/datasets/create/step-two/hooks/index.ts b/web/app/components/datasets/create/step-two/hooks/index.ts new file mode 100644 index 0000000000..f16daaaea5 --- /dev/null +++ b/web/app/components/datasets/create/step-two/hooks/index.ts @@ -0,0 +1,14 @@ +export { useDocumentCreation } from './use-document-creation' +export type { DocumentCreation, ValidationParams } from './use-document-creation' + +export { IndexingType, useIndexingConfig } from './use-indexing-config' +export type { IndexingConfig } from './use-indexing-config' + +export { useIndexingEstimate } from './use-indexing-estimate' +export type { IndexingEstimate } from './use-indexing-estimate' + +export { usePreviewState } from './use-preview-state' +export type { PreviewState } from './use-preview-state' + +export { DEFAULT_MAXIMUM_CHUNK_LENGTH, DEFAULT_OVERLAP, DEFAULT_SEGMENT_IDENTIFIER, defaultParentChildConfig, MAXIMUM_CHUNK_TOKEN_LENGTH, useSegmentationState } from './use-segmentation-state' +export type { ParentChildConfig, SegmentationState } from './use-segmentation-state' diff --git a/web/app/components/datasets/create/step-two/unescape.ts b/web/app/components/datasets/create/step-two/hooks/unescape.ts similarity index 100% rename from web/app/components/datasets/create/step-two/unescape.ts rename to web/app/components/datasets/create/step-two/hooks/unescape.ts diff --git a/web/app/components/datasets/create/step-two/hooks/use-document-creation.ts b/web/app/components/datasets/create/step-two/hooks/use-document-creation.ts new file mode 100644 index 0000000000..fd132b38ef --- /dev/null +++ b/web/app/components/datasets/create/step-two/hooks/use-document-creation.ts @@ -0,0 +1,279 @@ +import type { DefaultModel, Model } from '@/app/components/header/account-setting/model-provider-page/declarations' +import type { NotionPage } from '@/models/common' +import type { + ChunkingMode, + CrawlOptions, + CrawlResultItem, + CreateDocumentReq, + createDocumentResponse, + CustomFile, + FullDocumentDetail, + ProcessRule, +} from '@/models/datasets' +import type { RetrievalConfig, RETRIEVE_METHOD } from '@/types/app' +import { useCallback } from 'react' +import { useTranslation } from 'react-i18next' +import { trackEvent } from '@/app/components/base/amplitude' +import Toast from '@/app/components/base/toast' +import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model' +import { DataSourceProvider } from '@/models/common' +import { + DataSourceType, +} from '@/models/datasets' +import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument } from '@/service/knowledge/use-create-dataset' +import { useInvalidDatasetList } from '@/service/knowledge/use-dataset' +import { IndexingType } from './use-indexing-config' +import { MAXIMUM_CHUNK_TOKEN_LENGTH } from './use-segmentation-state' + +export type UseDocumentCreationOptions = { + datasetId?: string + isSetting?: boolean + documentDetail?: FullDocumentDetail + dataSourceType: DataSourceType + files: CustomFile[] + notionPages: NotionPage[] + notionCredentialId: string + websitePages: CrawlResultItem[] + crawlOptions?: CrawlOptions + websiteCrawlProvider?: DataSourceProvider + websiteCrawlJobId?: string + // Callbacks + onStepChange?: (delta: number) => void + updateIndexingTypeCache?: (type: string) => void + updateResultCache?: (res: createDocumentResponse) => void + updateRetrievalMethodCache?: (method: RETRIEVE_METHOD | '') => void + onSave?: () => void + mutateDatasetRes?: () => void +} + +export type ValidationParams = { + segmentationType: string + maxChunkLength: number + limitMaxChunkLength: number + overlap: number + indexType: IndexingType + embeddingModel: DefaultModel + rerankModelList: Model[] + retrievalConfig: RetrievalConfig +} + +export const useDocumentCreation = (options: UseDocumentCreationOptions) => { + const { t } = useTranslation() + const { + datasetId, + isSetting, + documentDetail, + dataSourceType, + files, + notionPages, + notionCredentialId, + websitePages, + crawlOptions, + websiteCrawlProvider = DataSourceProvider.jinaReader, + websiteCrawlJobId = '', + onStepChange, + updateIndexingTypeCache, + updateResultCache, + updateRetrievalMethodCache, + onSave, + mutateDatasetRes, + } = options + + const createFirstDocumentMutation = useCreateFirstDocument() + const createDocumentMutation = useCreateDocument(datasetId!) + const invalidDatasetList = useInvalidDatasetList() + + const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending + + // Validate creation params + const validateParams = useCallback((params: ValidationParams): boolean => { + const { + segmentationType, + maxChunkLength, + limitMaxChunkLength, + overlap, + indexType, + embeddingModel, + rerankModelList, + retrievalConfig, + } = params + + if (segmentationType === 'general' && overlap > maxChunkLength) { + Toast.notify({ type: 'error', message: t('stepTwo.overlapCheck', { ns: 'datasetCreation' }) }) + return false + } + + if (segmentationType === 'general' && maxChunkLength > limitMaxChunkLength) { + Toast.notify({ + type: 'error', + message: t('stepTwo.maxLengthCheck', { ns: 'datasetCreation', limit: limitMaxChunkLength }), + }) + return false + } + + if (!isSetting) { + if (indexType === IndexingType.QUALIFIED && (!embeddingModel.model || !embeddingModel.provider)) { + Toast.notify({ + type: 'error', + message: t('datasetConfig.embeddingModelRequired', { ns: 'appDebug' }), + }) + return false + } + + if (!isReRankModelSelected({ + rerankModelList, + retrievalConfig, + indexMethod: indexType, + })) { + Toast.notify({ type: 'error', message: t('datasetConfig.rerankModelRequired', { ns: 'appDebug' }) }) + return false + } + } + + return true + }, [t, isSetting]) + + // Build creation params + const buildCreationParams = useCallback(( + currentDocForm: ChunkingMode, + docLanguage: string, + processRule: ProcessRule, + retrievalConfig: RetrievalConfig, + embeddingModel: DefaultModel, + indexingTechnique: string, + ): CreateDocumentReq | null => { + if (isSetting) { + return { + original_document_id: documentDetail?.id, + doc_form: currentDocForm, + doc_language: docLanguage, + process_rule: processRule, + retrieval_model: retrievalConfig, + embedding_model: embeddingModel.model, + embedding_model_provider: embeddingModel.provider, + indexing_technique: indexingTechnique, + } as CreateDocumentReq + } + + const params: CreateDocumentReq = { + data_source: { + type: dataSourceType, + info_list: { + data_source_type: dataSourceType, + }, + }, + indexing_technique: indexingTechnique, + process_rule: processRule, + doc_form: currentDocForm, + doc_language: docLanguage, + retrieval_model: retrievalConfig, + embedding_model: embeddingModel.model, + embedding_model_provider: embeddingModel.provider, + } as CreateDocumentReq + + // Add data source specific info + if (dataSourceType === DataSourceType.FILE) { + params.data_source!.info_list.file_info_list = { + file_ids: files.map(file => file.id || '').filter(Boolean), + } + } + if (dataSourceType === DataSourceType.NOTION) + params.data_source!.info_list.notion_info_list = getNotionInfo(notionPages, notionCredentialId) + + if (dataSourceType === DataSourceType.WEB) { + params.data_source!.info_list.website_info_list = getWebsiteInfo({ + websiteCrawlProvider, + websiteCrawlJobId, + websitePages, + crawlOptions, + }) + } + + return params + }, [ + isSetting, + documentDetail, + dataSourceType, + files, + notionPages, + notionCredentialId, + websitePages, + websiteCrawlProvider, + websiteCrawlJobId, + crawlOptions, + ]) + + // Execute creation + const executeCreation = useCallback(async ( + params: CreateDocumentReq, + indexType: IndexingType, + retrievalConfig: RetrievalConfig, + ) => { + if (!datasetId) { + await createFirstDocumentMutation.mutateAsync(params, { + onSuccess(data) { + updateIndexingTypeCache?.(indexType) + updateResultCache?.(data) + updateRetrievalMethodCache?.(retrievalConfig.search_method as RETRIEVE_METHOD) + }, + }) + } + else { + await createDocumentMutation.mutateAsync(params, { + onSuccess(data) { + updateIndexingTypeCache?.(indexType) + updateResultCache?.(data) + updateRetrievalMethodCache?.(retrievalConfig.search_method as RETRIEVE_METHOD) + }, + }) + } + + mutateDatasetRes?.() + invalidDatasetList() + + trackEvent('create_datasets', { + data_source_type: dataSourceType, + indexing_technique: indexType, + }) + + onStepChange?.(+1) + + if (isSetting) + onSave?.() + }, [ + datasetId, + createFirstDocumentMutation, + createDocumentMutation, + updateIndexingTypeCache, + updateResultCache, + updateRetrievalMethodCache, + mutateDatasetRes, + invalidDatasetList, + dataSourceType, + onStepChange, + isSetting, + onSave, + ]) + + // Validate preview params + const validatePreviewParams = useCallback((maxChunkLength: number): boolean => { + if (maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) { + Toast.notify({ + type: 'error', + message: t('stepTwo.maxLengthCheck', { ns: 'datasetCreation', limit: MAXIMUM_CHUNK_TOKEN_LENGTH }), + }) + return false + } + return true + }, [t]) + + return { + isCreating, + validateParams, + buildCreationParams, + executeCreation, + validatePreviewParams, + } +} + +export type DocumentCreation = ReturnType diff --git a/web/app/components/datasets/create/step-two/hooks/use-indexing-config.ts b/web/app/components/datasets/create/step-two/hooks/use-indexing-config.ts new file mode 100644 index 0000000000..97fc9c260f --- /dev/null +++ b/web/app/components/datasets/create/step-two/hooks/use-indexing-config.ts @@ -0,0 +1,143 @@ +import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations' +import type { RetrievalConfig } from '@/types/app' +import { useEffect, useMemo, useState } from 'react' +import { checkShowMultiModalTip } from '@/app/components/datasets/settings/utils' +import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations' +import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks' +import { RETRIEVE_METHOD } from '@/types/app' + +export enum IndexingType { + QUALIFIED = 'high_quality', + ECONOMICAL = 'economy', +} + +const DEFAULT_RETRIEVAL_CONFIG: RetrievalConfig = { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { + reranking_provider_name: '', + reranking_model_name: '', + }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, +} + +export type UseIndexingConfigOptions = { + initialIndexType?: IndexingType + initialEmbeddingModel?: DefaultModel + initialRetrievalConfig?: RetrievalConfig + isAPIKeySet: boolean + hasSetIndexType: boolean +} + +export const useIndexingConfig = (options: UseIndexingConfigOptions) => { + const { + initialIndexType, + initialEmbeddingModel, + initialRetrievalConfig, + isAPIKeySet, + hasSetIndexType, + } = options + + // Rerank model + const { + modelList: rerankModelList, + defaultModel: rerankDefaultModel, + currentModel: isRerankDefaultModelValid, + } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank) + + // Embedding model list + const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding) + const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding) + + // Index type state + const [indexType, setIndexType] = useState(() => { + if (initialIndexType) + return initialIndexType + return isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL + }) + + // Embedding model state + const [embeddingModel, setEmbeddingModel] = useState( + initialEmbeddingModel ?? { + provider: defaultEmbeddingModel?.provider.provider || '', + model: defaultEmbeddingModel?.model || '', + }, + ) + + // Retrieval config state + const [retrievalConfig, setRetrievalConfig] = useState( + initialRetrievalConfig ?? DEFAULT_RETRIEVAL_CONFIG, + ) + + // Sync retrieval config with rerank model when available + useEffect(() => { + if (initialRetrievalConfig) + return + + setRetrievalConfig({ + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: !!isRerankDefaultModelValid, + reranking_model: { + reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider.provider ?? '' : '', + reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '', + }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }) + }, [rerankDefaultModel, isRerankDefaultModelValid, initialRetrievalConfig]) + + // Sync index type with props + useEffect(() => { + if (initialIndexType) + setIndexType(initialIndexType) + else + setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL) + }, [isAPIKeySet, initialIndexType]) + + // Show multimodal tip + const showMultiModalTip = useMemo(() => { + return checkShowMultiModalTip({ + embeddingModel, + rerankingEnable: retrievalConfig.reranking_enable, + rerankModel: { + rerankingProviderName: retrievalConfig.reranking_model.reranking_provider_name, + rerankingModelName: retrievalConfig.reranking_model.reranking_model_name, + }, + indexMethod: indexType, + embeddingModelList, + rerankModelList, + }) + }, [embeddingModel, retrievalConfig, indexType, embeddingModelList, rerankModelList]) + + // Get effective indexing technique + const getIndexingTechnique = () => initialIndexType || indexType + + return { + // Index type + indexType, + setIndexType, + hasSetIndexType, + getIndexingTechnique, + + // Embedding model + embeddingModel, + setEmbeddingModel, + embeddingModelList, + defaultEmbeddingModel, + + // Retrieval config + retrievalConfig, + setRetrievalConfig, + rerankModelList, + rerankDefaultModel, + isRerankDefaultModelValid, + + // Computed + showMultiModalTip, + } +} + +export type IndexingConfig = ReturnType diff --git a/web/app/components/datasets/create/step-two/hooks/use-indexing-estimate.ts b/web/app/components/datasets/create/step-two/hooks/use-indexing-estimate.ts new file mode 100644 index 0000000000..cc5a2bcf33 --- /dev/null +++ b/web/app/components/datasets/create/step-two/hooks/use-indexing-estimate.ts @@ -0,0 +1,123 @@ +import type { IndexingType } from './use-indexing-config' +import type { NotionPage } from '@/models/common' +import type { ChunkingMode, CrawlOptions, CrawlResultItem, CustomFile, ProcessRule } from '@/models/datasets' +import { useCallback } from 'react' +import { DataSourceProvider } from '@/models/common' +import { DataSourceType } from '@/models/datasets' +import { + useFetchFileIndexingEstimateForFile, + useFetchFileIndexingEstimateForNotion, + useFetchFileIndexingEstimateForWeb, +} from '@/service/knowledge/use-create-dataset' + +export type UseIndexingEstimateOptions = { + dataSourceType: DataSourceType + datasetId?: string + // Document settings + currentDocForm: ChunkingMode + docLanguage: string + // File data source + files: CustomFile[] + previewFileName?: string + // Notion data source + previewNotionPage: NotionPage + notionCredentialId: string + // Website data source + previewWebsitePage: CrawlResultItem + crawlOptions?: CrawlOptions + websiteCrawlProvider?: DataSourceProvider + websiteCrawlJobId?: string + // Processing + indexingTechnique: IndexingType + processRule: ProcessRule +} + +export const useIndexingEstimate = (options: UseIndexingEstimateOptions) => { + const { + dataSourceType, + datasetId, + currentDocForm, + docLanguage, + files, + previewFileName, + previewNotionPage, + notionCredentialId, + previewWebsitePage, + crawlOptions, + websiteCrawlProvider, + websiteCrawlJobId, + indexingTechnique, + processRule, + } = options + + // File indexing estimate + const fileQuery = useFetchFileIndexingEstimateForFile({ + docForm: currentDocForm, + docLanguage, + dataSourceType: DataSourceType.FILE, + files: previewFileName + ? [files.find(file => file.name === previewFileName)!] + : files, + indexingTechnique, + processRule, + dataset_id: datasetId!, + }) + + // Notion indexing estimate + const notionQuery = useFetchFileIndexingEstimateForNotion({ + docForm: currentDocForm, + docLanguage, + dataSourceType: DataSourceType.NOTION, + notionPages: [previewNotionPage], + indexingTechnique, + processRule, + dataset_id: datasetId || '', + credential_id: notionCredentialId, + }) + + // Website indexing estimate + const websiteQuery = useFetchFileIndexingEstimateForWeb({ + docForm: currentDocForm, + docLanguage, + dataSourceType: DataSourceType.WEB, + websitePages: [previewWebsitePage], + crawlOptions, + websiteCrawlProvider: websiteCrawlProvider ?? DataSourceProvider.jinaReader, + websiteCrawlJobId: websiteCrawlJobId ?? '', + indexingTechnique, + processRule, + dataset_id: datasetId || '', + }) + + // Get current mutation based on data source type + const getCurrentMutation = useCallback(() => { + if (dataSourceType === DataSourceType.FILE) + return fileQuery + if (dataSourceType === DataSourceType.NOTION) + return notionQuery + return websiteQuery + }, [dataSourceType, fileQuery, notionQuery, websiteQuery]) + + const currentMutation = getCurrentMutation() + + // Trigger estimate fetch + const fetchEstimate = useCallback(() => { + if (dataSourceType === DataSourceType.FILE) + fileQuery.mutate() + else if (dataSourceType === DataSourceType.NOTION) + notionQuery.mutate() + else + websiteQuery.mutate() + }, [dataSourceType, fileQuery, notionQuery, websiteQuery]) + + return { + currentMutation, + estimate: currentMutation.data, + isIdle: currentMutation.isIdle, + isPending: currentMutation.isPending, + fetchEstimate, + reset: currentMutation.reset, + } +} + +export type IndexingEstimate = ReturnType diff --git a/web/app/components/datasets/create/step-two/hooks/use-preview-state.ts b/web/app/components/datasets/create/step-two/hooks/use-preview-state.ts new file mode 100644 index 0000000000..94171c5947 --- /dev/null +++ b/web/app/components/datasets/create/step-two/hooks/use-preview-state.ts @@ -0,0 +1,127 @@ +import type { NotionPage } from '@/models/common' +import type { CrawlResultItem, CustomFile, DocumentItem, FullDocumentDetail } from '@/models/datasets' +import { useCallback, useState } from 'react' +import { DataSourceType } from '@/models/datasets' + +export type UsePreviewStateOptions = { + dataSourceType: DataSourceType + files: CustomFile[] + notionPages: NotionPage[] + websitePages: CrawlResultItem[] + documentDetail?: FullDocumentDetail + datasetId?: string +} + +export const usePreviewState = (options: UsePreviewStateOptions) => { + const { + dataSourceType, + files, + notionPages, + websitePages, + documentDetail, + datasetId, + } = options + + // File preview state + const [previewFile, setPreviewFile] = useState( + (datasetId && documentDetail) + ? documentDetail.file + : files[0], + ) + + // Notion page preview state + const [previewNotionPage, setPreviewNotionPage] = useState( + (datasetId && documentDetail) + ? documentDetail.notion_page + : notionPages[0], + ) + + // Website page preview state + const [previewWebsitePage, setPreviewWebsitePage] = useState( + (datasetId && documentDetail) + ? documentDetail.website_page + : websitePages[0], + ) + + // Get preview items for document picker based on data source type + const getPreviewPickerItems = useCallback(() => { + if (dataSourceType === DataSourceType.FILE) { + return files as Array> + } + if (dataSourceType === DataSourceType.NOTION) { + return notionPages.map(page => ({ + id: page.page_id, + name: page.page_name, + extension: 'md', + })) + } + if (dataSourceType === DataSourceType.WEB) { + return websitePages.map(page => ({ + id: page.source_url, + name: page.title, + extension: 'md', + })) + } + return [] + }, [dataSourceType, files, notionPages, websitePages]) + + // Get current preview value for picker + const getPreviewPickerValue = useCallback(() => { + if (dataSourceType === DataSourceType.FILE) { + return previewFile as Required + } + if (dataSourceType === DataSourceType.NOTION) { + return { + id: previewNotionPage?.page_id || '', + name: previewNotionPage?.page_name || '', + extension: 'md', + } + } + if (dataSourceType === DataSourceType.WEB) { + return { + id: previewWebsitePage?.source_url || '', + name: previewWebsitePage?.title || '', + extension: 'md', + } + } + return { id: '', name: '', extension: '' } + }, [dataSourceType, previewFile, previewNotionPage, previewWebsitePage]) + + // Handle preview change + const handlePreviewChange = useCallback((selected: { id: string, name: string }) => { + if (dataSourceType === DataSourceType.FILE) { + setPreviewFile(selected as DocumentItem) + } + else if (dataSourceType === DataSourceType.NOTION) { + const selectedPage = notionPages.find(page => page.page_id === selected.id) + if (selectedPage) + setPreviewNotionPage(selectedPage) + } + else if (dataSourceType === DataSourceType.WEB) { + const selectedPage = websitePages.find(page => page.source_url === selected.id) + if (selectedPage) + setPreviewWebsitePage(selectedPage) + } + }, [dataSourceType, notionPages, websitePages]) + + return { + // File preview + previewFile, + setPreviewFile, + + // Notion preview + previewNotionPage, + setPreviewNotionPage, + + // Website preview + previewWebsitePage, + setPreviewWebsitePage, + + // Picker helpers + getPreviewPickerItems, + getPreviewPickerValue, + handlePreviewChange, + } +} + +export type PreviewState = ReturnType diff --git a/web/app/components/datasets/create/step-two/hooks/use-segmentation-state.ts b/web/app/components/datasets/create/step-two/hooks/use-segmentation-state.ts new file mode 100644 index 0000000000..69cc089b4f --- /dev/null +++ b/web/app/components/datasets/create/step-two/hooks/use-segmentation-state.ts @@ -0,0 +1,222 @@ +import type { ParentMode, PreProcessingRule, ProcessRule, Rules } from '@/models/datasets' +import { useCallback, useState } from 'react' +import { ChunkingMode, ProcessMode } from '@/models/datasets' +import escape from './escape' +import unescape from './unescape' + +// Constants +export const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n' +export const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024 +export const DEFAULT_OVERLAP = 50 +export const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt( + globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', + 10, +) + +export type ParentChildConfig = { + chunkForContext: ParentMode + parent: { + delimiter: string + maxLength: number + } + child: { + delimiter: string + maxLength: number + } +} + +export const defaultParentChildConfig: ParentChildConfig = { + chunkForContext: 'paragraph', + parent: { + delimiter: '\\n\\n', + maxLength: 1024, + }, + child: { + delimiter: '\\n', + maxLength: 512, + }, +} + +export type UseSegmentationStateOptions = { + initialSegmentationType?: ProcessMode +} + +export const useSegmentationState = (options: UseSegmentationStateOptions = {}) => { + const { initialSegmentationType } = options + + // Segmentation type (general or parent-child) + const [segmentationType, setSegmentationType] = useState( + initialSegmentationType ?? ProcessMode.general, + ) + + // General chunking settings + const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER) + const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) + const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH) + const [overlap, setOverlap] = useState(DEFAULT_OVERLAP) + + // Pre-processing rules + const [rules, setRules] = useState([]) + const [defaultConfig, setDefaultConfig] = useState() + + // Parent-child config + const [parentChildConfig, setParentChildConfig] = useState(defaultParentChildConfig) + + // Escaped segment identifier setter + const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => { + if (value) { + doSetSegmentIdentifier(escape(value)) + } + else { + doSetSegmentIdentifier(canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER) + } + }, []) + + // Rule toggle handler + const toggleRule = useCallback((id: string) => { + setRules(prev => prev.map(rule => + rule.id === id ? { ...rule, enabled: !rule.enabled } : rule, + )) + }, []) + + // Reset to defaults + const resetToDefaults = useCallback(() => { + if (defaultConfig) { + setSegmentIdentifier(defaultConfig.segmentation.separator) + setMaxChunkLength(defaultConfig.segmentation.max_tokens) + setOverlap(defaultConfig.segmentation.chunk_overlap!) + setRules(defaultConfig.pre_processing_rules) + } + setParentChildConfig(defaultParentChildConfig) + }, [defaultConfig, setSegmentIdentifier]) + + // Apply config from document detail + const applyConfigFromRules = useCallback((rulesConfig: Rules, isHierarchical: boolean) => { + const separator = rulesConfig.segmentation.separator + const max = rulesConfig.segmentation.max_tokens + const chunkOverlap = rulesConfig.segmentation.chunk_overlap + + setSegmentIdentifier(separator) + setMaxChunkLength(max) + setOverlap(chunkOverlap!) + setRules(rulesConfig.pre_processing_rules) + setDefaultConfig(rulesConfig) + + if (isHierarchical) { + setParentChildConfig({ + chunkForContext: rulesConfig.parent_mode || 'paragraph', + parent: { + delimiter: escape(rulesConfig.segmentation.separator), + maxLength: rulesConfig.segmentation.max_tokens, + }, + child: { + delimiter: escape(rulesConfig.subchunk_segmentation!.separator), + maxLength: rulesConfig.subchunk_segmentation!.max_tokens, + }, + }) + } + }, [setSegmentIdentifier]) + + // Get process rule for API + const getProcessRule = useCallback((docForm: ChunkingMode): ProcessRule => { + if (docForm === ChunkingMode.parentChild) { + return { + rules: { + pre_processing_rules: rules, + segmentation: { + separator: unescape(parentChildConfig.parent.delimiter), + max_tokens: parentChildConfig.parent.maxLength, + }, + parent_mode: parentChildConfig.chunkForContext, + subchunk_segmentation: { + separator: unescape(parentChildConfig.child.delimiter), + max_tokens: parentChildConfig.child.maxLength, + }, + }, + mode: 'hierarchical', + } as ProcessRule + } + + return { + rules: { + pre_processing_rules: rules, + segmentation: { + separator: unescape(segmentIdentifier), + max_tokens: maxChunkLength, + chunk_overlap: overlap, + }, + }, + mode: segmentationType, + } as ProcessRule + }, [rules, parentChildConfig, segmentIdentifier, maxChunkLength, overlap, segmentationType]) + + // Update parent config field + const updateParentConfig = useCallback((field: 'delimiter' | 'maxLength', value: string | number) => { + setParentChildConfig((prev) => { + let newValue: string | number + if (field === 'delimiter') + newValue = value ? escape(value as string) : '' + else + newValue = value + return { + ...prev, + parent: { ...prev.parent, [field]: newValue }, + } + }) + }, []) + + // Update child config field + const updateChildConfig = useCallback((field: 'delimiter' | 'maxLength', value: string | number) => { + setParentChildConfig((prev) => { + let newValue: string | number + if (field === 'delimiter') + newValue = value ? escape(value as string) : '' + else + newValue = value + return { + ...prev, + child: { ...prev.child, [field]: newValue }, + } + }) + }, []) + + // Set chunk for context mode + const setChunkForContext = useCallback((mode: ParentMode) => { + setParentChildConfig(prev => ({ ...prev, chunkForContext: mode })) + }, []) + + return { + // General chunking state + segmentationType, + setSegmentationType, + segmentIdentifier, + setSegmentIdentifier, + maxChunkLength, + setMaxChunkLength, + limitMaxChunkLength, + setLimitMaxChunkLength, + overlap, + setOverlap, + + // Rules + rules, + setRules, + defaultConfig, + setDefaultConfig, + toggleRule, + + // Parent-child config + parentChildConfig, + setParentChildConfig, + updateParentConfig, + updateChildConfig, + setChunkForContext, + + // Actions + resetToDefaults, + applyConfigFromRules, + getProcessRule, + } +} + +export type SegmentationState = ReturnType diff --git a/web/app/components/datasets/create/step-two/index.spec.tsx b/web/app/components/datasets/create/step-two/index.spec.tsx new file mode 100644 index 0000000000..7145920f60 --- /dev/null +++ b/web/app/components/datasets/create/step-two/index.spec.tsx @@ -0,0 +1,2197 @@ +import type { Model } from '@/app/components/header/account-setting/model-provider-page/declarations' +import type { DataSourceProvider, NotionPage } from '@/models/common' +import type { + CrawlOptions, + CrawlResultItem, + CustomFile, + FileIndexingEstimateResponse, + FullDocumentDetail, + PreProcessingRule, + Rules, +} from '@/models/datasets' +import type { RetrievalConfig } from '@/types/app' +import { act, fireEvent, render, renderHook, screen } from '@testing-library/react' +import { ConfigurationMethodEnum, ModelStatusEnum, ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations' +import { ChunkingMode, DataSourceType, ProcessMode } from '@/models/datasets' +import { RETRIEVE_METHOD } from '@/types/app' +import { PreviewPanel } from './components/preview-panel' +import { StepTwoFooter } from './components/step-two-footer' +import { + DEFAULT_MAXIMUM_CHUNK_LENGTH, + DEFAULT_OVERLAP, + DEFAULT_SEGMENT_IDENTIFIER, + defaultParentChildConfig, + IndexingType, + useDocumentCreation, + useIndexingConfig, + useIndexingEstimate, + usePreviewState, + useSegmentationState, +} from './hooks' +import escape from './hooks/escape' +import unescape from './hooks/unescape' + +// ============================================ +// Mock external dependencies +// ============================================ + +// Mock dataset detail context +const mockDataset = { + id: 'test-dataset-id', + doc_form: ChunkingMode.text, + data_source_type: DataSourceType.FILE, + embedding_model: 'text-embedding-ada-002', + embedding_model_provider: 'openai', + retrieval_model_dict: { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + } as RetrievalConfig, +} + +let mockCurrentDataset: typeof mockDataset | null = null +const mockMutateDatasetRes = vi.fn() + +vi.mock('@/context/dataset-detail', () => ({ + useDatasetDetailContextWithSelector: (selector: (state: { dataset: typeof mockDataset | null, mutateDatasetRes: () => void }) => unknown) => + selector({ dataset: mockCurrentDataset, mutateDatasetRes: mockMutateDatasetRes }), +})) + +// Note: @/context/i18n is globally mocked in vitest.setup.ts, no need to mock here +// Note: @/hooks/use-breakpoints uses real import + +// Mock model hooks +const mockEmbeddingModelList = [ + { provider: 'openai', model: 'text-embedding-ada-002' }, + { provider: 'cohere', model: 'embed-english-v3.0' }, +] +const mockDefaultEmbeddingModel = { provider: { provider: 'openai' }, model: 'text-embedding-ada-002' } +// Model[] type structure for rerank model list (simplified mock) +const mockRerankModelList: Model[] = [{ + provider: 'cohere', + icon_small: { en_US: 'cohere-icon', zh_Hans: 'cohere-icon' }, + label: { en_US: 'Cohere', zh_Hans: 'Cohere' }, + models: [{ + model: 'rerank-english-v3.0', + label: { en_US: 'Rerank English v3.0', zh_Hans: 'Rerank English v3.0' }, + model_type: ModelTypeEnum.rerank, + features: [], + fetch_from: ConfigurationMethodEnum.predefinedModel, + status: ModelStatusEnum.active, + model_properties: {}, + load_balancing_enabled: false, + }], + status: ModelStatusEnum.active, +}] +const mockRerankDefaultModel = { provider: { provider: 'cohere' }, model: 'rerank-english-v3.0' } +let mockIsRerankDefaultModelValid = true + +vi.mock('@/app/components/header/account-setting/model-provider-page/hooks', () => ({ + useModelListAndDefaultModelAndCurrentProviderAndModel: () => ({ + modelList: mockRerankModelList, + defaultModel: mockRerankDefaultModel, + currentModel: mockIsRerankDefaultModelValid, + }), + useModelList: () => ({ data: mockEmbeddingModelList }), + useDefaultModel: () => ({ data: mockDefaultEmbeddingModel }), +})) + +// Mock service hooks +const mockFetchDefaultProcessRuleMutate = vi.fn() +vi.mock('@/service/knowledge/use-create-dataset', () => ({ + useFetchDefaultProcessRule: ({ onSuccess }: { onSuccess: (data: { rules: Rules, limits: { indexing_max_segmentation_tokens_length: number } }) => void }) => ({ + mutate: (url: string) => { + mockFetchDefaultProcessRuleMutate(url) + onSuccess({ + rules: { + segmentation: { separator: '\\n', max_tokens: 500, chunk_overlap: 50 }, + pre_processing_rules: [ + { id: 'remove_extra_spaces', enabled: true }, + { id: 'remove_urls_emails', enabled: false }, + ], + parent_mode: 'paragraph', + subchunk_segmentation: { separator: '\\n', max_tokens: 256 }, + }, + limits: { indexing_max_segmentation_tokens_length: 4000 }, + }) + }, + isPending: false, + }), + useFetchFileIndexingEstimateForFile: () => ({ + mutate: vi.fn(), + data: undefined, + isIdle: true, + isPending: false, + reset: vi.fn(), + }), + useFetchFileIndexingEstimateForNotion: () => ({ + mutate: vi.fn(), + data: undefined, + isIdle: true, + isPending: false, + reset: vi.fn(), + }), + useFetchFileIndexingEstimateForWeb: () => ({ + mutate: vi.fn(), + data: undefined, + isIdle: true, + isPending: false, + reset: vi.fn(), + }), + useCreateFirstDocument: () => ({ + mutateAsync: vi.fn().mockImplementation(async (params: unknown, options?: { onSuccess?: (data: unknown) => void }) => { + const data = { dataset: { id: 'new-dataset-id' } } + options?.onSuccess?.(data) + return data + }), + isPending: false, + }), + useCreateDocument: () => ({ + mutateAsync: vi.fn().mockImplementation(async (params: unknown, options?: { onSuccess?: (data: unknown) => void }) => { + const data = { document: { id: 'new-doc-id' } } + options?.onSuccess?.(data) + return data + }), + isPending: false, + }), + getNotionInfo: vi.fn().mockReturnValue([{ workspace_id: 'ws-1', pages: [{ page_id: 'page-1' }] }]), + getWebsiteInfo: vi.fn().mockReturnValue({ provider: 'jinaReader', job_id: 'job-123', urls: ['https://test.com'] }), +})) + +vi.mock('@/service/knowledge/use-dataset', () => ({ + useInvalidDatasetList: () => vi.fn(), +})) + +// Mock amplitude tracking (external service) +vi.mock('@/app/components/base/amplitude', () => ({ + trackEvent: vi.fn(), +})) + +// Note: @/app/components/base/toast - uses real import (base component) +// Note: @/app/components/datasets/common/check-rerank-model - uses real import +// Note: @/app/components/base/float-right-container - uses real import (base component) + +// Mock checkShowMultiModalTip - requires complex model list structure +vi.mock('@/app/components/datasets/settings/utils', () => ({ + checkShowMultiModalTip: () => false, +})) + +// ============================================ +// Test data factories +// ============================================ + +const createMockFile = (overrides?: Partial): CustomFile => ({ + id: 'file-1', + name: 'test-file.pdf', + extension: 'pdf', + size: 1024, + type: 'application/pdf', + lastModified: Date.now(), + ...overrides, +} as CustomFile) + +const createMockNotionPage = (overrides?: Partial): NotionPage => ({ + page_id: 'notion-page-1', + page_name: 'Test Notion Page', + page_icon: null, + type: 'page', + ...overrides, +} as NotionPage) + +const createMockWebsitePage = (overrides?: Partial): CrawlResultItem => ({ + source_url: 'https://example.com/page1', + title: 'Test Website Page', + description: 'Test description', + markdown: '# Test Content', + ...overrides, +} as CrawlResultItem) + +const createMockDocumentDetail = (overrides?: Partial): FullDocumentDetail => ({ + id: 'doc-1', + doc_form: ChunkingMode.text, + doc_language: 'English', + file: { id: 'file-1', name: 'test.pdf', extension: 'pdf' }, + notion_page: createMockNotionPage(), + website_page: createMockWebsitePage(), + dataset_process_rule: { + mode: ProcessMode.general, + rules: { + segmentation: { separator: '\\n\\n', max_tokens: 1024, chunk_overlap: 50 }, + pre_processing_rules: [{ id: 'remove_extra_spaces', enabled: true }], + }, + }, + ...overrides, +} as FullDocumentDetail) + +const createMockRules = (overrides?: Partial): Rules => ({ + segmentation: { separator: '\\n\\n', max_tokens: 1024, chunk_overlap: 50 }, + pre_processing_rules: [ + { id: 'remove_extra_spaces', enabled: true }, + { id: 'remove_urls_emails', enabled: false }, + ], + parent_mode: 'paragraph', + subchunk_segmentation: { separator: '\\n', max_tokens: 512 }, + ...overrides, +}) + +const createMockEstimate = (overrides?: Partial): FileIndexingEstimateResponse => ({ + total_segments: 10, + total_nodes: 10, + tokens: 5000, + total_price: 0.01, + currency: 'USD', + qa_preview: [{ question: 'Q1', answer: 'A1' }], + preview: [{ content: 'Chunk 1 content', child_chunks: ['Child 1', 'Child 2'] }], + ...overrides, +}) + +// ============================================ +// Utility Functions Tests (escape/unescape) +// ============================================ + +describe('escape utility', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + // Tests for escape function + describe('escape function', () => { + it('should return empty string for null/undefined input', () => { + expect(escape(null as unknown as string)).toBe('') + expect(escape(undefined as unknown as string)).toBe('') + expect(escape('')).toBe('') + }) + + it('should escape newline characters', () => { + expect(escape('\n')).toBe('\\n') + expect(escape('\r')).toBe('\\r') + expect(escape('\n\r')).toBe('\\n\\r') + }) + + it('should escape tab characters', () => { + expect(escape('\t')).toBe('\\t') + }) + + it('should escape other special characters', () => { + expect(escape('\0')).toBe('\\0') + expect(escape('\b')).toBe('\\b') + expect(escape('\f')).toBe('\\f') + expect(escape('\v')).toBe('\\v') + }) + + it('should escape single quotes', () => { + expect(escape('\'')).toBe('\\\'') + }) + + it('should handle mixed content', () => { + expect(escape('Hello\nWorld\t!')).toBe('Hello\\nWorld\\t!') + }) + + it('should not escape regular characters', () => { + expect(escape('Hello World')).toBe('Hello World') + expect(escape('abc123')).toBe('abc123') + }) + + it('should return empty string for non-string input', () => { + expect(escape(123 as unknown as string)).toBe('') + expect(escape({} as unknown as string)).toBe('') + }) + }) +}) + +describe('unescape utility', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + // Tests for unescape function + describe('unescape function', () => { + it('should unescape newline characters', () => { + expect(unescape('\\n')).toBe('\n') + expect(unescape('\\r')).toBe('\r') + }) + + it('should unescape tab characters', () => { + expect(unescape('\\t')).toBe('\t') + }) + + it('should unescape other special characters', () => { + expect(unescape('\\0')).toBe('\0') + expect(unescape('\\b')).toBe('\b') + expect(unescape('\\f')).toBe('\f') + expect(unescape('\\v')).toBe('\v') + }) + + it('should unescape single and double quotes', () => { + expect(unescape('\\\'')).toBe('\'') + expect(unescape('\\"')).toBe('"') + }) + + it('should unescape backslash', () => { + expect(unescape('\\\\')).toBe('\\') + }) + + it('should unescape hex sequences', () => { + expect(unescape('\\x41')).toBe('A') // 0x41 = 65 = 'A' + expect(unescape('\\x5A')).toBe('Z') // 0x5A = 90 = 'Z' + }) + + it('should unescape short hex (2-digit) sequences', () => { + // Short hex format: \xNN (2 hexadecimal digits) + expect(unescape('\\xA5')).toBe('¥') // Yen sign + expect(unescape('\\x7F')).toBe('\x7F') // Delete character + expect(unescape('\\x00')).toBe('\x00') // Null character via hex + }) + + it('should unescape octal sequences', () => { + expect(unescape('\\101')).toBe('A') // Octal 101 = 65 = 'A' + expect(unescape('\\132')).toBe('Z') // Octal 132 = 90 = 'Z' + expect(unescape('\\7')).toBe('\x07') // Single digit octal + }) + + it('should unescape unicode sequences', () => { + expect(unescape('\\u0041')).toBe('A') + expect(unescape('\\u{41}')).toBe('A') + }) + + it('should unescape Python-style unicode', () => { + expect(unescape('\\U00000041')).toBe('A') + }) + + it('should handle mixed content', () => { + expect(unescape('Hello\\nWorld\\t!')).toBe('Hello\nWorld\t!') + }) + + it('should not modify regular text', () => { + expect(unescape('Hello World')).toBe('Hello World') + }) + }) +}) + +// ============================================ +// useSegmentationState Hook Tests +// ============================================ + +describe('useSegmentationState', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + // Tests for initial state + describe('Initial State', () => { + it('should initialize with default values', () => { + const { result } = renderHook(() => useSegmentationState()) + + expect(result.current.segmentationType).toBe(ProcessMode.general) + expect(result.current.segmentIdentifier).toBe(DEFAULT_SEGMENT_IDENTIFIER) + expect(result.current.maxChunkLength).toBe(DEFAULT_MAXIMUM_CHUNK_LENGTH) + expect(result.current.overlap).toBe(DEFAULT_OVERLAP) + expect(result.current.rules).toEqual([]) + expect(result.current.parentChildConfig).toEqual(defaultParentChildConfig) + }) + + it('should initialize with custom segmentation type', () => { + const { result } = renderHook(() => + useSegmentationState({ initialSegmentationType: ProcessMode.parentChild }), + ) + + expect(result.current.segmentationType).toBe(ProcessMode.parentChild) + }) + }) + + // Tests for state setters + describe('State Management', () => { + it('should update segmentation type', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setSegmentationType(ProcessMode.parentChild) + }) + + expect(result.current.segmentationType).toBe(ProcessMode.parentChild) + }) + + it('should update max chunk length', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setMaxChunkLength(2048) + }) + + expect(result.current.maxChunkLength).toBe(2048) + }) + + it('should update overlap', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setOverlap(100) + }) + + expect(result.current.overlap).toBe(100) + }) + + it('should update rules', () => { + const { result } = renderHook(() => useSegmentationState()) + const newRules: PreProcessingRule[] = [{ id: 'test', enabled: true }] + + act(() => { + result.current.setRules(newRules) + }) + + expect(result.current.rules).toEqual(newRules) + }) + }) + + // Tests for setSegmentIdentifier with escape + describe('setSegmentIdentifier', () => { + it('should escape special characters', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setSegmentIdentifier('\n\n') + }) + + expect(result.current.segmentIdentifier).toBe('\\n\\n') + }) + + it('should use default when empty and canEmpty is false', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setSegmentIdentifier('') + }) + + expect(result.current.segmentIdentifier).toBe(DEFAULT_SEGMENT_IDENTIFIER) + }) + + it('should allow empty when canEmpty is true', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setSegmentIdentifier('', true) + }) + + expect(result.current.segmentIdentifier).toBe('') + }) + }) + + // Tests for toggleRule + describe('toggleRule', () => { + it('should toggle rule enabled state', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setRules([ + { id: 'rule1', enabled: true }, + { id: 'rule2', enabled: false }, + ]) + }) + + act(() => { + result.current.toggleRule('rule1') + }) + + expect(result.current.rules.find(r => r.id === 'rule1')?.enabled).toBe(false) + expect(result.current.rules.find(r => r.id === 'rule2')?.enabled).toBe(false) + }) + + it('should not affect other rules', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setRules([ + { id: 'rule1', enabled: true }, + { id: 'rule2', enabled: false }, + ]) + }) + + act(() => { + result.current.toggleRule('rule2') + }) + + expect(result.current.rules.find(r => r.id === 'rule1')?.enabled).toBe(true) + expect(result.current.rules.find(r => r.id === 'rule2')?.enabled).toBe(true) + }) + }) + + // Tests for parent-child config + describe('Parent-Child Configuration', () => { + it('should update parent config delimiter with truthy value', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.updateParentConfig('delimiter', '\n\n\n') + }) + + expect(result.current.parentChildConfig.parent.delimiter).toBe('\\n\\n\\n') + }) + + it('should update parent config delimiter with empty value', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.updateParentConfig('delimiter', '') + }) + + expect(result.current.parentChildConfig.parent.delimiter).toBe('') + }) + + it('should update parent config maxLength', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.updateParentConfig('maxLength', 2048) + }) + + expect(result.current.parentChildConfig.parent.maxLength).toBe(2048) + }) + + it('should update child config delimiter with truthy value', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.updateChildConfig('delimiter', '\n') + }) + + expect(result.current.parentChildConfig.child.delimiter).toBe('\\n') + }) + + it('should update child config delimiter with empty value', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.updateChildConfig('delimiter', '') + }) + + expect(result.current.parentChildConfig.child.delimiter).toBe('') + }) + + it('should update child config maxLength', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.updateChildConfig('maxLength', 256) + }) + + expect(result.current.parentChildConfig.child.maxLength).toBe(256) + }) + + it('should set chunk for context mode', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setChunkForContext('full-doc') + }) + + expect(result.current.parentChildConfig.chunkForContext).toBe('full-doc') + }) + }) + + // Tests for resetToDefaults + describe('resetToDefaults', () => { + it('should reset to default config when available', () => { + const { result } = renderHook(() => useSegmentationState()) + + // Set non-default values and default config + act(() => { + result.current.setMaxChunkLength(2048) + result.current.setOverlap(100) + result.current.setDefaultConfig(createMockRules()) + }) + + // Reset - should use default config values + act(() => { + result.current.resetToDefaults() + }) + + expect(result.current.maxChunkLength).toBe(1024) + expect(result.current.overlap).toBe(50) + expect(result.current.parentChildConfig).toEqual(defaultParentChildConfig) + }) + + it('should only reset parentChildConfig when no default config', () => { + const { result } = renderHook(() => useSegmentationState()) + + // Set non-default values without setting defaultConfig + act(() => { + result.current.setMaxChunkLength(2048) + result.current.setOverlap(100) + result.current.setChunkForContext('full-doc') + }) + + // Reset - should only reset parentChildConfig since no default config + act(() => { + result.current.resetToDefaults() + }) + + // Values stay the same since no defaultConfig + expect(result.current.maxChunkLength).toBe(2048) + expect(result.current.overlap).toBe(100) + // But parentChildConfig is always reset + expect(result.current.parentChildConfig).toEqual(defaultParentChildConfig) + }) + }) + + // Tests for applyConfigFromRules + describe('applyConfigFromRules', () => { + it('should apply general config from rules', () => { + const { result } = renderHook(() => useSegmentationState()) + const rules = createMockRules({ + segmentation: { separator: '---', max_tokens: 512, chunk_overlap: 25 }, + }) + + act(() => { + result.current.applyConfigFromRules(rules, false) + }) + + expect(result.current.maxChunkLength).toBe(512) + expect(result.current.overlap).toBe(25) + }) + + it('should apply hierarchical config from rules', () => { + const { result } = renderHook(() => useSegmentationState()) + const rules = createMockRules({ + parent_mode: 'paragraph', + subchunk_segmentation: { separator: '\n', max_tokens: 256 }, + }) + + act(() => { + result.current.applyConfigFromRules(rules, true) + }) + + expect(result.current.parentChildConfig.chunkForContext).toBe('paragraph') + expect(result.current.parentChildConfig.child.maxLength).toBe(256) + }) + + it('should apply full hierarchical parent-child config from rules', () => { + const { result } = renderHook(() => useSegmentationState()) + const rules = createMockRules({ + segmentation: { separator: '\n\n', max_tokens: 1024, chunk_overlap: 50 }, + parent_mode: 'full-doc', + subchunk_segmentation: { separator: '\n', max_tokens: 128 }, + }) + + act(() => { + result.current.applyConfigFromRules(rules, true) + }) + + // Should set parent config from segmentation + expect(result.current.parentChildConfig.parent.delimiter).toBe('\\n\\n') + expect(result.current.parentChildConfig.parent.maxLength).toBe(1024) + // Should set child config from subchunk_segmentation + expect(result.current.parentChildConfig.child.delimiter).toBe('\\n') + expect(result.current.parentChildConfig.child.maxLength).toBe(128) + // Should set chunkForContext + expect(result.current.parentChildConfig.chunkForContext).toBe('full-doc') + }) + }) + + // Tests for getProcessRule + describe('getProcessRule', () => { + it('should return general process rule', () => { + const { result } = renderHook(() => useSegmentationState()) + + const processRule = result.current.getProcessRule(ChunkingMode.text) + + expect(processRule.mode).toBe(ProcessMode.general) + expect(processRule.rules.segmentation.max_tokens).toBe(DEFAULT_MAXIMUM_CHUNK_LENGTH) + }) + + it('should return hierarchical process rule for parent-child', () => { + const { result } = renderHook(() => useSegmentationState()) + + const processRule = result.current.getProcessRule(ChunkingMode.parentChild) + + expect(processRule.mode).toBe('hierarchical') + expect(processRule.rules.parent_mode).toBe('paragraph') + expect(processRule.rules.subchunk_segmentation).toBeDefined() + }) + }) +}) + +// ============================================ +// useIndexingConfig Hook Tests +// ============================================ + +describe('useIndexingConfig', () => { + beforeEach(() => { + vi.clearAllMocks() + mockIsRerankDefaultModelValid = true + }) + + // Tests for initial state + // Note: Hook has useEffect that syncs state, so we test the state after effects settle + describe('Initial State', () => { + it('should initialize with QUALIFIED when API key is set', async () => { + const { result } = renderHook(() => + useIndexingConfig({ isAPIKeySet: true, hasSetIndexType: false }), + ) + + // After effects settle, indexType should be QUALIFIED + await vi.waitFor(() => { + expect(result.current.indexType).toBe(IndexingType.QUALIFIED) + }) + }) + + it('should initialize with ECONOMICAL when API key is not set', async () => { + const { result } = renderHook(() => + useIndexingConfig({ isAPIKeySet: false, hasSetIndexType: false }), + ) + + await vi.waitFor(() => { + expect(result.current.indexType).toBe(IndexingType.ECONOMICAL) + }) + }) + + it('should use initial index type when provided', async () => { + const { result } = renderHook(() => + useIndexingConfig({ + isAPIKeySet: false, + hasSetIndexType: true, + initialIndexType: IndexingType.QUALIFIED, + }), + ) + + await vi.waitFor(() => { + expect(result.current.indexType).toBe(IndexingType.QUALIFIED) + }) + }) + }) + + // Tests for state setters + describe('State Management', () => { + it('should update index type', async () => { + const { result } = renderHook(() => + useIndexingConfig({ isAPIKeySet: true, hasSetIndexType: false }), + ) + + // Wait for initial effects to settle + await vi.waitFor(() => { + expect(result.current.indexType).toBeDefined() + }) + + act(() => { + result.current.setIndexType(IndexingType.ECONOMICAL) + }) + + expect(result.current.indexType).toBe(IndexingType.ECONOMICAL) + }) + + it('should update embedding model', async () => { + const { result } = renderHook(() => + useIndexingConfig({ isAPIKeySet: true, hasSetIndexType: false }), + ) + + await vi.waitFor(() => { + expect(result.current.embeddingModel).toBeDefined() + }) + + act(() => { + result.current.setEmbeddingModel({ provider: 'cohere', model: 'embed-v3' }) + }) + + expect(result.current.embeddingModel).toEqual({ provider: 'cohere', model: 'embed-v3' }) + }) + + it('should update retrieval config', async () => { + const { result } = renderHook(() => + useIndexingConfig({ isAPIKeySet: true, hasSetIndexType: false }), + ) + + await vi.waitFor(() => { + expect(result.current.retrievalConfig).toBeDefined() + }) + + const newConfig: RetrievalConfig = { + search_method: RETRIEVE_METHOD.hybrid, + reranking_enable: true, + reranking_model: { reranking_provider_name: 'cohere', reranking_model_name: 'rerank-v3' }, + top_k: 5, + score_threshold_enabled: true, + score_threshold: 0.7, + } + + act(() => { + result.current.setRetrievalConfig(newConfig) + }) + + expect(result.current.retrievalConfig).toEqual(newConfig) + }) + }) + + // Tests for getIndexingTechnique + describe('getIndexingTechnique', () => { + it('should return initial type when set', async () => { + const { result } = renderHook(() => + useIndexingConfig({ + isAPIKeySet: true, + hasSetIndexType: true, + initialIndexType: IndexingType.ECONOMICAL, + }), + ) + + await vi.waitFor(() => { + expect(result.current.getIndexingTechnique()).toBe(IndexingType.ECONOMICAL) + }) + }) + + it('should return current type when no initial type', async () => { + const { result } = renderHook(() => + useIndexingConfig({ isAPIKeySet: true, hasSetIndexType: false }), + ) + + await vi.waitFor(() => { + expect(result.current.indexType).toBeDefined() + }) + + act(() => { + result.current.setIndexType(IndexingType.ECONOMICAL) + }) + + expect(result.current.getIndexingTechnique()).toBe(IndexingType.ECONOMICAL) + }) + }) + + // Tests for initialRetrievalConfig handling + describe('initialRetrievalConfig', () => { + it('should skip retrieval config sync when initialRetrievalConfig is provided', async () => { + const customRetrievalConfig: RetrievalConfig = { + search_method: RETRIEVE_METHOD.hybrid, + reranking_enable: true, + reranking_model: { reranking_provider_name: 'custom', reranking_model_name: 'custom-model' }, + top_k: 10, + score_threshold_enabled: true, + score_threshold: 0.8, + } + + const { result } = renderHook(() => + useIndexingConfig({ + isAPIKeySet: true, + hasSetIndexType: false, + initialRetrievalConfig: customRetrievalConfig, + }), + ) + + await vi.waitFor(() => { + expect(result.current.retrievalConfig).toBeDefined() + }) + + // Should use the provided initial config, not the default synced one + expect(result.current.retrievalConfig.search_method).toBe(RETRIEVE_METHOD.hybrid) + expect(result.current.retrievalConfig.top_k).toBe(10) + }) + }) +}) + +// ============================================ +// usePreviewState Hook Tests +// ============================================ + +describe('usePreviewState', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + const defaultOptions = { + dataSourceType: DataSourceType.FILE, + files: [createMockFile()], + notionPages: [createMockNotionPage()], + websitePages: [createMockWebsitePage()], + } + + // Tests for initial state + describe('Initial State', () => { + it('should initialize with first file for FILE data source', () => { + const { result } = renderHook(() => usePreviewState(defaultOptions)) + + expect(result.current.previewFile).toEqual(defaultOptions.files[0]) + }) + + it('should initialize with first notion page for NOTION data source', () => { + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, dataSourceType: DataSourceType.NOTION }), + ) + + expect(result.current.previewNotionPage).toEqual(defaultOptions.notionPages[0]) + }) + + it('should initialize with document detail when provided', () => { + const documentDetail = createMockDocumentDetail() + const { result } = renderHook(() => + usePreviewState({ + ...defaultOptions, + documentDetail, + datasetId: 'test-id', + }), + ) + + expect(result.current.previewFile).toEqual(documentDetail.file) + }) + }) + + // Tests for getPreviewPickerItems + describe('getPreviewPickerItems', () => { + it('should return files for FILE data source', () => { + const { result } = renderHook(() => usePreviewState(defaultOptions)) + + const items = result.current.getPreviewPickerItems() + expect(items).toEqual(defaultOptions.files) + }) + + it('should return mapped notion pages for NOTION data source', () => { + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, dataSourceType: DataSourceType.NOTION }), + ) + + const items = result.current.getPreviewPickerItems() + expect(items[0]).toEqual({ + id: 'notion-page-1', + name: 'Test Notion Page', + extension: 'md', + }) + }) + + it('should return mapped website pages for WEB data source', () => { + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, dataSourceType: DataSourceType.WEB }), + ) + + const items = result.current.getPreviewPickerItems() + expect(items[0]).toEqual({ + id: 'https://example.com/page1', + name: 'Test Website Page', + extension: 'md', + }) + }) + + it('should return empty array for unknown data source', () => { + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, dataSourceType: 'unknown' as DataSourceType }), + ) + + const items = result.current.getPreviewPickerItems() + expect(items).toEqual([]) + }) + }) + + // Tests for getPreviewPickerValue + describe('getPreviewPickerValue', () => { + it('should return file value for FILE data source', () => { + const { result } = renderHook(() => usePreviewState(defaultOptions)) + + const value = result.current.getPreviewPickerValue() + expect(value).toEqual(defaultOptions.files[0]) + }) + + it('should return mapped notion page value for NOTION data source', () => { + const notionPage = createMockNotionPage({ page_id: 'page-123', page_name: 'My Page' }) + const { result } = renderHook(() => + usePreviewState({ + ...defaultOptions, + dataSourceType: DataSourceType.NOTION, + notionPages: [notionPage], + }), + ) + + const value = result.current.getPreviewPickerValue() + expect(value).toEqual({ + id: 'page-123', + name: 'My Page', + extension: 'md', + }) + }) + + it('should return mapped website page value for WEB data source', () => { + const websitePage = createMockWebsitePage({ source_url: 'https://test.com', title: 'Test Title' }) + const { result } = renderHook(() => + usePreviewState({ + ...defaultOptions, + dataSourceType: DataSourceType.WEB, + websitePages: [websitePage], + }), + ) + + const value = result.current.getPreviewPickerValue() + expect(value).toEqual({ + id: 'https://test.com', + name: 'Test Title', + extension: 'md', + }) + }) + + it('should return empty value for unknown data source', () => { + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, dataSourceType: 'unknown' as DataSourceType }), + ) + + const value = result.current.getPreviewPickerValue() + expect(value).toEqual({ id: '', name: '', extension: '' }) + }) + + it('should handle undefined notion page gracefully', () => { + const { result } = renderHook(() => + usePreviewState({ + ...defaultOptions, + dataSourceType: DataSourceType.NOTION, + notionPages: [], + }), + ) + + const value = result.current.getPreviewPickerValue() + expect(value).toEqual({ + id: '', + name: '', + extension: 'md', + }) + }) + + it('should handle undefined website page gracefully', () => { + const { result } = renderHook(() => + usePreviewState({ + ...defaultOptions, + dataSourceType: DataSourceType.WEB, + websitePages: [], + }), + ) + + const value = result.current.getPreviewPickerValue() + expect(value).toEqual({ + id: '', + name: '', + extension: 'md', + }) + }) + }) + + // Tests for handlePreviewChange + describe('handlePreviewChange', () => { + it('should update preview file for FILE data source', () => { + const files = [createMockFile(), createMockFile({ id: 'file-2', name: 'second.pdf' })] + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, files }), + ) + + act(() => { + result.current.handlePreviewChange({ id: 'file-2', name: 'second.pdf' }) + }) + + expect(result.current.previewFile).toEqual({ id: 'file-2', name: 'second.pdf' }) + }) + + it('should update preview notion page for NOTION data source', () => { + const notionPages = [ + createMockNotionPage(), + createMockNotionPage({ page_id: 'notion-page-2', page_name: 'Second Page' }), + ] + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, dataSourceType: DataSourceType.NOTION, notionPages }), + ) + + act(() => { + result.current.handlePreviewChange({ id: 'notion-page-2', name: 'Second Page' }) + }) + + expect(result.current.previewNotionPage?.page_id).toBe('notion-page-2') + }) + + it('should update preview website page for WEB data source', () => { + const websitePages = [ + createMockWebsitePage(), + createMockWebsitePage({ source_url: 'https://example.com/page2', title: 'Second Page' }), + ] + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, dataSourceType: DataSourceType.WEB, websitePages }), + ) + + act(() => { + result.current.handlePreviewChange({ id: 'https://example.com/page2', name: 'Second Page' }) + }) + + expect(result.current.previewWebsitePage?.source_url).toBe('https://example.com/page2') + }) + }) +}) + +// ============================================ +// useDocumentCreation Hook Tests +// ============================================ + +describe('useDocumentCreation', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + const defaultOptions = { + dataSourceType: DataSourceType.FILE, + files: [createMockFile()], + notionPages: [] as NotionPage[], + notionCredentialId: '', + websitePages: [] as CrawlResultItem[], + } + + // Tests for validateParams + describe('validateParams', () => { + it('should return false when overlap exceeds max chunk length', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + const isValid = result.current.validateParams({ + segmentationType: 'general', + maxChunkLength: 100, + limitMaxChunkLength: 4000, + overlap: 200, + indexType: IndexingType.QUALIFIED, + embeddingModel: { provider: 'openai', model: 'text-embedding-ada-002' }, + rerankModelList: [], + retrievalConfig: { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + }) + + expect(isValid).toBe(false) + }) + + it('should return false when max chunk length exceeds limit', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + const isValid = result.current.validateParams({ + segmentationType: 'general', + maxChunkLength: 5000, + limitMaxChunkLength: 4000, + overlap: 50, + indexType: IndexingType.QUALIFIED, + embeddingModel: { provider: 'openai', model: 'text-embedding-ada-002' }, + rerankModelList: [], + retrievalConfig: { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + }) + + expect(isValid).toBe(false) + }) + + it('should return true for valid params', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + const isValid = result.current.validateParams({ + segmentationType: 'general', + maxChunkLength: 1000, + limitMaxChunkLength: 4000, + overlap: 50, + indexType: IndexingType.QUALIFIED, + embeddingModel: { provider: 'openai', model: 'text-embedding-ada-002' }, + rerankModelList: [], + retrievalConfig: { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + }) + + expect(isValid).toBe(true) + }) + }) + + // Tests for buildCreationParams + describe('buildCreationParams', () => { + it('should build params for file upload', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + const params = result.current.buildCreationParams( + ChunkingMode.text, + 'English', + { mode: ProcessMode.general, rules: createMockRules() }, + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + expect(params).toBeDefined() + expect(params?.doc_form).toBe(ChunkingMode.text) + expect(params?.doc_language).toBe('English') + expect(params?.data_source?.type).toBe(DataSourceType.FILE) + }) + + it('should build params for setting mode', () => { + const documentDetail = createMockDocumentDetail() + const { result } = renderHook(() => + useDocumentCreation({ + ...defaultOptions, + isSetting: true, + documentDetail, + }), + ) + + const params = result.current.buildCreationParams( + ChunkingMode.text, + 'English', + { mode: ProcessMode.general, rules: createMockRules() }, + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + expect(params?.original_document_id).toBe(documentDetail.id) + }) + + it('should build params for notion_import data source', () => { + const { result } = renderHook(() => + useDocumentCreation({ + ...defaultOptions, + dataSourceType: DataSourceType.NOTION, + notionPages: [createMockNotionPage()], + notionCredentialId: 'notion-cred-123', + }), + ) + + const params = result.current.buildCreationParams( + ChunkingMode.text, + 'English', + { mode: ProcessMode.general, rules: createMockRules() }, + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + expect(params).toBeDefined() + expect(params?.data_source?.type).toBe(DataSourceType.NOTION) + expect(params?.data_source?.info_list.notion_info_list).toBeDefined() + }) + + it('should build params for website_crawl data source', () => { + const { result } = renderHook(() => + useDocumentCreation({ + ...defaultOptions, + dataSourceType: DataSourceType.WEB, + websitePages: [createMockWebsitePage()], + websiteCrawlProvider: 'jinaReader' as DataSourceProvider, + websiteCrawlJobId: 'job-123', + crawlOptions: { max_depth: 2 } as CrawlOptions, + }), + ) + + const params = result.current.buildCreationParams( + ChunkingMode.text, + 'English', + { mode: ProcessMode.general, rules: createMockRules() }, + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + expect(params).toBeDefined() + expect(params?.data_source?.type).toBe(DataSourceType.WEB) + expect(params?.data_source?.info_list.website_info_list).toBeDefined() + }) + }) + + // Tests for validateParams edge cases + describe('validateParams - additional cases', () => { + it('should return false when embedding model is missing for QUALIFIED index type', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + const isValid = result.current.validateParams({ + segmentationType: 'general', + maxChunkLength: 500, + limitMaxChunkLength: 4000, + overlap: 50, + indexType: IndexingType.QUALIFIED, + embeddingModel: { provider: '', model: '' }, + rerankModelList: mockRerankModelList, + retrievalConfig: { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + }) + + expect(isValid).toBe(false) + }) + + it('should return false when rerank model is required but not selected', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + // isReRankModelSelected returns false when: + // - indexMethod === 'high_quality' (IndexingType.QUALIFIED) + // - reranking_enable === true + // - rerankModelSelected === false (model not found in list) + const isValid = result.current.validateParams({ + segmentationType: 'general', + maxChunkLength: 500, + limitMaxChunkLength: 4000, + overlap: 50, + indexType: IndexingType.QUALIFIED, + embeddingModel: { provider: 'openai', model: 'text-embedding-ada-002' }, + rerankModelList: [], // Empty list means model won't be found + retrievalConfig: { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: true, // Reranking enabled + reranking_model: { + reranking_provider_name: 'nonexistent', + reranking_model_name: 'nonexistent-model', + }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + }) + + expect(isValid).toBe(false) + }) + }) + + // Tests for executeCreation + describe('executeCreation', () => { + it('should call createFirstDocumentMutation when datasetId is not provided', async () => { + const mockOnStepChange = vi.fn() + const mockUpdateIndexingTypeCache = vi.fn() + const mockUpdateResultCache = vi.fn() + const mockUpdateRetrievalMethodCache = vi.fn() + const mockOnSave = vi.fn() + + const { result } = renderHook(() => + useDocumentCreation({ + ...defaultOptions, + datasetId: undefined, + onStepChange: mockOnStepChange, + updateIndexingTypeCache: mockUpdateIndexingTypeCache, + updateResultCache: mockUpdateResultCache, + updateRetrievalMethodCache: mockUpdateRetrievalMethodCache, + onSave: mockOnSave, + }), + ) + + const params = result.current.buildCreationParams( + ChunkingMode.text, + 'English', + { mode: ProcessMode.general, rules: createMockRules() }, + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + await act(async () => { + await result.current.executeCreation(params!, IndexingType.QUALIFIED, { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }) + }) + + expect(mockOnStepChange).toHaveBeenCalledWith(1) + }) + + it('should call createDocumentMutation when datasetId is provided', async () => { + const mockOnStepChange = vi.fn() + const { result } = renderHook(() => + useDocumentCreation({ + ...defaultOptions, + datasetId: 'existing-dataset-id', + onStepChange: mockOnStepChange, + }), + ) + + const params = result.current.buildCreationParams( + ChunkingMode.text, + 'English', + { mode: ProcessMode.general, rules: createMockRules() }, + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + await act(async () => { + await result.current.executeCreation(params!, IndexingType.QUALIFIED, { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }) + }) + + expect(mockOnStepChange).toHaveBeenCalledWith(1) + }) + + it('should call onSave when in setting mode', async () => { + const mockOnSave = vi.fn() + const documentDetail = createMockDocumentDetail() + const { result } = renderHook(() => + useDocumentCreation({ + ...defaultOptions, + datasetId: 'existing-dataset-id', + isSetting: true, + documentDetail, + onSave: mockOnSave, + }), + ) + + const params = result.current.buildCreationParams( + ChunkingMode.text, + 'English', + { mode: ProcessMode.general, rules: createMockRules() }, + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + await act(async () => { + await result.current.executeCreation(params!, IndexingType.QUALIFIED, { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }) + }) + + expect(mockOnSave).toHaveBeenCalled() + }) + }) + + // Tests for validatePreviewParams + describe('validatePreviewParams', () => { + it('should return true for valid max chunk length', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + const isValid = result.current.validatePreviewParams(1000) + expect(isValid).toBe(true) + }) + + it('should return false when max chunk length exceeds maximum', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + const isValid = result.current.validatePreviewParams(10000) + expect(isValid).toBe(false) + }) + }) +}) + +// ============================================ +// useIndexingEstimate Hook Tests +// ============================================ + +describe('useIndexingEstimate', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + const defaultOptions = { + dataSourceType: DataSourceType.FILE, + currentDocForm: ChunkingMode.text, + docLanguage: 'English', + files: [createMockFile()], + previewNotionPage: createMockNotionPage(), + notionCredentialId: '', + previewWebsitePage: createMockWebsitePage(), + indexingTechnique: IndexingType.QUALIFIED, + processRule: { mode: ProcessMode.general, rules: createMockRules() }, + } + + // Tests for initial state + describe('Initial State', () => { + it('should initialize with idle state', () => { + const { result } = renderHook(() => useIndexingEstimate(defaultOptions)) + + expect(result.current.isIdle).toBe(true) + expect(result.current.isPending).toBe(false) + expect(result.current.estimate).toBeUndefined() + }) + }) + + // Tests for fetchEstimate + describe('fetchEstimate', () => { + it('should have fetchEstimate function', () => { + const { result } = renderHook(() => useIndexingEstimate(defaultOptions)) + + expect(typeof result.current.fetchEstimate).toBe('function') + }) + + it('should have reset function', () => { + const { result } = renderHook(() => useIndexingEstimate(defaultOptions)) + + expect(typeof result.current.reset).toBe('function') + }) + + it('should call fetchEstimate for FILE data source', () => { + const { result } = renderHook(() => + useIndexingEstimate({ + ...defaultOptions, + dataSourceType: DataSourceType.FILE, + previewFileName: 'test-file.pdf', + }), + ) + + act(() => { + result.current.fetchEstimate() + }) + + // fetchEstimate should be callable without error + expect(result.current.fetchEstimate).toBeDefined() + }) + + it('should call fetchEstimate for NOTION data source', () => { + const { result } = renderHook(() => + useIndexingEstimate({ + ...defaultOptions, + dataSourceType: DataSourceType.NOTION, + previewNotionPage: createMockNotionPage(), + notionCredentialId: 'cred-123', + }), + ) + + act(() => { + result.current.fetchEstimate() + }) + + expect(result.current.fetchEstimate).toBeDefined() + }) + + it('should call fetchEstimate for WEB data source', () => { + const { result } = renderHook(() => + useIndexingEstimate({ + ...defaultOptions, + dataSourceType: DataSourceType.WEB, + previewWebsitePage: createMockWebsitePage(), + websiteCrawlProvider: 'jinaReader' as DataSourceProvider, + websiteCrawlJobId: 'job-123', + crawlOptions: { max_depth: 2 } as CrawlOptions, + }), + ) + + act(() => { + result.current.fetchEstimate() + }) + + expect(result.current.fetchEstimate).toBeDefined() + }) + }) + + // Tests for getCurrentMutation based on data source type + describe('Data Source Selection', () => { + it('should use file query for FILE data source', () => { + const { result } = renderHook(() => + useIndexingEstimate({ + ...defaultOptions, + dataSourceType: DataSourceType.FILE, + }), + ) + + expect(result.current.currentMutation).toBeDefined() + expect(result.current.isIdle).toBe(true) + }) + + it('should use notion query for NOTION data source', () => { + const { result } = renderHook(() => + useIndexingEstimate({ + ...defaultOptions, + dataSourceType: DataSourceType.NOTION, + }), + ) + + expect(result.current.currentMutation).toBeDefined() + expect(result.current.isIdle).toBe(true) + }) + + it('should use website query for WEB data source', () => { + const { result } = renderHook(() => + useIndexingEstimate({ + ...defaultOptions, + dataSourceType: DataSourceType.WEB, + websiteCrawlProvider: 'jinaReader' as DataSourceProvider, + websiteCrawlJobId: 'job-123', + }), + ) + + expect(result.current.currentMutation).toBeDefined() + expect(result.current.isIdle).toBe(true) + }) + }) +}) + +// ============================================ +// StepTwoFooter Component Tests +// ============================================ + +describe('StepTwoFooter', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + const defaultProps = { + isSetting: false, + isCreating: false, + onPrevious: vi.fn(), + onCreate: vi.fn(), + onCancel: vi.fn(), + } + + // Tests for rendering + describe('Rendering', () => { + it('should render without crashing', () => { + render() + + // Should render Previous and Next buttons with correct text + expect(screen.getByText(/previousStep/i)).toBeInTheDocument() + expect(screen.getByText(/nextStep/i)).toBeInTheDocument() + }) + + it('should render Previous and Next buttons when not in setting mode', () => { + render() + + expect(screen.getByText(/previousStep/i)).toBeInTheDocument() + expect(screen.getByText(/nextStep/i)).toBeInTheDocument() + }) + + it('should render Save and Cancel buttons when in setting mode', () => { + render() + + expect(screen.getByText(/save/i)).toBeInTheDocument() + expect(screen.getByText(/cancel/i)).toBeInTheDocument() + }) + }) + + // Tests for user interactions + describe('User Interactions', () => { + it('should call onPrevious when Previous button is clicked', () => { + const onPrevious = vi.fn() + render() + + fireEvent.click(screen.getByText(/previousStep/i)) + + expect(onPrevious).toHaveBeenCalledTimes(1) + }) + + it('should call onCreate when Next/Save button is clicked', () => { + const onCreate = vi.fn() + render() + + fireEvent.click(screen.getByText(/nextStep/i)) + + expect(onCreate).toHaveBeenCalledTimes(1) + }) + + it('should call onCancel when Cancel button is clicked in setting mode', () => { + const onCancel = vi.fn() + render() + + fireEvent.click(screen.getByText(/cancel/i)) + + expect(onCancel).toHaveBeenCalledTimes(1) + }) + }) + + // Tests for loading state + describe('Loading State', () => { + it('should show loading state on Next button when creating', () => { + render() + + const nextButton = screen.getByText(/nextStep/i).closest('button') + // Button has disabled:btn-disabled class which handles the loading state + expect(nextButton).toHaveClass('disabled:btn-disabled') + }) + + it('should show loading state on Save button when creating in setting mode', () => { + render() + + const saveButton = screen.getByText(/save/i).closest('button') + // Button has disabled:btn-disabled class which handles the loading state + expect(saveButton).toHaveClass('disabled:btn-disabled') + }) + }) +}) + +// ============================================ +// PreviewPanel Component Tests +// ============================================ + +describe('PreviewPanel', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + const defaultProps = { + isMobile: false, + dataSourceType: DataSourceType.FILE, + currentDocForm: ChunkingMode.text, + estimate: undefined as FileIndexingEstimateResponse | undefined, + parentChildConfig: defaultParentChildConfig, + isSetting: false, + pickerFiles: [{ id: 'file-1', name: 'test.pdf', extension: 'pdf' }], + pickerValue: { id: 'file-1', name: 'test.pdf', extension: 'pdf' }, + isIdle: true, + isPending: false, + onPickerChange: vi.fn(), + } + + // Tests for rendering + describe('Rendering', () => { + it('should render without crashing', () => { + render() + + // Check for the preview header title text + expect(screen.getByText('datasetCreation.stepTwo.preview')).toBeInTheDocument() + }) + + it('should render idle state when isIdle is true', () => { + render() + + expect(screen.getByText(/previewChunkTip/i)).toBeInTheDocument() + }) + + it('should render loading skeleton when isPending is true', () => { + render() + + // Should show skeleton containers + expect(screen.queryByText(/previewChunkTip/i)).not.toBeInTheDocument() + }) + }) + + // Tests for different doc forms + describe('Preview Content', () => { + it('should render text preview when docForm is text', () => { + const estimate = createMockEstimate() + render( + , + ) + + expect(screen.getByText('Chunk 1 content')).toBeInTheDocument() + }) + + it('should render QA preview when docForm is qa', () => { + const estimate = createMockEstimate() + render( + , + ) + + expect(screen.getByText('Q1')).toBeInTheDocument() + expect(screen.getByText('A1')).toBeInTheDocument() + }) + + it('should show chunk count badge for non-QA doc form', () => { + const estimate = createMockEstimate({ total_segments: 25 }) + render( + , + ) + + expect(screen.getByText(/25/)).toBeInTheDocument() + }) + + it('should render parent-child preview when docForm is parentChild', () => { + const estimate = createMockEstimate({ + preview: [ + { content: 'Parent chunk content', child_chunks: ['Child 1', 'Child 2', 'Child 3'] }, + ], + }) + render( + , + ) + + // Should render parent chunk label + expect(screen.getByText('Chunk-1')).toBeInTheDocument() + // Should render child chunks + expect(screen.getByText('Child 1')).toBeInTheDocument() + expect(screen.getByText('Child 2')).toBeInTheDocument() + expect(screen.getByText('Child 3')).toBeInTheDocument() + }) + + it('should limit child chunks when chunkForContext is full-doc', () => { + // FULL_DOC_PREVIEW_LENGTH is 50, so we need more than 50 chunks to test the limit + const manyChildChunks = Array.from({ length: 60 }, (_, i) => `ChildChunk${i + 1}`) + const estimate = createMockEstimate({ + preview: [{ content: 'Parent content', child_chunks: manyChildChunks }], + }) + render( + , + ) + + // Should render parent chunk + expect(screen.getByText('Chunk-1')).toBeInTheDocument() + // full-doc mode limits to FULL_DOC_PREVIEW_LENGTH (50) + expect(screen.getByText('ChildChunk1')).toBeInTheDocument() + expect(screen.getByText('ChildChunk50')).toBeInTheDocument() + // Should not render beyond the limit + expect(screen.queryByText('ChildChunk51')).not.toBeInTheDocument() + }) + + it('should render multiple parent chunks in parent-child mode', () => { + const estimate = createMockEstimate({ + preview: [ + { content: 'Parent 1', child_chunks: ['P1-C1'] }, + { content: 'Parent 2', child_chunks: ['P2-C1'] }, + ], + }) + render( + , + ) + + expect(screen.getByText('Chunk-1')).toBeInTheDocument() + expect(screen.getByText('Chunk-2')).toBeInTheDocument() + expect(screen.getByText('P1-C1')).toBeInTheDocument() + expect(screen.getByText('P2-C1')).toBeInTheDocument() + }) + }) + + // Tests for picker + describe('Document Picker', () => { + it('should call onPickerChange when document is selected', () => { + const onPickerChange = vi.fn() + render() + + // The picker interaction would be tested through the actual component + expect(onPickerChange).not.toHaveBeenCalled() + }) + }) +}) + +// ============================================ +// Edge Cases Tests +// ============================================ + +describe('Edge Cases', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + describe('Empty/Null Values', () => { + it('should handle empty files array in usePreviewState', () => { + const { result } = renderHook(() => + usePreviewState({ + dataSourceType: DataSourceType.FILE, + files: [], + notionPages: [], + websitePages: [], + }), + ) + + expect(result.current.previewFile).toBeUndefined() + }) + + it('should handle empty notion pages array', () => { + const { result } = renderHook(() => + usePreviewState({ + dataSourceType: DataSourceType.NOTION, + files: [], + notionPages: [], + websitePages: [], + }), + ) + + expect(result.current.previewNotionPage).toBeUndefined() + }) + + it('should handle empty website pages array', () => { + const { result } = renderHook(() => + usePreviewState({ + dataSourceType: DataSourceType.WEB, + files: [], + notionPages: [], + websitePages: [], + }), + ) + + expect(result.current.previewWebsitePage).toBeUndefined() + }) + }) + + describe('Boundary Conditions', () => { + it('should handle very large chunk length', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setMaxChunkLength(999999) + }) + + expect(result.current.maxChunkLength).toBe(999999) + }) + + it('should handle zero overlap', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setOverlap(0) + }) + + expect(result.current.overlap).toBe(0) + }) + + it('should handle special characters in segment identifier', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setSegmentIdentifier('<<>>') + }) + + expect(result.current.segmentIdentifier).toBe('<<>>') + }) + }) + + describe('Callback Stability', () => { + it('should maintain stable setSegmentIdentifier reference', () => { + const { result, rerender } = renderHook(() => useSegmentationState()) + const initialSetter = result.current.setSegmentIdentifier + + rerender() + + expect(result.current.setSegmentIdentifier).toBe(initialSetter) + }) + + it('should maintain stable toggleRule reference', () => { + const { result, rerender } = renderHook(() => useSegmentationState()) + const initialToggle = result.current.toggleRule + + rerender() + + expect(result.current.toggleRule).toBe(initialToggle) + }) + + it('should maintain stable getProcessRule reference', () => { + const { result, rerender } = renderHook(() => useSegmentationState()) + + // Update some state to trigger re-render + act(() => { + result.current.setMaxChunkLength(2048) + }) + + rerender() + + // getProcessRule depends on state, so it may change but should remain a function + expect(typeof result.current.getProcessRule).toBe('function') + }) + }) +}) + +// ============================================ +// Integration Scenarios +// ============================================ + +describe('Integration Scenarios', () => { + beforeEach(() => { + vi.clearAllMocks() + mockCurrentDataset = null + }) + + describe('Document Creation Flow', () => { + it('should build and validate params for file upload workflow', () => { + const files = [createMockFile()] + + const { result: segResult } = renderHook(() => useSegmentationState()) + const { result: creationResult } = renderHook(() => + useDocumentCreation({ + dataSourceType: DataSourceType.FILE, + files, + notionPages: [], + notionCredentialId: '', + websitePages: [], + }), + ) + + // Build params + const params = creationResult.current.buildCreationParams( + ChunkingMode.text, + 'English', + segResult.current.getProcessRule(ChunkingMode.text), + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + expect(params).toBeDefined() + expect(params?.data_source?.info_list.file_info_list?.file_ids).toContain('file-1') + }) + + it('should handle parent-child document form', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setSegmentationType(ProcessMode.parentChild) + result.current.setChunkForContext('full-doc') + result.current.updateParentConfig('maxLength', 2048) + result.current.updateChildConfig('maxLength', 512) + }) + + const processRule = result.current.getProcessRule(ChunkingMode.parentChild) + + expect(processRule.mode).toBe('hierarchical') + expect(processRule.rules.parent_mode).toBe('full-doc') + expect(processRule.rules.segmentation.max_tokens).toBe(2048) + expect(processRule.rules.subchunk_segmentation?.max_tokens).toBe(512) + }) + }) + + describe('Preview Flow', () => { + it('should handle preview file change flow', () => { + const files = [ + createMockFile({ id: 'file-1', name: 'first.pdf' }), + createMockFile({ id: 'file-2', name: 'second.pdf' }), + ] + + const { result } = renderHook(() => + usePreviewState({ + dataSourceType: DataSourceType.FILE, + files, + notionPages: [], + websitePages: [], + }), + ) + + // Initial state + expect(result.current.getPreviewPickerValue().name).toBe('first.pdf') + + // Change preview + act(() => { + result.current.handlePreviewChange({ id: 'file-2', name: 'second.pdf' }) + }) + + expect(result.current.previewFile).toEqual({ id: 'file-2', name: 'second.pdf' }) + }) + }) + + describe('Escape/Unescape Round Trip', () => { + it('should preserve original string through escape/unescape', () => { + const original = '\n\n' + const escaped = escape(original) + const unescaped = unescape(escaped) + + expect(unescaped).toBe(original) + }) + + it('should handle complex strings without backslashes', () => { + // This string contains control characters but no literal backslashes. + const original = 'Hello\nWorld\t!\r\n' + const escaped = escape(original) + const unescaped = unescape(escaped) + expect(unescaped).toBe(original) + }) + + it('should document behavior for strings with existing backslashes', () => { + // When the original string already contains backslash sequences, + // escape/unescape are not perfectly symmetric because escape() + // does not escape backslashes. + const original = 'Hello\\nWorld' + const escaped = escape(original) + const unescaped = unescape(escaped) + // The unescaped value interprets "\n" as a newline, so it differs from the original. + expect(unescaped).toBe('Hello\nWorld') + expect(unescaped).not.toBe(original) + }) + }) +}) diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx index 51b5c15178..b4d2c5f6e9 100644 --- a/web/app/components/datasets/create/step-two/index.tsx +++ b/web/app/components/datasets/create/step-two/index.tsx @@ -1,137 +1,30 @@ 'use client' -import type { FC, PropsWithChildren } from 'react' -import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations' -import type { NotionPage } from '@/models/common' -import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, createDocumentResponse, CustomFile, DocumentItem, FullDocumentDetail, ParentMode, PreProcessingRule, ProcessRule, Rules } from '@/models/datasets' -import type { RetrievalConfig } from '@/types/app' -import { - RiAlertFill, - RiArrowLeftLine, - RiSearchEyeLine, -} from '@remixicon/react' -import { noop } from 'es-toolkit/function' -import Image from 'next/image' -import Link from 'next/link' -import { useCallback, useEffect, useMemo, useState } from 'react' -import { useTranslation } from 'react-i18next' -import { trackEvent } from '@/app/components/base/amplitude' -import Badge from '@/app/components/base/badge' -import Button from '@/app/components/base/button' -import Checkbox from '@/app/components/base/checkbox' -import CustomDialog from '@/app/components/base/dialog' -import Divider from '@/app/components/base/divider' -import FloatRightContainer from '@/app/components/base/float-right-container' -import { ParentChildChunk } from '@/app/components/base/icons/src/vender/knowledge' -import { AlertTriangle } from '@/app/components/base/icons/src/vender/solid/alertsAndFeedback' -import RadioCard from '@/app/components/base/radio-card' -import { SkeletonContainer, SkeletonPoint, SkeletonRectangle, SkeletonRow } from '@/app/components/base/skeleton' -import Toast from '@/app/components/base/toast' -import Tooltip from '@/app/components/base/tooltip' -import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model' -import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config' -import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config' -import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations' -import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks' -import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector' -import { FULL_DOC_PREVIEW_LENGTH, IS_CE_EDITION } from '@/config' +import type { FC } from 'react' +import type { StepTwoProps } from './types' +import { useCallback, useEffect, useState } from 'react' +import { useTranslation } from 'react-i18next' +import Divider from '@/app/components/base/divider' +import Toast from '@/app/components/base/toast' import { useDatasetDetailContextWithSelector } from '@/context/dataset-detail' -import { useDocLink, useLocale } from '@/context/i18n' +import { useLocale } from '@/context/i18n' import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints' import { LanguagesSupported } from '@/i18n-config/language' import { DataSourceProvider } from '@/models/common' -import { ChunkingMode, DataSourceType, ProcessMode } from '@/models/datasets' -import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument, useFetchDefaultProcessRule, useFetchFileIndexingEstimateForFile, useFetchFileIndexingEstimateForNotion, useFetchFileIndexingEstimateForWeb } from '@/service/knowledge/use-create-dataset' -import { useInvalidDatasetList } from '@/service/knowledge/use-dataset' -import { RETRIEVE_METHOD } from '@/types/app' +import { ChunkingMode, ProcessMode } from '@/models/datasets' +import { useFetchDefaultProcessRule } from '@/service/knowledge/use-create-dataset' import { cn } from '@/utils/classnames' -import { ChunkContainer, QAPreview } from '../../chunk' -import PreviewDocumentPicker from '../../common/document-picker/preview-document-picker' -import { PreviewSlice } from '../../formatted-text/flavours/preview-slice' -import { FormattedText } from '../../formatted-text/formatted' -import PreviewContainer from '../../preview/container' -import { PreviewHeader } from '../../preview/header' -import { checkShowMultiModalTip } from '../../settings/utils' -import FileList from '../assets/file-list-3-fill.svg' -import Note from '../assets/note-mod.svg' -import BlueEffect from '../assets/option-card-effect-blue.svg' -import SettingCog from '../assets/setting-gear-mod.svg' -import { indexMethodIcon } from '../icons' -import escape from './escape' -import s from './index.module.css' -import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs' -import LanguageSelect from './language-select' -import { OptionCard } from './option-card' -import unescape from './unescape' +import { GeneralChunkingOptions, IndexingModeSection, ParentChildOptions, PreviewPanel, StepTwoFooter } from './components' +import { IndexingType, MAXIMUM_CHUNK_TOKEN_LENGTH, useDocumentCreation, useIndexingConfig, useIndexingEstimate, usePreviewState, useSegmentationState } from './hooks' -const TextLabel: FC = (props) => { - return -} +export { IndexingType } -type StepTwoProps = { - isSetting?: boolean - documentDetail?: FullDocumentDetail - isAPIKeySet: boolean - onSetting: () => void - datasetId?: string - indexingType?: IndexingType - retrievalMethod?: string - dataSourceType: DataSourceType - files: CustomFile[] - notionPages?: NotionPage[] - notionCredentialId: string - websitePages?: CrawlResultItem[] - crawlOptions?: CrawlOptions - websiteCrawlProvider?: DataSourceProvider - websiteCrawlJobId?: string - onStepChange?: (delta: number) => void - updateIndexingTypeCache?: (type: string) => void - updateRetrievalMethodCache?: (method: RETRIEVE_METHOD | '') => void - updateResultCache?: (res: createDocumentResponse) => void - onSave?: () => void - onCancel?: () => void -} - -export enum IndexingType { - QUALIFIED = 'high_quality', - ECONOMICAL = 'economy', -} - -const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n' -const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024 -const DEFAULT_OVERLAP = 50 -const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10) - -type ParentChildConfig = { - chunkForContext: ParentMode - parent: { - delimiter: string - maxLength: number - } - child: { - delimiter: string - maxLength: number - } -} - -const defaultParentChildConfig: ParentChildConfig = { - chunkForContext: 'paragraph', - parent: { - delimiter: '\\n\\n', - maxLength: 1024, - }, - child: { - delimiter: '\\n', - maxLength: 512, - }, -} - -const StepTwo = ({ +const StepTwo: FC = ({ isSetting, documentDetail, isAPIKeySet, datasetId, - indexingType, + indexingType: propsIndexingType, dataSourceType: inCreatePageDataSourceType, files, notionPages = [], @@ -146,1099 +39,238 @@ const StepTwo = ({ onSave, onCancel, updateRetrievalMethodCache, -}: StepTwoProps) => { +}) => { const { t } = useTranslation() - const docLink = useDocLink() const locale = useLocale() - const media = useBreakpoints() - const isMobile = media === MediaType.mobile - - const currentDataset = useDatasetDetailContextWithSelector(state => state.dataset) - const mutateDatasetRes = useDatasetDetailContextWithSelector(state => state.mutateDatasetRes) + const isMobile = useBreakpoints() === MediaType.mobile + const currentDataset = useDatasetDetailContextWithSelector(s => s.dataset) + const mutateDatasetRes = useDatasetDetailContextWithSelector(s => s.mutateDatasetRes) + // Computed flags const isInUpload = Boolean(currentDataset) const isUploadInEmptyDataset = isInUpload && !currentDataset?.doc_form const isNotUploadInEmptyDataset = !isUploadInEmptyDataset const isInInit = !isInUpload && !isSetting - const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type) - const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type - const [segmentationType, setSegmentationType] = useState( - currentDataset?.doc_form === ChunkingMode.parentChild ? ProcessMode.parentChild : ProcessMode.general, - ) - const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER) - const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => { - doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER)) - }, []) - const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length - const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH) - const [overlap, setOverlap] = useState(DEFAULT_OVERLAP) - const [rules, setRules] = useState([]) - const [defaultConfig, setDefaultConfig] = useState() - const hasSetIndexType = !!indexingType - const [indexType, setIndexType] = useState(() => { - if (hasSetIndexType) - return indexingType - return isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL - }) + const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : (currentDataset?.data_source_type ?? inCreatePageDataSourceType) + const hasSetIndexType = !!propsIndexingType + const isModelAndRetrievalConfigDisabled = !!datasetId && !!currentDataset?.data_source_type - const [previewFile, setPreviewFile] = useState( - (datasetId && documentDetail) - ? documentDetail.file - : files[0], - ) - const [previewNotionPage, setPreviewNotionPage] = useState( - (datasetId && documentDetail) - ? documentDetail.notion_page - : notionPages[0], - ) - - const [previewWebsitePage, setPreviewWebsitePage] = useState( - (datasetId && documentDetail) - ? documentDetail.website_page - : websitePages[0], - ) - - // QA Related + // Document form state + const [docForm, setDocForm] = useState((datasetId && documentDetail) ? documentDetail.doc_form as ChunkingMode : ChunkingMode.text) + const [docLanguage, setDocLanguage] = useState(() => (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese Simplified')) const [isQAConfirmDialogOpen, setIsQAConfirmDialogOpen] = useState(false) - const [docForm, setDocForm] = useState( - (datasetId && documentDetail) ? documentDetail.doc_form as ChunkingMode : ChunkingMode.text, - ) - const handleChangeDocform = (value: ChunkingMode) => { - if (value === ChunkingMode.qa && indexType === IndexingType.ECONOMICAL) { - setIsQAConfirmDialogOpen(true) - return - } - if (value === ChunkingMode.parentChild && indexType === IndexingType.ECONOMICAL) - setIndexType(IndexingType.QUALIFIED) - - setDocForm(value) - - if (value === ChunkingMode.parentChild) - setSegmentationType(ProcessMode.parentChild) - else - setSegmentationType(ProcessMode.general) - - // eslint-disable-next-line ts/no-use-before-define - currentEstimateMutation.reset() - } - - const [docLanguage, setDocLanguage] = useState( - (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese Simplified'), - ) - - const [parentChildConfig, setParentChildConfig] = useState(defaultParentChildConfig) - - const getIndexing_technique = () => indexingType || indexType const currentDocForm = currentDataset?.doc_form || docForm - const getProcessRule = (): ProcessRule => { - if (currentDocForm === ChunkingMode.parentChild) { - return { - rules: { - pre_processing_rules: rules, - segmentation: { - separator: unescape( - parentChildConfig.parent.delimiter, - ), - max_tokens: parentChildConfig.parent.maxLength, - }, - parent_mode: parentChildConfig.chunkForContext, - subchunk_segmentation: { - separator: unescape(parentChildConfig.child.delimiter), - max_tokens: parentChildConfig.child.maxLength, - }, - }, - mode: 'hierarchical', - } as ProcessRule - } - return { - rules: { - pre_processing_rules: rules, - segmentation: { - separator: unescape(segmentIdentifier), - max_tokens: maxChunkLength, - chunk_overlap: overlap, - }, - }, // api will check this. It will be removed after api refactored. - mode: segmentationType, - } as ProcessRule - } - - const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({ - docForm: currentDocForm, - docLanguage, - dataSourceType: DataSourceType.FILE, - files: previewFile - ? [files.find(file => file.name === previewFile.name)!] - : files, - indexingTechnique: getIndexing_technique() as any, - processRule: getProcessRule(), - dataset_id: datasetId!, + // Custom hooks + const segmentation = useSegmentationState({ + initialSegmentationType: currentDataset?.doc_form === ChunkingMode.parentChild ? ProcessMode.parentChild : ProcessMode.general, }) - const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({ - docForm: currentDocForm, - docLanguage, - dataSourceType: DataSourceType.NOTION, - notionPages: [previewNotionPage], - indexingTechnique: getIndexing_technique() as any, - processRule: getProcessRule(), - dataset_id: datasetId || '', - credential_id: notionCredentialId, + const indexing = useIndexingConfig({ + initialIndexType: propsIndexingType, + initialEmbeddingModel: currentDataset?.embedding_model ? { provider: currentDataset.embedding_model_provider, model: currentDataset.embedding_model } : undefined, + initialRetrievalConfig: currentDataset?.retrieval_model_dict, + isAPIKeySet, + hasSetIndexType, }) - - const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({ - docForm: currentDocForm, - docLanguage, - dataSourceType: DataSourceType.WEB, - websitePages: [previewWebsitePage], + const preview = usePreviewState({ dataSourceType, files, notionPages, websitePages, documentDetail, datasetId }) + const creation = useDocumentCreation({ + datasetId, + isSetting, + documentDetail, + dataSourceType, + files, + notionPages, + notionCredentialId, + websitePages, crawlOptions, websiteCrawlProvider, websiteCrawlJobId, - indexingTechnique: getIndexing_technique() as any, - processRule: getProcessRule(), - dataset_id: datasetId || '', + onStepChange, + updateIndexingTypeCache, + updateResultCache, + updateRetrievalMethodCache, + onSave, + mutateDatasetRes, + }) + const estimateHook = useIndexingEstimate({ + dataSourceType, + datasetId, + currentDocForm, + docLanguage, + files, + previewFileName: preview.previewFile?.name, + previewNotionPage: preview.previewNotionPage, + notionCredentialId, + previewWebsitePage: preview.previewWebsitePage, + crawlOptions, + websiteCrawlProvider, + websiteCrawlJobId, + indexingTechnique: indexing.getIndexingTechnique() as IndexingType, + processRule: segmentation.getProcessRule(currentDocForm), }) - const currentEstimateMutation = dataSourceType === DataSourceType.FILE - ? fileIndexingEstimateQuery - : dataSourceType === DataSourceType.NOTION - ? notionIndexingEstimateQuery - : websiteIndexingEstimateQuery + // Fetch default process rule + const fetchDefaultProcessRuleMutation = useFetchDefaultProcessRule({ + onSuccess(data) { + segmentation.setSegmentIdentifier(data.rules.segmentation.separator) + segmentation.setMaxChunkLength(data.rules.segmentation.max_tokens) + segmentation.setOverlap(data.rules.segmentation.chunk_overlap!) + segmentation.setRules(data.rules.pre_processing_rules) + segmentation.setDefaultConfig(data.rules) + segmentation.setLimitMaxChunkLength(data.limits.indexing_max_segmentation_tokens_length) + }, + }) - const fetchEstimate = useCallback(() => { - if (dataSourceType === DataSourceType.FILE) - fileIndexingEstimateQuery.mutate() - - if (dataSourceType === DataSourceType.NOTION) - notionIndexingEstimateQuery.mutate() - - if (dataSourceType === DataSourceType.WEB) - websiteIndexingEstimateQuery.mutate() - }, [dataSourceType, fileIndexingEstimateQuery, notionIndexingEstimateQuery, websiteIndexingEstimateQuery]) - - const estimate - = dataSourceType === DataSourceType.FILE - ? fileIndexingEstimateQuery.data - : dataSourceType === DataSourceType.NOTION - ? notionIndexingEstimateQuery.data - : websiteIndexingEstimateQuery.data - - const getRuleName = (key: string) => { - if (key === 'remove_extra_spaces') - return t('stepTwo.removeExtraSpaces', { ns: 'datasetCreation' }) - - if (key === 'remove_urls_emails') - return t('stepTwo.removeUrlEmails', { ns: 'datasetCreation' }) - - if (key === 'remove_stopwords') - return t('stepTwo.removeStopwords', { ns: 'datasetCreation' }) - } - const ruleChangeHandle = (id: string) => { - const newRules = rules.map((rule) => { - if (rule.id === id) { - return { - id: rule.id, - enabled: !rule.enabled, - } - } - return rule - }) - setRules(newRules) - } - const resetRules = () => { - if (defaultConfig) { - setSegmentIdentifier(defaultConfig.segmentation.separator) - setMaxChunkLength(defaultConfig.segmentation.max_tokens) - setOverlap(defaultConfig.segmentation.chunk_overlap!) - setRules(defaultConfig.pre_processing_rules) + // Event handlers + const handleDocFormChange = useCallback((value: ChunkingMode) => { + if (value === ChunkingMode.qa && indexing.indexType === IndexingType.ECONOMICAL) { + setIsQAConfirmDialogOpen(true) + return } - setParentChildConfig(defaultParentChildConfig) - } + if (value === ChunkingMode.parentChild && indexing.indexType === IndexingType.ECONOMICAL) + indexing.setIndexType(IndexingType.QUALIFIED) + setDocForm(value) + segmentation.setSegmentationType(value === ChunkingMode.parentChild ? ProcessMode.parentChild : ProcessMode.general) + estimateHook.reset() + }, [indexing, segmentation, estimateHook]) - const updatePreview = () => { - if (segmentationType === ProcessMode.general && maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) { + const updatePreview = useCallback(() => { + if (segmentation.segmentationType === ProcessMode.general && segmentation.maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) { Toast.notify({ type: 'error', message: t('stepTwo.maxLengthCheck', { ns: 'datasetCreation', limit: MAXIMUM_CHUNK_TOKEN_LENGTH }) }) return } - fetchEstimate() - } + estimateHook.fetchEstimate() + }, [segmentation, t, estimateHook]) - const { - modelList: rerankModelList, - defaultModel: rerankDefaultModel, - currentModel: isRerankDefaultModelValid, - } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank) - const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding) - const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding) - const [embeddingModel, setEmbeddingModel] = useState( - currentDataset?.embedding_model - ? { - provider: currentDataset.embedding_model_provider, - model: currentDataset.embedding_model, - } - : { - provider: defaultEmbeddingModel?.provider.provider || '', - model: defaultEmbeddingModel?.model || '', - }, - ) - const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || { - search_method: RETRIEVE_METHOD.semantic, - reranking_enable: false, - reranking_model: { - reranking_provider_name: '', - reranking_model_name: '', - }, - top_k: 3, - score_threshold_enabled: false, - score_threshold: 0.5, - } as RetrievalConfig) - - useEffect(() => { - if (currentDataset?.retrieval_model_dict) - return - setRetrievalConfig({ - search_method: RETRIEVE_METHOD.semantic, - reranking_enable: !!isRerankDefaultModelValid, - reranking_model: { - reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider.provider ?? '' : '', - reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '', - }, - top_k: 3, - score_threshold_enabled: false, - score_threshold: 0.5, + const handleCreate = useCallback(async () => { + const isValid = creation.validateParams({ + segmentationType: segmentation.segmentationType, + maxChunkLength: segmentation.maxChunkLength, + limitMaxChunkLength: segmentation.limitMaxChunkLength, + overlap: segmentation.overlap, + indexType: indexing.indexType, + embeddingModel: indexing.embeddingModel, + rerankModelList: indexing.rerankModelList, + retrievalConfig: indexing.retrievalConfig, }) - }, [rerankDefaultModel, isRerankDefaultModelValid]) - - const getCreationParams = () => { - let params - if (segmentationType === ProcessMode.general && overlap > maxChunkLength) { - Toast.notify({ type: 'error', message: t('stepTwo.overlapCheck', { ns: 'datasetCreation' }) }) + if (!isValid) return - } - if (segmentationType === ProcessMode.general && maxChunkLength > limitMaxChunkLength) { - Toast.notify({ type: 'error', message: t('stepTwo.maxLengthCheck', { ns: 'datasetCreation', limit: limitMaxChunkLength }) }) - return - } - if (isSetting) { - params = { - original_document_id: documentDetail?.id, - doc_form: currentDocForm, - doc_language: docLanguage, - process_rule: getProcessRule(), - retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page. - embedding_model: embeddingModel.model, // Readonly - embedding_model_provider: embeddingModel.provider, // Readonly - indexing_technique: getIndexing_technique(), - } as CreateDocumentReq - } - else { // create - const indexMethod = getIndexing_technique() - if (indexMethod === IndexingType.QUALIFIED && (!embeddingModel.model || !embeddingModel.provider)) { - Toast.notify({ - type: 'error', - message: t('datasetConfig.embeddingModelRequired', { ns: 'appDebug' }), - }) - return - } - if ( - !isReRankModelSelected({ - rerankModelList, - retrievalConfig, - indexMethod: indexMethod as string, - }) - ) { - Toast.notify({ type: 'error', message: t('datasetConfig.rerankModelRequired', { ns: 'appDebug' }) }) - return - } - params = { - data_source: { - type: dataSourceType, - info_list: { - data_source_type: dataSourceType, - }, - }, - indexing_technique: getIndexing_technique(), - process_rule: getProcessRule(), - doc_form: currentDocForm, - doc_language: docLanguage, - retrieval_model: retrievalConfig, - embedding_model: embeddingModel.model, - embedding_model_provider: embeddingModel.provider, - } as CreateDocumentReq - if (dataSourceType === DataSourceType.FILE) { - params.data_source.info_list.file_info_list = { - file_ids: files.map(file => file.id || '').filter(Boolean), - } - } - if (dataSourceType === DataSourceType.NOTION) - params.data_source.info_list.notion_info_list = getNotionInfo(notionPages, notionCredentialId) - - if (dataSourceType === DataSourceType.WEB) { - params.data_source.info_list.website_info_list = getWebsiteInfo({ - websiteCrawlProvider, - websiteCrawlJobId, - websitePages, - }) - } - } - return params - } - - const fetchDefaultProcessRuleMutation = useFetchDefaultProcessRule({ - onSuccess(data) { - const separator = data.rules.segmentation.separator - setSegmentIdentifier(separator) - setMaxChunkLength(data.rules.segmentation.max_tokens) - setOverlap(data.rules.segmentation.chunk_overlap!) - setRules(data.rules.pre_processing_rules) - setDefaultConfig(data.rules) - setLimitMaxChunkLength(data.limits.indexing_max_segmentation_tokens_length) - }, - }) - - const getRulesFromDetail = () => { - if (documentDetail) { - const rules = documentDetail.dataset_process_rule.rules - const separator = rules.segmentation.separator - const max = rules.segmentation.max_tokens - const overlap = rules.segmentation.chunk_overlap - const isHierarchicalDocument = documentDetail.doc_form === ChunkingMode.parentChild - || (rules.parent_mode && rules.subchunk_segmentation) - setSegmentIdentifier(separator) - setMaxChunkLength(max) - setOverlap(overlap!) - setRules(rules.pre_processing_rules) - setDefaultConfig(rules) - - if (isHierarchicalDocument) { - setParentChildConfig({ - chunkForContext: rules.parent_mode || 'paragraph', - parent: { - delimiter: escape(rules.segmentation.separator), - maxLength: rules.segmentation.max_tokens, - }, - child: { - delimiter: escape(rules.subchunk_segmentation.separator), - maxLength: rules.subchunk_segmentation.max_tokens, - }, - }) - } - } - } - - const getDefaultMode = () => { - if (documentDetail) - setSegmentationType(documentDetail.dataset_process_rule.mode) - } - - const createFirstDocumentMutation = useCreateFirstDocument() - const createDocumentMutation = useCreateDocument(datasetId!) - - const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending - const invalidDatasetList = useInvalidDatasetList() - - const createHandle = async () => { - const params = getCreationParams() + const params = creation.buildCreationParams(currentDocForm, docLanguage, segmentation.getProcessRule(currentDocForm), indexing.retrievalConfig, indexing.embeddingModel, indexing.getIndexingTechnique()) if (!params) - return false + return + await creation.executeCreation(params, indexing.indexType, indexing.retrievalConfig) + }, [creation, segmentation, indexing, currentDocForm, docLanguage]) - if (!datasetId) { - await createFirstDocumentMutation.mutateAsync( - params, - { - onSuccess(data) { - updateIndexingTypeCache?.(indexType as string) - updateResultCache?.(data) - updateRetrievalMethodCache?.(retrievalConfig.search_method as RETRIEVE_METHOD) - }, - }, - ) - } - else { - await createDocumentMutation.mutateAsync(params, { - onSuccess(data) { - updateIndexingTypeCache?.(indexType as string) - updateResultCache?.(data) - updateRetrievalMethodCache?.(retrievalConfig.search_method as RETRIEVE_METHOD) - }, - }) - } - if (mutateDatasetRes) - mutateDatasetRes() - invalidDatasetList() - trackEvent('create_datasets', { - data_source_type: dataSourceType, - indexing_technique: getIndexing_technique(), - }) - onStepChange?.(+1) - if (isSetting) - onSave?.() - } + const handlePickerChange = useCallback((selected: { id: string, name: string }) => { + estimateHook.reset() + preview.handlePreviewChange(selected) + estimateHook.fetchEstimate() + }, [estimateHook, preview]) + const handleQAConfirm = useCallback(() => { + setIsQAConfirmDialogOpen(false) + indexing.setIndexType(IndexingType.QUALIFIED) + setDocForm(ChunkingMode.qa) + }, [indexing]) + + // Initialize rules useEffect(() => { - // fetch rules if (!isSetting) { fetchDefaultProcessRuleMutation.mutate('/datasets/process-rule') } - else { - getRulesFromDetail() - getDefaultMode() + else if (documentDetail) { + const rules = documentDetail.dataset_process_rule.rules + const isHierarchical = documentDetail.doc_form === ChunkingMode.parentChild || Boolean(rules.parent_mode && rules.subchunk_segmentation) + segmentation.applyConfigFromRules(rules, isHierarchical) + segmentation.setSegmentationType(documentDetail.dataset_process_rule.mode) } + // eslint-disable-next-line react-hooks/exhaustive-deps }, []) - useEffect(() => { - // get indexing type by props - if (indexingType) - setIndexType(indexingType as IndexingType) - else - setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL) - }, [isAPIKeySet, indexingType, datasetId]) - - const isModelAndRetrievalConfigDisabled = !!datasetId && !!currentDataset?.data_source_type - - const showMultiModalTip = useMemo(() => { - return checkShowMultiModalTip({ - embeddingModel, - rerankingEnable: retrievalConfig.reranking_enable, - rerankModel: { - rerankingProviderName: retrievalConfig.reranking_model.reranking_provider_name, - rerankingModelName: retrievalConfig.reranking_model.reranking_model_name, - }, - indexMethod: indexType, - embeddingModelList, - rerankModelList, - }) - }, [embeddingModel, retrievalConfig.reranking_enable, retrievalConfig.reranking_model, indexType, embeddingModelList, rerankModelList]) + // Show options conditions + const showGeneralOption = (isInUpload && [ChunkingMode.text, ChunkingMode.qa].includes(currentDataset!.doc_form)) || isUploadInEmptyDataset || isInInit + const showParentChildOption = (isInUpload && currentDataset!.doc_form === ChunkingMode.parentChild) || isUploadInEmptyDataset || isInInit return (
{t('stepTwo.segmentation', { ns: 'datasetCreation' })}
- {((isInUpload && [ChunkingMode.text, ChunkingMode.qa].includes(currentDataset!.doc_form)) - || isUploadInEmptyDataset - || isInInit) - && ( - } - activeHeaderClassName="bg-dataset-option-card-blue-gradient" - description={t('stepTwo.generalTip', { ns: 'datasetCreation' })} - isActive={ - [ChunkingMode.text, ChunkingMode.qa].includes(currentDocForm) - } - onSwitched={() => - handleChangeDocform(ChunkingMode.text)} - actions={( - <> - - - - )} - noHighlight={isInUpload && isNotUploadInEmptyDataset} - > -
-
- setSegmentIdentifier(e.target.value, true)} - /> - - -
-
-
-
- {t('stepTwo.rules', { ns: 'datasetCreation' })} -
- -
-
- {rules.map(rule => ( -
{ - ruleChangeHandle(rule.id) - }} - > - - -
- ))} - {IS_CE_EDITION && ( - <> - -
-
{ - if (currentDataset?.doc_form) - return - if (docForm === ChunkingMode.qa) - handleChangeDocform(ChunkingMode.text) - else - handleChangeDocform(ChunkingMode.qa) - }} - > - - -
- - -
- {currentDocForm === ChunkingMode.qa && ( -
- - - {t('stepTwo.QATip', { ns: 'datasetCreation' })} - -
- )} - - )} -
-
-
-
+ {showGeneralOption && ( + segmentation.setSegmentIdentifier(value, true)} + onMaxChunkLengthChange={segmentation.setMaxChunkLength} + onOverlapChange={segmentation.setOverlap} + onRuleToggle={segmentation.toggleRule} + onDocFormChange={handleDocFormChange} + onDocLanguageChange={setDocLanguage} + onPreview={updatePreview} + onReset={segmentation.resetToDefaults} + locale={locale} + /> )} - { - ( - (isInUpload && currentDataset!.doc_form === ChunkingMode.parentChild) - || isUploadInEmptyDataset - || isInInit - ) - && ( - } - effectImg={BlueEffect.src} - className="text-util-colors-blue-light-blue-light-500" - activeHeaderClassName="bg-dataset-option-card-blue-gradient" - description={t('stepTwo.parentChildTip', { ns: 'datasetCreation' })} - isActive={currentDocForm === ChunkingMode.parentChild} - onSwitched={() => handleChangeDocform(ChunkingMode.parentChild)} - actions={( - <> - - - - )} - noHighlight={isInUpload && isNotUploadInEmptyDataset} - > -
-
-
-
- {t('stepTwo.parentChunkForContext', { ns: 'datasetCreation' })} -
- -
- } - title={t('stepTwo.paragraph', { ns: 'datasetCreation' })} - description={t('stepTwo.paragraphTip', { ns: 'datasetCreation' })} - isChosen={parentChildConfig.chunkForContext === 'paragraph'} - onChosen={() => setParentChildConfig( - { - ...parentChildConfig, - chunkForContext: 'paragraph', - }, - )} - chosenConfig={( -
- setParentChildConfig({ - ...parentChildConfig, - parent: { - ...parentChildConfig.parent, - delimiter: e.target.value ? escape(e.target.value) : '', - }, - })} - /> - setParentChildConfig({ - ...parentChildConfig, - parent: { - ...parentChildConfig.parent, - maxLength: value, - }, - })} - /> -
- )} - /> - } - title={t('stepTwo.fullDoc', { ns: 'datasetCreation' })} - description={t('stepTwo.fullDocTip', { ns: 'datasetCreation' })} - onChosen={() => setParentChildConfig( - { - ...parentChildConfig, - chunkForContext: 'full-doc', - }, - )} - isChosen={parentChildConfig.chunkForContext === 'full-doc'} - /> -
- -
-
-
- {t('stepTwo.childChunkForRetrieval', { ns: 'datasetCreation' })} -
- -
-
- setParentChildConfig({ - ...parentChildConfig, - child: { - ...parentChildConfig.child, - delimiter: e.target.value ? escape(e.target.value) : '', - }, - })} - /> - setParentChildConfig({ - ...parentChildConfig, - child: { - ...parentChildConfig.child, - maxLength: value, - }, - })} - /> -
-
-
-
-
- {t('stepTwo.rules', { ns: 'datasetCreation' })} -
- -
-
- {rules.map(rule => ( -
{ - ruleChangeHandle(rule.id) - }} - > - - -
- ))} -
-
-
-
- ) - } - -
{t('stepTwo.indexMode', { ns: 'datasetCreation' })}
-
- {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && ( - - {t('stepTwo.qualified', { ns: 'datasetCreation' })} - - {t('stepTwo.recommend', { ns: 'datasetCreation' })} - - - {!hasSetIndexType && } - -
- )} - description={t('stepTwo.qualifiedTip', { ns: 'datasetCreation' })} - icon={} - isActive={!hasSetIndexType && indexType === IndexingType.QUALIFIED} - disabled={hasSetIndexType} - onSwitched={() => { - setIndexType(IndexingType.QUALIFIED) - }} - /> - )} - - {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && ( - <> - setIsQAConfirmDialogOpen(false)} className="w-[432px]"> -
-

- {t('stepTwo.qaSwitchHighQualityTipTitle', { ns: 'datasetCreation' })} -

-

- {t('stepTwo.qaSwitchHighQualityTipContent', { ns: 'datasetCreation' })} -

-
-
- - -
-
- - { - docForm === ChunkingMode.qa - ? t('stepTwo.notAvailableForQA', { ns: 'datasetCreation' }) - : t('stepTwo.notAvailableForParentChild', { ns: 'datasetCreation' }) - } -
- )} - noDecoration - position="top" - asChild={false} - triggerClassName="flex-1 self-stretch" - > - } - isActive={!hasSetIndexType && indexType === IndexingType.ECONOMICAL} - disabled={hasSetIndexType || docForm !== ChunkingMode.text} - onSwitched={() => { - setIndexType(IndexingType.ECONOMICAL) - }} - /> - - - )} -
- {!hasSetIndexType && indexType === IndexingType.QUALIFIED && ( -
-
-
- -
- {t('stepTwo.highQualityTip', { ns: 'datasetCreation' })} -
- )} - {hasSetIndexType && indexType === IndexingType.ECONOMICAL && ( -
- {t('stepTwo.indexSettingTip', { ns: 'datasetCreation' })} - {t('stepTwo.datasetSettingLink', { ns: 'datasetCreation' })} -
- )} - {/* Embedding model */} - {indexType === IndexingType.QUALIFIED && ( -
-
{t('form.embeddingModel', { ns: 'datasetSettings' })}
- { - setEmbeddingModel(model) - }} - /> - {isModelAndRetrievalConfigDisabled && ( -
- {t('stepTwo.indexSettingTip', { ns: 'datasetCreation' })} - {t('stepTwo.datasetSettingLink', { ns: 'datasetCreation' })} -
- )} -
+ {showParentChildOption && ( + segmentation.updateParentConfig('delimiter', v)} + onParentMaxLengthChange={v => segmentation.updateParentConfig('maxLength', v)} + onChildDelimiterChange={v => segmentation.updateChildConfig('delimiter', v)} + onChildMaxLengthChange={v => segmentation.updateChildConfig('maxLength', v)} + onRuleToggle={segmentation.toggleRule} + onPreview={updatePreview} + onReset={segmentation.resetToDefaults} + /> )} - {/* Retrieval Method Config */} -
- {!isModelAndRetrievalConfigDisabled - ? ( -
-
{t('form.retrievalSetting.title', { ns: 'datasetSettings' })}
-
- - {t('form.retrievalSetting.learnMore', { ns: 'datasetSettings' })} - - {t('form.retrievalSetting.longDescription', { ns: 'datasetSettings' })} -
-
- ) - : ( -
-
{t('form.retrievalSetting.title', { ns: 'datasetSettings' })}
-
- )} - -
- { - getIndexing_technique() === IndexingType.QUALIFIED - ? ( - - ) - : ( - - ) - } -
-
- - {!isSetting - ? ( -
- - -
- ) - : ( -
- - -
- )} + setIsQAConfirmDialogOpen(false)} + onQAConfirmDialogConfirm={handleQAConfirm} + /> + onStepChange?.(-1)} onCreate={handleCreate} onCancel={onCancel} /> - - -
- {dataSourceType === DataSourceType.FILE - && ( - >} - onChange={(selected) => { - currentEstimateMutation.reset() - setPreviewFile(selected) - currentEstimateMutation.mutate() - }} - // when it is from setting, it just has one file - value={isSetting ? (files[0]! as Required) : previewFile} - /> - )} - {dataSourceType === DataSourceType.NOTION - && ( - ({ - id: page.page_id, - name: page.page_name, - extension: 'md', - })) - } - onChange={(selected) => { - currentEstimateMutation.reset() - const selectedPage = notionPages.find(page => page.page_id === selected.id) - setPreviewNotionPage(selectedPage!) - currentEstimateMutation.mutate() - }} - value={{ - id: previewNotionPage?.page_id || '', - name: previewNotionPage?.page_name || '', - extension: 'md', - }} - /> - )} - {dataSourceType === DataSourceType.WEB - && ( - ({ - id: page.source_url, - name: page.title, - extension: 'md', - })) - } - onChange={(selected) => { - currentEstimateMutation.reset() - const selectedPage = websitePages.find(page => page.source_url === selected.id) - setPreviewWebsitePage(selectedPage!) - currentEstimateMutation.mutate() - }} - value={ - { - id: previewWebsitePage?.source_url || '', - name: previewWebsitePage?.title || '', - extension: 'md', - } - } - /> - )} - { - currentDocForm !== ChunkingMode.qa - && ( - - ) - } -
- - )} - className={cn('relative flex h-full w-1/2 shrink-0 p-4 pr-0', isMobile && 'w-full max-w-[524px]')} - mainClassName="space-y-6" - > - {currentDocForm === ChunkingMode.qa && estimate?.qa_preview && ( - estimate?.qa_preview.map((item, index) => ( - - - - )) - )} - {currentDocForm === ChunkingMode.text && estimate?.preview && ( - estimate?.preview.map((item, index) => ( - - {item.content} - - )) - )} - {currentDocForm === ChunkingMode.parentChild && currentEstimateMutation.data?.preview && ( - estimate?.preview?.map((item, index) => { - const indexForLabel = index + 1 - const childChunks = parentChildConfig.chunkForContext === 'full-doc' - ? item.child_chunks.slice(0, FULL_DOC_PREVIEW_LENGTH) - : item.child_chunks - return ( - - - {childChunks.map((child, index) => { - const indexForLabel = index + 1 - return ( - - ) - })} - - - ) - }) - )} - {currentEstimateMutation.isIdle && ( -
-
- -

- {t('stepTwo.previewChunkTip', { ns: 'datasetCreation' })} -

-
-
- )} - {currentEstimateMutation.isPending && ( -
- {Array.from({ length: 10 }, (_, i) => ( - - - - - - - - - - - ))} -
- )} -
-
+ } + pickerValue={preview.getPreviewPickerValue()} + isIdle={estimateHook.isIdle} + isPending={estimateHook.isPending} + onPickerChange={handlePickerChange} + /> ) } diff --git a/web/app/components/datasets/create/step-two/types.ts b/web/app/components/datasets/create/step-two/types.ts new file mode 100644 index 0000000000..7f5291fb13 --- /dev/null +++ b/web/app/components/datasets/create/step-two/types.ts @@ -0,0 +1,28 @@ +import type { IndexingType } from './hooks' +import type { DataSourceProvider, NotionPage } from '@/models/common' +import type { CrawlOptions, CrawlResultItem, createDocumentResponse, CustomFile, DataSourceType, FullDocumentDetail } from '@/models/datasets' +import type { RETRIEVE_METHOD } from '@/types/app' + +export type StepTwoProps = { + isSetting?: boolean + documentDetail?: FullDocumentDetail + isAPIKeySet: boolean + onSetting: () => void + datasetId?: string + indexingType?: IndexingType + retrievalMethod?: string + dataSourceType: DataSourceType + files: CustomFile[] + notionPages?: NotionPage[] + notionCredentialId: string + websitePages?: CrawlResultItem[] + crawlOptions?: CrawlOptions + websiteCrawlProvider?: DataSourceProvider + websiteCrawlJobId?: string + onStepChange?: (delta: number) => void + updateIndexingTypeCache?: (type: string) => void + updateRetrievalMethodCache?: (method: RETRIEVE_METHOD | '') => void + updateResultCache?: (res: createDocumentResponse) => void + onSave?: () => void + onCancel?: () => void +}