dify/web/app/components/datasets/create/step-two/hooks/use-document-creation.ts

280 lines
8.0 KiB
TypeScript

import type { DefaultModel, Model } from '@/app/components/header/account-setting/model-provider-page/declarations'
import type { NotionPage } from '@/models/common'
import type {
ChunkingMode,
CrawlOptions,
CrawlResultItem,
CreateDocumentReq,
createDocumentResponse,
CustomFile,
FullDocumentDetail,
ProcessRule,
} from '@/models/datasets'
import type { RetrievalConfig, RETRIEVE_METHOD } from '@/types/app'
import { useCallback } from 'react'
import { useTranslation } from 'react-i18next'
import { trackEvent } from '@/app/components/base/amplitude'
import Toast from '@/app/components/base/toast'
import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
import { DataSourceProvider } from '@/models/common'
import {
DataSourceType,
} from '@/models/datasets'
import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument } from '@/service/knowledge/use-create-dataset'
import { useInvalidDatasetList } from '@/service/knowledge/use-dataset'
import { IndexingType } from './use-indexing-config'
import { MAXIMUM_CHUNK_TOKEN_LENGTH } from './use-segmentation-state'
export type UseDocumentCreationOptions = {
datasetId?: string
isSetting?: boolean
documentDetail?: FullDocumentDetail
dataSourceType: DataSourceType
files: CustomFile[]
notionPages: NotionPage[]
notionCredentialId: string
websitePages: CrawlResultItem[]
crawlOptions?: CrawlOptions
websiteCrawlProvider?: DataSourceProvider
websiteCrawlJobId?: string
// Callbacks
onStepChange?: (delta: number) => void
updateIndexingTypeCache?: (type: string) => void
updateResultCache?: (res: createDocumentResponse) => void
updateRetrievalMethodCache?: (method: RETRIEVE_METHOD | '') => void
onSave?: () => void
mutateDatasetRes?: () => void
}
export type ValidationParams = {
segmentationType: string
maxChunkLength: number
limitMaxChunkLength: number
overlap: number
indexType: IndexingType
embeddingModel: DefaultModel
rerankModelList: Model[]
retrievalConfig: RetrievalConfig
}
export const useDocumentCreation = (options: UseDocumentCreationOptions) => {
const { t } = useTranslation()
const {
datasetId,
isSetting,
documentDetail,
dataSourceType,
files,
notionPages,
notionCredentialId,
websitePages,
crawlOptions,
websiteCrawlProvider = DataSourceProvider.jinaReader,
websiteCrawlJobId = '',
onStepChange,
updateIndexingTypeCache,
updateResultCache,
updateRetrievalMethodCache,
onSave,
mutateDatasetRes,
} = options
const createFirstDocumentMutation = useCreateFirstDocument()
const createDocumentMutation = useCreateDocument(datasetId!)
const invalidDatasetList = useInvalidDatasetList()
const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending
// Validate creation params
const validateParams = useCallback((params: ValidationParams): boolean => {
const {
segmentationType,
maxChunkLength,
limitMaxChunkLength,
overlap,
indexType,
embeddingModel,
rerankModelList,
retrievalConfig,
} = params
if (segmentationType === 'general' && overlap > maxChunkLength) {
Toast.notify({ type: 'error', message: t('stepTwo.overlapCheck', { ns: 'datasetCreation' }) })
return false
}
if (segmentationType === 'general' && maxChunkLength > limitMaxChunkLength) {
Toast.notify({
type: 'error',
message: t('stepTwo.maxLengthCheck', { ns: 'datasetCreation', limit: limitMaxChunkLength }),
})
return false
}
if (!isSetting) {
if (indexType === IndexingType.QUALIFIED && (!embeddingModel.model || !embeddingModel.provider)) {
Toast.notify({
type: 'error',
message: t('datasetConfig.embeddingModelRequired', { ns: 'appDebug' }),
})
return false
}
if (!isReRankModelSelected({
rerankModelList,
retrievalConfig,
indexMethod: indexType,
})) {
Toast.notify({ type: 'error', message: t('datasetConfig.rerankModelRequired', { ns: 'appDebug' }) })
return false
}
}
return true
}, [t, isSetting])
// Build creation params
const buildCreationParams = useCallback((
currentDocForm: ChunkingMode,
docLanguage: string,
processRule: ProcessRule,
retrievalConfig: RetrievalConfig,
embeddingModel: DefaultModel,
indexingTechnique: string,
): CreateDocumentReq | null => {
if (isSetting) {
return {
original_document_id: documentDetail?.id,
doc_form: currentDocForm,
doc_language: docLanguage,
process_rule: processRule,
retrieval_model: retrievalConfig,
embedding_model: embeddingModel.model,
embedding_model_provider: embeddingModel.provider,
indexing_technique: indexingTechnique,
} as CreateDocumentReq
}
const params: CreateDocumentReq = {
data_source: {
type: dataSourceType,
info_list: {
data_source_type: dataSourceType,
},
},
indexing_technique: indexingTechnique,
process_rule: processRule,
doc_form: currentDocForm,
doc_language: docLanguage,
retrieval_model: retrievalConfig,
embedding_model: embeddingModel.model,
embedding_model_provider: embeddingModel.provider,
} as CreateDocumentReq
// Add data source specific info
if (dataSourceType === DataSourceType.FILE) {
params.data_source!.info_list.file_info_list = {
file_ids: files.map(file => file.id || '').filter(Boolean),
}
}
if (dataSourceType === DataSourceType.NOTION)
params.data_source!.info_list.notion_info_list = getNotionInfo(notionPages, notionCredentialId)
if (dataSourceType === DataSourceType.WEB) {
params.data_source!.info_list.website_info_list = getWebsiteInfo({
websiteCrawlProvider,
websiteCrawlJobId,
websitePages,
crawlOptions,
})
}
return params
}, [
isSetting,
documentDetail,
dataSourceType,
files,
notionPages,
notionCredentialId,
websitePages,
websiteCrawlProvider,
websiteCrawlJobId,
crawlOptions,
])
// Execute creation
const executeCreation = useCallback(async (
params: CreateDocumentReq,
indexType: IndexingType,
retrievalConfig: RetrievalConfig,
) => {
if (!datasetId) {
await createFirstDocumentMutation.mutateAsync(params, {
onSuccess(data) {
updateIndexingTypeCache?.(indexType)
updateResultCache?.(data)
updateRetrievalMethodCache?.(retrievalConfig.search_method as RETRIEVE_METHOD)
},
})
}
else {
await createDocumentMutation.mutateAsync(params, {
onSuccess(data) {
updateIndexingTypeCache?.(indexType)
updateResultCache?.(data)
updateRetrievalMethodCache?.(retrievalConfig.search_method as RETRIEVE_METHOD)
},
})
}
mutateDatasetRes?.()
invalidDatasetList()
trackEvent('create_datasets', {
data_source_type: dataSourceType,
indexing_technique: indexType,
})
onStepChange?.(+1)
if (isSetting)
onSave?.()
}, [
datasetId,
createFirstDocumentMutation,
createDocumentMutation,
updateIndexingTypeCache,
updateResultCache,
updateRetrievalMethodCache,
mutateDatasetRes,
invalidDatasetList,
dataSourceType,
onStepChange,
isSetting,
onSave,
])
// Validate preview params
const validatePreviewParams = useCallback((maxChunkLength: number): boolean => {
if (maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) {
Toast.notify({
type: 'error',
message: t('stepTwo.maxLengthCheck', { ns: 'datasetCreation', limit: MAXIMUM_CHUNK_TOKEN_LENGTH }),
})
return false
}
return true
}, [t])
return {
isCreating,
validateParams,
buildCreationParams,
executeCreation,
validatePreviewParams,
}
}
export type DocumentCreation = ReturnType<typeof useDocumentCreation>