mirror of
https://github.com/langgenius/dify.git
synced 2026-05-13 08:57:28 +08:00
Signed-off-by: majiayu000 <1835304752@qq.com> Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: NeatGuyCoding <15627489+NeatGuyCoding@users.noreply.github.com> Signed-off-by: -LAN- <laipz8200@outlook.com> Signed-off-by: yihong0618 <zouzou0208@gmail.com> Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com> Co-authored-by: 盐粒 Yanli <yanli@dify.ai> Co-authored-by: wangxiaolei <fatelei@gmail.com> Co-authored-by: Stephen Zhou <38493346+hyoban@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cursx <33718736+Cursx@users.noreply.github.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: lif <1835304752@qq.com> Co-authored-by: 非法操作 <hjlarry@163.com> Co-authored-by: Asuka Minato <i@asukaminato.eu.org> Co-authored-by: fenglin <790872612@qq.com> Co-authored-by: qiaofenglin <qiaofenglin@baidu.com> Co-authored-by: -LAN- <laipz8200@outlook.com> Co-authored-by: TomoOkuyama <49631611+TomoOkuyama@users.noreply.github.com> Co-authored-by: Tomo Okuyama <tomo.okuyama@intersystems.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: zyssyz123 <916125788@qq.com> Co-authored-by: hj24 <mambahj24@gmail.com> Co-authored-by: Coding On Star <447357187@qq.com> Co-authored-by: CodingOnStar <hanxujiang@dify.ai> Co-authored-by: yyh <92089059+lyzno1@users.noreply.github.com> Co-authored-by: Xiangxuan Qu <fghpdf@outlook.com> Co-authored-by: fghpdf <fghpdf@users.noreply.github.com> Co-authored-by: coopercoder <whitetiger0127@163.com> Co-authored-by: zhaiguangpeng <zhaiguangpeng@didiglobal.com> Co-authored-by: Junyan Qin (Chin) <rockchinq@gmail.com> Co-authored-by: E.G <146701565+GlobalStar117@users.noreply.github.com> Co-authored-by: GlobalStar117 <GlobalStar117@users.noreply.github.com> Co-authored-by: Claude Haiku 4.5 <noreply@anthropic.com> Co-authored-by: CodingOnStar <hanxujiang@dify.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: heyszt <270985384@qq.com> Co-authored-by: NeatGuyCoding <15627489+NeatGuyCoding@users.noreply.github.com> Co-authored-by: Yeuoly <45712896+Yeuoly@users.noreply.github.com> Co-authored-by: zxhlyh <jasonapring2015@outlook.com> Co-authored-by: moonpanda <chuanzegao@163.com> Co-authored-by: warlocgao <warlocgao@tencent.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com> Co-authored-by: KVOJJJin <jzongcode@gmail.com> Co-authored-by: eux <euxx@users.noreply.github.com> Co-authored-by: bangjiehan <bangjiehan@gmail.com> Co-authored-by: FFXN <31929997+FFXN@users.noreply.github.com> Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com> Co-authored-by: Nie Ronghua <nieronghua@sf-express.com> Co-authored-by: JQSevenMiao <141806521+JQSevenMiao@users.noreply.github.com> Co-authored-by: jiasiqi <jiasiqi3@tal.com> Co-authored-by: Seokrin Taron Sung <sungsjade@gmail.com> Co-authored-by: CrabSAMA <40541269+CrabSAMA@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: yihong <zouzou0208@gmail.com> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com> Co-authored-by: yessenia <yessenia.contact@gmail.com> Co-authored-by: Jax <anobaka@qq.com> Co-authored-by: niveshdandyan <155956228+niveshdandyan@users.noreply.github.com> Co-authored-by: OSS Contributor <oss-contributor@example.com> Co-authored-by: niveshdandyan <niveshdandyan@users.noreply.github.com> Co-authored-by: Sean Kenneth Doherty <Smaster7772@gmail.com>
284 lines
8.2 KiB
TypeScript
284 lines
8.2 KiB
TypeScript
import type { DefaultModel, Model } from '@/app/components/header/account-setting/model-provider-page/declarations'
|
|
import type { NotionPage } from '@/models/common'
|
|
import type {
|
|
ChunkingMode,
|
|
CrawlOptions,
|
|
CrawlResultItem,
|
|
CreateDocumentReq,
|
|
createDocumentResponse,
|
|
CustomFile,
|
|
FullDocumentDetail,
|
|
ProcessRule,
|
|
SummaryIndexSetting as SummaryIndexSettingType,
|
|
} from '@/models/datasets'
|
|
import type { RetrievalConfig, RETRIEVE_METHOD } from '@/types/app'
|
|
import { useCallback } from 'react'
|
|
import { useTranslation } from 'react-i18next'
|
|
import { trackEvent } from '@/app/components/base/amplitude'
|
|
import Toast from '@/app/components/base/toast'
|
|
import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
|
|
import { DataSourceProvider } from '@/models/common'
|
|
import {
|
|
DataSourceType,
|
|
} from '@/models/datasets'
|
|
import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument } from '@/service/knowledge/use-create-dataset'
|
|
import { useInvalidDatasetList } from '@/service/knowledge/use-dataset'
|
|
import { IndexingType } from './use-indexing-config'
|
|
import { MAXIMUM_CHUNK_TOKEN_LENGTH } from './use-segmentation-state'
|
|
|
|
export type UseDocumentCreationOptions = {
|
|
datasetId?: string
|
|
isSetting?: boolean
|
|
documentDetail?: FullDocumentDetail
|
|
dataSourceType: DataSourceType
|
|
files: CustomFile[]
|
|
notionPages: NotionPage[]
|
|
notionCredentialId: string
|
|
websitePages: CrawlResultItem[]
|
|
crawlOptions?: CrawlOptions
|
|
websiteCrawlProvider?: DataSourceProvider
|
|
websiteCrawlJobId?: string
|
|
// Callbacks
|
|
onStepChange?: (delta: number) => void
|
|
updateIndexingTypeCache?: (type: string) => void
|
|
updateResultCache?: (res: createDocumentResponse) => void
|
|
updateRetrievalMethodCache?: (method: RETRIEVE_METHOD | '') => void
|
|
onSave?: () => void
|
|
mutateDatasetRes?: () => void
|
|
}
|
|
|
|
export type ValidationParams = {
|
|
segmentationType: string
|
|
maxChunkLength: number
|
|
limitMaxChunkLength: number
|
|
overlap: number
|
|
indexType: IndexingType
|
|
embeddingModel: DefaultModel
|
|
rerankModelList: Model[]
|
|
retrievalConfig: RetrievalConfig
|
|
}
|
|
|
|
export const useDocumentCreation = (options: UseDocumentCreationOptions) => {
|
|
const { t } = useTranslation()
|
|
const {
|
|
datasetId,
|
|
isSetting,
|
|
documentDetail,
|
|
dataSourceType,
|
|
files,
|
|
notionPages,
|
|
notionCredentialId,
|
|
websitePages,
|
|
crawlOptions,
|
|
websiteCrawlProvider = DataSourceProvider.jinaReader,
|
|
websiteCrawlJobId = '',
|
|
onStepChange,
|
|
updateIndexingTypeCache,
|
|
updateResultCache,
|
|
updateRetrievalMethodCache,
|
|
onSave,
|
|
mutateDatasetRes,
|
|
} = options
|
|
|
|
const createFirstDocumentMutation = useCreateFirstDocument()
|
|
const createDocumentMutation = useCreateDocument(datasetId!)
|
|
const invalidDatasetList = useInvalidDatasetList()
|
|
|
|
const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending
|
|
|
|
// Validate creation params
|
|
const validateParams = useCallback((params: ValidationParams): boolean => {
|
|
const {
|
|
segmentationType,
|
|
maxChunkLength,
|
|
limitMaxChunkLength,
|
|
overlap,
|
|
indexType,
|
|
embeddingModel,
|
|
rerankModelList,
|
|
retrievalConfig,
|
|
} = params
|
|
|
|
if (segmentationType === 'general' && overlap > maxChunkLength) {
|
|
Toast.notify({ type: 'error', message: t('stepTwo.overlapCheck', { ns: 'datasetCreation' }) })
|
|
return false
|
|
}
|
|
|
|
if (segmentationType === 'general' && maxChunkLength > limitMaxChunkLength) {
|
|
Toast.notify({
|
|
type: 'error',
|
|
message: t('stepTwo.maxLengthCheck', { ns: 'datasetCreation', limit: limitMaxChunkLength }),
|
|
})
|
|
return false
|
|
}
|
|
|
|
if (!isSetting) {
|
|
if (indexType === IndexingType.QUALIFIED && (!embeddingModel.model || !embeddingModel.provider)) {
|
|
Toast.notify({
|
|
type: 'error',
|
|
message: t('datasetConfig.embeddingModelRequired', { ns: 'appDebug' }),
|
|
})
|
|
return false
|
|
}
|
|
|
|
if (!isReRankModelSelected({
|
|
rerankModelList,
|
|
retrievalConfig,
|
|
indexMethod: indexType,
|
|
})) {
|
|
Toast.notify({ type: 'error', message: t('datasetConfig.rerankModelRequired', { ns: 'appDebug' }) })
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}, [t, isSetting])
|
|
|
|
// Build creation params
|
|
const buildCreationParams = useCallback((
|
|
currentDocForm: ChunkingMode,
|
|
docLanguage: string,
|
|
processRule: ProcessRule,
|
|
retrievalConfig: RetrievalConfig,
|
|
embeddingModel: DefaultModel,
|
|
indexingTechnique: string,
|
|
summaryIndexSetting?: SummaryIndexSettingType,
|
|
): CreateDocumentReq | null => {
|
|
if (isSetting) {
|
|
return {
|
|
original_document_id: documentDetail?.id,
|
|
doc_form: currentDocForm,
|
|
doc_language: docLanguage,
|
|
process_rule: processRule,
|
|
summary_index_setting: summaryIndexSetting,
|
|
retrieval_model: retrievalConfig,
|
|
embedding_model: embeddingModel.model,
|
|
embedding_model_provider: embeddingModel.provider,
|
|
indexing_technique: indexingTechnique,
|
|
} as CreateDocumentReq
|
|
}
|
|
|
|
const params: CreateDocumentReq = {
|
|
data_source: {
|
|
type: dataSourceType,
|
|
info_list: {
|
|
data_source_type: dataSourceType,
|
|
},
|
|
},
|
|
indexing_technique: indexingTechnique,
|
|
process_rule: processRule,
|
|
summary_index_setting: summaryIndexSetting,
|
|
doc_form: currentDocForm,
|
|
doc_language: docLanguage,
|
|
retrieval_model: retrievalConfig,
|
|
embedding_model: embeddingModel.model,
|
|
embedding_model_provider: embeddingModel.provider,
|
|
} as CreateDocumentReq
|
|
|
|
// Add data source specific info
|
|
if (dataSourceType === DataSourceType.FILE) {
|
|
params.data_source!.info_list.file_info_list = {
|
|
file_ids: files.map(file => file.id || '').filter(Boolean),
|
|
}
|
|
}
|
|
if (dataSourceType === DataSourceType.NOTION)
|
|
params.data_source!.info_list.notion_info_list = getNotionInfo(notionPages, notionCredentialId)
|
|
|
|
if (dataSourceType === DataSourceType.WEB) {
|
|
params.data_source!.info_list.website_info_list = getWebsiteInfo({
|
|
websiteCrawlProvider,
|
|
websiteCrawlJobId,
|
|
websitePages,
|
|
crawlOptions,
|
|
})
|
|
}
|
|
|
|
return params
|
|
}, [
|
|
isSetting,
|
|
documentDetail,
|
|
dataSourceType,
|
|
files,
|
|
notionPages,
|
|
notionCredentialId,
|
|
websitePages,
|
|
websiteCrawlProvider,
|
|
websiteCrawlJobId,
|
|
crawlOptions,
|
|
])
|
|
|
|
// Execute creation
|
|
const executeCreation = useCallback(async (
|
|
params: CreateDocumentReq,
|
|
indexType: IndexingType,
|
|
retrievalConfig: RetrievalConfig,
|
|
) => {
|
|
if (!datasetId) {
|
|
await createFirstDocumentMutation.mutateAsync(params, {
|
|
onSuccess(data) {
|
|
updateIndexingTypeCache?.(indexType)
|
|
updateResultCache?.(data)
|
|
updateRetrievalMethodCache?.(retrievalConfig.search_method as RETRIEVE_METHOD)
|
|
},
|
|
})
|
|
}
|
|
else {
|
|
await createDocumentMutation.mutateAsync(params, {
|
|
onSuccess(data) {
|
|
updateIndexingTypeCache?.(indexType)
|
|
updateResultCache?.(data)
|
|
updateRetrievalMethodCache?.(retrievalConfig.search_method as RETRIEVE_METHOD)
|
|
},
|
|
})
|
|
}
|
|
|
|
mutateDatasetRes?.()
|
|
invalidDatasetList()
|
|
|
|
trackEvent('create_datasets', {
|
|
data_source_type: dataSourceType,
|
|
indexing_technique: indexType,
|
|
})
|
|
|
|
onStepChange?.(+1)
|
|
|
|
if (isSetting)
|
|
onSave?.()
|
|
}, [
|
|
datasetId,
|
|
createFirstDocumentMutation,
|
|
createDocumentMutation,
|
|
updateIndexingTypeCache,
|
|
updateResultCache,
|
|
updateRetrievalMethodCache,
|
|
mutateDatasetRes,
|
|
invalidDatasetList,
|
|
dataSourceType,
|
|
onStepChange,
|
|
isSetting,
|
|
onSave,
|
|
])
|
|
|
|
// Validate preview params
|
|
const validatePreviewParams = useCallback((maxChunkLength: number): boolean => {
|
|
if (maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) {
|
|
Toast.notify({
|
|
type: 'error',
|
|
message: t('stepTwo.maxLengthCheck', { ns: 'datasetCreation', limit: MAXIMUM_CHUNK_TOKEN_LENGTH }),
|
|
})
|
|
return false
|
|
}
|
|
return true
|
|
}, [t])
|
|
|
|
return {
|
|
isCreating,
|
|
validateParams,
|
|
buildCreationParams,
|
|
executeCreation,
|
|
validatePreviewParams,
|
|
}
|
|
}
|
|
|
|
export type DocumentCreation = ReturnType<typeof useDocumentCreation>
|