From dfdc4ed3b118f1d4616f1805ac513fc25a598a9f Mon Sep 17 00:00:00 2001 From: AkaraChen Date: Tue, 3 Dec 2024 15:23:51 +0800 Subject: [PATCH] refactor: step 2 --- .../datasets/create/step-two/index.tsx | 255 +++++++----------- web/service/use-datasets.ts | 15 +- 2 files changed, 114 insertions(+), 156 deletions(-) diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx index 7bcb0f96d3..c6e5e82194 100644 --- a/web/app/components/datasets/create/step-two/index.tsx +++ b/web/app/components/datasets/create/step-two/index.tsx @@ -9,7 +9,6 @@ import { RiSearchEyeLine, } from '@remixicon/react' import Link from 'next/link' -import { groupBy } from 'lodash-es' import Image from 'next/image' import SettingCog from '../assets/setting-gear-mod.svg' import OrangeEffect from '../assets/option-card-effect-orange.svg' @@ -17,23 +16,21 @@ import FamilyMod from '../assets/family-mod.svg' import Note from '../assets/note-mod.svg' import FileList from '../assets/file-list-3-fill.svg' import { indexMethodIcon } from '../icons' -import PreviewItem, { PreviewType } from './preview-item' import s from './index.module.css' import unescape from './unescape' import escape from './escape' import { OptionCard } from './option-card' import LanguageSelect from './language-select' import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs' +import PreviewItem, { PreviewType } from './preview-item' import cn from '@/utils/classnames' -import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets' +import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FullDocumentDetail, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets' import { createDocument, createFirstDocument, - fetchFileIndexingEstimate as didFetchFileIndexingEstimate, fetchDefaultProcessRule, } from '@/service/datasets' import Button from '@/app/components/base/button' -import Loading from '@/app/components/base/loading' import FloatRightContainer from '@/app/components/base/float-right-container' import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config' import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config' @@ -58,6 +55,8 @@ import { MessageChatSquare } from '@/app/components/base/icons/src/public/common import { IS_CE_EDITION } from '@/config' import Switch from '@/app/components/base/switch' import Divider from '@/app/components/base/divider' +import { getNotionInfo, getWebsiteInfo, useFetchFileIndexingEstimateForFile, useFetchFileIndexingEstimateForNotion, useFetchFileIndexingEstimateForWeb } from '@/service/use-datasets' +import Loading from '@/app/components/base/loading' const TextLabel: FC = (props) => { return @@ -87,7 +86,7 @@ type StepTwoProps = { onCancel?: () => void } -enum SegmentType { +export enum SegmentType { AUTO = 'automatic', CUSTOM = 'custom', } @@ -176,17 +175,92 @@ const StepTwo = ({ ) const [QATipHide, setQATipHide] = useState(false) const [previewSwitched, setPreviewSwitched] = useState(false) - const [customFileIndexingEstimate, setCustomFileIndexingEstimate] = useState(null) - const [automaticFileIndexingEstimate, setAutomaticFileIndexingEstimate] = useState(null) - - const fileIndexingEstimate = segmentationType === SegmentType.AUTO - ? automaticFileIndexingEstimate - : customFileIndexingEstimate - const [isCreating, setIsCreating] = useState(false) const [parentChildConfig, setParentChildConfig] = useState(defaultParentChildConfig) + const getIndexing_technique = () => indexingType || indexType + + const getProcessRule = () => { + const processRule: ProcessRule = { + rules: {} as any, // api will check this. It will be removed after api refactored. + mode: segmentationType, + } + if (segmentationType === SegmentType.CUSTOM) { + const ruleObj = { + pre_processing_rules: rules, + segmentation: { + separator: unescape(segmentIdentifier), + max_tokens: max, + chunk_overlap: overlap, + }, + } + processRule.rules = ruleObj + } + return processRule + } + + const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({ + docForm: docForm as DocForm, + docLanguage, + dataSourceType: DataSourceType.FILE, + files, + indexingTechnique: getIndexing_technique() as any, + processRule: getProcessRule(), + dataset_id: datasetId!, + }) + const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({ + docForm: docForm as DocForm, + docLanguage, + dataSourceType: DataSourceType.NOTION, + notionPages, + indexingTechnique: getIndexing_technique() as any, + processRule: getProcessRule(), + dataset_id: datasetId || '', + }) + + const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({ + docForm: docForm as DocForm, + docLanguage, + dataSourceType: DataSourceType.WEB, + websitePages, + crawlOptions, + websiteCrawlProvider, + websiteCrawlJobId, + indexingTechnique: getIndexing_technique() as any, + processRule: getProcessRule(), + dataset_id: datasetId || '', + }) + + const fetchEstimate = useCallback(() => { + if (dataSourceType === DataSourceType.FILE) + fileIndexingEstimateQuery.mutate() + + if (dataSourceType === DataSourceType.NOTION) + notionIndexingEstimateQuery.mutate() + + if (dataSourceType === DataSourceType.WEB) + websiteIndexingEstimateQuery.mutate() + }, [dataSourceType, fileIndexingEstimateQuery, notionIndexingEstimateQuery, websiteIndexingEstimateQuery]) + + const estimate + = dataSourceType === DataSourceType.FILE + ? fileIndexingEstimateQuery.data + : dataSourceType === DataSourceType.NOTION + ? notionIndexingEstimateQuery.data + : websiteIndexingEstimateQuery.data + + const getIsEstimateReady = useCallback(() => { + if (dataSourceType === DataSourceType.FILE) + return fileIndexingEstimateQuery.isSuccess + + if (dataSourceType === DataSourceType.NOTION) + return notionIndexingEstimateQuery.isSuccess + + if (dataSourceType === DataSourceType.WEB) + return websiteIndexingEstimateQuery.isSuccess + }, [dataSourceType, fileIndexingEstimateQuery.isSuccess, notionIndexingEstimateQuery.isSuccess, websiteIndexingEstimateQuery.isSuccess]) + const getFileName = (name: string) => { const arr = name.split('.') return arr.slice(0, -1).join('.') @@ -224,122 +298,15 @@ const StepTwo = ({ setParentChildConfig(defaultParentChildConfig) } - const fetchFileIndexingEstimate = async (docForm = DocForm.TEXT, language?: string) => { - // eslint-disable-next-line @typescript-eslint/no-use-before-define - const res = await didFetchFileIndexingEstimate(getFileIndexingEstimateParams(docForm, language)!) - if (segmentationType === SegmentType.CUSTOM) - setCustomFileIndexingEstimate(res) - else - setAutomaticFileIndexingEstimate(res) - } - const updatePreview = () => { if (segmentationType === SegmentType.CUSTOM && max > 4000) { Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') }) return } - setCustomFileIndexingEstimate(null) - fetchFileIndexingEstimate() + fetchEstimate() setPreviewSwitched(false) } - const getIndexing_technique = () => indexingType || indexType - - const getProcessRule = () => { - const processRule: ProcessRule = { - rules: {} as any, // api will check this. It will be removed after api refactored. - mode: segmentationType, - } - if (segmentationType === SegmentType.CUSTOM) { - const ruleObj = { - pre_processing_rules: rules, - segmentation: { - separator: unescape(segmentIdentifier), - max_tokens: max, - chunk_overlap: overlap, - }, - } - processRule.rules = ruleObj - } - return processRule - } - - const getNotionInfo = () => { - const workspacesMap = groupBy(notionPages, 'workspace_id') - const workspaces = Object.keys(workspacesMap).map((workspaceId) => { - return { - workspaceId, - pages: workspacesMap[workspaceId], - } - }) - return workspaces.map((workspace) => { - return { - workspace_id: workspace.workspaceId, - pages: workspace.pages.map((page) => { - const { page_id, page_name, page_icon, type } = page - return { - page_id, - page_name, - page_icon, - type, - } - }), - } - }) as NotionInfo[] - } - - const getWebsiteInfo = () => { - return { - provider: websiteCrawlProvider, - job_id: websiteCrawlJobId, - urls: websitePages.map(page => page.source_url), - only_main_content: crawlOptions?.only_main_content, - } - } - - const getFileIndexingEstimateParams = (docForm: DocForm, language?: string): IndexingEstimateParams | undefined => { - if (dataSourceType === DataSourceType.FILE) { - return { - info_list: { - data_source_type: dataSourceType, - file_info_list: { - file_ids: files.map(file => file.id) as string[], - }, - }, - indexing_technique: getIndexing_technique() as string, - process_rule: getProcessRule(), - doc_form: docForm, - doc_language: language || docLanguage, - dataset_id: datasetId as string, - } - } - if (dataSourceType === DataSourceType.NOTION) { - return { - info_list: { - data_source_type: dataSourceType, - notion_info_list: getNotionInfo(), - }, - indexing_technique: getIndexing_technique() as string, - process_rule: getProcessRule(), - doc_form: docForm, - doc_language: language || docLanguage, - dataset_id: datasetId as string, - } - } - if (dataSourceType === DataSourceType.WEB) { - return { - info_list: { - data_source_type: dataSourceType, - website_info_list: getWebsiteInfo(), - }, - indexing_technique: getIndexing_technique() as string, - process_rule: getProcessRule(), - doc_form: docForm, - doc_language: language || docLanguage, - dataset_id: datasetId as string, - } - } - } const { modelList: rerankModelList, defaultModel: rerankDefaultModel, @@ -423,10 +390,15 @@ const StepTwo = ({ } } if (dataSourceType === DataSourceType.NOTION) - params.data_source.info_list.notion_info_list = getNotionInfo() + params.data_source.info_list.notion_info_list = getNotionInfo(notionPages) - if (dataSourceType === DataSourceType.WEB) - params.data_source.info_list.website_info_list = getWebsiteInfo() + if (dataSourceType === DataSourceType.WEB) { + params.data_source.info_list.website_info_list = getWebsiteInfo({ + websiteCrawlProvider, + websiteCrawlJobId, + websitePages, + }) + } } return params } @@ -519,16 +491,7 @@ const StepTwo = ({ const previewSwitch = async (language?: string) => { setPreviewSwitched(true) setIsLanguageSelectDisabled(true) - if (segmentationType === SegmentType.AUTO) - setAutomaticFileIndexingEstimate(null) - else - setCustomFileIndexingEstimate(null) - try { - await fetchFileIndexingEstimate(DocForm.QA, language) - } - finally { - setIsLanguageSelectDisabled(false) - } + fetchEstimate() } const handleSelect = (language: string) => { @@ -570,18 +533,6 @@ const StepTwo = ({ setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL) }, [isAPIKeySet, indexingType, datasetId]) - useEffect(() => { - if (segmentationType === SegmentType.AUTO) { - setAutomaticFileIndexingEstimate(null) - fetchFileIndexingEstimate() - setPreviewSwitched(false) - } - else { - setCustomFileIndexingEstimate(null) - setPreviewSwitched(false) - } - }, [segmentationType, indexType]) - const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || { search_method: RETRIEVE_METHOD.semantic, reranking_enable: false, @@ -971,26 +922,26 @@ const StepTwo = ({ )}
- {previewSwitched && docForm === DocForm.QA && fileIndexingEstimate?.qa_preview && ( + {previewSwitched && docForm === DocForm.QA && estimate?.qa_preview && ( <> - {fileIndexingEstimate?.qa_preview.map((item, index) => ( + {estimate?.qa_preview.map((item, index) => ( ))} )} - {(docForm === DocForm.TEXT || !previewSwitched) && fileIndexingEstimate?.preview && ( + {(docForm === DocForm.TEXT || !previewSwitched) && estimate?.preview && ( <> - {fileIndexingEstimate?.preview.map((item, index) => ( + {estimate?.preview.map((item, index) => ( ))} )} - {previewSwitched && docForm === DocForm.QA && !fileIndexingEstimate?.qa_preview && ( + {previewSwitched && docForm === DocForm.QA && !estimate?.qa_preview && (
)} - {!previewSwitched && !fileIndexingEstimate?.preview && ( + {!previewSwitched && !estimate?.preview && (
diff --git a/web/service/use-datasets.ts b/web/service/use-datasets.ts index 7ac9635cd4..221e258100 100644 --- a/web/service/use-datasets.ts +++ b/web/service/use-datasets.ts @@ -1,11 +1,12 @@ import groupBy from 'lodash-es/groupBy' +import type { MutationOptions } from '@tanstack/react-query' import { useMutation } from '@tanstack/react-query' import { fetchFileIndexingEstimate } from './datasets' -import type { IndexingType } from '@/app/components/datasets/create/step-two' -import type { CrawlOptions, CrawlResultItem, CustomFile, DataSourceType, DocForm, IndexingEstimateParams, NotionInfo, ProcessRule } from '@/models/datasets' +import { type IndexingType } from '@/app/components/datasets/create/step-two' +import type { CrawlOptions, CrawlResultItem, CustomFile, DataSourceType, DocForm, FileIndexingEstimateResponse, IndexingEstimateParams, NotionInfo, ProcessRule } from '@/models/datasets' import type { DataSourceProvider, NotionPage } from '@/models/common' -const getNotionInfo = ( +export const getNotionInfo = ( notionPages: NotionPage[], ) => { const workspacesMap = groupBy(notionPages, 'workspace_id') @@ -31,7 +32,7 @@ const getNotionInfo = ( }) as NotionInfo[] } -const getWebsiteInfo = ( +export const getWebsiteInfo = ( opts: { websiteCrawlProvider: DataSourceProvider websiteCrawlJobId: string @@ -152,30 +153,36 @@ const getFileIndexingEstimateParamsForWeb = ({ export const useFetchFileIndexingEstimateForFile = ( options: GetFileIndexingEstimateParamsOptionFile, + mutationOptions: MutationOptions = {}, ) => { return useMutation({ mutationFn: async () => { return fetchFileIndexingEstimate(getFileIndexingEstimateParamsForFile(options)) }, + ...mutationOptions, }) } export const useFetchFileIndexingEstimateForNotion = ( options: GetFileIndexingEstimateParamsOptionNotion, + mutationOptions: MutationOptions = {}, ) => { return useMutation({ mutationFn: async () => { return fetchFileIndexingEstimate(getFileIndexingEstimateParamsForNotion(options)) }, + ...mutationOptions, }) } export const useFetchFileIndexingEstimateForWeb = ( options: GetFileIndexingEstimateParamsOptionWeb, + mutationOptions: MutationOptions = {}, ) => { return useMutation({ mutationFn: async () => { return fetchFileIndexingEstimate(getFileIndexingEstimateParamsForWeb(options)) }, + ...mutationOptions, }) }