From 14c6bd2958ee03f40b3fb42c9c1a87c3a4de5f28 Mon Sep 17 00:00:00 2001 From: JzoNg Date: Sat, 20 May 2023 16:32:46 +0800 Subject: [PATCH] feat: support re-segmentation --- .../documents/[documentId]/settings/page.tsx | 16 ++++ .../[datasetId]/layout.tsx | 5 +- .../datasets/create/step-two/index.tsx | 96 ++++++++++++++----- .../documents/detail/embedding/index.tsx | 2 +- .../documents/detail/settings/index.tsx | 90 +++++++++++++++++ .../components/datasets/documents/list.tsx | 23 +++-- .../datasets/documents/style.module.css | 2 +- web/context/dataset-detail.ts | 3 +- web/i18n/lang/dataset-creation.en.ts | 2 + web/i18n/lang/dataset-creation.zh.ts | 2 + 10 files changed, 204 insertions(+), 37 deletions(-) create mode 100644 web/app/(commonLayout)/datasets/(datasetDetailLayout)/[datasetId]/documents/[documentId]/settings/page.tsx create mode 100644 web/app/components/datasets/documents/detail/settings/index.tsx diff --git a/web/app/(commonLayout)/datasets/(datasetDetailLayout)/[datasetId]/documents/[documentId]/settings/page.tsx b/web/app/(commonLayout)/datasets/(datasetDetailLayout)/[datasetId]/documents/[documentId]/settings/page.tsx new file mode 100644 index 0000000000..2194934ad1 --- /dev/null +++ b/web/app/(commonLayout)/datasets/(datasetDetailLayout)/[datasetId]/documents/[documentId]/settings/page.tsx @@ -0,0 +1,16 @@ +import React from 'react' +import Settings from '@/app/components/datasets/documents/detail/settings' + +export type IProps = { + params: { datasetId: string; documentId: string } +} + +const DocumentSettings = async ({ + params: { datasetId, documentId }, +}: IProps) => { + return ( + + ) +} + +export default DocumentSettings diff --git a/web/app/(commonLayout)/datasets/(datasetDetailLayout)/[datasetId]/layout.tsx b/web/app/(commonLayout)/datasets/(datasetDetailLayout)/[datasetId]/layout.tsx index 1dc6578977..8a69cbae0c 100644 --- a/web/app/(commonLayout)/datasets/(datasetDetailLayout)/[datasetId]/layout.tsx +++ b/web/app/(commonLayout)/datasets/(datasetDetailLayout)/[datasetId]/layout.tsx @@ -160,7 +160,10 @@ const DatasetDetailLayout: FC = (props) => { extraInfo={} iconType='dataset' />} - +
{children}
diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx index b13a8ad656..6426691eae 100644 --- a/web/app/components/datasets/create/step-two/index.tsx +++ b/web/app/components/datasets/create/step-two/index.tsx @@ -9,7 +9,7 @@ import { createDocument, fetchFileIndexingEstimate as didFetchFileIndexingEstimate, } from '@/service/datasets' -import type { CreateDocumentReq, createDocumentResponse } from '@/models/datasets' +import type { CreateDocumentReq, createDocumentResponse, FullDocumentDetail } from '@/models/datasets' import Button from '@/app/components/base/button' import PreviewItem from './preview-item' import Loading from '@/app/components/base/loading' @@ -22,14 +22,18 @@ import Toast from '@/app/components/base/toast' import { formatNumber } from '@/utils/format' type StepTwoProps = { + isSetting?: boolean, + documentDetail?: FullDocumentDetail hasSetAPIKEY: boolean, onSetting: () => void, datasetId?: string, indexingType?: string, file?: File, - onStepChange: (delta: number) => void, - updateIndexingTypeCache: (type: string) => void, - updateResultCache: (res: createDocumentResponse) => void + onStepChange?: (delta: number) => void, + updateIndexingTypeCache?: (type: string) => void, + updateResultCache?: (res: createDocumentResponse) => void + onSave?: () => void + onCancel?: () => void } enum SegmentType { @@ -42,6 +46,8 @@ enum IndexingType { } const StepTwo = ({ + isSetting, + documentDetail, hasSetAPIKEY, onSetting, datasetId, @@ -50,6 +56,8 @@ const StepTwo = ({ onStepChange, updateIndexingTypeCache, updateResultCache, + onSave, + onCancel, }: StepTwoProps) => { const { t } = useTranslation() const scrollRef = useRef(null) @@ -171,15 +179,23 @@ const StepTwo = ({ } const getCreationParams = () => { - const params = { - data_source: { - type: 'upload_file', - info: file?.id, - name: file?.name, - }, - indexing_technique: getIndexing_technique(), - process_rule: getProcessRule(), - } as CreateDocumentReq + let params + if (isSetting) { + params = { + original_document_id: documentDetail?.id, + process_rule: getProcessRule(), + } as CreateDocumentReq + } else { + params = { + data_source: { + type: 'upload_file', + info: file?.id, + name: file?.name, + }, + indexing_technique: getIndexing_technique(), + process_rule: getProcessRule(), + } as CreateDocumentReq + } return params } @@ -196,6 +212,25 @@ const StepTwo = ({ console.log(err) } } + + const getRulesFromDetail = () => { + if (documentDetail) { + const rules = documentDetail.dataset_process_rule.rules + const separator = rules.segmentation.separator + const max = rules.segmentation.max_tokens + setSegmentIdentifier(separator === '\n' ? '\\n' : separator || '\\n') + setMax(max) + setRules(rules.pre_processing_rules) + setDefaultConfig(rules) + } + } + + const getDefaultMode = () => { + if (documentDetail) { + setSegmentationType(documentDetail.dataset_process_rule.mode) + } + } + const createHandle = async () => { try { let res; @@ -204,19 +239,20 @@ const StepTwo = ({ res = await createFirstDocument({ body: params }) - updateIndexingTypeCache(indexType) - updateResultCache(res) + updateIndexingTypeCache && updateIndexingTypeCache(indexType) + updateResultCache && updateResultCache(res) } else { res = await createDocument({ datasetId, body: params }) - updateIndexingTypeCache(indexType) - updateResultCache({ + updateIndexingTypeCache && updateIndexingTypeCache(indexType) + updateResultCache && updateResultCache({ document: res, }) } - onStepChange(+1) + onStepChange && onStepChange(+1) + isSetting && onSave && onSave() } catch (err) { Toast.notify({ @@ -228,7 +264,12 @@ const StepTwo = ({ useEffect(() => { // fetch rules - getRules() + if (!isSetting) { + getRules() + } else { + getRulesFromDetail() + getDefaultMode() + } }, []) useEffect(() => { @@ -444,11 +485,18 @@ const StepTwo = ({ -
- -
- -
+ {!isSetting ? ( +
+ +
+ +
+ ) : ( +
+ + +
+ )}
diff --git a/web/app/components/datasets/documents/detail/embedding/index.tsx b/web/app/components/datasets/documents/detail/embedding/index.tsx index 40ce2713e7..c84ee49be8 100644 --- a/web/app/components/datasets/documents/detail/embedding/index.tsx +++ b/web/app/components/datasets/documents/detail/embedding/index.tsx @@ -125,7 +125,7 @@ const EmbeddingDetail: FC = ({ detail, stopPosition = 'top', datasetId: d datasetId: localDatasetId, documentId: localDocumentId, }, apiParams => fetchIndexingStatus(omit(apiParams, 'action')), { - refreshInterval: 5000, + refreshInterval: 2500, revalidateOnFocus: false, }) diff --git a/web/app/components/datasets/documents/detail/settings/index.tsx b/web/app/components/datasets/documents/detail/settings/index.tsx new file mode 100644 index 0000000000..cdd0b1ddd8 --- /dev/null +++ b/web/app/components/datasets/documents/detail/settings/index.tsx @@ -0,0 +1,90 @@ +'use client' +import React, { useState, useCallback, useEffect } from 'react' +import { useTranslation } from 'react-i18next' +import { useBoolean } from 'ahooks' +import { useContext } from 'use-context-selector' +import { useRouter } from 'next/navigation' +import DatasetDetailContext from '@/context/dataset-detail' +import type { FullDocumentDetail } from '@/models/datasets' +import { fetchTenantInfo } from '@/service/common' +import { fetchDocumentDetail, MetadataType } from '@/service/datasets' + +import Loading from '@/app/components/base/loading' +import StepTwo from '@/app/components/datasets/create/step-two' +import AccountSetting from '@/app/components/header/account-setting' +import AppUnavailable from '@/app/components/base/app-unavailable' + +type DocumentSettingsProps = { + datasetId: string; + documentId: string; +} + +const DocumentSettings = ({ datasetId, documentId }: DocumentSettingsProps) => { + const { t } = useTranslation() + const router = useRouter() + const [hasSetAPIKEY, setHasSetAPIKEY] = useState(true) + const [isShowSetAPIKey, { setTrue: showSetAPIKey, setFalse: hideSetAPIkey }] = useBoolean() + const [hasError, setHasError] = useState(false) + const { indexingTechnique, dataset } = useContext(DatasetDetailContext) + + const saveHandler = () => router.push(`/datasets/${datasetId}/documents/${documentId}`) + + const cancelHandler = () => router.back() + + const checkAPIKey = async () => { + const data = await fetchTenantInfo({ url: '/info' }) + const hasSetKey = data.providers.some(({ is_valid }) => is_valid) + setHasSetAPIKEY(hasSetKey) + } + + useEffect(() => { + checkAPIKey() + }, []) + + const [documentDetail, setDocumentDetail] = useState(null) + useEffect(() => { + (async () => { + try { + const detail = await fetchDocumentDetail({ + datasetId, + documentId, + params: { metadata: 'without' as MetadataType } + }) + setDocumentDetail(detail) + } catch (e) { + setHasError(true) + } + })() + }, [datasetId, documentId]) + + if (hasError) { + return + } + + return ( +
+
+ {!documentDetail && } + {dataset && documentDetail && ( + + )} +
+ {isShowSetAPIKey && { + await checkAPIKey() + hideSetAPIkey() + }} />} +
+ ) +} + +export default DocumentSettings diff --git a/web/app/components/datasets/documents/list.tsx b/web/app/components/datasets/documents/list.tsx index f53c63f032..d8ffd3f5ca 100644 --- a/web/app/components/datasets/documents/list.tsx +++ b/web/app/components/datasets/documents/list.tsx @@ -94,6 +94,7 @@ export const OperationAction: FC<{ const [showModal, setShowModal] = useState(false) const { notify } = useContext(ToastContext) const { t } = useTranslation() + const router = useRouter() const isListScene = scene === 'list'; @@ -165,15 +166,19 @@ export const OperationAction: FC<{ } - {/*
- - {t('datasetDocuments.list.action.settings')} -
-
router.push(`/datasets/${datasetId}/documents/create`)}> - - {t('datasetDocuments.list.action.uploadFile')} -
- */} + {!archived && ( + <> +
router.push(`/datasets/${datasetId}/documents/${detail.id}/settings`)}> + + {t('datasetDocuments.list.action.settings')} +
+ {/*
router.push(`/datasets/${datasetId}/documents/create`)}> + + {t('datasetDocuments.list.action.uploadFile')} +
*/} + + + )} {!archived &&
onOperate('archive')}> {t('datasetDocuments.list.action.archive')} diff --git a/web/app/components/datasets/documents/style.module.css b/web/app/components/datasets/documents/style.module.css index 76327d83f6..d412b382bc 100644 --- a/web/app/components/datasets/documents/style.module.css +++ b/web/app/components/datasets/documents/style.module.css @@ -72,7 +72,7 @@ .txtIcon { background-image: url(./assets/txt.svg); } -.mdIcon { +.markdownIcon { background-image: url(./assets/md.svg); } .statusItemDetail { diff --git a/web/context/dataset-detail.ts b/web/context/dataset-detail.ts index b507fbcc4c..362b13535a 100644 --- a/web/context/dataset-detail.ts +++ b/web/context/dataset-detail.ts @@ -1,5 +1,6 @@ import { createContext } from 'use-context-selector' +import type { DataSet } from '@/models/datasets' -const DatasetDetailContext = createContext<{ indexingTechnique?: string; }>({}) +const DatasetDetailContext = createContext<{ indexingTechnique?: string; dataset?: DataSet }>({}) export default DatasetDetailContext diff --git a/web/i18n/lang/dataset-creation.en.ts b/web/i18n/lang/dataset-creation.en.ts index 2a0ecf574f..f937fad5b6 100644 --- a/web/i18n/lang/dataset-creation.en.ts +++ b/web/i18n/lang/dataset-creation.en.ts @@ -76,6 +76,8 @@ const translation = { fileName: 'Preprocess document', lastStep: 'Last step', nextStep: 'Save & Process', + save: 'Save & Process', + cancel: 'Cancel', sideTipTitle: 'Why segment and preprocess?', sideTipP1: 'When processing text data, segmentation and cleaning are two important preprocessing steps.', sideTipP2: 'Segmentation splits long text into paragraphs so models can understand better. This improves the quality and relevance of model results.', diff --git a/web/i18n/lang/dataset-creation.zh.ts b/web/i18n/lang/dataset-creation.zh.ts index 4fbaea9661..1ab2d642cb 100644 --- a/web/i18n/lang/dataset-creation.zh.ts +++ b/web/i18n/lang/dataset-creation.zh.ts @@ -76,6 +76,8 @@ const translation = { fileName: '预处理文档', lastStep: '上一步', nextStep: '保存并处理', + save: '保存并处理', + cancel: '取消', sideTipTitle: '为什么要分段和预处理?', sideTipP1: '在处理文本数据时,分段和清洗是两个重要的预处理步骤。', sideTipP2: '分段的目的是将长文本拆分成较小的段落,以便模型更有效地处理和理解。这有助于提高模型生成的结果的质量和相关性。',