feat: support re-segmentation

This commit is contained in:
JzoNg 2023-05-20 16:32:46 +08:00
parent ffa8e4ccd1
commit 14c6bd2958
10 changed files with 204 additions and 37 deletions

View File

@ -0,0 +1,16 @@
import React from 'react'
import Settings from '@/app/components/datasets/documents/detail/settings'
export type IProps = {
params: { datasetId: string; documentId: string }
}
const DocumentSettings = async ({
params: { datasetId, documentId },
}: IProps) => {
return (
<Settings datasetId={datasetId} documentId={documentId} />
)
}
export default DocumentSettings

View File

@ -160,7 +160,10 @@ const DatasetDetailLayout: FC<IAppDetailLayoutProps> = (props) => {
extraInfo={<ExtraInfo />}
iconType='dataset'
/>}
<DatasetDetailContext.Provider value={{ indexingTechnique: datasetRes?.indexing_technique }}>
<DatasetDetailContext.Provider value={{
indexingTechnique: datasetRes?.indexing_technique,
dataset: datasetRes,
}}>
<div className="bg-white grow">{children}</div>
</DatasetDetailContext.Provider>
</div>

View File

@ -9,7 +9,7 @@ import {
createDocument,
fetchFileIndexingEstimate as didFetchFileIndexingEstimate,
} from '@/service/datasets'
import type { CreateDocumentReq, createDocumentResponse } from '@/models/datasets'
import type { CreateDocumentReq, createDocumentResponse, FullDocumentDetail } from '@/models/datasets'
import Button from '@/app/components/base/button'
import PreviewItem from './preview-item'
import Loading from '@/app/components/base/loading'
@ -22,14 +22,18 @@ import Toast from '@/app/components/base/toast'
import { formatNumber } from '@/utils/format'
type StepTwoProps = {
isSetting?: boolean,
documentDetail?: FullDocumentDetail
hasSetAPIKEY: boolean,
onSetting: () => void,
datasetId?: string,
indexingType?: string,
file?: File,
onStepChange: (delta: number) => void,
updateIndexingTypeCache: (type: string) => void,
updateResultCache: (res: createDocumentResponse) => void
onStepChange?: (delta: number) => void,
updateIndexingTypeCache?: (type: string) => void,
updateResultCache?: (res: createDocumentResponse) => void
onSave?: () => void
onCancel?: () => void
}
enum SegmentType {
@ -42,6 +46,8 @@ enum IndexingType {
}
const StepTwo = ({
isSetting,
documentDetail,
hasSetAPIKEY,
onSetting,
datasetId,
@ -50,6 +56,8 @@ const StepTwo = ({
onStepChange,
updateIndexingTypeCache,
updateResultCache,
onSave,
onCancel,
}: StepTwoProps) => {
const { t } = useTranslation()
const scrollRef = useRef<HTMLDivElement>(null)
@ -171,15 +179,23 @@ const StepTwo = ({
}
const getCreationParams = () => {
const params = {
data_source: {
type: 'upload_file',
info: file?.id,
name: file?.name,
},
indexing_technique: getIndexing_technique(),
process_rule: getProcessRule(),
} as CreateDocumentReq
let params
if (isSetting) {
params = {
original_document_id: documentDetail?.id,
process_rule: getProcessRule(),
} as CreateDocumentReq
} else {
params = {
data_source: {
type: 'upload_file',
info: file?.id,
name: file?.name,
},
indexing_technique: getIndexing_technique(),
process_rule: getProcessRule(),
} as CreateDocumentReq
}
return params
}
@ -196,6 +212,25 @@ const StepTwo = ({
console.log(err)
}
}
const getRulesFromDetail = () => {
if (documentDetail) {
const rules = documentDetail.dataset_process_rule.rules
const separator = rules.segmentation.separator
const max = rules.segmentation.max_tokens
setSegmentIdentifier(separator === '\n' ? '\\n' : separator || '\\n')
setMax(max)
setRules(rules.pre_processing_rules)
setDefaultConfig(rules)
}
}
const getDefaultMode = () => {
if (documentDetail) {
setSegmentationType(documentDetail.dataset_process_rule.mode)
}
}
const createHandle = async () => {
try {
let res;
@ -204,19 +239,20 @@ const StepTwo = ({
res = await createFirstDocument({
body: params
})
updateIndexingTypeCache(indexType)
updateResultCache(res)
updateIndexingTypeCache && updateIndexingTypeCache(indexType)
updateResultCache && updateResultCache(res)
} else {
res = await createDocument({
datasetId,
body: params
})
updateIndexingTypeCache(indexType)
updateResultCache({
updateIndexingTypeCache && updateIndexingTypeCache(indexType)
updateResultCache && updateResultCache({
document: res,
})
}
onStepChange(+1)
onStepChange && onStepChange(+1)
isSetting && onSave && onSave()
}
catch (err) {
Toast.notify({
@ -228,7 +264,12 @@ const StepTwo = ({
useEffect(() => {
// fetch rules
getRules()
if (!isSetting) {
getRules()
} else {
getRulesFromDetail()
getDefaultMode()
}
}, [])
useEffect(() => {
@ -444,11 +485,18 @@ const StepTwo = ({
</div>
</div>
</div>
<div className='flex items-center mt-8 py-2'>
<Button onClick={() => onStepChange(-1)}>{t('datasetCreation.stepTwo.lastStep')}</Button>
<div className={s.divider} />
<Button type='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.nextStep')}</Button>
</div>
{!isSetting ? (
<div className='flex items-center mt-8 py-2'>
<Button onClick={() => onStepChange && onStepChange(-1)}>{t('datasetCreation.stepTwo.lastStep')}</Button>
<div className={s.divider} />
<Button type='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.nextStep')}</Button>
</div>
) : (
<div className='flex items-center mt-8 py-2'>
<Button type='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>
<Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
</div>
)}
</div>
</div>
</div>

View File

@ -125,7 +125,7 @@ const EmbeddingDetail: FC<Props> = ({ detail, stopPosition = 'top', datasetId: d
datasetId: localDatasetId,
documentId: localDocumentId,
}, apiParams => fetchIndexingStatus(omit(apiParams, 'action')), {
refreshInterval: 5000,
refreshInterval: 2500,
revalidateOnFocus: false,
})

View File

@ -0,0 +1,90 @@
'use client'
import React, { useState, useCallback, useEffect } from 'react'
import { useTranslation } from 'react-i18next'
import { useBoolean } from 'ahooks'
import { useContext } from 'use-context-selector'
import { useRouter } from 'next/navigation'
import DatasetDetailContext from '@/context/dataset-detail'
import type { FullDocumentDetail } from '@/models/datasets'
import { fetchTenantInfo } from '@/service/common'
import { fetchDocumentDetail, MetadataType } from '@/service/datasets'
import Loading from '@/app/components/base/loading'
import StepTwo from '@/app/components/datasets/create/step-two'
import AccountSetting from '@/app/components/header/account-setting'
import AppUnavailable from '@/app/components/base/app-unavailable'
type DocumentSettingsProps = {
datasetId: string;
documentId: string;
}
const DocumentSettings = ({ datasetId, documentId }: DocumentSettingsProps) => {
const { t } = useTranslation()
const router = useRouter()
const [hasSetAPIKEY, setHasSetAPIKEY] = useState(true)
const [isShowSetAPIKey, { setTrue: showSetAPIKey, setFalse: hideSetAPIkey }] = useBoolean()
const [hasError, setHasError] = useState(false)
const { indexingTechnique, dataset } = useContext(DatasetDetailContext)
const saveHandler = () => router.push(`/datasets/${datasetId}/documents/${documentId}`)
const cancelHandler = () => router.back()
const checkAPIKey = async () => {
const data = await fetchTenantInfo({ url: '/info' })
const hasSetKey = data.providers.some(({ is_valid }) => is_valid)
setHasSetAPIKEY(hasSetKey)
}
useEffect(() => {
checkAPIKey()
}, [])
const [documentDetail, setDocumentDetail] = useState<FullDocumentDetail | null>(null)
useEffect(() => {
(async () => {
try {
const detail = await fetchDocumentDetail({
datasetId,
documentId,
params: { metadata: 'without' as MetadataType }
})
setDocumentDetail(detail)
} catch (e) {
setHasError(true)
}
})()
}, [datasetId, documentId])
if (hasError) {
return <AppUnavailable code={500} unknownReason={t('datasetCreation.error.unavailable') as string} />
}
return (
<div className='flex' style={{ height: 'calc(100vh - 56px)' }}>
<div className="grow bg-white">
{!documentDetail && <Loading type='app' />}
{dataset && documentDetail && (
<StepTwo
hasSetAPIKEY={hasSetAPIKEY}
onSetting={showSetAPIKey}
datasetId={datasetId}
indexingType={indexingTechnique || ''}
isSetting
documentDetail={documentDetail}
file={documentDetail.data_source_info.upload_file}
onSave={saveHandler}
onCancel={cancelHandler}
/>
)}
</div>
{isShowSetAPIKey && <AccountSetting activeTab="provider" onCancel={async () => {
await checkAPIKey()
hideSetAPIkey()
}} />}
</div>
)
}
export default DocumentSettings

View File

@ -94,6 +94,7 @@ export const OperationAction: FC<{
const [showModal, setShowModal] = useState(false)
const { notify } = useContext(ToastContext)
const { t } = useTranslation()
const router = useRouter()
const isListScene = scene === 'list';
@ -165,15 +166,19 @@ export const OperationAction: FC<{
</div>
<Divider />
</>}
{/* <div className={s.actionItem}>
<SettingsIcon />
<span className={s.actionName}>{t('datasetDocuments.list.action.settings')}</span>
</div>
<div className={s.actionItem} onClick={() => router.push(`/datasets/${datasetId}/documents/create`)}>
<FilePlusIcon />
<span className={s.actionName}>{t('datasetDocuments.list.action.uploadFile')}</span>
</div>
<Divider className='my-1' /> */}
{!archived && (
<>
<div className={s.actionItem} onClick={() => router.push(`/datasets/${datasetId}/documents/${detail.id}/settings`)}>
<SettingsIcon />
<span className={s.actionName}>{t('datasetDocuments.list.action.settings')}</span>
</div>
{/* <div className={s.actionItem} onClick={() => router.push(`/datasets/${datasetId}/documents/create`)}>
<FilePlusIcon />
<span className={s.actionName}>{t('datasetDocuments.list.action.uploadFile')}</span>
</div> */}
<Divider className='my-1' />
</>
)}
{!archived && <div className={s.actionItem} onClick={() => onOperate('archive')}>
<ArchiveIcon />
<span className={s.actionName}>{t('datasetDocuments.list.action.archive')}</span>

View File

@ -72,7 +72,7 @@
.txtIcon {
background-image: url(./assets/txt.svg);
}
.mdIcon {
.markdownIcon {
background-image: url(./assets/md.svg);
}
.statusItemDetail {

View File

@ -1,5 +1,6 @@
import { createContext } from 'use-context-selector'
import type { DataSet } from '@/models/datasets'
const DatasetDetailContext = createContext<{ indexingTechnique?: string; }>({})
const DatasetDetailContext = createContext<{ indexingTechnique?: string; dataset?: DataSet }>({})
export default DatasetDetailContext

View File

@ -76,6 +76,8 @@ const translation = {
fileName: 'Preprocess document',
lastStep: 'Last step',
nextStep: 'Save & Process',
save: 'Save & Process',
cancel: 'Cancel',
sideTipTitle: 'Why segment and preprocess?',
sideTipP1: 'When processing text data, segmentation and cleaning are two important preprocessing steps.',
sideTipP2: 'Segmentation splits long text into paragraphs so models can understand better. This improves the quality and relevance of model results.',

View File

@ -76,6 +76,8 @@ const translation = {
fileName: '预处理文档',
lastStep: '上一步',
nextStep: '保存并处理',
save: '保存并处理',
cancel: '取消',
sideTipTitle: '为什么要分段和预处理?',
sideTipP1: '在处理文本数据时,分段和清洗是两个重要的预处理步骤。',
sideTipP2: '分段的目的是将长文本拆分成较小的段落,以便模型更有效地处理和理解。这有助于提高模型生成的结果的质量和相关性。',