wip: create datasets

This commit is contained in:
AkaraChen 2024-12-06 13:27:32 +08:00
parent f3cfcb757e
commit a893309b73
3 changed files with 45 additions and 62 deletions

View File

@ -27,7 +27,7 @@ import { OptionCard } from './option-card'
import LanguageSelect from './language-select'
import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'
import cn from '@/utils/classnames'
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FullDocumentDetail, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FullDocumentDetail, ParentMode, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
import Button from '@/app/components/base/button'
import FloatRightContainer from '@/app/components/base/float-right-container'
@ -38,7 +38,7 @@ import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/componen
import Toast from '@/app/components/base/toast'
import type { NotionPage } from '@/models/common'
import { DataSourceProvider } from '@/models/common'
import { DataSourceType, DocForm } from '@/models/datasets'
import { ChuckingMode, DataSourceType } from '@/models/datasets'
import { useDatasetDetailContext } from '@/context/dataset-detail'
import I18n from '@/context/i18n'
import { RETRIEVE_METHOD } from '@/types/app'
@ -96,7 +96,7 @@ export enum IndexingType {
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
type ParentChildConfig = {
chunkForContext: 'paragraph' | 'full_doc'
chunkForContext: ParentMode
parent: {
delimiter: string
maxLength: number
@ -168,8 +168,8 @@ const StepTwo = ({
// QA Related
const [isLanguageSelectDisabled, setIsLanguageSelectDisabled] = useState(false)
const [docForm, setDocForm] = useState<DocForm | string>(
(datasetId && documentDetail) ? documentDetail.doc_form : DocForm.TEXT,
const [docForm, setDocForm] = useState<ChuckingMode>(
(datasetId && documentDetail) ? documentDetail.doc_form as ChuckingMode : ChuckingMode.text,
)
const [docLanguage, setDocLanguage] = useState<string>(
@ -181,27 +181,28 @@ const StepTwo = ({
const getIndexing_technique = () => indexingType || indexType
const getProcessRule = () => {
const processRule: ProcessRule = {
rules: {} as any, // api will check this. It will be removed after api refactored.
mode: segmentationType,
}
if (segmentationType === SegmentType.CUSTOM) {
const ruleObj = {
return {
rules: {
pre_processing_rules: rules,
segmentation: {
separator: unescape(segmentIdentifier),
max_tokens: maxChunkLength,
chunk_overlap: overlap,
},
}
// @ts-expect-error will be removed after api refactored.
processRule.rules = ruleObj
}
return processRule
parent_mode: parentChildConfig.chunkForContext,
subchunk_segmentation: {
separator: parentChildConfig.child.delimiter,
max_tokens: parentChildConfig.child.maxLength,
},
}, // api will check this. It will be removed after api refactored.
mode: docForm === ChuckingMode.parentChild
? 'hierarchical'
: segmentationType,
} as ProcessRule
}
const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({
docForm: docForm as DocForm,
docForm,
docLanguage,
dataSourceType: DataSourceType.FILE,
files,
@ -210,7 +211,7 @@ const StepTwo = ({
dataset_id: datasetId!,
})
const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({
docForm: docForm as DocForm,
docForm,
docLanguage,
dataSourceType: DataSourceType.NOTION,
notionPages,
@ -220,7 +221,7 @@ const StepTwo = ({
})
const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({
docForm: docForm as DocForm,
docForm,
docLanguage,
dataSourceType: DataSourceType.WEB,
websitePages,
@ -481,29 +482,10 @@ const StepTwo = ({
isSetting && onSave && onSave()
}
const handleDocformSwitch = (isQAMode: boolean) => {
if (isQAMode)
setDocForm(DocForm.QA)
else
setDocForm(DocForm.TEXT)
}
const previewSwitch = () => {
setIsLanguageSelectDisabled(true)
fetchEstimate()
}
const handleSelect = (language: string) => {
setDocLanguage(language)
// Switch language, re-cutter
if (docForm === DocForm.QA)
previewSwitch()
}
const changeToEconomicalType = () => {
if (!hasSetIndexType) {
setIndexType(IndexingType.ECONOMICAL)
setDocForm(DocForm.TEXT)
setDocForm(ChuckingMode.text)
}
}
@ -520,8 +502,8 @@ const StepTwo = ({
}, [])
useEffect(() => {
if (indexingType === IndexingType.ECONOMICAL && docForm === DocForm.QA)
setDocForm(DocForm.TEXT)
if (indexingType === IndexingType.ECONOMICAL && docForm === ChuckingMode.qa)
setDocForm(ChuckingMode.text)
}, [indexingType, docForm])
useEffect(() => {
@ -557,8 +539,8 @@ const StepTwo = ({
icon={<Image src={SettingCog} alt={t('datasetCreation.stepTwo.general')} />}
activeHeaderClassName='bg-gradient-to-r from-[#EFF0F9] to-[#F9FAFB]'
description={t('datasetCreation.stepTwo.generalTip')}
isActive={SegmentType.AUTO === segmentationType}
onClick={() => setSegmentationType(SegmentType.AUTO)}
isActive={docForm === ChuckingMode.qa || docForm === ChuckingMode.text}
onClick={() => setDocForm(ChuckingMode.text)}
actions={
<>
<Button variant={'secondary-accent'} onClick={() => updatePreview()}>
@ -607,12 +589,12 @@ const StepTwo = ({
{IS_CE_EDITION && <>
<div className='flex items-center'>
<Checkbox
checked={docForm === DocForm.QA}
checked={docForm === ChuckingMode.qa}
onCheck={() => {
if (docForm === DocForm.QA)
setDocForm(DocForm.TEXT)
if (docForm === ChuckingMode.qa)
setDocForm(ChuckingMode.text)
else
setDocForm(DocForm.QA)
setDocForm(ChuckingMode.qa)
}}
className='mr-2'
/>
@ -630,7 +612,7 @@ const StepTwo = ({
<Tooltip popupContent={t('datasetCreation.stepTwo.QATip')} />
</div>
</div>
{docForm === DocForm.QA && (
{docForm === ChuckingMode.qa && (
<div
style={{
background: 'linear-gradient(92deg, rgba(247, 144, 9, 0.1) 0%, rgba(255, 255, 255, 0.00) 100%)',
@ -652,8 +634,8 @@ const StepTwo = ({
effectImg={OrangeEffect.src}
activeHeaderClassName='bg-gradient-to-r from-[#F9F1EE] to-[#F9FAFB]'
description={t('datasetCreation.stepTwo.parentChildTip')}
isActive={SegmentType.CUSTOM === segmentationType}
onClick={() => setSegmentationType(SegmentType.CUSTOM)}
isActive={docForm === ChuckingMode.parentChild}
onClick={() => setDocForm(ChuckingMode.parentChild)}
actions={
<>
<Button variant={'secondary-accent'} onClick={() => updatePreview()}>
@ -714,10 +696,10 @@ const StepTwo = ({
onChosen={() => setParentChildConfig(
{
...parentChildConfig,
chunkForContext: 'full_doc',
chunkForContext: 'full-doc',
},
)}
isChosen={parentChildConfig.chunkForContext === 'full_doc'}
isChosen={parentChildConfig.chunkForContext === 'full-doc'}
/>
</div>
@ -926,19 +908,19 @@ const StepTwo = ({
</PreviewHeader>}
className={cn(s.previewWrap, isMobile && s.isMobile, 'relative h-full overflow-y-scroll space-y-4')}
>
{docForm === DocForm.QA && estimate?.qa_preview && (
{docForm === ChuckingMode.qa && estimate?.qa_preview && (
estimate?.qa_preview.map(item => (
<QAPreview key={item.question} qa={item} />
))
)}
{docForm === DocForm.TEXT && estimate?.preview && (
{docForm === ChuckingMode.text && estimate?.preview && (
estimate?.preview.map((item, index) => (
<ChunkContainer
key={item}
key={item.content}
label={`Chunk-${index + 1}`}
characterCount={item.length}
characterCount={item.content.length}
>
{item}
{item.content}
</ChunkContainer>
))
)}

View File

@ -151,7 +151,8 @@ export type IndexingEstimateResponse = {
total_price: number
currency: string
total_segments: number
preview: string[]
// TODO: change it
preview: Array<{ content: string; child_chunks: any }>
qa_preview?: QA[]
}
@ -304,7 +305,7 @@ export type DocumentListResponse = {
export type DocumentReq = {
original_document_id?: string
indexing_technique?: string
doc_form: 'text_model' | 'qa_model'
doc_form: ChuckingMode
doc_language: string
process_rule: ProcessRule
}
@ -346,7 +347,7 @@ export type NotionPage = {
}
export type ProcessRule = {
mode: string
mode: ChildChunkType | 'hierarchical'
rules: Rules
}

View File

@ -3,7 +3,7 @@ import type { MutationOptions } from '@tanstack/react-query'
import { useMutation } from '@tanstack/react-query'
import { createDocument, createFirstDocument, fetchDefaultProcessRule, fetchFileIndexingEstimate } from '../datasets'
import { type IndexingType } from '@/app/components/datasets/create/step-two'
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DataSourceType, DocForm, FileIndexingEstimateResponse, IndexingEstimateParams, NotionInfo, ProcessRule, ProcessRuleResponse, createDocumentResponse } from '@/models/datasets'
import type { ChuckingMode, CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DataSourceType, FileIndexingEstimateResponse, IndexingEstimateParams, NotionInfo, ProcessRule, ProcessRuleResponse, createDocumentResponse } from '@/models/datasets'
import type { DataSourceProvider, NotionPage } from '@/models/common'
export const getNotionInfo = (
@ -50,7 +50,7 @@ export const getWebsiteInfo = (
}
type GetFileIndexingEstimateParamsOptionBase = {
docForm: DocForm
docForm: ChuckingMode
docLanguage: string
indexingTechnique: IndexingType
processRule: ProcessRule