mirror of https://github.com/langgenius/dify.git
wip: create datasets
This commit is contained in:
parent
f3cfcb757e
commit
a893309b73
|
|
@ -27,7 +27,7 @@ import { OptionCard } from './option-card'
|
|||
import LanguageSelect from './language-select'
|
||||
import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'
|
||||
import cn from '@/utils/classnames'
|
||||
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FullDocumentDetail, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
|
||||
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FullDocumentDetail, ParentMode, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
|
||||
|
||||
import Button from '@/app/components/base/button'
|
||||
import FloatRightContainer from '@/app/components/base/float-right-container'
|
||||
|
|
@ -38,7 +38,7 @@ import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/componen
|
|||
import Toast from '@/app/components/base/toast'
|
||||
import type { NotionPage } from '@/models/common'
|
||||
import { DataSourceProvider } from '@/models/common'
|
||||
import { DataSourceType, DocForm } from '@/models/datasets'
|
||||
import { ChuckingMode, DataSourceType } from '@/models/datasets'
|
||||
import { useDatasetDetailContext } from '@/context/dataset-detail'
|
||||
import I18n from '@/context/i18n'
|
||||
import { RETRIEVE_METHOD } from '@/types/app'
|
||||
|
|
@ -96,7 +96,7 @@ export enum IndexingType {
|
|||
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
|
||||
|
||||
type ParentChildConfig = {
|
||||
chunkForContext: 'paragraph' | 'full_doc'
|
||||
chunkForContext: ParentMode
|
||||
parent: {
|
||||
delimiter: string
|
||||
maxLength: number
|
||||
|
|
@ -168,8 +168,8 @@ const StepTwo = ({
|
|||
|
||||
// QA Related
|
||||
const [isLanguageSelectDisabled, setIsLanguageSelectDisabled] = useState(false)
|
||||
const [docForm, setDocForm] = useState<DocForm | string>(
|
||||
(datasetId && documentDetail) ? documentDetail.doc_form : DocForm.TEXT,
|
||||
const [docForm, setDocForm] = useState<ChuckingMode>(
|
||||
(datasetId && documentDetail) ? documentDetail.doc_form as ChuckingMode : ChuckingMode.text,
|
||||
)
|
||||
|
||||
const [docLanguage, setDocLanguage] = useState<string>(
|
||||
|
|
@ -181,27 +181,28 @@ const StepTwo = ({
|
|||
const getIndexing_technique = () => indexingType || indexType
|
||||
|
||||
const getProcessRule = () => {
|
||||
const processRule: ProcessRule = {
|
||||
rules: {} as any, // api will check this. It will be removed after api refactored.
|
||||
mode: segmentationType,
|
||||
}
|
||||
if (segmentationType === SegmentType.CUSTOM) {
|
||||
const ruleObj = {
|
||||
return {
|
||||
rules: {
|
||||
pre_processing_rules: rules,
|
||||
segmentation: {
|
||||
separator: unescape(segmentIdentifier),
|
||||
max_tokens: maxChunkLength,
|
||||
chunk_overlap: overlap,
|
||||
},
|
||||
}
|
||||
// @ts-expect-error will be removed after api refactored.
|
||||
processRule.rules = ruleObj
|
||||
}
|
||||
return processRule
|
||||
parent_mode: parentChildConfig.chunkForContext,
|
||||
subchunk_segmentation: {
|
||||
separator: parentChildConfig.child.delimiter,
|
||||
max_tokens: parentChildConfig.child.maxLength,
|
||||
},
|
||||
}, // api will check this. It will be removed after api refactored.
|
||||
mode: docForm === ChuckingMode.parentChild
|
||||
? 'hierarchical'
|
||||
: segmentationType,
|
||||
} as ProcessRule
|
||||
}
|
||||
|
||||
const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({
|
||||
docForm: docForm as DocForm,
|
||||
docForm,
|
||||
docLanguage,
|
||||
dataSourceType: DataSourceType.FILE,
|
||||
files,
|
||||
|
|
@ -210,7 +211,7 @@ const StepTwo = ({
|
|||
dataset_id: datasetId!,
|
||||
})
|
||||
const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({
|
||||
docForm: docForm as DocForm,
|
||||
docForm,
|
||||
docLanguage,
|
||||
dataSourceType: DataSourceType.NOTION,
|
||||
notionPages,
|
||||
|
|
@ -220,7 +221,7 @@ const StepTwo = ({
|
|||
})
|
||||
|
||||
const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({
|
||||
docForm: docForm as DocForm,
|
||||
docForm,
|
||||
docLanguage,
|
||||
dataSourceType: DataSourceType.WEB,
|
||||
websitePages,
|
||||
|
|
@ -481,29 +482,10 @@ const StepTwo = ({
|
|||
isSetting && onSave && onSave()
|
||||
}
|
||||
|
||||
const handleDocformSwitch = (isQAMode: boolean) => {
|
||||
if (isQAMode)
|
||||
setDocForm(DocForm.QA)
|
||||
else
|
||||
setDocForm(DocForm.TEXT)
|
||||
}
|
||||
|
||||
const previewSwitch = () => {
|
||||
setIsLanguageSelectDisabled(true)
|
||||
fetchEstimate()
|
||||
}
|
||||
|
||||
const handleSelect = (language: string) => {
|
||||
setDocLanguage(language)
|
||||
// Switch language, re-cutter
|
||||
if (docForm === DocForm.QA)
|
||||
previewSwitch()
|
||||
}
|
||||
|
||||
const changeToEconomicalType = () => {
|
||||
if (!hasSetIndexType) {
|
||||
setIndexType(IndexingType.ECONOMICAL)
|
||||
setDocForm(DocForm.TEXT)
|
||||
setDocForm(ChuckingMode.text)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -520,8 +502,8 @@ const StepTwo = ({
|
|||
}, [])
|
||||
|
||||
useEffect(() => {
|
||||
if (indexingType === IndexingType.ECONOMICAL && docForm === DocForm.QA)
|
||||
setDocForm(DocForm.TEXT)
|
||||
if (indexingType === IndexingType.ECONOMICAL && docForm === ChuckingMode.qa)
|
||||
setDocForm(ChuckingMode.text)
|
||||
}, [indexingType, docForm])
|
||||
|
||||
useEffect(() => {
|
||||
|
|
@ -557,8 +539,8 @@ const StepTwo = ({
|
|||
icon={<Image src={SettingCog} alt={t('datasetCreation.stepTwo.general')} />}
|
||||
activeHeaderClassName='bg-gradient-to-r from-[#EFF0F9] to-[#F9FAFB]'
|
||||
description={t('datasetCreation.stepTwo.generalTip')}
|
||||
isActive={SegmentType.AUTO === segmentationType}
|
||||
onClick={() => setSegmentationType(SegmentType.AUTO)}
|
||||
isActive={docForm === ChuckingMode.qa || docForm === ChuckingMode.text}
|
||||
onClick={() => setDocForm(ChuckingMode.text)}
|
||||
actions={
|
||||
<>
|
||||
<Button variant={'secondary-accent'} onClick={() => updatePreview()}>
|
||||
|
|
@ -607,12 +589,12 @@ const StepTwo = ({
|
|||
{IS_CE_EDITION && <>
|
||||
<div className='flex items-center'>
|
||||
<Checkbox
|
||||
checked={docForm === DocForm.QA}
|
||||
checked={docForm === ChuckingMode.qa}
|
||||
onCheck={() => {
|
||||
if (docForm === DocForm.QA)
|
||||
setDocForm(DocForm.TEXT)
|
||||
if (docForm === ChuckingMode.qa)
|
||||
setDocForm(ChuckingMode.text)
|
||||
else
|
||||
setDocForm(DocForm.QA)
|
||||
setDocForm(ChuckingMode.qa)
|
||||
}}
|
||||
className='mr-2'
|
||||
/>
|
||||
|
|
@ -630,7 +612,7 @@ const StepTwo = ({
|
|||
<Tooltip popupContent={t('datasetCreation.stepTwo.QATip')} />
|
||||
</div>
|
||||
</div>
|
||||
{docForm === DocForm.QA && (
|
||||
{docForm === ChuckingMode.qa && (
|
||||
<div
|
||||
style={{
|
||||
background: 'linear-gradient(92deg, rgba(247, 144, 9, 0.1) 0%, rgba(255, 255, 255, 0.00) 100%)',
|
||||
|
|
@ -652,8 +634,8 @@ const StepTwo = ({
|
|||
effectImg={OrangeEffect.src}
|
||||
activeHeaderClassName='bg-gradient-to-r from-[#F9F1EE] to-[#F9FAFB]'
|
||||
description={t('datasetCreation.stepTwo.parentChildTip')}
|
||||
isActive={SegmentType.CUSTOM === segmentationType}
|
||||
onClick={() => setSegmentationType(SegmentType.CUSTOM)}
|
||||
isActive={docForm === ChuckingMode.parentChild}
|
||||
onClick={() => setDocForm(ChuckingMode.parentChild)}
|
||||
actions={
|
||||
<>
|
||||
<Button variant={'secondary-accent'} onClick={() => updatePreview()}>
|
||||
|
|
@ -714,10 +696,10 @@ const StepTwo = ({
|
|||
onChosen={() => setParentChildConfig(
|
||||
{
|
||||
...parentChildConfig,
|
||||
chunkForContext: 'full_doc',
|
||||
chunkForContext: 'full-doc',
|
||||
},
|
||||
)}
|
||||
isChosen={parentChildConfig.chunkForContext === 'full_doc'}
|
||||
isChosen={parentChildConfig.chunkForContext === 'full-doc'}
|
||||
/>
|
||||
</div>
|
||||
|
||||
|
|
@ -926,19 +908,19 @@ const StepTwo = ({
|
|||
</PreviewHeader>}
|
||||
className={cn(s.previewWrap, isMobile && s.isMobile, 'relative h-full overflow-y-scroll space-y-4')}
|
||||
>
|
||||
{docForm === DocForm.QA && estimate?.qa_preview && (
|
||||
{docForm === ChuckingMode.qa && estimate?.qa_preview && (
|
||||
estimate?.qa_preview.map(item => (
|
||||
<QAPreview key={item.question} qa={item} />
|
||||
))
|
||||
)}
|
||||
{docForm === DocForm.TEXT && estimate?.preview && (
|
||||
{docForm === ChuckingMode.text && estimate?.preview && (
|
||||
estimate?.preview.map((item, index) => (
|
||||
<ChunkContainer
|
||||
key={item}
|
||||
key={item.content}
|
||||
label={`Chunk-${index + 1}`}
|
||||
characterCount={item.length}
|
||||
characterCount={item.content.length}
|
||||
>
|
||||
{item}
|
||||
{item.content}
|
||||
</ChunkContainer>
|
||||
))
|
||||
)}
|
||||
|
|
|
|||
|
|
@ -151,7 +151,8 @@ export type IndexingEstimateResponse = {
|
|||
total_price: number
|
||||
currency: string
|
||||
total_segments: number
|
||||
preview: string[]
|
||||
// TODO: change it
|
||||
preview: Array<{ content: string; child_chunks: any }>
|
||||
qa_preview?: QA[]
|
||||
}
|
||||
|
||||
|
|
@ -304,7 +305,7 @@ export type DocumentListResponse = {
|
|||
export type DocumentReq = {
|
||||
original_document_id?: string
|
||||
indexing_technique?: string
|
||||
doc_form: 'text_model' | 'qa_model'
|
||||
doc_form: ChuckingMode
|
||||
doc_language: string
|
||||
process_rule: ProcessRule
|
||||
}
|
||||
|
|
@ -346,7 +347,7 @@ export type NotionPage = {
|
|||
}
|
||||
|
||||
export type ProcessRule = {
|
||||
mode: string
|
||||
mode: ChildChunkType | 'hierarchical'
|
||||
rules: Rules
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import type { MutationOptions } from '@tanstack/react-query'
|
|||
import { useMutation } from '@tanstack/react-query'
|
||||
import { createDocument, createFirstDocument, fetchDefaultProcessRule, fetchFileIndexingEstimate } from '../datasets'
|
||||
import { type IndexingType } from '@/app/components/datasets/create/step-two'
|
||||
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DataSourceType, DocForm, FileIndexingEstimateResponse, IndexingEstimateParams, NotionInfo, ProcessRule, ProcessRuleResponse, createDocumentResponse } from '@/models/datasets'
|
||||
import type { ChuckingMode, CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DataSourceType, FileIndexingEstimateResponse, IndexingEstimateParams, NotionInfo, ProcessRule, ProcessRuleResponse, createDocumentResponse } from '@/models/datasets'
|
||||
import type { DataSourceProvider, NotionPage } from '@/models/common'
|
||||
|
||||
export const getNotionInfo = (
|
||||
|
|
@ -50,7 +50,7 @@ export const getWebsiteInfo = (
|
|||
}
|
||||
|
||||
type GetFileIndexingEstimateParamsOptionBase = {
|
||||
docForm: DocForm
|
||||
docForm: ChuckingMode
|
||||
docLanguage: string
|
||||
indexingTechnique: IndexingType
|
||||
processRule: ProcessRule
|
||||
|
|
|
|||
Loading…
Reference in New Issue