refactor: step 2

This commit is contained in:
AkaraChen 2024-12-03 14:34:18 +08:00
parent c960f78035
commit 94eb069a97
2 changed files with 132 additions and 1 deletions

View File

@ -91,7 +91,7 @@ enum SegmentType {
AUTO = 'automatic',
CUSTOM = 'custom',
}
enum IndexingType {
export enum IndexingType {
QUALIFIED = 'high_quality',
ECONOMICAL = 'economy',
}

131
web/service/use-datasets.ts Normal file
View File

@ -0,0 +1,131 @@
import groupBy from 'lodash-es/groupBy'
import type { IndexingType } from '@/app/components/datasets/create/step-two'
import type { CrawlOptions, CrawlResultItem, CustomFile, DocForm, IndexingEstimateParams, NotionInfo, ProcessRule } from '@/models/datasets'
import { DataSourceType } from '@/models/datasets'
import type { DataSourceProvider, NotionPage } from '@/models/common'
const getNotionInfo = (
notionPages: NotionPage[],
) => {
const workspacesMap = groupBy(notionPages, 'workspace_id')
const workspaces = Object.keys(workspacesMap).map((workspaceId) => {
return {
workspaceId,
pages: workspacesMap[workspaceId],
}
})
return workspaces.map((workspace) => {
return {
workspace_id: workspace.workspaceId,
pages: workspace.pages.map((page) => {
const { page_id, page_name, page_icon, type } = page
return {
page_id,
page_name,
page_icon,
type,
}
}),
}
}) as NotionInfo[]
}
const getWebsiteInfo = (
opts: {
websiteCrawlProvider: DataSourceProvider
websiteCrawlJobId: string
websitePages: CrawlResultItem[]
crawlOptions?: CrawlOptions
},
) => {
const { websiteCrawlProvider, websiteCrawlJobId, websitePages, crawlOptions } = opts
return {
provider: websiteCrawlProvider,
job_id: websiteCrawlJobId,
urls: websitePages.map(page => page.source_url),
only_main_content: crawlOptions?.only_main_content,
}
}
type GetFileIndexingEstimateParamsOption = {
docForm: DocForm
docLanguage: string
dataSourceType: DataSourceType
files: CustomFile[]
indexingTechnique: IndexingType
processRule: ProcessRule
dataset_id: string
notionPages?: NotionPage[]
websitePages?: CrawlResultItem[]
crawlOptions?: CrawlOptions
websiteCrawlProvider?: DataSourceProvider
websiteCrawlJobId?: string
}
const getFileIndexingEstimateParams = ({
docForm,
docLanguage,
dataSourceType,
files,
indexingTechnique,
processRule,
dataset_id,
notionPages,
websitePages,
crawlOptions,
websiteCrawlProvider,
websiteCrawlJobId,
}: GetFileIndexingEstimateParamsOption): IndexingEstimateParams | undefined => {
if (dataSourceType === DataSourceType.FILE) {
return {
info_list: {
data_source_type: dataSourceType,
file_info_list: {
file_ids: files.map(file => file.id) as string[],
},
},
indexing_technique: indexingTechnique,
process_rule: processRule,
doc_form: docForm,
doc_language: docLanguage,
dataset_id,
}
}
if (dataSourceType === DataSourceType.NOTION) {
return {
info_list: {
data_source_type: dataSourceType,
notion_info_list: getNotionInfo(
notionPages as NotionPage[],
),
},
indexing_technique: indexingTechnique,
process_rule: processRule,
doc_form: docForm,
doc_language: docLanguage,
dataset_id,
}
}
if (dataSourceType === DataSourceType.WEB) {
return {
info_list: {
data_source_type: dataSourceType,
website_info_list: getWebsiteInfo({
websiteCrawlProvider: websiteCrawlProvider as DataSourceProvider,
websiteCrawlJobId: websiteCrawlJobId as string,
websitePages: websitePages as CrawlResultItem[],
crawlOptions,
}),
},
indexing_technique: indexingTechnique,
process_rule: processRule,
doc_form: docForm,
doc_language: docLanguage,
dataset_id,
}
}
}
export const useFetchFileIndexingEstimate = () => {
}