From 94eb069a97e572884e15dab05e99d15a213f542e Mon Sep 17 00:00:00 2001 From: AkaraChen Date: Tue, 3 Dec 2024 14:34:18 +0800 Subject: [PATCH] refactor: step 2 --- .../datasets/create/step-two/index.tsx | 2 +- web/service/use-datasets.ts | 131 ++++++++++++++++++ 2 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 web/service/use-datasets.ts diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx index 8381125f2f..7bcb0f96d3 100644 --- a/web/app/components/datasets/create/step-two/index.tsx +++ b/web/app/components/datasets/create/step-two/index.tsx @@ -91,7 +91,7 @@ enum SegmentType { AUTO = 'automatic', CUSTOM = 'custom', } -enum IndexingType { +export enum IndexingType { QUALIFIED = 'high_quality', ECONOMICAL = 'economy', } diff --git a/web/service/use-datasets.ts b/web/service/use-datasets.ts new file mode 100644 index 0000000000..53ca309c72 --- /dev/null +++ b/web/service/use-datasets.ts @@ -0,0 +1,131 @@ +import groupBy from 'lodash-es/groupBy' +import type { IndexingType } from '@/app/components/datasets/create/step-two' +import type { CrawlOptions, CrawlResultItem, CustomFile, DocForm, IndexingEstimateParams, NotionInfo, ProcessRule } from '@/models/datasets' +import { DataSourceType } from '@/models/datasets' +import type { DataSourceProvider, NotionPage } from '@/models/common' + +const getNotionInfo = ( + notionPages: NotionPage[], +) => { + const workspacesMap = groupBy(notionPages, 'workspace_id') + const workspaces = Object.keys(workspacesMap).map((workspaceId) => { + return { + workspaceId, + pages: workspacesMap[workspaceId], + } + }) + return workspaces.map((workspace) => { + return { + workspace_id: workspace.workspaceId, + pages: workspace.pages.map((page) => { + const { page_id, page_name, page_icon, type } = page + return { + page_id, + page_name, + page_icon, + type, + } + }), + } + }) as NotionInfo[] +} + +const getWebsiteInfo = ( + opts: { + websiteCrawlProvider: DataSourceProvider + websiteCrawlJobId: string + websitePages: CrawlResultItem[] + crawlOptions?: CrawlOptions + }, +) => { + const { websiteCrawlProvider, websiteCrawlJobId, websitePages, crawlOptions } = opts + return { + provider: websiteCrawlProvider, + job_id: websiteCrawlJobId, + urls: websitePages.map(page => page.source_url), + only_main_content: crawlOptions?.only_main_content, + } +} + +type GetFileIndexingEstimateParamsOption = { + docForm: DocForm + docLanguage: string + dataSourceType: DataSourceType + files: CustomFile[] + indexingTechnique: IndexingType + processRule: ProcessRule + dataset_id: string + notionPages?: NotionPage[] + websitePages?: CrawlResultItem[] + crawlOptions?: CrawlOptions + websiteCrawlProvider?: DataSourceProvider + websiteCrawlJobId?: string +} + +const getFileIndexingEstimateParams = ({ + docForm, + docLanguage, + dataSourceType, + files, + indexingTechnique, + processRule, + dataset_id, + notionPages, + websitePages, + crawlOptions, + websiteCrawlProvider, + websiteCrawlJobId, +}: GetFileIndexingEstimateParamsOption): IndexingEstimateParams | undefined => { + if (dataSourceType === DataSourceType.FILE) { + return { + info_list: { + data_source_type: dataSourceType, + file_info_list: { + file_ids: files.map(file => file.id) as string[], + }, + }, + indexing_technique: indexingTechnique, + process_rule: processRule, + doc_form: docForm, + doc_language: docLanguage, + dataset_id, + } + } + if (dataSourceType === DataSourceType.NOTION) { + return { + info_list: { + data_source_type: dataSourceType, + notion_info_list: getNotionInfo( + notionPages as NotionPage[], + ), + }, + indexing_technique: indexingTechnique, + process_rule: processRule, + doc_form: docForm, + doc_language: docLanguage, + dataset_id, + } + } + if (dataSourceType === DataSourceType.WEB) { + return { + info_list: { + data_source_type: dataSourceType, + website_info_list: getWebsiteInfo({ + websiteCrawlProvider: websiteCrawlProvider as DataSourceProvider, + websiteCrawlJobId: websiteCrawlJobId as string, + websitePages: websitePages as CrawlResultItem[], + crawlOptions, + }), + }, + indexing_technique: indexingTechnique, + process_rule: processRule, + doc_form: docForm, + doc_language: docLanguage, + dataset_id, + } + } +} + +export const useFetchFileIndexingEstimate = () => { + +}