From 20343facad33e84bb56f8a4a6dc8105410b12bfa Mon Sep 17 00:00:00 2001 From: twwu Date: Wed, 21 May 2025 10:53:18 +0800 Subject: [PATCH] refactor: website data source components and hooks --- .../base/form/components/form/actions.tsx | 18 +- .../base/form/form-scenarios/base/field.tsx | 1 + .../workspace-selector/index.tsx | 2 +- .../datasets/create/website/base/header.tsx | 28 +-- .../data-source/website/base/crawler.tsx | 133 +++++++++++ .../data-source/website/base/options/hooks.ts | 50 ++++ .../base/{options.tsx => options/index.tsx} | 24 +- .../data-source/website/firecrawl/hooks.ts | 89 -------- .../data-source/website/firecrawl/index.tsx | 204 ++--------------- .../data-source/website/jina-reader/hooks.ts | 66 ------ .../data-source/website/jina-reader/index.tsx | 215 ++---------------- .../data-source/website/water-crawl/hooks.ts | 89 -------- .../data-source/website/water-crawl/index.tsx | 202 ++-------------- .../test-run/document-processing/hooks.ts | 2 +- .../components/panel/test-run/hooks.ts | 30 ++- .../components/panel/test-run/index.tsx | 17 +- .../components/panel/test-run/types.ts | 3 +- web/models/pipeline.ts | 27 +-- web/service/use-pipeline.ts | 8 +- 19 files changed, 322 insertions(+), 886 deletions(-) create mode 100644 web/app/components/rag-pipeline/components/panel/test-run/data-source/website/base/crawler.tsx create mode 100644 web/app/components/rag-pipeline/components/panel/test-run/data-source/website/base/options/hooks.ts rename web/app/components/rag-pipeline/components/panel/test-run/data-source/website/base/{options.tsx => options/index.tsx} (85%) delete mode 100644 web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/hooks.ts delete mode 100644 web/app/components/rag-pipeline/components/panel/test-run/data-source/website/jina-reader/hooks.ts delete mode 100644 web/app/components/rag-pipeline/components/panel/test-run/data-source/website/water-crawl/hooks.ts diff --git a/web/app/components/base/form/components/form/actions.tsx b/web/app/components/base/form/components/form/actions.tsx index bf7620d341..4bdd398ad0 100644 --- a/web/app/components/base/form/components/form/actions.tsx +++ b/web/app/components/base/form/components/form/actions.tsx @@ -23,16 +23,14 @@ const Actions = ({ return CustomActions(form) return ( -
- -
+ ) } diff --git a/web/app/components/base/form/form-scenarios/base/field.tsx b/web/app/components/base/form/form-scenarios/base/field.tsx index efaaa130ea..e36a2ecf2b 100644 --- a/web/app/components/base/form/form-scenarios/base/field.tsx +++ b/web/app/components/base/form/form-scenarios/base/field.tsx @@ -156,6 +156,7 @@ const BaseField = ({ allowed_file_extensions: allowedFileExtensions, allowed_file_types: allowedFileTypes, allowed_file_upload_methods: allowedFileUploadMethods, + number_limits: 1, }} /> )} diff --git a/web/app/components/base/notion-page-selector/workspace-selector/index.tsx b/web/app/components/base/notion-page-selector/workspace-selector/index.tsx index 433ad49061..1fb55598e3 100644 --- a/web/app/components/base/notion-page-selector/workspace-selector/index.tsx +++ b/web/app/components/base/notion-page-selector/workspace-selector/index.tsx @@ -45,7 +45,7 @@ export default function WorkspaceSelector({
{ diff --git a/web/app/components/datasets/create/website/base/header.tsx b/web/app/components/datasets/create/website/base/header.tsx index 6400e9475a..7710d8ada9 100644 --- a/web/app/components/datasets/create/website/base/header.tsx +++ b/web/app/components/datasets/create/website/base/header.tsx @@ -6,9 +6,9 @@ import { RiBookOpenLine, RiEqualizer2Line } from '@remixicon/react' type HeaderProps = { isInPipeline?: boolean - onClickConfiguration: () => void + onClickConfiguration?: () => void title: string - buttonText: string + buttonText?: string docTitle: string docLink: string } @@ -31,21 +31,21 @@ const Header = ({ {title}
- + + )} void + onJobIdChange: (jobId: string) => void +} + +enum Step { + init = 'init', + running = 'running', + finished = 'finished', +} + +const Crawler = ({ + nodeId, + variables, + checkedCrawlResult, + datasourceProvider, + onCheckedCrawlResultChange, + onJobIdChange, +}: CrawlerProps) => { + const { t } = useTranslation() + const [step, setStep] = useState(Step.init) + const [controlFoldOptions, setControlFoldOptions] = useState(0) + const pipelineId = useDatasetDetailContextWithSelector(s => s.dataset?.pipeline_id) + + const headerInfoMap = useWebCrawlerHeaderInfo() + + useEffect(() => { + if (step !== Step.init) + setControlFoldOptions(Date.now()) + }, [step]) + + const isInit = step === Step.init + const isCrawlFinished = step === Step.finished + const isRunning = step === Step.running + const [crawlResult, setCrawlResult] = useState<{ + result: CrawlResultItem[] + time_consuming: number | string + } | undefined>(undefined) + const [crawlErrorMessage, setCrawlErrorMessage] = useState('') + const showError = isCrawlFinished && crawlErrorMessage + + const { mutateAsync: runDatasourceNode } = useDatasourceNodeRun() + + const handleRun = useCallback(async (value: Record) => { + setStep(Step.running) + await runDatasourceNode({ + node_id: nodeId, + pipeline_id: pipelineId!, + inputs: value, + }, { + onSuccess: (res: any) => { + const jobId = res.job_id + onJobIdChange(jobId) + setCrawlResult(res) + onCheckedCrawlResultChange(res.result || []) // default select the crawl result + setCrawlErrorMessage('') + }, + onError: (error) => { + setCrawlErrorMessage(error.message || t(`${I18N_PREFIX}.unknownError`)) + }, + onSettled: () => { + setStep(Step.finished) + }, + }) + }, [runDatasourceNode, nodeId, pipelineId, onJobIdChange, onCheckedCrawlResultChange, t]) + + return ( +
+
+
+ { + handleRun(value) + }} + /> +
+ {!isInit && ( +
+ {isRunning && ( + + )} + {showError && ( + + )} + {isCrawlFinished && !showError && ( + + )} +
+ )} +
+ ) +} +export default React.memo(Crawler) diff --git a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/base/options/hooks.ts b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/base/options/hooks.ts new file mode 100644 index 0000000000..dc3a9292a2 --- /dev/null +++ b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/base/options/hooks.ts @@ -0,0 +1,50 @@ +import type { BaseConfiguration, BaseFieldType } from '@/app/components/base/form/form-scenarios/base/types' +import { PipelineInputVarType, type RAGPipelineVariables } from '@/models/pipeline' +import { useMemo } from 'react' + +export const useInitialData = (variables: RAGPipelineVariables) => { + const initialData = useMemo(() => { + const initialData: Record = {} + variables.forEach((item) => { + if ([PipelineInputVarType.textInput, PipelineInputVarType.paragraph, PipelineInputVarType.select].includes(item.type)) + initialData[item.variable] = item.default_value || '' + if (item.type === PipelineInputVarType.number) + initialData[item.variable] = item.default_value || 0 + if ([PipelineInputVarType.singleFile, PipelineInputVarType.multiFiles].includes(item.type)) + initialData[item.variable] = item.default_value || [] + if (item.type === PipelineInputVarType.checkbox) + initialData[item.variable] = item.default_value || true + }) + return initialData + }, [variables]) + + return initialData +} + +export const useConfigurations = (variables: RAGPipelineVariables) => { + const configurations = useMemo(() => { + const configurations: BaseConfiguration[] = [] + variables.forEach((item) => { + configurations.push({ + type: item.type as unknown as BaseFieldType, + variable: item.variable, + label: item.label, + required: item.required, + placeholder: item.placeholder, + tooltip: item.tooltips, + options: item.options?.map(option => ({ + label: option, + value: option, + })), + maxLength: item.max_length, + showConditions: [], + allowedFileUploadMethods: item.allowed_file_upload_methods, + allowedFileTypes: item.allowed_file_types, + allowedFileExtensions: item.allowed_file_extensions, + }) + }) + return configurations + }, [variables]) + + return configurations +} diff --git a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/base/options.tsx b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/base/options/index.tsx similarity index 85% rename from web/app/components/rag-pipeline/components/panel/test-run/data-source/website/base/options.tsx rename to web/app/components/rag-pipeline/components/panel/test-run/data-source/website/base/options/index.tsx index d1d2742fbb..dc1a0ba7c0 100644 --- a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/base/options.tsx +++ b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/base/options/index.tsx @@ -1,35 +1,39 @@ import Button from '@/app/components/base/button' import { useAppForm } from '@/app/components/base/form' import BaseField from '@/app/components/base/form/form-scenarios/base/field' -import type { BaseConfiguration } from '@/app/components/base/form/form-scenarios/base/types' import { ArrowDownRoundFill } from '@/app/components/base/icons/src/vender/solid/general' import cn from '@/utils/classnames' import { RiPlayLargeLine } from '@remixicon/react' import { useBoolean } from 'ahooks' -import { useEffect } from 'react' +import { useEffect, useMemo } from 'react' import { useTranslation } from 'react-i18next' import Toast from '@/app/components/base/toast' -import type { ZodSchema } from 'zod' +import type { RAGPipelineVariables } from '@/models/pipeline' +import { useConfigurations, useInitialData } from './hooks' +import { generateZodSchema } from '@/app/components/base/form/form-scenarios/base/utils' const I18N_PREFIX = 'datasetCreation.stepOne.website' type OptionsProps = { - initialData: Record - configurations: BaseConfiguration[] + variables: RAGPipelineVariables isRunning: boolean controlFoldOptions?: number - schema: ZodSchema onSubmit: (data: Record) => void } const Options = ({ - initialData, - configurations, + variables, isRunning, controlFoldOptions, - schema, onSubmit, }: OptionsProps) => { + const { t } = useTranslation() + const initialData = useInitialData(variables) + const configurations = useConfigurations(variables) + const schema = useMemo(() => { + return generateZodSchema(configurations) + }, [configurations]) + const form = useAppForm({ defaultValues: initialData, validators: { @@ -53,8 +57,6 @@ const Options = ({ }, }) - const { t } = useTranslation() - const [fold, { toggle: foldToggle, setTrue: foldHide, diff --git a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/hooks.ts b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/hooks.ts deleted file mode 100644 index 42f90ab75a..0000000000 --- a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/hooks.ts +++ /dev/null @@ -1,89 +0,0 @@ -import type { BaseConfiguration } from '@/app/components/base/form/form-scenarios/base/types' -import { BaseFieldType } from '@/app/components/base/form/form-scenarios/base/types' -import { useTranslation } from 'react-i18next' -import { z } from 'zod' - -const ERROR_I18N_PREFIX = 'common.errorMsg' -const I18N_PREFIX = 'datasetCreation.stepOne.website' - -export const useConfigurations = () => { - const { t } = useTranslation() - const configurations: BaseConfiguration[] = [ - { - type: BaseFieldType.textInput, - variable: 'url', - label: 'URL', - required: true, - showConditions: [], - placeholder: 'https://docs.dify.ai', - }, - { - type: BaseFieldType.numberInput, - variable: 'limit', - label: t(`${I18N_PREFIX}.limit`), - required: true, - showConditions: [], - }, - { - type: BaseFieldType.numberInput, - variable: 'max_depth', - label: t(`${I18N_PREFIX}.maxDepth`), - required: false, - showConditions: [], - tooltip: t(`${I18N_PREFIX}.maxDepthTooltip`), - }, - { - type: BaseFieldType.textInput, - variable: 'excludes', - label: t(`${I18N_PREFIX}.excludePaths`), - required: false, - showConditions: [], - placeholder: 'blog/*, /about/*', - }, - { - type: BaseFieldType.textInput, - variable: 'includes', - label: t(`${I18N_PREFIX}.includeOnlyPaths`), - required: false, - showConditions: [], - placeholder: 'articles/*', - }, - { - type: BaseFieldType.checkbox, - variable: 'crawl_sub_pages', - label: t(`${I18N_PREFIX}.crawlSubPage`), - required: false, - showConditions: [], - }, - { - type: BaseFieldType.checkbox, - variable: 'only_main_content', - label: t(`${I18N_PREFIX}.extractOnlyMainContent`), - required: false, - showConditions: [], - }, - ] - - return configurations -} - -export const useSchema = () => { - const { t } = useTranslation() - - const Schema = z.object({ - url: z.string().nonempty({ - message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, { - field: 'url', - }), - }).regex(/^https?:\/\//, { - message: t(`${ERROR_I18N_PREFIX}.urlError`), - }), - limit: z.number().positive({ - message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, { - field: t(`${I18N_PREFIX}.limit`), - }), - }).int(), - }).passthrough() - - return Schema -} diff --git a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/index.tsx b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/index.tsx index 4d295a3b80..c6bbcf851e 100644 --- a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/index.tsx +++ b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/firecrawl/index.tsx @@ -1,202 +1,34 @@ 'use client' -import React, { useCallback, useEffect, useState } from 'react' -import { useTranslation } from 'react-i18next' -import { useModalContextSelector } from '@/context/modal-context' -import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' -import { checkFirecrawlTaskStatus, createFirecrawlTask } from '@/service/datasets' -import { sleep } from '@/utils' -import Header from '@/app/components/datasets/create/website/base/header' -import Options from '../base/options' -import { useConfigurations, useSchema } from './hooks' -import Crawling from '../base/crawling' -import ErrorMessage from '../base/error-message' -import CrawledResult from '../base/crawled-result' - -const I18N_PREFIX = 'datasetCreation.stepOne.website' +import React from 'react' +import type { CrawlResultItem } from '@/models/datasets' +import type { RAGPipelineVariables } from '@/models/pipeline' +import Crawler from '../base/crawler' +import { DataSourceProvider } from '@/models/common' type FireCrawlProps = { + nodeId: string + variables: RAGPipelineVariables checkedCrawlResult: CrawlResultItem[] onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void onJobIdChange: (jobId: string) => void - crawlOptions: CrawlOptions - onCrawlOptionsChange: (payload: CrawlOptions) => void -} - -enum Step { - init = 'init', - running = 'running', - finished = 'finished', } const FireCrawl = ({ + nodeId, + variables, checkedCrawlResult, onCheckedCrawlResultChange, onJobIdChange, - crawlOptions, - onCrawlOptionsChange, }: FireCrawlProps) => { - const { t } = useTranslation() - const [step, setStep] = useState(Step.init) - const [controlFoldOptions, setControlFoldOptions] = useState(0) - const configurations = useConfigurations() - const schema = useSchema() - - useEffect(() => { - if (step !== Step.init) - setControlFoldOptions(Date.now()) - }, [step]) - - const setShowAccountSettingModal = useModalContextSelector(s => s.setShowAccountSettingModal) - const handleSetting = useCallback(() => { - setShowAccountSettingModal({ - payload: 'data-source', - }) - }, [setShowAccountSettingModal]) - - const isInit = step === Step.init - const isCrawlFinished = step === Step.finished - const isRunning = step === Step.running - const [crawlResult, setCrawlResult] = useState<{ - current: number - total: number - data: CrawlResultItem[] - time_consuming: number | string - } | undefined>(undefined) - const [crawlErrorMessage, setCrawlErrorMessage] = useState('') - const showError = isCrawlFinished && crawlErrorMessage - - const waitForCrawlFinished = useCallback(async (jobId: string) => { - try { - const res = await checkFirecrawlTaskStatus(jobId) as any - if (res.status === 'completed') { - return { - isError: false, - data: { - ...res, - total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), - }, - } - } - if (res.status === 'error' || !res.status) { - // can't get the error message from the firecrawl api - return { - isError: true, - errorMessage: res.message, - data: { - data: [], - }, - } - } - // update the progress - setCrawlResult({ - ...res, - total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), - }) - onCheckedCrawlResultChange(res.data || []) // default select the crawl result - await sleep(2500) - return await waitForCrawlFinished(jobId) - } - catch (e: any) { - const errorBody = await e.json() - return { - isError: true, - errorMessage: errorBody.message, - data: { - data: [], - }, - } - } - }, [crawlOptions.limit, onCheckedCrawlResultChange]) - - const handleRun = useCallback(async (value: Record) => { - const { url, ...crawlOptions } = value - onCrawlOptionsChange(crawlOptions as CrawlOptions) - setStep(Step.running) - try { - const passToServerCrawlOptions: any = { - ...crawlOptions, - } - if (crawlOptions.max_depth === '') - delete passToServerCrawlOptions.max_depth - - const res = await createFirecrawlTask({ - url, - options: passToServerCrawlOptions, - }) as any - const jobId = res.job_id - onJobIdChange(jobId) - const { isError, data, errorMessage } = await waitForCrawlFinished(jobId) - if (isError) { - setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`)) - } - else { - setCrawlResult(data) - onCheckedCrawlResultChange(data.data || []) // default select the crawl result - setCrawlErrorMessage('') - } - } - catch (e) { - setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!) - console.log(e) - } - finally { - setStep(Step.finished) - } - }, [onCrawlOptionsChange, onJobIdChange, t, waitForCrawlFinished, onCheckedCrawlResultChange]) - return ( -
-
-
- { - handleRun(value) - console.log('submit') - }} - /> -
- {!isInit && ( -
- {isRunning && ( - - )} - {showError && ( - - )} - {isCrawlFinished && !showError && ( - - )} -
- )} -
+ ) } -export default React.memo(FireCrawl) +export default FireCrawl diff --git a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/jina-reader/hooks.ts b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/jina-reader/hooks.ts deleted file mode 100644 index 9f57726c04..0000000000 --- a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/jina-reader/hooks.ts +++ /dev/null @@ -1,66 +0,0 @@ -import type { BaseConfiguration } from '@/app/components/base/form/form-scenarios/base/types' -import { BaseFieldType } from '@/app/components/base/form/form-scenarios/base/types' -import { useTranslation } from 'react-i18next' -import { z } from 'zod' - -const ERROR_I18N_PREFIX = 'common.errorMsg' -const I18N_PREFIX = 'datasetCreation.stepOne.website' - -export const useConfigurations = () => { - const { t } = useTranslation() - const configurations: BaseConfiguration[] = [ - { - type: BaseFieldType.textInput, - variable: 'url', - label: 'URL', - required: true, - showConditions: [], - placeholder: 'https://docs.dify.ai', - }, - { - type: BaseFieldType.numberInput, - variable: 'limit', - label: t(`${I18N_PREFIX}.limit`), - required: true, - showConditions: [], - }, - { - type: BaseFieldType.checkbox, - variable: 'crawl_sub_pages', - label: t(`${I18N_PREFIX}.crawlSubPage`), - required: false, - showConditions: [], - }, - { - type: BaseFieldType.checkbox, - variable: 'use_sitemap', - label: t(`${I18N_PREFIX}.useSitemap`), - tooltip: t(`${I18N_PREFIX}.useSitemapTooltip`), - required: false, - showConditions: [], - }, - ] - - return configurations -} - -export const useSchema = () => { - const { t } = useTranslation() - - const Schema = z.object({ - url: z.string().nonempty({ - message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, { - field: 'url', - }), - }).regex(/^https?:\/\//, { - message: t(`${ERROR_I18N_PREFIX}.urlError`), - }), - limit: z.number().positive({ - message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, { - field: t(`${I18N_PREFIX}.limit`), - }), - }).int(), - }).passthrough() - - return Schema -} diff --git a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/jina-reader/index.tsx b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/jina-reader/index.tsx index b26091dbbe..19cf3862a1 100644 --- a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/jina-reader/index.tsx +++ b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/jina-reader/index.tsx @@ -1,215 +1,34 @@ 'use client' -import React, { useCallback, useEffect, useState } from 'react' -import { useTranslation } from 'react-i18next' -import CrawledResult from '../base/crawled-result' -import Crawling from '../base/crawling' -import ErrorMessage from '../base/error-message' -import { useModalContextSelector } from '@/context/modal-context' -import { checkJinaReaderTaskStatus, createJinaReaderTask } from '@/service/datasets' -import { sleep } from '@/utils' -import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' -import Header from '@/app/components/datasets/create/website/base/header' -import Options from '../base/options' -import { useConfigurations, useSchema } from './hooks' - -const I18N_PREFIX = 'datasetCreation.stepOne.website' +import React from 'react' +import type { CrawlResultItem } from '@/models/datasets' +import type { RAGPipelineVariables } from '@/models/pipeline' +import Crawler from '../base/crawler' +import { DataSourceProvider } from '@/models/common' type JinaReaderProps = { + nodeId: string + variables: RAGPipelineVariables checkedCrawlResult: CrawlResultItem[] onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void onJobIdChange: (jobId: string) => void - crawlOptions: CrawlOptions - onCrawlOptionsChange: (payload: CrawlOptions) => void -} - -enum Step { - init = 'init', - running = 'running', - finished = 'finished', } const JinaReader = ({ + nodeId, + variables, checkedCrawlResult, onCheckedCrawlResultChange, onJobIdChange, - crawlOptions, - onCrawlOptionsChange, }: JinaReaderProps) => { - const { t } = useTranslation() - const [step, setStep] = useState(Step.init) - const [controlFoldOptions, setControlFoldOptions] = useState(0) - const configurations = useConfigurations() - const schema = useSchema() - - useEffect(() => { - if (step !== Step.init) - setControlFoldOptions(Date.now()) - }, [step]) - - const setShowAccountSettingModal = useModalContextSelector(state => state.setShowAccountSettingModal) - const handleSetting = useCallback(() => { - setShowAccountSettingModal({ - payload: 'data-source', - }) - }, [setShowAccountSettingModal]) - - const isInit = step === Step.init - const isCrawlFinished = step === Step.finished - const isRunning = step === Step.running - const [crawlResult, setCrawlResult] = useState<{ - current: number - total: number - data: CrawlResultItem[] - time_consuming: number | string - } | undefined>(undefined) - const [crawlErrorMessage, setCrawlErrorMessage] = useState('') - const showError = isCrawlFinished && crawlErrorMessage - - const waitForCrawlFinished = useCallback(async (jobId: string) => { - try { - const res = await checkJinaReaderTaskStatus(jobId) as any - if (res.status === 'completed') { - return { - isError: false, - data: { - ...res, - total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), - }, - } - } - if (res.status === 'failed' || !res.status) { - return { - isError: true, - errorMessage: res.message, - data: { - data: [], - }, - } - } - // update the progress - setCrawlResult({ - ...res, - total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), - }) - onCheckedCrawlResultChange(res.data || []) // default select the crawl result - await sleep(2500) - return await waitForCrawlFinished(jobId) - } - catch (e: any) { - const errorBody = await e.json() - return { - isError: true, - errorMessage: errorBody.message, - data: { - data: [], - }, - } - } - }, [crawlOptions.limit, onCheckedCrawlResultChange]) - - const handleRun = useCallback(async (value: Record) => { - const { url, ...crawlOptions } = value - onCrawlOptionsChange(crawlOptions as CrawlOptions) - setStep(Step.running) - try { - const startTime = Date.now() - const res = await createJinaReaderTask({ - url, - options: crawlOptions, - }) as any - - if (res.data) { - const data = { - current: 1, - total: 1, - data: [{ - title: res.data.title, - markdown: res.data.content, - description: res.data.description, - source_url: res.data.url, - }], - time_consuming: (Date.now() - startTime) / 1000, - } - setCrawlResult(data) - onCheckedCrawlResultChange(data.data || []) - setCrawlErrorMessage('') - } - else if (res.job_id) { - const jobId = res.job_id - onJobIdChange(jobId) - const { isError, data, errorMessage } = await waitForCrawlFinished(jobId) - if (isError) { - setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`)) - } - else { - setCrawlResult(data) - onCheckedCrawlResultChange(data.data || []) // default select the crawl result - setCrawlErrorMessage('') - } - } - } - catch (e) { - setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!) - console.log(e) - } - finally { - setStep(Step.finished) - } - }, [onCrawlOptionsChange, onCheckedCrawlResultChange, onJobIdChange, t, waitForCrawlFinished]) - return ( -
-
-
- { - handleRun(value) - console.log('submit') - }} - /> -
- {!isInit && ( -
- {isRunning && ( - - )} - {showError && ( - - )} - {isCrawlFinished && !showError && ( - - )} -
- )} -
+ ) } export default React.memo(JinaReader) diff --git a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/water-crawl/hooks.ts b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/water-crawl/hooks.ts deleted file mode 100644 index 42f90ab75a..0000000000 --- a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/water-crawl/hooks.ts +++ /dev/null @@ -1,89 +0,0 @@ -import type { BaseConfiguration } from '@/app/components/base/form/form-scenarios/base/types' -import { BaseFieldType } from '@/app/components/base/form/form-scenarios/base/types' -import { useTranslation } from 'react-i18next' -import { z } from 'zod' - -const ERROR_I18N_PREFIX = 'common.errorMsg' -const I18N_PREFIX = 'datasetCreation.stepOne.website' - -export const useConfigurations = () => { - const { t } = useTranslation() - const configurations: BaseConfiguration[] = [ - { - type: BaseFieldType.textInput, - variable: 'url', - label: 'URL', - required: true, - showConditions: [], - placeholder: 'https://docs.dify.ai', - }, - { - type: BaseFieldType.numberInput, - variable: 'limit', - label: t(`${I18N_PREFIX}.limit`), - required: true, - showConditions: [], - }, - { - type: BaseFieldType.numberInput, - variable: 'max_depth', - label: t(`${I18N_PREFIX}.maxDepth`), - required: false, - showConditions: [], - tooltip: t(`${I18N_PREFIX}.maxDepthTooltip`), - }, - { - type: BaseFieldType.textInput, - variable: 'excludes', - label: t(`${I18N_PREFIX}.excludePaths`), - required: false, - showConditions: [], - placeholder: 'blog/*, /about/*', - }, - { - type: BaseFieldType.textInput, - variable: 'includes', - label: t(`${I18N_PREFIX}.includeOnlyPaths`), - required: false, - showConditions: [], - placeholder: 'articles/*', - }, - { - type: BaseFieldType.checkbox, - variable: 'crawl_sub_pages', - label: t(`${I18N_PREFIX}.crawlSubPage`), - required: false, - showConditions: [], - }, - { - type: BaseFieldType.checkbox, - variable: 'only_main_content', - label: t(`${I18N_PREFIX}.extractOnlyMainContent`), - required: false, - showConditions: [], - }, - ] - - return configurations -} - -export const useSchema = () => { - const { t } = useTranslation() - - const Schema = z.object({ - url: z.string().nonempty({ - message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, { - field: 'url', - }), - }).regex(/^https?:\/\//, { - message: t(`${ERROR_I18N_PREFIX}.urlError`), - }), - limit: z.number().positive({ - message: t(`${ERROR_I18N_PREFIX}.fieldRequired`, { - field: t(`${I18N_PREFIX}.limit`), - }), - }).int(), - }).passthrough() - - return Schema -} diff --git a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/water-crawl/index.tsx b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/water-crawl/index.tsx index be12c4a787..beb586c4dd 100644 --- a/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/water-crawl/index.tsx +++ b/web/app/components/rag-pipeline/components/panel/test-run/data-source/website/water-crawl/index.tsx @@ -1,202 +1,34 @@ 'use client' -import React, { useCallback, useEffect, useState } from 'react' -import { useTranslation } from 'react-i18next' -import { useModalContextSelector } from '@/context/modal-context' -import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' -import { checkWatercrawlTaskStatus, createWatercrawlTask } from '@/service/datasets' -import { sleep } from '@/utils' -import Header from '@/app/components/datasets/create/website/base/header' -import Options from '../base/options' -import { useConfigurations, useSchema } from './hooks' -import Crawling from '../base/crawling' -import ErrorMessage from '../base/error-message' -import CrawledResult from '../base/crawled-result' - -const I18N_PREFIX = 'datasetCreation.stepOne.website' +import React from 'react' +import type { CrawlResultItem } from '@/models/datasets' +import type { RAGPipelineVariables } from '@/models/pipeline' +import Crawler from '../base/crawler' +import { DataSourceProvider } from '@/models/common' type WaterCrawlProps = { + nodeId: string + variables: RAGPipelineVariables checkedCrawlResult: CrawlResultItem[] onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void onJobIdChange: (jobId: string) => void - crawlOptions: CrawlOptions - onCrawlOptionsChange: (payload: CrawlOptions) => void -} - -enum Step { - init = 'init', - running = 'running', - finished = 'finished', } const WaterCrawl = ({ + nodeId, + variables, checkedCrawlResult, onCheckedCrawlResultChange, onJobIdChange, - crawlOptions, - onCrawlOptionsChange, }: WaterCrawlProps) => { - const { t } = useTranslation() - const [step, setStep] = useState(Step.init) - const [controlFoldOptions, setControlFoldOptions] = useState(0) - const configurations = useConfigurations() - const schema = useSchema() - - useEffect(() => { - if (step !== Step.init) - setControlFoldOptions(Date.now()) - }, [step]) - - const setShowAccountSettingModal = useModalContextSelector(state => state.setShowAccountSettingModal) - const handleSetting = useCallback(() => { - setShowAccountSettingModal({ - payload: 'data-source', - }) - }, [setShowAccountSettingModal]) - - const isInit = step === Step.init - const isCrawlFinished = step === Step.finished - const isRunning = step === Step.running - const [crawlResult, setCrawlResult] = useState<{ - current: number - total: number - data: CrawlResultItem[] - time_consuming: number | string - } | undefined>(undefined) - const [crawlErrorMessage, setCrawlErrorMessage] = useState('') - const showError = isCrawlFinished && crawlErrorMessage - - const waitForCrawlFinished = useCallback(async (jobId: string): Promise => { - try { - const res = await checkWatercrawlTaskStatus(jobId) as any - if (res.status === 'completed') { - return { - isError: false, - data: { - ...res, - total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), - }, - } - } - if (res.status === 'error' || !res.status) { - // can't get the error message from the watercrawl api - return { - isError: true, - errorMessage: res.message, - data: { - data: [], - }, - } - } - // update the progress - setCrawlResult({ - ...res, - total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), - }) - onCheckedCrawlResultChange(res.data || []) // default select the crawl result - await sleep(2500) - return await waitForCrawlFinished(jobId) - } - catch (e: any) { - const errorBody = await e.json() - return { - isError: true, - errorMessage: errorBody.message, - data: { - data: [], - }, - } - } - }, [crawlOptions.limit, onCheckedCrawlResultChange]) - - const handleRun = useCallback(async (value: Record) => { - const { url, ...crawlOptions } = value - onCrawlOptionsChange(crawlOptions as CrawlOptions) - setStep(Step.running) - try { - const passToServerCrawlOptions: any = { - ...crawlOptions, - } - if (crawlOptions.max_depth === '') - delete passToServerCrawlOptions.max_depth - - const res = await createWatercrawlTask({ - url, - options: passToServerCrawlOptions, - }) as any - const jobId = res.job_id - onJobIdChange(jobId) - const { isError, data, errorMessage } = await waitForCrawlFinished(jobId) - if (isError) { - setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`)) - } - else { - setCrawlResult(data) - onCheckedCrawlResultChange(data.data || []) // default select the crawl result - setCrawlErrorMessage('') - } - } - catch (e) { - setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!) - console.log(e) - } - finally { - setStep(Step.finished) - } - }, [onCrawlOptionsChange, onCheckedCrawlResultChange, onJobIdChange, t, waitForCrawlFinished]) - return ( -
-
-
- { - handleRun(value) - console.log('submit') - }} - /> -
- {!isInit && ( -
- {isRunning && ( - - )} - {showError && ( - - )} - {isCrawlFinished && !showError && ( - - )} -
- )} -
+ ) } export default React.memo(WaterCrawl) diff --git a/web/app/components/rag-pipeline/components/panel/test-run/document-processing/hooks.ts b/web/app/components/rag-pipeline/components/panel/test-run/document-processing/hooks.ts index 81121bbfe4..30b2fc588a 100644 --- a/web/app/components/rag-pipeline/components/panel/test-run/document-processing/hooks.ts +++ b/web/app/components/rag-pipeline/components/panel/test-run/document-processing/hooks.ts @@ -49,7 +49,7 @@ export const useConfigurations = (datasourceNodeId: string) => { value: option, })), showConditions: [], - default: item.default, + default: item.default_value, })) return configs }, [paramsConfig]) diff --git a/web/app/components/rag-pipeline/components/panel/test-run/hooks.ts b/web/app/components/rag-pipeline/components/panel/test-run/hooks.ts index 6fca02c634..26805b34de 100644 --- a/web/app/components/rag-pipeline/components/panel/test-run/hooks.ts +++ b/web/app/components/rag-pipeline/components/panel/test-run/hooks.ts @@ -50,7 +50,7 @@ export const useDatasourceOptions = () => { return { nodeId: node.id, type, - config: {}, + variables: node.data.variables, } }) }, [nodes]) @@ -98,3 +98,31 @@ export const useDatasourceOptions = () => { }, [datasources, t]) return { datasources, options } } + +export const useWebCrawlerHeaderInfo = () => { + const { t } = useTranslation() + const I18N_PREFIX = 'datasetCreation.stepOne.website' + + const headerInfoMap: Record = { + [DataSourceProvider.fireCrawl]: { + title: t(`${I18N_PREFIX}.firecrawlTitle`), + docTitle: t(`${I18N_PREFIX}.firecrawlDoc`), + docLink: 'https://docs.firecrawl.dev/introduction', + }, + [DataSourceProvider.jinaReader]: { + title: t(`${I18N_PREFIX}.jinaReaderTitle`), + docTitle: t(`${I18N_PREFIX}.jinaReaderDoc`), + docLink: 'https://jina.ai/reader', + }, + [DataSourceProvider.waterCrawl]: { + title: t(`${I18N_PREFIX}.watercrawlTitle`), + docTitle: t(`${I18N_PREFIX}.watercrawlDoc`), + docLink: 'https://docs.watercrawl.dev/', + }, + } + return headerInfoMap +} diff --git a/web/app/components/rag-pipeline/components/panel/test-run/index.tsx b/web/app/components/rag-pipeline/components/panel/test-run/index.tsx index d263df9e01..fac5005b94 100644 --- a/web/app/components/rag-pipeline/components/panel/test-run/index.tsx +++ b/web/app/components/rag-pipeline/components/panel/test-run/index.tsx @@ -4,7 +4,7 @@ import { useCallback, useMemo, useState } from 'react' import StepIndicator from './step-indicator' import { useTestRunSteps } from './hooks' import DataSourceOptions from './data-source-options' -import type { CrawlOptions, CrawlResultItem, FileItem } from '@/models/datasets' +import type { CrawlResultItem, FileItem } from '@/models/datasets' import { DataSourceType } from '@/models/datasets' import LocalFile from './data-source/local-file' import produce from 'immer' @@ -12,7 +12,6 @@ import { useProviderContextSelector } from '@/context/provider-context' import { DataSourceProvider, type NotionPage } from '@/models/common' import Notion from './data-source/notion' import VectorSpaceFull from '@/app/components/billing/vector-space-full' -import { DEFAULT_CRAWL_OPTIONS } from './consts' import Firecrawl from './data-source/website/firecrawl' import JinaReader from './data-source/website/jina-reader' import WaterCrawl from './data-source/website/water-crawl' @@ -31,7 +30,6 @@ const TestRunPanel = () => { const [notionPages, setNotionPages] = useState([]) const [websitePages, setWebsitePages] = useState([]) const [websiteCrawlJobId, setWebsiteCrawlJobId] = useState('') - const [crawlOptions, setCrawlOptions] = useState(DEFAULT_CRAWL_OPTIONS) const plan = useProviderContextSelector(state => state.plan) const enableBilling = useProviderContextSelector(state => state.enableBilling) @@ -159,35 +157,36 @@ const TestRunPanel = () => { )} {datasource?.type === DataSourceType.NOTION && ( )} {datasource?.type === DataSourceProvider.fireCrawl && ( )} {datasource?.type === DataSourceProvider.jinaReader && ( )} {datasource?.type === DataSourceProvider.waterCrawl && ( )} {isShowVectorSpaceFull && ( diff --git a/web/app/components/rag-pipeline/components/panel/test-run/types.ts b/web/app/components/rag-pipeline/components/panel/test-run/types.ts index 990fbda47f..718e8a9593 100644 --- a/web/app/components/rag-pipeline/components/panel/test-run/types.ts +++ b/web/app/components/rag-pipeline/components/panel/test-run/types.ts @@ -1,5 +1,6 @@ import type { DataSourceProvider } from '@/models/common' import type { DataSourceType } from '@/models/datasets' +import type { RAGPipelineVariables } from '@/models/pipeline' export enum TestRunStep { dataSource = 'dataSource', @@ -15,5 +16,5 @@ export type DataSourceOption = { export type Datasource = { nodeId: string type: DataSourceType | DataSourceProvider - config: any + variables: RAGPipelineVariables } diff --git a/web/models/pipeline.ts b/web/models/pipeline.ts index 7f94ab0ab2..fe501a9216 100644 --- a/web/models/pipeline.ts +++ b/web/models/pipeline.ts @@ -98,12 +98,12 @@ export type PipelineCheckDependenciesResponse = { } export enum PipelineInputVarType { - textInput = 'text-input', + textInput = 'textInput', paragraph = 'paragraph', select = 'select', - number = 'number', + number = 'numberInput', singleFile = 'file', - multiFiles = 'file-list', + multiFiles = 'fileList', checkbox = 'checkbox', } @@ -142,23 +142,4 @@ export type PipelineDatasourceNodeRunRequest = { inputs: Record } -export type PipelineDatasourceNodeRunResponse = { - id: string - inputs: Record - process_data: Record - outputs: Record - status: string - error?: string - elapsed_time: number - execution_metadata: { - total_tokens: number - total_price: number - currency?: string - } - extras: { - icon: string | object - } - created_at: string - created_by: string - finished_at: string -} +export type PipelineDatasourceNodeRunResponse = Record diff --git a/web/service/use-pipeline.ts b/web/service/use-pipeline.ts index 8e9d7e879c..9c33e9c278 100644 --- a/web/service/use-pipeline.ts +++ b/web/service/use-pipeline.ts @@ -9,6 +9,7 @@ import type { ImportPipelineDSLResponse, PipelineCheckDependenciesResponse, PipelineDatasourceNodeRunRequest, + PipelineDatasourceNodeRunResponse, PipelineProcessingParamsRequest, PipelineProcessingParamsResponse, PipelineTemplateByIdResponse, @@ -115,15 +116,18 @@ export const useCheckPipelineDependencies = ( }) } -export const useDatasourceNodeRun = () => { +export const useDatasourceNodeRun = ( + mutationOptions: MutationOptions = {}, +) => { return useMutation({ mutationKey: [NAME_SPACE, 'datasource-node-run'], mutationFn: (request: PipelineDatasourceNodeRunRequest) => { const { pipeline_id, node_id, ...rest } = request - return post(`/rag/pipelines/${pipeline_id}/workflows/published/nodes/${node_id}/run`, { + return post(`/rag/pipelines/${pipeline_id}/workflows/published/nodes/${node_id}/run`, { body: rest, }) }, + ...mutationOptions, }) }