From 87c15062e6c8bb92de89252e9273338c8cb86f16 Mon Sep 17 00:00:00 2001 From: twwu Date: Tue, 3 Jun 2025 17:42:40 +0800 Subject: [PATCH] feat: enhance document processing with embedding and rule detail components --- .../documents/create-from-pipeline/index.tsx | 11 +- .../processing/embedding-process/index.tsx | 247 ++++++++++++++++++ .../embedding-process/rule-detail.tsx | 128 +++++++++ .../create-from-pipeline/processing/index.tsx | 14 +- web/models/pipeline.ts | 32 ++- web/service/use-pipeline.ts | 2 +- 6 files changed, 422 insertions(+), 12 deletions(-) create mode 100644 web/app/components/datasets/documents/create-from-pipeline/processing/embedding-process/index.tsx create mode 100644 web/app/components/datasets/documents/create-from-pipeline/processing/embedding-process/rule-detail.tsx diff --git a/web/app/components/datasets/documents/create-from-pipeline/index.tsx b/web/app/components/datasets/documents/create-from-pipeline/index.tsx index 23952fcebd..8de9a436db 100644 --- a/web/app/components/datasets/documents/create-from-pipeline/index.tsx +++ b/web/app/components/datasets/documents/create-from-pipeline/index.tsx @@ -23,7 +23,7 @@ import WebsitePreview from './preview/web-preview' import ProcessDocuments from './process-documents' import ChunkPreview from './preview/chunk-preview' import Processing from './processing' -import type { PublishedPipelineRunPreviewResponse } from '@/models/pipeline' +import type { InitialDocumentDetail, PublishedPipelineRunPreviewResponse, PublishedPipelineRunResponse } from '@/models/pipeline' import { DatasourceType } from '@/models/pipeline' import { TransferMethod } from '@/types/app' import { useAddDocumentsSteps, useLocalFile, useNotionsPages, useWebsiteCrawl } from './hooks' @@ -38,6 +38,8 @@ const CreateFormPipeline = () => { const retrievalMethod = useDatasetDetailContextWithSelector(s => s.dataset?.retrieval_model_dict.search_method) const [datasource, setDatasource] = useState() const [estimateData, setEstimateData] = useState(undefined) + const [batchId, setBatchId] = useState('') + const [documents, setDocuments] = useState([]) const isPreview = useRef(false) const formRef = useRef(null) @@ -189,7 +191,8 @@ const CreateFormPipeline = () => { is_preview: false, }, { onSuccess: (res) => { - console.log('๐Ÿš€ ~ handleProcess ~ res:', res) + setBatchId((res as PublishedPipelineRunResponse).batch || '') + setDocuments((res as PublishedPipelineRunResponse).documents || []) handleNextStep() }, }) @@ -306,8 +309,8 @@ const CreateFormPipeline = () => { currentStep === 3 && ( diff --git a/web/app/components/datasets/documents/create-from-pipeline/processing/embedding-process/index.tsx b/web/app/components/datasets/documents/create-from-pipeline/processing/embedding-process/index.tsx new file mode 100644 index 0000000000..47572515aa --- /dev/null +++ b/web/app/components/datasets/documents/create-from-pipeline/processing/embedding-process/index.tsx @@ -0,0 +1,247 @@ +import React, { useEffect, useMemo, useRef, useState } from 'react' +import useSWR from 'swr' +import { useRouter } from 'next/navigation' +import { useTranslation } from 'react-i18next' +import { omit } from 'lodash-es' +import { ArrowRightIcon } from '@heroicons/react/24/solid' +import { + RiCheckboxCircleFill, + RiErrorWarningFill, + RiLoader2Fill, + RiTerminalBoxLine, +} from '@remixicon/react' +import cn from '@/utils/classnames' +import Button from '@/app/components/base/button' +import type { IndexingStatusResponse } from '@/models/datasets' +import { fetchIndexingStatusBatch as doFetchIndexingStatus, fetchProcessRule } from '@/service/datasets' +import NotionIcon from '@/app/components/base/notion-icon' +import PriorityLabel from '@/app/components/billing/priority-label' +import { Plan } from '@/app/components/billing/type' +import { ZapFast } from '@/app/components/base/icons/src/vender/solid/general' +import UpgradeBtn from '@/app/components/billing/upgrade-btn' +import { useProviderContext } from '@/context/provider-context' +import { sleep } from '@/utils' +import Tooltip from '@/app/components/base/tooltip' +import { useInvalidDocumentList } from '@/service/knowledge/use-document' +import DocumentFileIcon from '@/app/components/datasets/common/document-file-icon' +import RuleDetail from './rule-detail' +import type { IndexingType } from '@/app/components/datasets/create/step-two' +import type { RETRIEVE_METHOD } from '@/types/app' +import { DatasourceType, type InitialDocumentDetail } from '@/models/pipeline' + +type EmbeddingProcessProps = { + datasetId: string + batchId: string + documents?: InitialDocumentDetail[] + indexingType?: IndexingType + retrievalMethod?: RETRIEVE_METHOD +} + +const EmbeddingProcess = ({ + datasetId, + batchId, + documents = [], + indexingType, + retrievalMethod, +}: EmbeddingProcessProps) => { + const { t } = useTranslation() + const { enableBilling, plan } = useProviderContext() + + const firstDocument = documents[0] + + const [indexingStatusBatchDetail, setIndexingStatusDetail] = useState([]) + const fetchIndexingStatus = async () => { + const status = await doFetchIndexingStatus({ datasetId, batchId }) + setIndexingStatusDetail(status.data) + return status.data + } + + const [isStopQuery, setIsStopQuery] = useState(false) + const isStopQueryRef = useRef(isStopQuery) + useEffect(() => { + isStopQueryRef.current = isStopQuery + }, [isStopQuery]) + const stopQueryStatus = () => { + setIsStopQuery(true) + } + + const startQueryStatus = async () => { + if (isStopQueryRef.current) + return + + try { + const indexingStatusBatchDetail = await fetchIndexingStatus() + const isCompleted = indexingStatusBatchDetail.every(indexingStatusDetail => ['completed', 'error', 'paused'].includes(indexingStatusDetail.indexing_status)) + if (isCompleted) { + stopQueryStatus() + return + } + await sleep(2500) + await startQueryStatus() + } + catch { + await sleep(2500) + await startQueryStatus() + } + } + + useEffect(() => { + setIsStopQuery(false) + startQueryStatus() + return () => { + stopQueryStatus() + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []) + + // get rule + const { data: ruleDetail } = useSWR({ + action: 'fetchProcessRule', + params: { documentId: firstDocument.id }, + }, apiParams => fetchProcessRule(omit(apiParams, 'action')), { + revalidateOnFocus: false, + }) + + const router = useRouter() + const invalidDocumentList = useInvalidDocumentList() + const navToDocumentList = () => { + invalidDocumentList() + router.push(`/datasets/${datasetId}/documents`) + } + const navToApiDocs = () => { + router.push('/datasets?category=api') + } + + const isEmbedding = useMemo(() => { + return indexingStatusBatchDetail.some(indexingStatusDetail => ['indexing', 'splitting', 'parsing', 'cleaning'].includes(indexingStatusDetail?.indexing_status || '')) + }, [indexingStatusBatchDetail]) + const isEmbeddingCompleted = useMemo(() => { + return indexingStatusBatchDetail.every(indexingStatusDetail => ['completed', 'error', 'paused'].includes(indexingStatusDetail?.indexing_status || '')) + }, [indexingStatusBatchDetail]) + + const getSourceName = (id: string) => { + const doc = documents.find(document => document.id === id) + return doc?.name + } + const getFileType = (name?: string) => name?.split('.').pop() || 'txt' + const getSourcePercent = (detail: IndexingStatusResponse) => { + const completedCount = detail.completed_segments || 0 + const totalCount = detail.total_segments || 0 + if (totalCount === 0) + return 0 + const percent = Math.round(completedCount * 100 / totalCount) + return percent > 100 ? 100 : percent + } + const getSourceType = (id: string) => { + const doc = documents.find(document => document.id === id) + return doc?.data_source_type + } + + const getIcon = (id: string) => { + const doc = documents.find(document => document.id === id) + + return doc?.data_source_info.notion_page_icon + } + const isSourceEmbedding = (detail: IndexingStatusResponse) => + ['indexing', 'splitting', 'parsing', 'cleaning', 'waiting'].includes(detail.indexing_status || '') + + return ( + <> +
+
+ {isEmbedding &&
+ + {t('datasetDocuments.embedding.processing')} +
} + {isEmbeddingCompleted && t('datasetDocuments.embedding.completed')} +
+
+ { + enableBilling && plan.type !== Plan.team && ( +
+
+ +
+
+ {t('billing.plansCommon.documentProcessingPriorityUpgrade')} +
+ +
+ ) + } +
+ {indexingStatusBatchDetail.map(indexingStatusDetail => ( +
+ {isSourceEmbedding(indexingStatusDetail) && ( +
+ )} +
+ {getSourceType(indexingStatusDetail.id) === DatasourceType.localFile && ( + + )} + {getSourceType(indexingStatusDetail.id) === DatasourceType.onlineDocument && ( + + )} +
+
+ {getSourceName(indexingStatusDetail.id)} +
+ { + enableBilling && ( + + ) + } +
+ {isSourceEmbedding(indexingStatusDetail) && ( +
{`${getSourcePercent(indexingStatusDetail)}%`}
+ )} + {indexingStatusDetail.indexing_status === 'error' && ( + + + + + + )} + {indexingStatusDetail.indexing_status === 'completed' && ( + + )} +
+
+ ))} +
+
+ +
+ + +
+ + ) +} + +export default EmbeddingProcess diff --git a/web/app/components/datasets/documents/create-from-pipeline/processing/embedding-process/rule-detail.tsx b/web/app/components/datasets/documents/create-from-pipeline/processing/embedding-process/rule-detail.tsx new file mode 100644 index 0000000000..c14ea0abb5 --- /dev/null +++ b/web/app/components/datasets/documents/create-from-pipeline/processing/embedding-process/rule-detail.tsx @@ -0,0 +1,128 @@ +import React, { useCallback } from 'react' +import { IndexingType } from '@/app/components/datasets/create/step-two' +import { ProcessMode, type ProcessRuleResponse } from '@/models/datasets' +import { RETRIEVE_METHOD } from '@/types/app' +import { useTranslation } from 'react-i18next' +import { FieldInfo } from '@/app/components/datasets/documents/detail/metadata' +import Image from 'next/image' +import { indexMethodIcon, retrievalIcon } from '@/app/components/datasets/create/icons' + +type RuleDetailProps = { + sourceData?: ProcessRuleResponse + indexingType?: IndexingType + retrievalMethod?: RETRIEVE_METHOD +} + +const RuleDetail = ({ + sourceData, + indexingType, + retrievalMethod, +}: RuleDetailProps) => { + const { t } = useTranslation() + + const segmentationRuleMap = { + mode: t('datasetDocuments.embedding.mode'), + segmentLength: t('datasetDocuments.embedding.segmentLength'), + textCleaning: t('datasetDocuments.embedding.textCleaning'), + } + + const getRuleName = useCallback((key: string) => { + if (key === 'remove_extra_spaces') + return t('datasetCreation.stepTwo.removeExtraSpaces') + + if (key === 'remove_urls_emails') + return t('datasetCreation.stepTwo.removeUrlEmails') + + if (key === 'remove_stopwords') + return t('datasetCreation.stepTwo.removeStopwords') + }, [t]) + + const isNumber = useCallback((value: unknown) => { + return typeof value === 'number' + }, []) + + const getValue = useCallback((field: string) => { + let value: string | number | undefined = '-' + const maxTokens = isNumber(sourceData?.rules?.segmentation?.max_tokens) + ? sourceData.rules.segmentation.max_tokens + : value + const childMaxTokens = isNumber(sourceData?.rules?.subchunk_segmentation?.max_tokens) + ? sourceData.rules.subchunk_segmentation.max_tokens + : value + switch (field) { + case 'mode': + value = !sourceData?.mode + ? value + // eslint-disable-next-line sonarjs/no-nested-conditional + : sourceData.mode === ProcessMode.general + ? (t('datasetDocuments.embedding.custom') as string) + // eslint-disable-next-line sonarjs/no-nested-conditional + : `${t('datasetDocuments.embedding.hierarchical')} ยท ${sourceData?.rules?.parent_mode === 'paragraph' + ? t('dataset.parentMode.paragraph') + : t('dataset.parentMode.fullDoc')}` + break + case 'segmentLength': + value = !sourceData?.mode + ? value + // eslint-disable-next-line sonarjs/no-nested-conditional + : sourceData.mode === ProcessMode.general + ? maxTokens + : `${t('datasetDocuments.embedding.parentMaxTokens')} ${maxTokens}; ${t('datasetDocuments.embedding.childMaxTokens')} ${childMaxTokens}` + break + default: + value = !sourceData?.mode + ? value + : sourceData?.rules?.pre_processing_rules?.filter(rule => + rule.enabled).map(rule => getRuleName(rule.id)).join(',') + break + } + return value + }, [getRuleName, isNumber, sourceData, t]) + + return ( +
+ {Object.keys(segmentationRuleMap).map((field) => { + return + })} + + } + /> + + } + /> +
+ ) +} + +export default React.memo(RuleDetail) diff --git a/web/app/components/datasets/documents/create-from-pipeline/processing/index.tsx b/web/app/components/datasets/documents/create-from-pipeline/processing/index.tsx index 8095122339..d3b95587f4 100644 --- a/web/app/components/datasets/documents/create-from-pipeline/processing/index.tsx +++ b/web/app/components/datasets/documents/create-from-pipeline/processing/index.tsx @@ -2,14 +2,16 @@ import React from 'react' import { useTranslation } from 'react-i18next' import { RiBookOpenLine } from '@remixicon/react' -import type { FullDocumentDetail, InitialDocumentDetail } from '@/models/datasets' -import EmbeddingProcess from '../../../create/embedding-process' import { useGetDocLanguage } from '@/context/i18n' +import EmbeddingProcess from './embedding-process' +import type { IndexingType } from '../../../create/step-two' +import type { RETRIEVE_METHOD } from '@/types/app' +import type { InitialDocumentDetail } from '@/models/pipeline' type ProcessingProps = { datasetId: string - indexingType: string - retrievalMethod: string + indexingType: IndexingType + retrievalMethod: RETRIEVE_METHOD batchId: string documents: InitialDocumentDetail[] } @@ -30,8 +32,8 @@ const Processing = ({
diff --git a/web/models/pipeline.ts b/web/models/pipeline.ts index 5d08942055..fb1f29438b 100644 --- a/web/models/pipeline.ts +++ b/web/models/pipeline.ts @@ -1,6 +1,6 @@ import type { Edge, EnvironmentVariable, Node, SupportUploadFileTypes } from '@/app/components/workflow/types' import type { DSLImportMode, DSLImportStatus } from './app' -import type { ChunkingMode, DatasetPermission, FileIndexingEstimateResponse, IconInfo } from './datasets' +import type { ChunkingMode, DatasetPermission, DocumentIndexingStatus, FileIndexingEstimateResponse, IconInfo } from './datasets' import type { Dependency } from '@/app/components/plugins/types' import type { AppIconSelection } from '@/app/components/base/app-icon-picker' import type { Viewport } from 'reactflow' @@ -187,10 +187,40 @@ export type PublishedPipelineRunRequest = { } export type PublishedPipelineRunPreviewResponse = { + task_iod: string + workflow_run_id: string data: { + id: string + status: string + created_at: number + elapsed_time: number + error: string + finished_at: number outputs: FileIndexingEstimateResponse + total_steps: number + total_tokens: number + workflow_id: string } } export type PublishedPipelineRunResponse = { + batch: string + dataset: { + chunk_structure: ChunkingMode + description: string + id: string + name: string + } + documents: InitialDocumentDetail[] +} + +export type InitialDocumentDetail = { + data_source_info: Record + data_source_type: DatasourceType + enable: boolean + error: string + id: string + indexing_status: DocumentIndexingStatus + name: string + position: number } diff --git a/web/service/use-pipeline.ts b/web/service/use-pipeline.ts index 5ef35d30ed..8a56b52ba9 100644 --- a/web/service/use-pipeline.ts +++ b/web/service/use-pipeline.ts @@ -195,7 +195,7 @@ export const useRunPublishedPipeline = ( mutationKey: [NAME_SPACE, 'run-published-pipeline'], mutationFn: (request: PublishedPipelineRunRequest) => { const { pipeline_id: pipelineId, is_preview, ...rest } = request - return post(`/rag/pipelines/${pipelineId}/workflows/published/run`, { + return post(`/rag/pipelines/${pipelineId}/workflows/published/run`, { body: { ...rest, is_preview,