import type { FC } from 'react' import type { DataSourceInfo, FullDocumentDetail, IndexingStatusResponse, LegacyDataSourceInfo, ProcessRuleResponse, } from '@/models/datasets' import { RiArrowRightLine, RiCheckboxCircleFill, RiErrorWarningFill, RiLoader2Fill, RiTerminalBoxLine, } from '@remixicon/react' import Image from 'next/image' import Link from 'next/link' import { useRouter } from 'next/navigation' import * as React from 'react' import { useCallback, useEffect, useMemo, useRef, useState } from 'react' import { useTranslation } from 'react-i18next' import Button from '@/app/components/base/button' import Divider from '@/app/components/base/divider' import { ZapFast } from '@/app/components/base/icons/src/vender/solid/general' import NotionIcon from '@/app/components/base/notion-icon' import Tooltip from '@/app/components/base/tooltip' import PriorityLabel from '@/app/components/billing/priority-label' import { Plan } from '@/app/components/billing/type' import UpgradeBtn from '@/app/components/billing/upgrade-btn' import { FieldInfo } from '@/app/components/datasets/documents/detail/metadata' import { useProviderContext } from '@/context/provider-context' import { useDatasetApiAccessUrl } from '@/hooks/use-api-access-url' import { DataSourceType, ProcessMode } from '@/models/datasets' import { fetchIndexingStatusBatch as doFetchIndexingStatus } from '@/service/datasets' import { useProcessRule } from '@/service/knowledge/use-dataset' import { useInvalidDocumentList } from '@/service/knowledge/use-document' import { RETRIEVE_METHOD } from '@/types/app' import { sleep } from '@/utils' import { cn } from '@/utils/classnames' import DocumentFileIcon from '../../common/document-file-icon' import { indexMethodIcon, retrievalIcon } from '../icons' import { IndexingType } from '../step-two' type Props = { datasetId: string batchId: string documents?: FullDocumentDetail[] indexingType?: string retrievalMethod?: RETRIEVE_METHOD } const RuleDetail: FC<{ sourceData?: ProcessRuleResponse indexingType?: string retrievalMethod?: RETRIEVE_METHOD }> = ({ sourceData, indexingType, retrievalMethod }) => { const { t } = useTranslation() const segmentationRuleMap = { mode: t('embedding.mode', { ns: 'datasetDocuments' }), segmentLength: t('embedding.segmentLength', { ns: 'datasetDocuments' }), textCleaning: t('embedding.textCleaning', { ns: 'datasetDocuments' }), } const getRuleName = (key: string) => { if (key === 'remove_extra_spaces') return t('stepTwo.removeExtraSpaces', { ns: 'datasetCreation' }) if (key === 'remove_urls_emails') return t('stepTwo.removeUrlEmails', { ns: 'datasetCreation' }) if (key === 'remove_stopwords') return t('stepTwo.removeStopwords', { ns: 'datasetCreation' }) } const isNumber = (value: unknown) => { return typeof value === 'number' } const getValue = useCallback((field: string) => { let value: string | number | undefined = '-' const maxTokens = isNumber(sourceData?.rules?.segmentation?.max_tokens) ? sourceData.rules.segmentation.max_tokens : value const childMaxTokens = isNumber(sourceData?.rules?.subchunk_segmentation?.max_tokens) ? sourceData.rules.subchunk_segmentation.max_tokens : value switch (field) { case 'mode': value = !sourceData?.mode ? value : sourceData.mode === ProcessMode.general ? (t('embedding.custom', { ns: 'datasetDocuments' }) as string) : `${t('embedding.hierarchical', { ns: 'datasetDocuments' })} ยท ${sourceData?.rules?.parent_mode === 'paragraph' ? t('parentMode.paragraph', { ns: 'dataset' }) : t('parentMode.fullDoc', { ns: 'dataset' })}` break case 'segmentLength': value = !sourceData?.mode ? value : sourceData.mode === ProcessMode.general ? maxTokens : `${t('embedding.parentMaxTokens', { ns: 'datasetDocuments' })} ${maxTokens}; ${t('embedding.childMaxTokens', { ns: 'datasetDocuments' })} ${childMaxTokens}` break default: value = !sourceData?.mode ? value : sourceData?.rules?.pre_processing_rules?.filter(rule => rule.enabled).map(rule => getRuleName(rule.id)).join(',') break } return value }, [sourceData]) return (
{Object.keys(segmentationRuleMap).map((field) => { return ( ) })} )} /> )} />
) } const EmbeddingProcess: FC = ({ datasetId, batchId, documents = [], indexingType, retrievalMethod }) => { const { t } = useTranslation() const { enableBilling, plan } = useProviderContext() const getFirstDocument = documents[0] const [indexingStatusBatchDetail, setIndexingStatusDetail] = useState([]) const fetchIndexingStatus = async () => { const status = await doFetchIndexingStatus({ datasetId, batchId }) setIndexingStatusDetail(status.data) return status.data } const [isStopQuery, setIsStopQuery] = useState(false) const isStopQueryRef = useRef(isStopQuery) useEffect(() => { isStopQueryRef.current = isStopQuery }, [isStopQuery]) const stopQueryStatus = () => { setIsStopQuery(true) } const startQueryStatus = async () => { if (isStopQueryRef.current) return try { const indexingStatusBatchDetail = await fetchIndexingStatus() const isCompleted = indexingStatusBatchDetail.every(indexingStatusDetail => ['completed', 'error', 'paused'].includes(indexingStatusDetail.indexing_status)) if (isCompleted) { stopQueryStatus() return } await sleep(2500) await startQueryStatus() } catch { await sleep(2500) await startQueryStatus() } } useEffect(() => { setIsStopQuery(false) startQueryStatus() return () => { stopQueryStatus() } }, []) // get rule const { data: ruleDetail } = useProcessRule(getFirstDocument?.id) const router = useRouter() const invalidDocumentList = useInvalidDocumentList() const navToDocumentList = () => { invalidDocumentList() router.push(`/datasets/${datasetId}/documents`) } const apiReferenceUrl = useDatasetApiAccessUrl() const isEmbedding = useMemo(() => { return indexingStatusBatchDetail.some(indexingStatusDetail => ['indexing', 'splitting', 'parsing', 'cleaning'].includes(indexingStatusDetail?.indexing_status || '')) }, [indexingStatusBatchDetail]) const isEmbeddingCompleted = useMemo(() => { return indexingStatusBatchDetail.every(indexingStatusDetail => ['completed', 'error', 'paused'].includes(indexingStatusDetail?.indexing_status || '')) }, [indexingStatusBatchDetail]) const getSourceName = (id: string) => { const doc = documents.find(document => document.id === id) return doc?.name } const getFileType = (name?: string) => name?.split('.').pop() || 'txt' const getSourcePercent = (detail: IndexingStatusResponse) => { const completedCount = detail.completed_segments || 0 const totalCount = detail.total_segments || 0 if (totalCount === 0) return 0 const percent = Math.round(completedCount * 100 / totalCount) return percent > 100 ? 100 : percent } const getSourceType = (id: string) => { const doc = documents.find(document => document.id === id) return doc?.data_source_type as DataSourceType } const isLegacyDataSourceInfo = (info: DataSourceInfo): info is LegacyDataSourceInfo => { return info != null && typeof (info as LegacyDataSourceInfo).upload_file === 'object' } const getIcon = (id: string) => { const doc = documents.find(document => document.id === id) const info = doc?.data_source_info if (info && isLegacyDataSourceInfo(info)) return info.notion_page_icon return undefined } const isSourceEmbedding = (detail: IndexingStatusResponse) => ['indexing', 'splitting', 'parsing', 'cleaning', 'waiting'].includes(detail.indexing_status || '') return ( <>
{isEmbedding && ( <> {t('embedding.processing', { ns: 'datasetDocuments' })} )} {isEmbeddingCompleted && t('embedding.completed', { ns: 'datasetDocuments' })}
{ enableBilling && plan.type !== Plan.team && (
{t('plansCommon.documentProcessingPriorityUpgrade', { ns: 'billing' })}
) }
{indexingStatusBatchDetail.map(indexingStatusDetail => (
{isSourceEmbedding(indexingStatusDetail) && (
)}
{getSourceType(indexingStatusDetail.id) === DataSourceType.FILE && ( )} {getSourceType(indexingStatusDetail.id) === DataSourceType.NOTION && ( )}
{getSourceName(indexingStatusDetail.id)}
{ enableBilling && ( ) }
{isSourceEmbedding(indexingStatusDetail) && (
{`${getSourcePercent(indexingStatusDetail)}%`}
)} {indexingStatusDetail.indexing_status === 'error' && ( )} {indexingStatusDetail.indexing_status === 'completed' && ( )}
))}
) } export default EmbeddingProcess