diff --git a/web/app/components/evaluation/__tests__/index.spec.tsx b/web/app/components/evaluation/__tests__/index.spec.tsx
new file mode 100644
index 0000000000..55edd6ceb2
--- /dev/null
+++ b/web/app/components/evaluation/__tests__/index.spec.tsx
@@ -0,0 +1,112 @@
+import { act, fireEvent, render, screen } from '@testing-library/react'
+import Evaluation from '..'
+import { getEvaluationMockConfig } from '../mock'
+import { useEvaluationStore } from '../store'
+
+vi.mock('@/app/components/header/account-setting/model-provider-page/hooks', () => ({
+ useModelList: () => ({
+ data: [{
+ provider: 'openai',
+ models: [{ model: 'gpt-4o-mini' }],
+ }],
+ }),
+}))
+
+vi.mock('@/app/components/header/account-setting/model-provider-page/model-selector', () => ({
+ default: ({ defaultModel }: { defaultModel?: { provider: string, model: string } }) => (
+
+ {defaultModel ? `${defaultModel.provider}:${defaultModel.model}` : 'empty'}
+
+ ),
+}))
+
+describe('Evaluation', () => {
+ beforeEach(() => {
+ useEvaluationStore.setState({ resources: {} })
+ })
+
+ it('should search, add metrics, and create a batch history record', async () => {
+ vi.useFakeTimers()
+
+ render(
)
+
+ expect(screen.getByTestId('evaluation-model-selector')).toHaveTextContent('openai:gpt-4o-mini')
+
+ fireEvent.click(screen.getByRole('button', { name: 'evaluation.metrics.add' }))
+ expect(screen.getByTestId('evaluation-metric-loading')).toBeInTheDocument()
+
+ await act(async () => {
+ vi.advanceTimersByTime(200)
+ })
+
+ fireEvent.change(screen.getByPlaceholderText('evaluation.metrics.searchPlaceholder'), {
+ target: { value: 'does-not-exist' },
+ })
+
+ await act(async () => {
+ vi.advanceTimersByTime(200)
+ })
+
+ expect(screen.getByText('evaluation.metrics.noResults')).toBeInTheDocument()
+
+ fireEvent.change(screen.getByPlaceholderText('evaluation.metrics.searchPlaceholder'), {
+ target: { value: 'faith' },
+ })
+
+ await act(async () => {
+ vi.advanceTimersByTime(200)
+ })
+
+ fireEvent.click(screen.getByRole('button', { name: /Faithfulness/i }))
+ expect(screen.getAllByText('Faithfulness').length).toBeGreaterThan(0)
+
+ fireEvent.click(screen.getByRole('button', { name: 'evaluation.batch.run' }))
+ expect(screen.getByText('evaluation.batch.status.running')).toBeInTheDocument()
+
+ await act(async () => {
+ vi.advanceTimersByTime(1300)
+ })
+
+ expect(screen.getByText('evaluation.batch.status.success')).toBeInTheDocument()
+ expect(screen.getByText('Workflow evaluation batch')).toBeInTheDocument()
+
+ vi.useRealTimers()
+ })
+
+ it('should render time placeholders and hide the value row for empty operators', () => {
+ const resourceType = 'workflow'
+ const resourceId = 'app-2'
+ const store = useEvaluationStore.getState()
+ const config = getEvaluationMockConfig(resourceType)
+
+ const timeField = config.fieldOptions.find(field => field.type === 'time')!
+ let groupId = ''
+ let itemId = ''
+
+ act(() => {
+ store.ensureResource(resourceType, resourceId)
+ store.setJudgeModel(resourceType, resourceId, 'openai::gpt-4o-mini')
+
+ const group = useEvaluationStore.getState().resources['workflow:app-2'].conditions[0]
+ groupId = group.id
+ itemId = group.items[0].id
+
+ store.updateConditionField(resourceType, resourceId, groupId, itemId, timeField.id)
+ store.updateConditionOperator(resourceType, resourceId, groupId, itemId, 'before')
+ })
+
+ let rerender: ReturnType
['rerender']
+ act(() => {
+ ({ rerender } = render())
+ })
+
+ expect(screen.getByText('evaluation.conditions.selectTime')).toBeInTheDocument()
+
+ act(() => {
+ store.updateConditionOperator(resourceType, resourceId, groupId, itemId, 'is_empty')
+ rerender()
+ })
+
+ expect(screen.queryByText('evaluation.conditions.selectTime')).not.toBeInTheDocument()
+ })
+})
diff --git a/web/app/components/evaluation/__tests__/store.spec.ts b/web/app/components/evaluation/__tests__/store.spec.ts
new file mode 100644
index 0000000000..d37952be61
--- /dev/null
+++ b/web/app/components/evaluation/__tests__/store.spec.ts
@@ -0,0 +1,96 @@
+import { getEvaluationMockConfig } from '../mock'
+import {
+ getAllowedOperators,
+ isCustomMetricConfigured,
+ requiresConditionValue,
+ useEvaluationStore,
+} from '../store'
+
+describe('evaluation store', () => {
+ beforeEach(() => {
+ useEvaluationStore.setState({ resources: {} })
+ })
+
+ it('should configure a custom metric mapping to a valid state', () => {
+ const resourceType = 'workflow'
+ const resourceId = 'app-1'
+ const store = useEvaluationStore.getState()
+ const config = getEvaluationMockConfig(resourceType)
+
+ store.ensureResource(resourceType, resourceId)
+ store.addCustomMetric(resourceType, resourceId)
+
+ const initialMetric = useEvaluationStore.getState().resources['workflow:app-1'].metrics.find(metric => metric.kind === 'custom-workflow')
+ expect(initialMetric).toBeDefined()
+ expect(isCustomMetricConfigured(initialMetric!)).toBe(false)
+
+ store.setCustomMetricWorkflow(resourceType, resourceId, initialMetric!.id, config.workflowOptions[0].id)
+ store.updateCustomMetricMapping(resourceType, resourceId, initialMetric!.id, initialMetric!.customConfig!.mappings[0].id, {
+ sourceFieldId: config.fieldOptions[0].id,
+ targetVariableId: config.workflowOptions[0].targetVariables[0].id,
+ })
+
+ const configuredMetric = useEvaluationStore.getState().resources['workflow:app-1'].metrics.find(metric => metric.id === initialMetric!.id)
+ expect(isCustomMetricConfigured(configuredMetric!)).toBe(true)
+ })
+
+ it('should add and remove builtin metrics', () => {
+ const resourceType = 'workflow'
+ const resourceId = 'app-2'
+ const store = useEvaluationStore.getState()
+ const config = getEvaluationMockConfig(resourceType)
+
+ store.ensureResource(resourceType, resourceId)
+ store.addBuiltinMetric(resourceType, resourceId, config.builtinMetrics[1].id)
+
+ const addedMetric = useEvaluationStore.getState().resources['workflow:app-2'].metrics.find(metric => metric.optionId === config.builtinMetrics[1].id)
+ expect(addedMetric).toBeDefined()
+
+ store.removeMetric(resourceType, resourceId, addedMetric!.id)
+
+ expect(useEvaluationStore.getState().resources['workflow:app-2'].metrics.some(metric => metric.id === addedMetric!.id)).toBe(false)
+ })
+
+ it('should update condition groups and adapt operators to field types', () => {
+ const resourceType = 'pipeline'
+ const resourceId = 'dataset-1'
+ const store = useEvaluationStore.getState()
+ const config = getEvaluationMockConfig(resourceType)
+
+ store.ensureResource(resourceType, resourceId)
+
+ const initialGroup = useEvaluationStore.getState().resources['pipeline:dataset-1'].conditions[0]
+ store.setConditionGroupOperator(resourceType, resourceId, initialGroup.id, 'or')
+ store.addConditionGroup(resourceType, resourceId)
+
+ const booleanField = config.fieldOptions.find(field => field.type === 'boolean')!
+ const currentItem = useEvaluationStore.getState().resources['pipeline:dataset-1'].conditions[0].items[0]
+ store.updateConditionField(resourceType, resourceId, initialGroup.id, currentItem.id, booleanField.id)
+
+ const updatedGroup = useEvaluationStore.getState().resources['pipeline:dataset-1'].conditions[0]
+ expect(updatedGroup.logicalOperator).toBe('or')
+ expect(updatedGroup.items[0].operator).toBe('is')
+ expect(getAllowedOperators(resourceType, booleanField.id)).toEqual(['is', 'is_not'])
+ })
+
+ it('should support time fields and clear values for empty operators', () => {
+ const resourceType = 'workflow'
+ const resourceId = 'app-3'
+ const store = useEvaluationStore.getState()
+ const config = getEvaluationMockConfig(resourceType)
+
+ store.ensureResource(resourceType, resourceId)
+
+ const timeField = config.fieldOptions.find(field => field.type === 'time')!
+ const item = useEvaluationStore.getState().resources['workflow:app-3'].conditions[0].items[0]
+
+ store.updateConditionField(resourceType, resourceId, useEvaluationStore.getState().resources['workflow:app-3'].conditions[0].id, item.id, timeField.id)
+ store.updateConditionOperator(resourceType, resourceId, useEvaluationStore.getState().resources['workflow:app-3'].conditions[0].id, item.id, 'is_empty')
+
+ const updatedItem = useEvaluationStore.getState().resources['workflow:app-3'].conditions[0].items[0]
+
+ expect(getAllowedOperators(resourceType, timeField.id)).toEqual(['is', 'before', 'after', 'is_empty', 'is_not_empty'])
+ expect(requiresConditionValue('is_empty')).toBe(false)
+ expect(updatedItem.value).toBeNull()
+ })
+})
diff --git a/web/app/components/evaluation/index.tsx b/web/app/components/evaluation/index.tsx
new file mode 100644
index 0000000000..798e092eed
--- /dev/null
+++ b/web/app/components/evaluation/index.tsx
@@ -0,0 +1,1017 @@
+'use client'
+
+import type { TFunction } from 'i18next'
+import type { ChangeEvent, ReactNode } from 'react'
+import type {
+ ComparisonOperator,
+ CustomMetricMapping,
+ EvaluationFieldOption,
+ EvaluationMetric,
+ EvaluationResourceType,
+ JudgmentConditionGroup,
+} from './types'
+import {
+ RiAddLine,
+ RiArrowDownSLine,
+ RiCloseLine,
+ RiDeleteBinLine,
+ RiDownloadLine,
+ RiFileUploadLine,
+ RiFlaskLine,
+ RiLoader4Line,
+} from '@remixicon/react'
+import { useEffect, useMemo, useRef, useState } from 'react'
+import { useTranslation } from 'react-i18next'
+import Badge from '@/app/components/base/badge'
+import Button from '@/app/components/base/button'
+import DatePicker from '@/app/components/base/date-and-time-picker/date-picker'
+import dayjs from '@/app/components/base/date-and-time-picker/utils/dayjs'
+import Input from '@/app/components/base/input'
+import Toast from '@/app/components/base/toast'
+import {
+ Popover,
+ PopoverContent,
+ PopoverTrigger,
+} from '@/app/components/base/ui/popover'
+import {
+ Select,
+ SelectContent,
+ SelectGroup,
+ SelectGroupLabel,
+ SelectItem,
+ SelectTrigger,
+ SelectValue,
+} from '@/app/components/base/ui/select'
+import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
+import { useModelList } from '@/app/components/header/account-setting/model-provider-page/hooks'
+import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
+import { cn } from '@/utils/classnames'
+import { getEvaluationMockConfig } from './mock'
+import {
+ getAllowedOperators,
+ isCustomMetricConfigured,
+ isEvaluationRunnable,
+ requiresConditionValue,
+ useEvaluationResource,
+ useEvaluationStore,
+} from './store'
+
+type EvaluationProps = {
+ resourceType: EvaluationResourceType
+ resourceId: string
+}
+
+const TAB_CLASS_NAME = 'flex-1 rounded-lg px-3 py-2 text-left system-sm-medium'
+
+const encodeModelSelection = (provider: string, model: string) => `${provider}::${model}`
+
+const decodeModelSelection = (judgeModelId: string | null) => {
+ if (!judgeModelId)
+ return undefined
+
+ const [provider, model] = judgeModelId.split('::')
+ if (!provider || !model)
+ return undefined
+
+ return { provider, model }
+}
+
+const compactOperatorLabels: Partial> = {
+ is: '=',
+ is_not: '!=',
+ greater_than: '>',
+ less_than: '<',
+ greater_or_equal: '>=',
+ less_or_equal: '<=',
+}
+
+const groupFieldOptions = (fieldOptions: EvaluationFieldOption[]) => {
+ return Object.entries(fieldOptions.reduce>((acc, field) => {
+ acc[field.group] = [...(acc[field.group] ?? []), field]
+ return acc
+ }, {}))
+}
+
+const getOperatorLabel = (
+ operator: ComparisonOperator,
+ fieldType: EvaluationFieldOption['type'] | undefined,
+ t: TFunction<'evaluation'>,
+) => {
+ if (fieldType === 'number' && compactOperatorLabels[operator])
+ return compactOperatorLabels[operator] as string
+
+ return t(`conditions.operators.${operator}` as const)
+}
+
+const getFieldTypeIconClassName = (fieldType: EvaluationFieldOption['type']) => {
+ if (fieldType === 'number')
+ return 'i-ri-hashtag'
+
+ if (fieldType === 'boolean')
+ return 'i-ri-checkbox-circle-line'
+
+ if (fieldType === 'enum')
+ return 'i-ri-list-check-2'
+
+ if (fieldType === 'time')
+ return 'i-ri-time-line'
+
+ return 'i-ri-text'
+}
+
+const ConditionFieldLabel = ({
+ field,
+ placeholder,
+}: {
+ field?: EvaluationFieldOption
+ placeholder: string
+}) => {
+ if (!field)
+ return {placeholder}
+
+ return (
+
+
+
+ {field.label}
+
+
{field.type}
+
+ )
+}
+
+const SectionHeader = ({
+ title,
+ description,
+ action,
+}: {
+ title: string
+ description: string
+ action?: ReactNode
+}) => {
+ return (
+
+
+
{title}
+
{description}
+
+ {action}
+
+ )
+}
+
+const FieldValueInput = ({
+ field,
+ operator,
+ value,
+ onChange,
+}: {
+ field?: EvaluationFieldOption
+ operator: ComparisonOperator
+ value: string | number | boolean | null
+ onChange: (value: string | number | boolean | null) => void
+}) => {
+ const { t } = useTranslation('evaluation')
+
+ if (!field || !requiresConditionValue(operator))
+ return null
+
+ if (field.type === 'time') {
+ const selectedTime = typeof value === 'string' && value ? dayjs(value) : undefined
+
+ return (
+
+ onChange(date ? date.toISOString() : null)}
+ onClear={() => onChange(null)}
+ placeholder={t('conditions.selectTime')}
+ triggerWrapClassName="w-full"
+ popupZIndexClassname="z-[1002]"
+ renderTrigger={({ handleClickTrigger }) => (
+
+ )}
+ />
+
+ )
+ }
+
+ if (field.type === 'boolean') {
+ return (
+
+
+
+ )
+ }
+
+ if (field.type === 'enum') {
+ return (
+
+
+
+ )
+ }
+
+ return (
+
+ {
+ if (field.type === 'number') {
+ const nextValue = e.target.value
+ onChange(nextValue === '' ? null : Number(nextValue))
+ return
+ }
+
+ onChange(e.target.value)
+ }}
+ />
+
+ )
+}
+
+const ConditionFieldSelect = ({
+ field,
+ fieldOptions,
+ placeholder,
+ onChange,
+}: {
+ field?: EvaluationFieldOption
+ fieldOptions: EvaluationFieldOption[]
+ placeholder: string
+ onChange: (fieldId: string) => void
+}) => {
+ return (
+
+ )
+}
+
+const ConditionOperatorSelect = ({
+ field,
+ operator,
+ operators,
+ onChange,
+}: {
+ field?: EvaluationFieldOption
+ operator: ComparisonOperator
+ operators: ComparisonOperator[]
+ onChange: (operator: ComparisonOperator) => void
+}) => {
+ const { t } = useTranslation('evaluation')
+
+ return (
+
+ )
+}
+
+const JudgeModelSelector = ({
+ resourceId,
+ resourceType,
+}: EvaluationProps) => {
+ const { data: modelList } = useModelList(ModelTypeEnum.textGeneration)
+ const resource = useEvaluationResource(resourceType, resourceId)
+ const setJudgeModel = useEvaluationStore(state => state.setJudgeModel)
+ const selectedModel = decodeModelSelection(resource.judgeModelId)
+
+ useEffect(() => {
+ if (resource.judgeModelId || !modelList.length)
+ return
+
+ const firstProvider = modelList[0]
+ const firstModel = firstProvider.models[0]
+ if (!firstProvider || !firstModel)
+ return
+
+ setJudgeModel(resourceType, resourceId, encodeModelSelection(firstProvider.provider, firstModel.model))
+ }, [modelList, resource.judgeModelId, resourceId, resourceType, setJudgeModel])
+
+ return (
+ setJudgeModel(resourceType, resourceId, encodeModelSelection(model.provider, model.model))}
+ showDeprecatedWarnIcon
+ triggerClassName="h-11"
+ />
+ )
+}
+
+const MetricSelector = ({
+ resourceType,
+ resourceId,
+}: EvaluationProps) => {
+ const { t } = useTranslation('evaluation')
+ const config = getEvaluationMockConfig(resourceType)
+ const metricGroupLabels = {
+ quality: t('metrics.groups.quality'),
+ operations: t('metrics.groups.operations'),
+ }
+ const metrics = useEvaluationResource(resourceType, resourceId).metrics
+ const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric)
+ const addCustomMetric = useEvaluationStore(state => state.addCustomMetric)
+ const [open, setOpen] = useState(false)
+ const [query, setQuery] = useState('')
+ const [showAll, setShowAll] = useState(false)
+ const [isLoading, setIsLoading] = useState(false)
+ const loadingTimerRef = useRef(null)
+
+ const triggerLoading = () => {
+ if (loadingTimerRef.current)
+ window.clearTimeout(loadingTimerRef.current)
+
+ setIsLoading(true)
+ loadingTimerRef.current = window.setTimeout(() => {
+ setIsLoading(false)
+ }, 180)
+ }
+
+ const handleOpenChange = (nextOpen: boolean) => {
+ setOpen(nextOpen)
+
+ if (nextOpen) {
+ triggerLoading()
+ return
+ }
+
+ if (loadingTimerRef.current)
+ window.clearTimeout(loadingTimerRef.current)
+ setIsLoading(false)
+ }
+
+ const handleQueryChange = (event: ChangeEvent) => {
+ setQuery(event.target.value)
+ if (open)
+ triggerLoading()
+ }
+
+ useEffect(() => {
+ return () => {
+ if (loadingTimerRef.current)
+ window.clearTimeout(loadingTimerRef.current)
+ }
+ }, [])
+
+ const filteredGroups = useMemo(() => {
+ const filteredMetrics = config.builtinMetrics.filter((metric) => {
+ const keyword = query.trim().toLowerCase()
+ if (!keyword)
+ return true
+
+ return metric.label.toLowerCase().includes(keyword) || metric.description.toLowerCase().includes(keyword)
+ })
+
+ const grouped = filteredMetrics.reduce>((acc, metric) => {
+ acc[metric.group] = [...(acc[metric.group] ?? []), metric]
+ return acc
+ }, {})
+
+ return Object.entries(grouped)
+ }, [config.builtinMetrics, query])
+
+ return (
+
+
+
+ {t('metrics.add')}
+
+
+
+
+
+ {isLoading && (
+
+ {['metric-skeleton-1', 'metric-skeleton-2', 'metric-skeleton-3'].map(key => (
+
+ ))}
+
+ )}
+ {!isLoading && filteredGroups.length === 0 && (
+
+ {t('metrics.noResults')}
+
+ )}
+ {!isLoading && filteredGroups.map(([groupName, options]) => {
+ const shownOptions = showAll ? options : options.slice(0, 2)
+ return (
+
+
{metricGroupLabels[groupName as keyof typeof metricGroupLabels] ?? groupName}
+
+ {shownOptions.map(option => (
+
+ ))}
+
+
+ )
+ })}
+
+ {filteredGroups.some(([, options]) => options.length > 2) && (
+
+ )}
+
+
+
+
+
+
+ )
+}
+
+const CustomMetricEditor = ({
+ resourceType,
+ resourceId,
+ metric,
+}: EvaluationProps & { metric: EvaluationMetric }) => {
+ const { t } = useTranslation('evaluation')
+ const config = getEvaluationMockConfig(resourceType)
+ const setCustomMetricWorkflow = useEvaluationStore(state => state.setCustomMetricWorkflow)
+ const addCustomMetricMapping = useEvaluationStore(state => state.addCustomMetricMapping)
+ const updateCustomMetricMapping = useEvaluationStore(state => state.updateCustomMetricMapping)
+ const removeCustomMetricMapping = useEvaluationStore(state => state.removeCustomMetricMapping)
+ const selectedWorkflow = config.workflowOptions.find(option => option.id === metric.customConfig?.workflowId)
+ const isConfigured = isCustomMetricConfigured(metric)
+
+ if (!metric.customConfig)
+ return null
+
+ return (
+
+
+
+
{t('metrics.custom.title')}
+
{t('metrics.custom.description')}
+
+ {!isConfigured &&
{t('metrics.custom.warningBadge')}}
+
+
+
+
{t('metrics.custom.workflowLabel')}
+
+ {selectedWorkflow &&
{selectedWorkflow.description}
}
+
+
+
+
{t('metrics.custom.mappingTitle')}
+
+
+
+ {metric.customConfig.mappings.map(mapping => (
+ updateCustomMetricMapping(resourceType, resourceId, metric.id, mapping.id, patch)}
+ onRemove={() => removeCustomMetricMapping(resourceType, resourceId, metric.id, mapping.id)}
+ />
+ ))}
+
+ {!isConfigured && (
+
+ {t('metrics.custom.mappingWarning')}
+
+ )}
+
+
+
+ )
+}
+
+function MappingRow({
+ resourceType,
+ mapping,
+ targetOptions,
+ onUpdate,
+ onRemove,
+}: {
+ resourceType: EvaluationResourceType
+ mapping: CustomMetricMapping
+ targetOptions: Array<{ id: string, label: string }>
+ onUpdate: (patch: { sourceFieldId?: string | null, targetVariableId?: string | null }) => void
+ onRemove: () => void
+}) {
+ const { t } = useTranslation('evaluation')
+ const config = getEvaluationMockConfig(resourceType)
+
+ return (
+
+
+
+
+
+
+
+
+ )
+}
+
+const ConditionGroup = ({
+ resourceType,
+ resourceId,
+ group,
+ index,
+}: EvaluationProps & { group: JudgmentConditionGroup, index: number }) => {
+ const { t } = useTranslation('evaluation')
+ const config = getEvaluationMockConfig(resourceType)
+ const logicalLabels = {
+ and: t('conditions.logical.and'),
+ or: t('conditions.logical.or'),
+ }
+ const removeConditionGroup = useEvaluationStore(state => state.removeConditionGroup)
+ const setConditionGroupOperator = useEvaluationStore(state => state.setConditionGroupOperator)
+ const addConditionItem = useEvaluationStore(state => state.addConditionItem)
+ const removeConditionItem = useEvaluationStore(state => state.removeConditionItem)
+ const updateConditionField = useEvaluationStore(state => state.updateConditionField)
+ const updateConditionOperator = useEvaluationStore(state => state.updateConditionOperator)
+ const updateConditionValue = useEvaluationStore(state => state.updateConditionValue)
+
+ return (
+
+
+
+
{t('conditions.groupLabel', { index: index + 1 })}
+
+ {(['and', 'or'] as const).map(operator => (
+
+ ))}
+
+
+
+
+
+
+
+
+ {group.items.map((item) => {
+ const field = config.fieldOptions.find(option => option.id === item.fieldId)
+ const allowedOperators = getAllowedOperators(resourceType, item.fieldId)
+ const showValue = !!field && requiresConditionValue(item.operator)
+
+ return (
+
+
+
+
+ updateConditionField(resourceType, resourceId, group.id, item.id, value)}
+ />
+
+
+
updateConditionOperator(resourceType, resourceId, group.id, item.id, value)}
+ />
+
+ {showValue && (
+
+ updateConditionValue(resourceType, resourceId, group.id, item.id, value)}
+ />
+
+ )}
+
+
+
+
+
+ )
+ })}
+
+
+ )
+}
+
+const BatchTestPanel = ({
+ resourceType,
+ resourceId,
+}: EvaluationProps) => {
+ const { t } = useTranslation('evaluation')
+ const config = getEvaluationMockConfig(resourceType)
+ const tabLabels = {
+ 'input-fields': t('batch.tabs.input-fields'),
+ 'history': t('batch.tabs.history'),
+ }
+ const statusLabels = {
+ running: t('batch.status.running'),
+ success: t('batch.status.success'),
+ failed: t('batch.status.failed'),
+ }
+ const resource = useEvaluationResource(resourceType, resourceId)
+ const setBatchTab = useEvaluationStore(state => state.setBatchTab)
+ const setUploadedFileName = useEvaluationStore(state => state.setUploadedFileName)
+ const runBatchTest = useEvaluationStore(state => state.runBatchTest)
+ const fileInputRef = useRef(null)
+ const isRunnable = isEvaluationRunnable(resource)
+
+ const handleDownloadTemplate = () => {
+ const content = ['case_id,input,expected', '1,Example input,Example output'].join('\n')
+ const link = document.createElement('a')
+ link.href = `data:text/csv;charset=utf-8,${encodeURIComponent(content)}`
+ link.download = config.templateFileName
+ link.click()
+ }
+
+ const handleRun = () => {
+ if (!isRunnable) {
+ Toast.notify({
+ type: 'warning',
+ message: t('batch.validation'),
+ })
+ return
+ }
+
+ runBatchTest(resourceType, resourceId)
+ }
+
+ return (
+
+
+
+
+ {t('batch.title')}
+
+
+
{t('batch.noticeTitle')}
+
{t('batch.noticeDescription')}
+
+
+ {(['input-fields', 'history'] as const).map(tab => (
+
+ ))}
+
+
+
+ {resource.activeBatchTab === 'input-fields' && (
+
+
+
{t('batch.requirementsTitle')}
+
+ {config.batchRequirements.map(requirement => (
+
+
+ {requirement}
+
+ ))}
+
+
+
+
+
{
+ const file = event.target.files?.[0]
+ setUploadedFileName(resourceType, resourceId, file?.name ?? null)
+ }}
+ />
+
+
+ {!isRunnable && (
+
+ {t('batch.validation')}
+
+ )}
+
+
+ )}
+ {resource.activeBatchTab === 'history' && (
+
+ {resource.batchRecords.length === 0 && (
+
+ {t('batch.emptyHistory')}
+
+ )}
+ {resource.batchRecords.map(record => (
+
+
+
+
{record.summary}
+
{record.fileName}
+
+
+ {record.status === 'running'
+ ? (
+
+
+ {statusLabels.running}
+
+ )
+ : statusLabels[record.status]}
+
+
+
{record.startedAt}
+
+ ))}
+
+ )}
+
+
+ )
+}
+
+const Evaluation = ({
+ resourceType,
+ resourceId,
+}: EvaluationProps) => {
+ const { t } = useTranslation('evaluation')
+ const resource = useEvaluationResource(resourceType, resourceId)
+ const ensureResource = useEvaluationStore(state => state.ensureResource)
+ const removeMetric = useEvaluationStore(state => state.removeMetric)
+ const addConditionGroup = useEvaluationStore(state => state.addConditionGroup)
+
+ useEffect(() => {
+ ensureResource(resourceType, resourceId)
+ }, [ensureResource, resourceId, resourceType])
+
+ return (
+
+
+
+
+
+
+
+ }
+ />
+
+ {resource.metrics.map(metric => (
+
+
+
+
{metric.label}
+
{metric.description}
+
+ {metric.badges.map(badge => (
+ {badge}
+ ))}
+
+
+
+
+ {metric.kind === 'custom-workflow' && (
+
+ )}
+
+ ))}
+
+
+
+
+ addConditionGroup(resourceType, resourceId)}>
+
+ {t('conditions.addGroup')}
+
+ )}
+ />
+
+ {resource.conditions.length === 0 && (
+
+
{t('conditions.emptyTitle')}
+
{t('conditions.emptyDescription')}
+
+ )}
+ {resource.conditions.map((group, index) => (
+
+ ))}
+
+
+
+
+
+
+
+
+
+ )
+}
+
+export default Evaluation
diff --git a/web/app/components/evaluation/mock.ts b/web/app/components/evaluation/mock.ts
new file mode 100644
index 0000000000..598e5ee675
--- /dev/null
+++ b/web/app/components/evaluation/mock.ts
@@ -0,0 +1,184 @@
+import type {
+ ComparisonOperator,
+ EvaluationFieldOption,
+ EvaluationMockConfig,
+ EvaluationResourceType,
+ MetricOption,
+} from './types'
+
+const judgeModels = [
+ {
+ id: 'gpt-4.1-mini',
+ label: 'GPT-4.1 mini',
+ provider: 'OpenAI',
+ },
+ {
+ id: 'claude-3-7-sonnet',
+ label: 'Claude 3.7 Sonnet',
+ provider: 'Anthropic',
+ },
+ {
+ id: 'gemini-2.0-flash',
+ label: 'Gemini 2.0 Flash',
+ provider: 'Google',
+ },
+]
+
+const builtinMetrics: MetricOption[] = [
+ {
+ id: 'answer-correctness',
+ label: 'Answer Correctness',
+ description: 'Compares the response with the expected answer and scores factual alignment.',
+ group: 'quality',
+ badges: ['LLM', 'Built-in'],
+ },
+ {
+ id: 'faithfulness',
+ label: 'Faithfulness',
+ description: 'Checks whether the answer stays grounded in the retrieved evidence.',
+ group: 'quality',
+ badges: ['LLM', 'Retrieval'],
+ },
+ {
+ id: 'relevance',
+ label: 'Relevance',
+ description: 'Evaluates how directly the answer addresses the original request.',
+ group: 'quality',
+ badges: ['LLM'],
+ },
+ {
+ id: 'latency',
+ label: 'Latency',
+ description: 'Captures runtime responsiveness for the full execution path.',
+ group: 'operations',
+ badges: ['System'],
+ },
+ {
+ id: 'token-usage',
+ label: 'Token Usage',
+ description: 'Tracks prompt and completion token consumption for the run.',
+ group: 'operations',
+ badges: ['System'],
+ },
+ {
+ id: 'tool-success-rate',
+ label: 'Tool Success Rate',
+ description: 'Measures whether each required tool invocation finishes without failure.',
+ group: 'operations',
+ badges: ['Workflow'],
+ },
+]
+
+const workflowOptions = [
+ {
+ id: 'workflow-precision-review',
+ label: 'Precision Review Workflow',
+ description: 'Custom evaluator for nuanced quality review.',
+ targetVariables: [
+ { id: 'query', label: 'query' },
+ { id: 'answer', label: 'answer' },
+ { id: 'reference', label: 'reference' },
+ ],
+ },
+ {
+ id: 'workflow-risk-review',
+ label: 'Risk Review Workflow',
+ description: 'Custom evaluator for policy and escalation checks.',
+ targetVariables: [
+ { id: 'input', label: 'input' },
+ { id: 'output', label: 'output' },
+ ],
+ },
+]
+
+const workflowFields: EvaluationFieldOption[] = [
+ { id: 'app.input.query', label: 'Query', group: 'App Input', type: 'string' },
+ { id: 'app.input.locale', label: 'Locale', group: 'App Input', type: 'enum', options: [{ value: 'en-US', label: 'en-US' }, { value: 'zh-Hans', label: 'zh-Hans' }] },
+ { id: 'app.output.answer', label: 'Answer', group: 'App Output', type: 'string' },
+ { id: 'app.output.score', label: 'Score', group: 'App Output', type: 'number' },
+ { id: 'app.output.published_at', label: 'Publication Date', group: 'App Output', type: 'time' },
+ { id: 'system.has_context', label: 'Has Context', group: 'System', type: 'boolean' },
+]
+
+const pipelineFields: EvaluationFieldOption[] = [
+ { id: 'dataset.input.document_id', label: 'Document ID', group: 'Dataset', type: 'string' },
+ { id: 'dataset.input.chunk_count', label: 'Chunk Count', group: 'Dataset', type: 'number' },
+ { id: 'dataset.input.updated_at', label: 'Updated At', group: 'Dataset', type: 'time' },
+ { id: 'retrieval.output.hit_rate', label: 'Hit Rate', group: 'Retrieval', type: 'number' },
+ { id: 'retrieval.output.source', label: 'Source', group: 'Retrieval', type: 'enum', options: [{ value: 'bm25', label: 'BM25' }, { value: 'hybrid', label: 'Hybrid' }] },
+ { id: 'pipeline.output.published', label: 'Published', group: 'Output', type: 'boolean' },
+]
+
+const snippetFields: EvaluationFieldOption[] = [
+ { id: 'snippet.input.blog_url', label: 'Blog URL', group: 'Snippet Input', type: 'string' },
+ { id: 'snippet.input.platforms', label: 'Platforms', group: 'Snippet Input', type: 'string' },
+ { id: 'snippet.output.content', label: 'Generated Content', group: 'Snippet Output', type: 'string' },
+ { id: 'snippet.output.length', label: 'Output Length', group: 'Snippet Output', type: 'number' },
+ { id: 'snippet.output.scheduled_at', label: 'Scheduled At', group: 'Snippet Output', type: 'time' },
+ { id: 'system.requires_review', label: 'Requires Review', group: 'System', type: 'boolean' },
+]
+
+export const getComparisonOperators = (fieldType: EvaluationFieldOption['type']): ComparisonOperator[] => {
+ if (fieldType === 'number')
+ return ['is', 'is_not', 'greater_than', 'less_than', 'greater_or_equal', 'less_or_equal', 'is_empty', 'is_not_empty']
+
+ if (fieldType === 'time')
+ return ['is', 'before', 'after', 'is_empty', 'is_not_empty']
+
+ if (fieldType === 'boolean' || fieldType === 'enum')
+ return ['is', 'is_not']
+
+ return ['contains', 'not_contains', 'is', 'is_not', 'is_empty', 'is_not_empty']
+}
+
+export const getDefaultOperator = (fieldType: EvaluationFieldOption['type']): ComparisonOperator => {
+ return getComparisonOperators(fieldType)[0]
+}
+
+export const getEvaluationMockConfig = (resourceType: EvaluationResourceType): EvaluationMockConfig => {
+ if (resourceType === 'pipeline') {
+ return {
+ judgeModels,
+ builtinMetrics,
+ workflowOptions,
+ fieldOptions: pipelineFields,
+ templateFileName: 'pipeline-evaluation-template.csv',
+ batchRequirements: [
+ 'Include one row per retrieval scenario.',
+ 'Provide the expected source or target chunk for each case.',
+ 'Keep numeric metrics in plain number format.',
+ ],
+ historySummaryLabel: 'Pipeline evaluation batch',
+ }
+ }
+
+ if (resourceType === 'snippet') {
+ return {
+ judgeModels,
+ builtinMetrics,
+ workflowOptions,
+ fieldOptions: snippetFields,
+ templateFileName: 'snippet-evaluation-template.csv',
+ batchRequirements: [
+ 'Include one row per snippet execution case.',
+ 'Provide the expected final content or acceptance rule.',
+ 'Keep optional fields empty when not used.',
+ ],
+ historySummaryLabel: 'Snippet evaluation batch',
+ }
+ }
+
+ return {
+ judgeModels,
+ builtinMetrics,
+ workflowOptions,
+ fieldOptions: workflowFields,
+ templateFileName: 'workflow-evaluation-template.csv',
+ batchRequirements: [
+ 'Include one row per workflow test case.',
+ 'Provide both user input and expected answer when available.',
+ 'Keep boolean columns as true or false.',
+ ],
+ historySummaryLabel: 'Workflow evaluation batch',
+ }
+}
diff --git a/web/app/components/evaluation/store.ts b/web/app/components/evaluation/store.ts
new file mode 100644
index 0000000000..59b2b6d907
--- /dev/null
+++ b/web/app/components/evaluation/store.ts
@@ -0,0 +1,635 @@
+import type {
+ BatchTestRecord,
+ ComparisonOperator,
+ EvaluationFieldOption,
+ EvaluationMetric,
+ EvaluationResourceState,
+ EvaluationResourceType,
+ JudgmentConditionGroup,
+} from './types'
+import { create } from 'zustand'
+import { getComparisonOperators, getDefaultOperator, getEvaluationMockConfig } from './mock'
+
+type EvaluationStore = {
+ resources: Record
+ ensureResource: (resourceType: EvaluationResourceType, resourceId: string) => void
+ setJudgeModel: (resourceType: EvaluationResourceType, resourceId: string, judgeModelId: string) => void
+ addBuiltinMetric: (resourceType: EvaluationResourceType, resourceId: string, optionId: string) => void
+ addCustomMetric: (resourceType: EvaluationResourceType, resourceId: string) => void
+ removeMetric: (resourceType: EvaluationResourceType, resourceId: string, metricId: string) => void
+ setCustomMetricWorkflow: (resourceType: EvaluationResourceType, resourceId: string, metricId: string, workflowId: string) => void
+ addCustomMetricMapping: (resourceType: EvaluationResourceType, resourceId: string, metricId: string) => void
+ updateCustomMetricMapping: (
+ resourceType: EvaluationResourceType,
+ resourceId: string,
+ metricId: string,
+ mappingId: string,
+ patch: { sourceFieldId?: string | null, targetVariableId?: string | null },
+ ) => void
+ removeCustomMetricMapping: (resourceType: EvaluationResourceType, resourceId: string, metricId: string, mappingId: string) => void
+ addConditionGroup: (resourceType: EvaluationResourceType, resourceId: string) => void
+ removeConditionGroup: (resourceType: EvaluationResourceType, resourceId: string, groupId: string) => void
+ setConditionGroupOperator: (resourceType: EvaluationResourceType, resourceId: string, groupId: string, logicalOperator: 'and' | 'or') => void
+ addConditionItem: (resourceType: EvaluationResourceType, resourceId: string, groupId: string) => void
+ removeConditionItem: (resourceType: EvaluationResourceType, resourceId: string, groupId: string, itemId: string) => void
+ updateConditionField: (resourceType: EvaluationResourceType, resourceId: string, groupId: string, itemId: string, fieldId: string) => void
+ updateConditionOperator: (resourceType: EvaluationResourceType, resourceId: string, groupId: string, itemId: string, operator: ComparisonOperator) => void
+ updateConditionValue: (
+ resourceType: EvaluationResourceType,
+ resourceId: string,
+ groupId: string,
+ itemId: string,
+ value: string | number | boolean | null,
+ ) => void
+ setBatchTab: (resourceType: EvaluationResourceType, resourceId: string, tab: EvaluationResourceState['activeBatchTab']) => void
+ setUploadedFileName: (resourceType: EvaluationResourceType, resourceId: string, uploadedFileName: string | null) => void
+ runBatchTest: (resourceType: EvaluationResourceType, resourceId: string) => void
+}
+
+const buildResourceKey = (resourceType: EvaluationResourceType, resourceId: string) => `${resourceType}:${resourceId}`
+const initialResourceCache: Record = {}
+
+const createId = (prefix: string) => `${prefix}-${Math.random().toString(36).slice(2, 10)}`
+
+export const conditionOperatorsWithoutValue: ComparisonOperator[] = ['is_empty', 'is_not_empty']
+
+export const requiresConditionValue = (operator: ComparisonOperator) => !conditionOperatorsWithoutValue.includes(operator)
+
+const getConditionValue = (
+ field: EvaluationFieldOption | undefined,
+ operator: ComparisonOperator,
+ previousValue: string | number | boolean | null = null,
+) => {
+ if (!field || !requiresConditionValue(operator))
+ return null
+
+ if (field.type === 'boolean')
+ return typeof previousValue === 'boolean' ? previousValue : null
+
+ if (field.type === 'enum')
+ return typeof previousValue === 'string' ? previousValue : null
+
+ if (field.type === 'number')
+ return typeof previousValue === 'number' ? previousValue : null
+
+ return typeof previousValue === 'string' ? previousValue : null
+}
+
+const buildConditionItem = (resourceType: EvaluationResourceType) => {
+ const field = getEvaluationMockConfig(resourceType).fieldOptions[0]
+ const operator = field ? getDefaultOperator(field.type) : 'contains'
+
+ return {
+ id: createId('condition'),
+ fieldId: field?.id ?? null,
+ operator,
+ value: getConditionValue(field, operator),
+ }
+}
+
+const buildInitialState = (resourceType: EvaluationResourceType): EvaluationResourceState => {
+ const config = getEvaluationMockConfig(resourceType)
+ const defaultMetric = config.builtinMetrics[0]
+
+ return {
+ judgeModelId: null,
+ metrics: defaultMetric
+ ? [{
+ id: createId('metric'),
+ optionId: defaultMetric.id,
+ kind: 'builtin',
+ label: defaultMetric.label,
+ description: defaultMetric.description,
+ badges: defaultMetric.badges,
+ }]
+ : [],
+ conditions: [{
+ id: createId('group'),
+ logicalOperator: 'and',
+ items: [buildConditionItem(resourceType)],
+ }],
+ activeBatchTab: 'input-fields',
+ uploadedFileName: null,
+ batchRecords: [],
+ }
+}
+
+const withResourceState = (
+ resources: EvaluationStore['resources'],
+ resourceType: EvaluationResourceType,
+ resourceId: string,
+) => {
+ const resourceKey = buildResourceKey(resourceType, resourceId)
+
+ return {
+ resourceKey,
+ resource: resources[resourceKey] ?? buildInitialState(resourceType),
+ }
+}
+
+const updateMetric = (
+ metrics: EvaluationMetric[],
+ metricId: string,
+ updater: (metric: EvaluationMetric) => EvaluationMetric,
+) => metrics.map(metric => metric.id === metricId ? updater(metric) : metric)
+
+const updateConditionGroup = (
+ groups: JudgmentConditionGroup[],
+ groupId: string,
+ updater: (group: JudgmentConditionGroup) => JudgmentConditionGroup,
+) => groups.map(group => group.id === groupId ? updater(group) : group)
+
+export const isCustomMetricConfigured = (metric: EvaluationMetric) => {
+ if (metric.kind !== 'custom-workflow')
+ return true
+
+ if (!metric.customConfig?.workflowId)
+ return false
+
+ return metric.customConfig.mappings.length > 0
+ && metric.customConfig.mappings.every(mapping => !!mapping.sourceFieldId && !!mapping.targetVariableId)
+}
+
+export const isEvaluationRunnable = (state: EvaluationResourceState) => {
+ return !!state.judgeModelId
+ && state.metrics.length > 0
+ && state.metrics.every(isCustomMetricConfigured)
+ && state.conditions.some(group => group.items.length > 0)
+}
+
+export const useEvaluationStore = create((set, get) => ({
+ resources: {},
+ ensureResource: (resourceType, resourceId) => {
+ const resourceKey = buildResourceKey(resourceType, resourceId)
+ if (get().resources[resourceKey])
+ return
+
+ set(state => ({
+ resources: {
+ ...state.resources,
+ [resourceKey]: buildInitialState(resourceType),
+ },
+ }))
+ },
+ setJudgeModel: (resourceType, resourceId, judgeModelId) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ judgeModelId,
+ },
+ },
+ }
+ })
+ },
+ addBuiltinMetric: (resourceType, resourceId, optionId) => {
+ const option = getEvaluationMockConfig(resourceType).builtinMetrics.find(metric => metric.id === optionId)
+ if (!option)
+ return
+
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ if (resource.metrics.some(metric => metric.optionId === optionId && metric.kind === 'builtin'))
+ return state
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ metrics: [
+ ...resource.metrics,
+ {
+ id: createId('metric'),
+ optionId: option.id,
+ kind: 'builtin',
+ label: option.label,
+ description: option.description,
+ badges: option.badges,
+ },
+ ],
+ },
+ },
+ }
+ })
+ },
+ addCustomMetric: (resourceType, resourceId) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ metrics: [
+ ...resource.metrics,
+ {
+ id: createId('metric'),
+ optionId: createId('custom'),
+ kind: 'custom-workflow',
+ label: 'Custom Evaluator',
+ description: 'Map workflow variables to your evaluation inputs.',
+ badges: ['Workflow'],
+ customConfig: {
+ workflowId: null,
+ mappings: [{
+ id: createId('mapping'),
+ sourceFieldId: null,
+ targetVariableId: null,
+ }],
+ },
+ },
+ ],
+ },
+ },
+ }
+ })
+ },
+ removeMetric: (resourceType, resourceId, metricId) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ metrics: resource.metrics.filter(metric => metric.id !== metricId),
+ },
+ },
+ }
+ })
+ },
+ setCustomMetricWorkflow: (resourceType, resourceId, metricId, workflowId) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ metrics: updateMetric(resource.metrics, metricId, metric => ({
+ ...metric,
+ customConfig: metric.customConfig
+ ? {
+ ...metric.customConfig,
+ workflowId,
+ mappings: metric.customConfig.mappings.map(mapping => ({
+ ...mapping,
+ targetVariableId: null,
+ })),
+ }
+ : metric.customConfig,
+ })),
+ },
+ },
+ }
+ })
+ },
+ addCustomMetricMapping: (resourceType, resourceId, metricId) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ metrics: updateMetric(resource.metrics, metricId, metric => ({
+ ...metric,
+ customConfig: metric.customConfig
+ ? {
+ ...metric.customConfig,
+ mappings: [
+ ...metric.customConfig.mappings,
+ {
+ id: createId('mapping'),
+ sourceFieldId: null,
+ targetVariableId: null,
+ },
+ ],
+ }
+ : metric.customConfig,
+ })),
+ },
+ },
+ }
+ })
+ },
+ updateCustomMetricMapping: (resourceType, resourceId, metricId, mappingId, patch) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ metrics: updateMetric(resource.metrics, metricId, metric => ({
+ ...metric,
+ customConfig: metric.customConfig
+ ? {
+ ...metric.customConfig,
+ mappings: metric.customConfig.mappings.map(mapping => mapping.id === mappingId ? { ...mapping, ...patch } : mapping),
+ }
+ : metric.customConfig,
+ })),
+ },
+ },
+ }
+ })
+ },
+ removeCustomMetricMapping: (resourceType, resourceId, metricId, mappingId) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ metrics: updateMetric(resource.metrics, metricId, metric => ({
+ ...metric,
+ customConfig: metric.customConfig
+ ? {
+ ...metric.customConfig,
+ mappings: metric.customConfig.mappings.filter(mapping => mapping.id !== mappingId),
+ }
+ : metric.customConfig,
+ })),
+ },
+ },
+ }
+ })
+ },
+ addConditionGroup: (resourceType, resourceId) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ conditions: [
+ ...resource.conditions,
+ {
+ id: createId('group'),
+ logicalOperator: 'and',
+ items: [buildConditionItem(resourceType)],
+ },
+ ],
+ },
+ },
+ }
+ })
+ },
+ removeConditionGroup: (resourceType, resourceId, groupId) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ conditions: resource.conditions.filter(group => group.id !== groupId),
+ },
+ },
+ }
+ })
+ },
+ setConditionGroupOperator: (resourceType, resourceId, groupId, logicalOperator) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ conditions: updateConditionGroup(resource.conditions, groupId, group => ({
+ ...group,
+ logicalOperator,
+ })),
+ },
+ },
+ }
+ })
+ },
+ addConditionItem: (resourceType, resourceId, groupId) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ conditions: updateConditionGroup(resource.conditions, groupId, group => ({
+ ...group,
+ items: [
+ ...group.items,
+ buildConditionItem(resourceType),
+ ],
+ })),
+ },
+ },
+ }
+ })
+ },
+ removeConditionItem: (resourceType, resourceId, groupId, itemId) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ conditions: updateConditionGroup(resource.conditions, groupId, group => ({
+ ...group,
+ items: group.items.filter(item => item.id !== itemId),
+ })),
+ },
+ },
+ }
+ })
+ },
+ updateConditionField: (resourceType, resourceId, groupId, itemId, fieldId) => {
+ const field = getEvaluationMockConfig(resourceType).fieldOptions.find(option => option.id === fieldId)
+
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ conditions: updateConditionGroup(resource.conditions, groupId, group => ({
+ ...group,
+ items: group.items.map((item) => {
+ if (item.id !== itemId)
+ return item
+
+ return {
+ ...item,
+ fieldId,
+ operator: field ? getDefaultOperator(field.type) : item.operator,
+ value: getConditionValue(field, field ? getDefaultOperator(field.type) : item.operator),
+ }
+ }),
+ })),
+ },
+ },
+ }
+ })
+ },
+ updateConditionOperator: (resourceType, resourceId, groupId, itemId, operator) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+ const fieldOptions = getEvaluationMockConfig(resourceType).fieldOptions
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ conditions: updateConditionGroup(resource.conditions, groupId, group => ({
+ ...group,
+ items: group.items.map((item) => {
+ if (item.id !== itemId)
+ return item
+
+ const field = fieldOptions.find(option => option.id === item.fieldId)
+
+ return {
+ ...item,
+ operator,
+ value: getConditionValue(field, operator, item.value),
+ }
+ }),
+ })),
+ },
+ },
+ }
+ })
+ },
+ updateConditionValue: (resourceType, resourceId, groupId, itemId, value) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ conditions: updateConditionGroup(resource.conditions, groupId, group => ({
+ ...group,
+ items: group.items.map(item => item.id === itemId ? { ...item, value } : item),
+ })),
+ },
+ },
+ }
+ })
+ },
+ setBatchTab: (resourceType, resourceId, tab) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ activeBatchTab: tab,
+ },
+ },
+ }
+ })
+ },
+ setUploadedFileName: (resourceType, resourceId, uploadedFileName) => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ uploadedFileName,
+ },
+ },
+ }
+ })
+ },
+ runBatchTest: (resourceType, resourceId) => {
+ const config = getEvaluationMockConfig(resourceType)
+ const recordId = createId('batch')
+ const nextRecord: BatchTestRecord = {
+ id: recordId,
+ fileName: get().resources[buildResourceKey(resourceType, resourceId)]?.uploadedFileName ?? config.templateFileName,
+ status: 'running',
+ startedAt: new Date().toLocaleTimeString(),
+ summary: config.historySummaryLabel,
+ }
+
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ activeBatchTab: 'history',
+ batchRecords: [nextRecord, ...resource.batchRecords],
+ },
+ },
+ }
+ })
+
+ window.setTimeout(() => {
+ set((state) => {
+ const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+ return {
+ resources: {
+ ...state.resources,
+ [resourceKey]: {
+ ...resource,
+ batchRecords: resource.batchRecords.map(record => record.id === recordId
+ ? {
+ ...record,
+ status: resource.metrics.length > 1 ? 'success' : 'failed',
+ }
+ : record),
+ },
+ },
+ }
+ })
+ }, 1200)
+ },
+}))
+
+export const useEvaluationResource = (resourceType: EvaluationResourceType, resourceId: string) => {
+ const resourceKey = buildResourceKey(resourceType, resourceId)
+ return useEvaluationStore(state => state.resources[resourceKey] ?? (initialResourceCache[resourceKey] ??= buildInitialState(resourceType)))
+}
+
+export const getAllowedOperators = (resourceType: EvaluationResourceType, fieldId: string | null) => {
+ const field = getEvaluationMockConfig(resourceType).fieldOptions.find(option => option.id === fieldId)
+
+ if (!field)
+ return ['contains'] as ComparisonOperator[]
+
+ return getComparisonOperators(field.type)
+}
diff --git a/web/app/components/evaluation/types.ts b/web/app/components/evaluation/types.ts
new file mode 100644
index 0000000000..9b9cfda31c
--- /dev/null
+++ b/web/app/components/evaluation/types.ts
@@ -0,0 +1,117 @@
+export type EvaluationResourceType = 'workflow' | 'pipeline' | 'snippet'
+
+export type MetricKind = 'builtin' | 'custom-workflow'
+
+export type BatchTestTab = 'input-fields' | 'history'
+
+export type FieldType = 'string' | 'number' | 'boolean' | 'enum' | 'time'
+
+export type ComparisonOperator
+ = | 'contains'
+ | 'not_contains'
+ | 'is'
+ | 'is_not'
+ | 'is_empty'
+ | 'is_not_empty'
+ | 'greater_than'
+ | 'less_than'
+ | 'greater_or_equal'
+ | 'less_or_equal'
+ | 'before'
+ | 'after'
+
+export type JudgeModelOption = {
+ id: string
+ label: string
+ provider: string
+}
+
+export type MetricOption = {
+ id: string
+ label: string
+ description: string
+ group: string
+ badges: string[]
+}
+
+export type EvaluationWorkflowOption = {
+ id: string
+ label: string
+ description: string
+ targetVariables: Array<{
+ id: string
+ label: string
+ }>
+}
+
+export type EvaluationFieldOption = {
+ id: string
+ label: string
+ group: string
+ type: FieldType
+ options?: Array<{
+ value: string
+ label: string
+ }>
+}
+
+export type CustomMetricMapping = {
+ id: string
+ sourceFieldId: string | null
+ targetVariableId: string | null
+}
+
+export type CustomMetricConfig = {
+ workflowId: string | null
+ mappings: CustomMetricMapping[]
+}
+
+export type EvaluationMetric = {
+ id: string
+ optionId: string
+ kind: MetricKind
+ label: string
+ description: string
+ badges: string[]
+ customConfig?: CustomMetricConfig
+}
+
+export type JudgmentConditionItem = {
+ id: string
+ fieldId: string | null
+ operator: ComparisonOperator
+ value: string | number | boolean | null
+}
+
+export type JudgmentConditionGroup = {
+ id: string
+ logicalOperator: 'and' | 'or'
+ items: JudgmentConditionItem[]
+}
+
+export type BatchTestRecord = {
+ id: string
+ fileName: string
+ status: 'running' | 'success' | 'failed'
+ startedAt: string
+ summary: string
+}
+
+export type EvaluationResourceState = {
+ judgeModelId: string | null
+ metrics: EvaluationMetric[]
+ conditions: JudgmentConditionGroup[]
+ activeBatchTab: BatchTestTab
+ uploadedFileName: string | null
+ batchRecords: BatchTestRecord[]
+}
+
+export type EvaluationMockConfig = {
+ judgeModels: JudgeModelOption[]
+ builtinMetrics: MetricOption[]
+ workflowOptions: EvaluationWorkflowOption[]
+ fieldOptions: EvaluationFieldOption[]
+ templateFileName: string
+ batchRequirements: string[]
+ historySummaryLabel: string
+}
diff --git a/web/app/components/snippets/components/snippet-main.tsx b/web/app/components/snippets/components/snippet-main.tsx
index fd9bd14373..d1f936a0c5 100644
--- a/web/app/components/snippets/components/snippet-main.tsx
+++ b/web/app/components/snippets/components/snippet-main.tsx
@@ -2,7 +2,7 @@
import type { NavIcon } from '@/app/components/app-sidebar/nav-link'
import type { WorkflowProps } from '@/app/components/workflow'
-import type { SnippetDetailPayload, SnippetInputField } from '@/models/snippet'
+import type { SnippetDetailPayload, SnippetInputField, SnippetSection } from '@/models/snippet'
import {
RiFlaskFill,
RiFlaskLine,
@@ -17,6 +17,7 @@ import NavLink from '@/app/components/app-sidebar/nav-link'
import SnippetInfo from '@/app/components/app-sidebar/snippet-info'
import { useStore as useAppStore } from '@/app/components/app/store'
import Toast from '@/app/components/base/toast'
+import Evaluation from '@/app/components/evaluation'
import { WorkflowWithInnerContext } from '@/app/components/workflow'
import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
import { useSnippetDetailStore } from '../store'
@@ -25,6 +26,7 @@ import SnippetChildren from './snippet-children'
type SnippetMainProps = {
payload: SnippetDetailPayload
snippetId: string
+ section: SnippetSection
} & Pick
const ORCHESTRATE_ICONS: { normal: NavIcon, selected: NavIcon } = {
@@ -40,6 +42,7 @@ const EVALUATION_ICONS: { normal: NavIcon, selected: NavIcon } = {
const SnippetMain = ({
payload,
snippetId,
+ section,
nodes,
edges,
viewport,
@@ -51,7 +54,6 @@ const SnippetMain = ({
const [fields, setFields] = useState(payload.inputFields)
const setAppSidebarExpand = useAppStore(state => state.setAppSidebarExpand)
const {
- activeSection,
editingField,
isEditorOpen,
isInputPanelOpen,
@@ -59,12 +61,10 @@ const SnippetMain = ({
closeEditor,
openEditor,
reset,
- setActiveSection,
setInputPanelOpen,
toggleInputPanel,
togglePublishMenu,
} = useSnippetDetailStore(useShallow(state => ({
- activeSection: state.activeSection,
editingField: state.editingField,
isEditorOpen: state.isEditorOpen,
isInputPanelOpen: state.isInputPanelOpen,
@@ -72,7 +72,6 @@ const SnippetMain = ({
closeEditor: state.closeEditor,
openEditor: state.openEditor,
reset: state.reset,
- setActiveSection: state.setActiveSection,
setInputPanelOpen: state.setInputPanelOpen,
toggleInputPanel: state.toggleInputPanel,
togglePublishMenu: state.togglePublishMenu,
@@ -145,15 +144,15 @@ const SnippetMain = ({
mode={mode}
name={t('sectionOrchestrate')}
iconMap={ORCHESTRATE_ICONS}
- active={activeSection === 'orchestrate'}
- onClick={() => setActiveSection('orchestrate')}
+ href={`/snippets/${snippetId}/orchestrate`}
+ active={section === 'orchestrate'}
/>
setActiveSection('evaluation')}
+ href={`/snippets/${snippetId}/evaluation`}
+ active={section === 'evaluation'}
/>
>
)}
@@ -161,29 +160,35 @@ const SnippetMain = ({
-
-
-
+ {section === 'evaluation'
+ ? (
+
+ )
+ : (
+
+
+
+ )}
diff --git a/web/app/components/snippets/index.tsx b/web/app/components/snippets/index.tsx
index 690a8d6376..63392a24b5 100644
--- a/web/app/components/snippets/index.tsx
+++ b/web/app/components/snippets/index.tsx
@@ -1,5 +1,6 @@
'use client'
+import type { SnippetSection } from '@/models/snippet'
import { useMemo } from 'react'
import { useTranslation } from 'react-i18next'
import Loading from '@/app/components/base/loading'
@@ -14,10 +15,12 @@ import { useSnippetInit } from './hooks/use-snippet-init'
type SnippetPageProps = {
snippetId: string
+ section?: SnippetSection
}
const SnippetPage = ({
snippetId,
+ section = 'orchestrate',
}: SnippetPageProps) => {
const { t } = useTranslation('snippet')
const { data, isLoading } = useSnippetInit(snippetId)
@@ -62,6 +65,7 @@ const SnippetPage = ({