feat: evaluation

2026-05-06 18:27:19 +08:00 · 2026-03-16 10:46:33 +08:00 · 2026-03-16 10:46:33 +08:00 · 887c7710e9
commit 887c7710e9
parent 7a722773c7
22 changed files with 2493 additions and 84 deletions
--- a/web/app/(commonLayout)/app/(appDetailLayout)/[appId]/evaluation/page.tsx
+++ b/web/app/(commonLayout)/app/(appDetailLayout)/[appId]/evaluation/page.tsx
@ -0,0 +1,11 @@
+import Evaluation from '@/app/components/evaluation'
+
+const Page = async (props: {
+  params: Promise<{ appId: string }>
+}) => {
+  const { appId } = await props.params
+
+  return <Evaluation resourceType="workflow" resourceId={appId} />
+}
+
+export default Page
--- a/web/app/(commonLayout)/app/(appDetailLayout)/[appId]/layout-main.tsx
+++ b/web/app/(commonLayout)/app/(appDetailLayout)/[appId]/layout-main.tsx
@ -7,6 +7,8 @@ import {
  RiDashboard2Line,
  RiFileList3Fill,
  RiFileList3Line,
+  RiFlaskFill,
+  RiFlaskLine,
  RiTerminalBoxFill,
  RiTerminalBoxLine,
  RiTerminalWindowFill,
@ -67,40 +69,47 @@ const AppDetailLayout: FC<IAppDetailLayoutProps> = (props) => {
  }>>([])

  const getNavigationConfig = useCallback((appId: string, isCurrentWorkspaceEditor: boolean, mode: AppModeEnum) => {
-    const navConfig = [
-      ...(isCurrentWorkspaceEditor
-        ? [{
-            name: t('appMenus.promptEng', { ns: 'common' }),
-            href: `/app/${appId}/${(mode === AppModeEnum.WORKFLOW || mode === AppModeEnum.ADVANCED_CHAT) ? 'workflow' : 'configuration'}`,
-            icon: RiTerminalWindowLine,
-            selectedIcon: RiTerminalWindowFill,
-          }]
-        : []
-      ),
-      {
-        name: t('appMenus.apiAccess', { ns: 'common' }),
-        href: `/app/${appId}/develop`,
-        icon: RiTerminalBoxLine,
-        selectedIcon: RiTerminalBoxFill,
-      },
-      ...(isCurrentWorkspaceEditor
-        ? [{
-            name: mode !== AppModeEnum.WORKFLOW
-              ? t('appMenus.logAndAnn', { ns: 'common' })
-              : t('appMenus.logs', { ns: 'common' }),
-            href: `/app/${appId}/logs`,
-            icon: RiFileList3Line,
-            selectedIcon: RiFileList3Fill,
-          }]
-        : []
-      ),
-      {
-        name: t('appMenus.overview', { ns: 'common' }),
-        href: `/app/${appId}/overview`,
-        icon: RiDashboard2Line,
-        selectedIcon: RiDashboard2Fill,
-      },
-    ]
+    const navConfig = []
+
+    if (isCurrentWorkspaceEditor) {
+      navConfig.push({
+        name: t('appMenus.promptEng', { ns: 'common' }),
+        href: `/app/${appId}/${(mode === AppModeEnum.WORKFLOW || mode === AppModeEnum.ADVANCED_CHAT) ? 'workflow' : 'configuration'}`,
+        icon: RiTerminalWindowLine,
+        selectedIcon: RiTerminalWindowFill,
+      })
+      navConfig.push({
+        name: t('appMenus.evaluation', { ns: 'common' }),
+        href: `/app/${appId}/evaluation`,
+        icon: RiFlaskLine,
+        selectedIcon: RiFlaskFill,
+      })
+    }
+
+    navConfig.push({
+      name: t('appMenus.apiAccess', { ns: 'common' }),
+      href: `/app/${appId}/develop`,
+      icon: RiTerminalBoxLine,
+      selectedIcon: RiTerminalBoxFill,
+    })
+
+    if (isCurrentWorkspaceEditor) {
+      navConfig.push({
+        name: mode !== AppModeEnum.WORKFLOW
+          ? t('appMenus.logAndAnn', { ns: 'common' })
+          : t('appMenus.logs', { ns: 'common' }),
+        href: `/app/${appId}/logs`,
+        icon: RiFileList3Line,
+        selectedIcon: RiFileList3Fill,
+      })
+    }
+
+    navConfig.push({
+      name: t('appMenus.overview', { ns: 'common' }),
+      href: `/app/${appId}/overview`,
+      icon: RiDashboard2Line,
+      selectedIcon: RiDashboard2Fill,
+    })
    return navConfig
  }, [t])

--- a/web/app/(commonLayout)/datasets/(datasetDetailLayout)/[datasetId]/evaluation/page.tsx
+++ b/web/app/(commonLayout)/datasets/(datasetDetailLayout)/[datasetId]/evaluation/page.tsx
@ -0,0 +1,11 @@
+import Evaluation from '@/app/components/evaluation'
+
+const Page = async (props: {
+  params: Promise<{ datasetId: string }>
+}) => {
+  const { datasetId } = await props.params
+
+  return <Evaluation resourceType="pipeline" resourceId={datasetId} />
+}
+
+export default Page
--- a/web/app/(commonLayout)/datasets/(datasetDetailLayout)/[datasetId]/layout-main.tsx
+++ b/web/app/(commonLayout)/datasets/(datasetDetailLayout)/[datasetId]/layout-main.tsx
@ -6,6 +6,8 @@ import {
  RiEqualizer2Line,
  RiFileTextFill,
  RiFileTextLine,
+  RiFlaskFill,
+  RiFlaskLine,
  RiFocus2Fill,
  RiFocus2Line,
 } from '@remixicon/react'
@ -86,20 +88,30 @@ const DatasetDetailLayout: FC<IAppDetailLayoutProps> = (props) => {
    ]

    if (datasetRes?.provider !== 'external') {
-      baseNavigation.unshift({
-        name: t('datasetMenus.pipeline', { ns: 'common' }),
-        href: `/datasets/${datasetId}/pipeline`,
-        icon: PipelineLine as RemixiconComponentType,
-        selectedIcon: PipelineFill as RemixiconComponentType,
-        disabled: false,
-      })
-      baseNavigation.unshift({
-        name: t('datasetMenus.documents', { ns: 'common' }),
-        href: `/datasets/${datasetId}/documents`,
-        icon: RiFileTextLine,
-        selectedIcon: RiFileTextFill,
-        disabled: isButtonDisabledWithPipeline,
-      })
+      return [
+        {
+          name: t('datasetMenus.documents', { ns: 'common' }),
+          href: `/datasets/${datasetId}/documents`,
+          icon: RiFileTextLine,
+          selectedIcon: RiFileTextFill,
+          disabled: isButtonDisabledWithPipeline,
+        },
+        {
+          name: t('datasetMenus.pipeline', { ns: 'common' }),
+          href: `/datasets/${datasetId}/pipeline`,
+          icon: PipelineLine as RemixiconComponentType,
+          selectedIcon: PipelineFill as RemixiconComponentType,
+          disabled: false,
+        },
+        {
+          name: t('datasetMenus.evaluation', { ns: 'common' }),
+          href: `/datasets/${datasetId}/evaluation`,
+          icon: RiFlaskLine,
+          selectedIcon: RiFlaskFill,
+          disabled: false,
+        },
+        ...baseNavigation,
+      ]
    }

    return baseNavigation
--- a/web/app/(commonLayout)/snippets/[snippetId]/evaluation/page.tsx
+++ b/web/app/(commonLayout)/snippets/[snippetId]/evaluation/page.tsx
@ -0,0 +1,11 @@
+import SnippetPage from '@/app/components/snippets'
+
+const Page = async (props: {
+  params: Promise<{ snippetId: string }>
+}) => {
+  const { snippetId } = await props.params
+
+  return <SnippetPage snippetId={snippetId} section="evaluation" />
+}
+
+export default Page
--- a/web/app/(commonLayout)/snippets/[snippetId]/orchestrate/page.tsx
+++ b/web/app/(commonLayout)/snippets/[snippetId]/orchestrate/page.tsx
@ -0,0 +1,11 @@
+import SnippetPage from '@/app/components/snippets'
+
+const Page = async (props: {
+  params: Promise<{ snippetId: string }>
+}) => {
+  const { snippetId } = await props.params
+
+  return <SnippetPage snippetId={snippetId} section="orchestrate" />
+}
+
+export default Page
--- a/web/app/(commonLayout)/snippets/[snippetId]/page.spec.ts
+++ b/web/app/(commonLayout)/snippets/[snippetId]/page.spec.ts
@ -0,0 +1,21 @@
+import Page from './page'
+
+const mockRedirect = vi.fn()
+
+vi.mock('next/navigation', () => ({
+  redirect: (path: string) => mockRedirect(path),
+}))
+
+describe('snippet detail redirect page', () => {
+  beforeEach(() => {
+    vi.clearAllMocks()
+  })
+
+  it('should redirect legacy snippet detail routes to orchestrate', async () => {
+    await Page({
+      params: Promise.resolve({ snippetId: 'snippet-1' }),
+    })
+
+    expect(mockRedirect).toHaveBeenCalledWith('/snippets/snippet-1/orchestrate')
+  })
+})
--- a/web/app/(commonLayout)/snippets/[snippetId]/page.tsx
+++ b/web/app/(commonLayout)/snippets/[snippetId]/page.tsx
@ -1,11 +1,11 @@
-import SnippetPage from '@/app/components/snippets'
+import { redirect } from 'next/navigation'

 const Page = async (props: {
  params: Promise<{ snippetId: string }>
 }) => {
-  const { params } = props
+  const { snippetId } = await props.params

-  return <SnippetPage snippetId={(await params).snippetId} />
+  redirect(`/snippets/${snippetId}/orchestrate`)
 }

 export default Page
--- a/web/app/components/apps/list.tsx
+++ b/web/app/components/apps/list.tsx
@ -91,7 +91,7 @@ const SnippetCard = ({
  snippet: SnippetListItem
 }) => {
  return (
-    <Link href={`/snippets/${snippet.id}`} className="group col-span-1">
+    <Link href={`/snippets/${snippet.id}/orchestrate`} className="group col-span-1">
      <article className="relative inline-flex h-[160px] w-full flex-col rounded-xl border border-components-card-border bg-components-card-bg shadow-sm transition-all duration-200 ease-in-out hover:-translate-y-0.5 hover:shadow-lg">
        {snippet.status && (
          <div className="absolute right-0 top-0 rounded-bl-lg rounded-tr-xl bg-background-default-dimmed px-2 py-1 text-[10px] font-medium uppercase leading-3 text-text-placeholder">
--- a/web/app/components/evaluation/tests/index.spec.tsx
+++ b/web/app/components/evaluation/tests/index.spec.tsx
@ -0,0 +1,112 @@
+import { act, fireEvent, render, screen } from '@testing-library/react'
+import Evaluation from '..'
+import { getEvaluationMockConfig } from '../mock'
+import { useEvaluationStore } from '../store'
+
+vi.mock('@/app/components/header/account-setting/model-provider-page/hooks', () => ({
+  useModelList: () => ({
+    data: [{
+      provider: 'openai',
+      models: [{ model: 'gpt-4o-mini' }],
+    }],
+  }),
+}))
+
+vi.mock('@/app/components/header/account-setting/model-provider-page/model-selector', () => ({
+  default: ({ defaultModel }: { defaultModel?: { provider: string, model: string } }) => (
+    <div data-testid="evaluation-model-selector">
+      {defaultModel ? `${defaultModel.provider}:${defaultModel.model}` : 'empty'}
+    </div>
+  ),
+}))
+
+describe('Evaluation', () => {
+  beforeEach(() => {
+    useEvaluationStore.setState({ resources: {} })
+  })
+
+  it('should search, add metrics, and create a batch history record', async () => {
+    vi.useFakeTimers()
+
+    render(<Evaluation resourceType="workflow" resourceId="app-1" />)
+
+    expect(screen.getByTestId('evaluation-model-selector')).toHaveTextContent('openai:gpt-4o-mini')
+
+    fireEvent.click(screen.getByRole('button', { name: 'evaluation.metrics.add' }))
+    expect(screen.getByTestId('evaluation-metric-loading')).toBeInTheDocument()
+
+    await act(async () => {
+      vi.advanceTimersByTime(200)
+    })
+
+    fireEvent.change(screen.getByPlaceholderText('evaluation.metrics.searchPlaceholder'), {
+      target: { value: 'does-not-exist' },
+    })
+
+    await act(async () => {
+      vi.advanceTimersByTime(200)
+    })
+
+    expect(screen.getByText('evaluation.metrics.noResults')).toBeInTheDocument()
+
+    fireEvent.change(screen.getByPlaceholderText('evaluation.metrics.searchPlaceholder'), {
+      target: { value: 'faith' },
+    })
+
+    await act(async () => {
+      vi.advanceTimersByTime(200)
+    })
+
+    fireEvent.click(screen.getByRole('button', { name: /Faithfulness/i }))
+    expect(screen.getAllByText('Faithfulness').length).toBeGreaterThan(0)
+
+    fireEvent.click(screen.getByRole('button', { name: 'evaluation.batch.run' }))
+    expect(screen.getByText('evaluation.batch.status.running')).toBeInTheDocument()
+
+    await act(async () => {
+      vi.advanceTimersByTime(1300)
+    })
+
+    expect(screen.getByText('evaluation.batch.status.success')).toBeInTheDocument()
+    expect(screen.getByText('Workflow evaluation batch')).toBeInTheDocument()
+
+    vi.useRealTimers()
+  })
+
+  it('should render time placeholders and hide the value row for empty operators', () => {
+    const resourceType = 'workflow'
+    const resourceId = 'app-2'
+    const store = useEvaluationStore.getState()
+    const config = getEvaluationMockConfig(resourceType)
+
+    const timeField = config.fieldOptions.find(field => field.type === 'time')!
+    let groupId = ''
+    let itemId = ''
+
+    act(() => {
+      store.ensureResource(resourceType, resourceId)
+      store.setJudgeModel(resourceType, resourceId, 'openai::gpt-4o-mini')
+
+      const group = useEvaluationStore.getState().resources['workflow:app-2'].conditions[0]
+      groupId = group.id
+      itemId = group.items[0].id
+
+      store.updateConditionField(resourceType, resourceId, groupId, itemId, timeField.id)
+      store.updateConditionOperator(resourceType, resourceId, groupId, itemId, 'before')
+    })
+
+    let rerender: ReturnType<typeof render>['rerender']
+    act(() => {
+      ({ rerender } = render(<Evaluation resourceType={resourceType} resourceId={resourceId} />))
+    })
+
+    expect(screen.getByText('evaluation.conditions.selectTime')).toBeInTheDocument()
+
+    act(() => {
+      store.updateConditionOperator(resourceType, resourceId, groupId, itemId, 'is_empty')
+      rerender(<Evaluation resourceType={resourceType} resourceId={resourceId} />)
+    })
+
+    expect(screen.queryByText('evaluation.conditions.selectTime')).not.toBeInTheDocument()
+  })
+})
--- a/web/app/components/evaluation/tests/store.spec.ts
+++ b/web/app/components/evaluation/tests/store.spec.ts
@ -0,0 +1,96 @@
+import { getEvaluationMockConfig } from '../mock'
+import {
+  getAllowedOperators,
+  isCustomMetricConfigured,
+  requiresConditionValue,
+  useEvaluationStore,
+} from '../store'
+
+describe('evaluation store', () => {
+  beforeEach(() => {
+    useEvaluationStore.setState({ resources: {} })
+  })
+
+  it('should configure a custom metric mapping to a valid state', () => {
+    const resourceType = 'workflow'
+    const resourceId = 'app-1'
+    const store = useEvaluationStore.getState()
+    const config = getEvaluationMockConfig(resourceType)
+
+    store.ensureResource(resourceType, resourceId)
+    store.addCustomMetric(resourceType, resourceId)
+
+    const initialMetric = useEvaluationStore.getState().resources['workflow:app-1'].metrics.find(metric => metric.kind === 'custom-workflow')
+    expect(initialMetric).toBeDefined()
+    expect(isCustomMetricConfigured(initialMetric!)).toBe(false)
+
+    store.setCustomMetricWorkflow(resourceType, resourceId, initialMetric!.id, config.workflowOptions[0].id)
+    store.updateCustomMetricMapping(resourceType, resourceId, initialMetric!.id, initialMetric!.customConfig!.mappings[0].id, {
+      sourceFieldId: config.fieldOptions[0].id,
+      targetVariableId: config.workflowOptions[0].targetVariables[0].id,
+    })
+
+    const configuredMetric = useEvaluationStore.getState().resources['workflow:app-1'].metrics.find(metric => metric.id === initialMetric!.id)
+    expect(isCustomMetricConfigured(configuredMetric!)).toBe(true)
+  })
+
+  it('should add and remove builtin metrics', () => {
+    const resourceType = 'workflow'
+    const resourceId = 'app-2'
+    const store = useEvaluationStore.getState()
+    const config = getEvaluationMockConfig(resourceType)
+
+    store.ensureResource(resourceType, resourceId)
+    store.addBuiltinMetric(resourceType, resourceId, config.builtinMetrics[1].id)
+
+    const addedMetric = useEvaluationStore.getState().resources['workflow:app-2'].metrics.find(metric => metric.optionId === config.builtinMetrics[1].id)
+    expect(addedMetric).toBeDefined()
+
+    store.removeMetric(resourceType, resourceId, addedMetric!.id)
+
+    expect(useEvaluationStore.getState().resources['workflow:app-2'].metrics.some(metric => metric.id === addedMetric!.id)).toBe(false)
+  })
+
+  it('should update condition groups and adapt operators to field types', () => {
+    const resourceType = 'pipeline'
+    const resourceId = 'dataset-1'
+    const store = useEvaluationStore.getState()
+    const config = getEvaluationMockConfig(resourceType)
+
+    store.ensureResource(resourceType, resourceId)
+
+    const initialGroup = useEvaluationStore.getState().resources['pipeline:dataset-1'].conditions[0]
+    store.setConditionGroupOperator(resourceType, resourceId, initialGroup.id, 'or')
+    store.addConditionGroup(resourceType, resourceId)
+
+    const booleanField = config.fieldOptions.find(field => field.type === 'boolean')!
+    const currentItem = useEvaluationStore.getState().resources['pipeline:dataset-1'].conditions[0].items[0]
+    store.updateConditionField(resourceType, resourceId, initialGroup.id, currentItem.id, booleanField.id)
+
+    const updatedGroup = useEvaluationStore.getState().resources['pipeline:dataset-1'].conditions[0]
+    expect(updatedGroup.logicalOperator).toBe('or')
+    expect(updatedGroup.items[0].operator).toBe('is')
+    expect(getAllowedOperators(resourceType, booleanField.id)).toEqual(['is', 'is_not'])
+  })
+
+  it('should support time fields and clear values for empty operators', () => {
+    const resourceType = 'workflow'
+    const resourceId = 'app-3'
+    const store = useEvaluationStore.getState()
+    const config = getEvaluationMockConfig(resourceType)
+
+    store.ensureResource(resourceType, resourceId)
+
+    const timeField = config.fieldOptions.find(field => field.type === 'time')!
+    const item = useEvaluationStore.getState().resources['workflow:app-3'].conditions[0].items[0]
+
+    store.updateConditionField(resourceType, resourceId, useEvaluationStore.getState().resources['workflow:app-3'].conditions[0].id, item.id, timeField.id)
+    store.updateConditionOperator(resourceType, resourceId, useEvaluationStore.getState().resources['workflow:app-3'].conditions[0].id, item.id, 'is_empty')
+
+    const updatedItem = useEvaluationStore.getState().resources['workflow:app-3'].conditions[0].items[0]
+
+    expect(getAllowedOperators(resourceType, timeField.id)).toEqual(['is', 'before', 'after', 'is_empty', 'is_not_empty'])
+    expect(requiresConditionValue('is_empty')).toBe(false)
+    expect(updatedItem.value).toBeNull()
+  })
+})
--- a/web/app/components/evaluation/index.tsx
+++ b/web/app/components/evaluation/index.tsx
--- a/web/app/components/evaluation/mock.ts
+++ b/web/app/components/evaluation/mock.ts
@ -0,0 +1,184 @@
+import type {
+  ComparisonOperator,
+  EvaluationFieldOption,
+  EvaluationMockConfig,
+  EvaluationResourceType,
+  MetricOption,
+} from './types'
+
+const judgeModels = [
+  {
+    id: 'gpt-4.1-mini',
+    label: 'GPT-4.1 mini',
+    provider: 'OpenAI',
+  },
+  {
+    id: 'claude-3-7-sonnet',
+    label: 'Claude 3.7 Sonnet',
+    provider: 'Anthropic',
+  },
+  {
+    id: 'gemini-2.0-flash',
+    label: 'Gemini 2.0 Flash',
+    provider: 'Google',
+  },
+]
+
+const builtinMetrics: MetricOption[] = [
+  {
+    id: 'answer-correctness',
+    label: 'Answer Correctness',
+    description: 'Compares the response with the expected answer and scores factual alignment.',
+    group: 'quality',
+    badges: ['LLM', 'Built-in'],
+  },
+  {
+    id: 'faithfulness',
+    label: 'Faithfulness',
+    description: 'Checks whether the answer stays grounded in the retrieved evidence.',
+    group: 'quality',
+    badges: ['LLM', 'Retrieval'],
+  },
+  {
+    id: 'relevance',
+    label: 'Relevance',
+    description: 'Evaluates how directly the answer addresses the original request.',
+    group: 'quality',
+    badges: ['LLM'],
+  },
+  {
+    id: 'latency',
+    label: 'Latency',
+    description: 'Captures runtime responsiveness for the full execution path.',
+    group: 'operations',
+    badges: ['System'],
+  },
+  {
+    id: 'token-usage',
+    label: 'Token Usage',
+    description: 'Tracks prompt and completion token consumption for the run.',
+    group: 'operations',
+    badges: ['System'],
+  },
+  {
+    id: 'tool-success-rate',
+    label: 'Tool Success Rate',
+    description: 'Measures whether each required tool invocation finishes without failure.',
+    group: 'operations',
+    badges: ['Workflow'],
+  },
+]
+
+const workflowOptions = [
+  {
+    id: 'workflow-precision-review',
+    label: 'Precision Review Workflow',
+    description: 'Custom evaluator for nuanced quality review.',
+    targetVariables: [
+      { id: 'query', label: 'query' },
+      { id: 'answer', label: 'answer' },
+      { id: 'reference', label: 'reference' },
+    ],
+  },
+  {
+    id: 'workflow-risk-review',
+    label: 'Risk Review Workflow',
+    description: 'Custom evaluator for policy and escalation checks.',
+    targetVariables: [
+      { id: 'input', label: 'input' },
+      { id: 'output', label: 'output' },
+    ],
+  },
+]
+
+const workflowFields: EvaluationFieldOption[] = [
+  { id: 'app.input.query', label: 'Query', group: 'App Input', type: 'string' },
+  { id: 'app.input.locale', label: 'Locale', group: 'App Input', type: 'enum', options: [{ value: 'en-US', label: 'en-US' }, { value: 'zh-Hans', label: 'zh-Hans' }] },
+  { id: 'app.output.answer', label: 'Answer', group: 'App Output', type: 'string' },
+  { id: 'app.output.score', label: 'Score', group: 'App Output', type: 'number' },
+  { id: 'app.output.published_at', label: 'Publication Date', group: 'App Output', type: 'time' },
+  { id: 'system.has_context', label: 'Has Context', group: 'System', type: 'boolean' },
+]
+
+const pipelineFields: EvaluationFieldOption[] = [
+  { id: 'dataset.input.document_id', label: 'Document ID', group: 'Dataset', type: 'string' },
+  { id: 'dataset.input.chunk_count', label: 'Chunk Count', group: 'Dataset', type: 'number' },
+  { id: 'dataset.input.updated_at', label: 'Updated At', group: 'Dataset', type: 'time' },
+  { id: 'retrieval.output.hit_rate', label: 'Hit Rate', group: 'Retrieval', type: 'number' },
+  { id: 'retrieval.output.source', label: 'Source', group: 'Retrieval', type: 'enum', options: [{ value: 'bm25', label: 'BM25' }, { value: 'hybrid', label: 'Hybrid' }] },
+  { id: 'pipeline.output.published', label: 'Published', group: 'Output', type: 'boolean' },
+]
+
+const snippetFields: EvaluationFieldOption[] = [
+  { id: 'snippet.input.blog_url', label: 'Blog URL', group: 'Snippet Input', type: 'string' },
+  { id: 'snippet.input.platforms', label: 'Platforms', group: 'Snippet Input', type: 'string' },
+  { id: 'snippet.output.content', label: 'Generated Content', group: 'Snippet Output', type: 'string' },
+  { id: 'snippet.output.length', label: 'Output Length', group: 'Snippet Output', type: 'number' },
+  { id: 'snippet.output.scheduled_at', label: 'Scheduled At', group: 'Snippet Output', type: 'time' },
+  { id: 'system.requires_review', label: 'Requires Review', group: 'System', type: 'boolean' },
+]
+
+export const getComparisonOperators = (fieldType: EvaluationFieldOption['type']): ComparisonOperator[] => {
+  if (fieldType === 'number')
+    return ['is', 'is_not', 'greater_than', 'less_than', 'greater_or_equal', 'less_or_equal', 'is_empty', 'is_not_empty']
+
+  if (fieldType === 'time')
+    return ['is', 'before', 'after', 'is_empty', 'is_not_empty']
+
+  if (fieldType === 'boolean' || fieldType === 'enum')
+    return ['is', 'is_not']
+
+  return ['contains', 'not_contains', 'is', 'is_not', 'is_empty', 'is_not_empty']
+}
+
+export const getDefaultOperator = (fieldType: EvaluationFieldOption['type']): ComparisonOperator => {
+  return getComparisonOperators(fieldType)[0]
+}
+
+export const getEvaluationMockConfig = (resourceType: EvaluationResourceType): EvaluationMockConfig => {
+  if (resourceType === 'pipeline') {
+    return {
+      judgeModels,
+      builtinMetrics,
+      workflowOptions,
+      fieldOptions: pipelineFields,
+      templateFileName: 'pipeline-evaluation-template.csv',
+      batchRequirements: [
+        'Include one row per retrieval scenario.',
+        'Provide the expected source or target chunk for each case.',
+        'Keep numeric metrics in plain number format.',
+      ],
+      historySummaryLabel: 'Pipeline evaluation batch',
+    }
+  }
+
+  if (resourceType === 'snippet') {
+    return {
+      judgeModels,
+      builtinMetrics,
+      workflowOptions,
+      fieldOptions: snippetFields,
+      templateFileName: 'snippet-evaluation-template.csv',
+      batchRequirements: [
+        'Include one row per snippet execution case.',
+        'Provide the expected final content or acceptance rule.',
+        'Keep optional fields empty when not used.',
+      ],
+      historySummaryLabel: 'Snippet evaluation batch',
+    }
+  }
+
+  return {
+    judgeModels,
+    builtinMetrics,
+    workflowOptions,
+    fieldOptions: workflowFields,
+    templateFileName: 'workflow-evaluation-template.csv',
+    batchRequirements: [
+      'Include one row per workflow test case.',
+      'Provide both user input and expected answer when available.',
+      'Keep boolean columns as true or false.',
+    ],
+    historySummaryLabel: 'Workflow evaluation batch',
+  }
+}
--- a/web/app/components/evaluation/store.ts
+++ b/web/app/components/evaluation/store.ts
@ -0,0 +1,635 @@
+import type {
+  BatchTestRecord,
+  ComparisonOperator,
+  EvaluationFieldOption,
+  EvaluationMetric,
+  EvaluationResourceState,
+  EvaluationResourceType,
+  JudgmentConditionGroup,
+} from './types'
+import { create } from 'zustand'
+import { getComparisonOperators, getDefaultOperator, getEvaluationMockConfig } from './mock'
+
+type EvaluationStore = {
+  resources: Record<string, EvaluationResourceState>
+  ensureResource: (resourceType: EvaluationResourceType, resourceId: string) => void
+  setJudgeModel: (resourceType: EvaluationResourceType, resourceId: string, judgeModelId: string) => void
+  addBuiltinMetric: (resourceType: EvaluationResourceType, resourceId: string, optionId: string) => void
+  addCustomMetric: (resourceType: EvaluationResourceType, resourceId: string) => void
+  removeMetric: (resourceType: EvaluationResourceType, resourceId: string, metricId: string) => void
+  setCustomMetricWorkflow: (resourceType: EvaluationResourceType, resourceId: string, metricId: string, workflowId: string) => void
+  addCustomMetricMapping: (resourceType: EvaluationResourceType, resourceId: string, metricId: string) => void
+  updateCustomMetricMapping: (
+    resourceType: EvaluationResourceType,
+    resourceId: string,
+    metricId: string,
+    mappingId: string,
+    patch: { sourceFieldId?: string | null, targetVariableId?: string | null },
+  ) => void
+  removeCustomMetricMapping: (resourceType: EvaluationResourceType, resourceId: string, metricId: string, mappingId: string) => void
+  addConditionGroup: (resourceType: EvaluationResourceType, resourceId: string) => void
+  removeConditionGroup: (resourceType: EvaluationResourceType, resourceId: string, groupId: string) => void
+  setConditionGroupOperator: (resourceType: EvaluationResourceType, resourceId: string, groupId: string, logicalOperator: 'and' | 'or') => void
+  addConditionItem: (resourceType: EvaluationResourceType, resourceId: string, groupId: string) => void
+  removeConditionItem: (resourceType: EvaluationResourceType, resourceId: string, groupId: string, itemId: string) => void
+  updateConditionField: (resourceType: EvaluationResourceType, resourceId: string, groupId: string, itemId: string, fieldId: string) => void
+  updateConditionOperator: (resourceType: EvaluationResourceType, resourceId: string, groupId: string, itemId: string, operator: ComparisonOperator) => void
+  updateConditionValue: (
+    resourceType: EvaluationResourceType,
+    resourceId: string,
+    groupId: string,
+    itemId: string,
+    value: string | number | boolean | null,
+  ) => void
+  setBatchTab: (resourceType: EvaluationResourceType, resourceId: string, tab: EvaluationResourceState['activeBatchTab']) => void
+  setUploadedFileName: (resourceType: EvaluationResourceType, resourceId: string, uploadedFileName: string | null) => void
+  runBatchTest: (resourceType: EvaluationResourceType, resourceId: string) => void
+}
+
+const buildResourceKey = (resourceType: EvaluationResourceType, resourceId: string) => `${resourceType}:${resourceId}`
+const initialResourceCache: Record<string, EvaluationResourceState> = {}
+
+const createId = (prefix: string) => `${prefix}-${Math.random().toString(36).slice(2, 10)}`
+
+export const conditionOperatorsWithoutValue: ComparisonOperator[] = ['is_empty', 'is_not_empty']
+
+export const requiresConditionValue = (operator: ComparisonOperator) => !conditionOperatorsWithoutValue.includes(operator)
+
+const getConditionValue = (
+  field: EvaluationFieldOption | undefined,
+  operator: ComparisonOperator,
+  previousValue: string | number | boolean | null = null,
+) => {
+  if (!field || !requiresConditionValue(operator))
+    return null
+
+  if (field.type === 'boolean')
+    return typeof previousValue === 'boolean' ? previousValue : null
+
+  if (field.type === 'enum')
+    return typeof previousValue === 'string' ? previousValue : null
+
+  if (field.type === 'number')
+    return typeof previousValue === 'number' ? previousValue : null
+
+  return typeof previousValue === 'string' ? previousValue : null
+}
+
+const buildConditionItem = (resourceType: EvaluationResourceType) => {
+  const field = getEvaluationMockConfig(resourceType).fieldOptions[0]
+  const operator = field ? getDefaultOperator(field.type) : 'contains'
+
+  return {
+    id: createId('condition'),
+    fieldId: field?.id ?? null,
+    operator,
+    value: getConditionValue(field, operator),
+  }
+}
+
+const buildInitialState = (resourceType: EvaluationResourceType): EvaluationResourceState => {
+  const config = getEvaluationMockConfig(resourceType)
+  const defaultMetric = config.builtinMetrics[0]
+
+  return {
+    judgeModelId: null,
+    metrics: defaultMetric
+      ? [{
+          id: createId('metric'),
+          optionId: defaultMetric.id,
+          kind: 'builtin',
+          label: defaultMetric.label,
+          description: defaultMetric.description,
+          badges: defaultMetric.badges,
+        }]
+      : [],
+    conditions: [{
+      id: createId('group'),
+      logicalOperator: 'and',
+      items: [buildConditionItem(resourceType)],
+    }],
+    activeBatchTab: 'input-fields',
+    uploadedFileName: null,
+    batchRecords: [],
+  }
+}
+
+const withResourceState = (
+  resources: EvaluationStore['resources'],
+  resourceType: EvaluationResourceType,
+  resourceId: string,
+) => {
+  const resourceKey = buildResourceKey(resourceType, resourceId)
+
+  return {
+    resourceKey,
+    resource: resources[resourceKey] ?? buildInitialState(resourceType),
+  }
+}
+
+const updateMetric = (
+  metrics: EvaluationMetric[],
+  metricId: string,
+  updater: (metric: EvaluationMetric) => EvaluationMetric,
+) => metrics.map(metric => metric.id === metricId ? updater(metric) : metric)
+
+const updateConditionGroup = (
+  groups: JudgmentConditionGroup[],
+  groupId: string,
+  updater: (group: JudgmentConditionGroup) => JudgmentConditionGroup,
+) => groups.map(group => group.id === groupId ? updater(group) : group)
+
+export const isCustomMetricConfigured = (metric: EvaluationMetric) => {
+  if (metric.kind !== 'custom-workflow')
+    return true
+
+  if (!metric.customConfig?.workflowId)
+    return false
+
+  return metric.customConfig.mappings.length > 0
+    && metric.customConfig.mappings.every(mapping => !!mapping.sourceFieldId && !!mapping.targetVariableId)
+}
+
+export const isEvaluationRunnable = (state: EvaluationResourceState) => {
+  return !!state.judgeModelId
+    && state.metrics.length > 0
+    && state.metrics.every(isCustomMetricConfigured)
+    && state.conditions.some(group => group.items.length > 0)
+}
+
+export const useEvaluationStore = create<EvaluationStore>((set, get) => ({
+  resources: {},
+  ensureResource: (resourceType, resourceId) => {
+    const resourceKey = buildResourceKey(resourceType, resourceId)
+    if (get().resources[resourceKey])
+      return
+
+    set(state => ({
+      resources: {
+        ...state.resources,
+        [resourceKey]: buildInitialState(resourceType),
+      },
+    }))
+  },
+  setJudgeModel: (resourceType, resourceId, judgeModelId) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            judgeModelId,
+          },
+        },
+      }
+    })
+  },
+  addBuiltinMetric: (resourceType, resourceId, optionId) => {
+    const option = getEvaluationMockConfig(resourceType).builtinMetrics.find(metric => metric.id === optionId)
+    if (!option)
+      return
+
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      if (resource.metrics.some(metric => metric.optionId === optionId && metric.kind === 'builtin'))
+        return state
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            metrics: [
+              ...resource.metrics,
+              {
+                id: createId('metric'),
+                optionId: option.id,
+                kind: 'builtin',
+                label: option.label,
+                description: option.description,
+                badges: option.badges,
+              },
+            ],
+          },
+        },
+      }
+    })
+  },
+  addCustomMetric: (resourceType, resourceId) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            metrics: [
+              ...resource.metrics,
+              {
+                id: createId('metric'),
+                optionId: createId('custom'),
+                kind: 'custom-workflow',
+                label: 'Custom Evaluator',
+                description: 'Map workflow variables to your evaluation inputs.',
+                badges: ['Workflow'],
+                customConfig: {
+                  workflowId: null,
+                  mappings: [{
+                    id: createId('mapping'),
+                    sourceFieldId: null,
+                    targetVariableId: null,
+                  }],
+                },
+              },
+            ],
+          },
+        },
+      }
+    })
+  },
+  removeMetric: (resourceType, resourceId, metricId) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            metrics: resource.metrics.filter(metric => metric.id !== metricId),
+          },
+        },
+      }
+    })
+  },
+  setCustomMetricWorkflow: (resourceType, resourceId, metricId, workflowId) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            metrics: updateMetric(resource.metrics, metricId, metric => ({
+              ...metric,
+              customConfig: metric.customConfig
+                ? {
+                    ...metric.customConfig,
+                    workflowId,
+                    mappings: metric.customConfig.mappings.map(mapping => ({
+                      ...mapping,
+                      targetVariableId: null,
+                    })),
+                  }
+                : metric.customConfig,
+            })),
+          },
+        },
+      }
+    })
+  },
+  addCustomMetricMapping: (resourceType, resourceId, metricId) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            metrics: updateMetric(resource.metrics, metricId, metric => ({
+              ...metric,
+              customConfig: metric.customConfig
+                ? {
+                    ...metric.customConfig,
+                    mappings: [
+                      ...metric.customConfig.mappings,
+                      {
+                        id: createId('mapping'),
+                        sourceFieldId: null,
+                        targetVariableId: null,
+                      },
+                    ],
+                  }
+                : metric.customConfig,
+            })),
+          },
+        },
+      }
+    })
+  },
+  updateCustomMetricMapping: (resourceType, resourceId, metricId, mappingId, patch) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            metrics: updateMetric(resource.metrics, metricId, metric => ({
+              ...metric,
+              customConfig: metric.customConfig
+                ? {
+                    ...metric.customConfig,
+                    mappings: metric.customConfig.mappings.map(mapping => mapping.id === mappingId ? { ...mapping, ...patch } : mapping),
+                  }
+                : metric.customConfig,
+            })),
+          },
+        },
+      }
+    })
+  },
+  removeCustomMetricMapping: (resourceType, resourceId, metricId, mappingId) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            metrics: updateMetric(resource.metrics, metricId, metric => ({
+              ...metric,
+              customConfig: metric.customConfig
+                ? {
+                    ...metric.customConfig,
+                    mappings: metric.customConfig.mappings.filter(mapping => mapping.id !== mappingId),
+                  }
+                : metric.customConfig,
+            })),
+          },
+        },
+      }
+    })
+  },
+  addConditionGroup: (resourceType, resourceId) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            conditions: [
+              ...resource.conditions,
+              {
+                id: createId('group'),
+                logicalOperator: 'and',
+                items: [buildConditionItem(resourceType)],
+              },
+            ],
+          },
+        },
+      }
+    })
+  },
+  removeConditionGroup: (resourceType, resourceId, groupId) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            conditions: resource.conditions.filter(group => group.id !== groupId),
+          },
+        },
+      }
+    })
+  },
+  setConditionGroupOperator: (resourceType, resourceId, groupId, logicalOperator) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            conditions: updateConditionGroup(resource.conditions, groupId, group => ({
+              ...group,
+              logicalOperator,
+            })),
+          },
+        },
+      }
+    })
+  },
+  addConditionItem: (resourceType, resourceId, groupId) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            conditions: updateConditionGroup(resource.conditions, groupId, group => ({
+              ...group,
+              items: [
+                ...group.items,
+                buildConditionItem(resourceType),
+              ],
+            })),
+          },
+        },
+      }
+    })
+  },
+  removeConditionItem: (resourceType, resourceId, groupId, itemId) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            conditions: updateConditionGroup(resource.conditions, groupId, group => ({
+              ...group,
+              items: group.items.filter(item => item.id !== itemId),
+            })),
+          },
+        },
+      }
+    })
+  },
+  updateConditionField: (resourceType, resourceId, groupId, itemId, fieldId) => {
+    const field = getEvaluationMockConfig(resourceType).fieldOptions.find(option => option.id === fieldId)
+
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            conditions: updateConditionGroup(resource.conditions, groupId, group => ({
+              ...group,
+              items: group.items.map((item) => {
+                if (item.id !== itemId)
+                  return item
+
+                return {
+                  ...item,
+                  fieldId,
+                  operator: field ? getDefaultOperator(field.type) : item.operator,
+                  value: getConditionValue(field, field ? getDefaultOperator(field.type) : item.operator),
+                }
+              }),
+            })),
+          },
+        },
+      }
+    })
+  },
+  updateConditionOperator: (resourceType, resourceId, groupId, itemId, operator) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+      const fieldOptions = getEvaluationMockConfig(resourceType).fieldOptions
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            conditions: updateConditionGroup(resource.conditions, groupId, group => ({
+              ...group,
+              items: group.items.map((item) => {
+                if (item.id !== itemId)
+                  return item
+
+                const field = fieldOptions.find(option => option.id === item.fieldId)
+
+                return {
+                  ...item,
+                  operator,
+                  value: getConditionValue(field, operator, item.value),
+                }
+              }),
+            })),
+          },
+        },
+      }
+    })
+  },
+  updateConditionValue: (resourceType, resourceId, groupId, itemId, value) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            conditions: updateConditionGroup(resource.conditions, groupId, group => ({
+              ...group,
+              items: group.items.map(item => item.id === itemId ? { ...item, value } : item),
+            })),
+          },
+        },
+      }
+    })
+  },
+  setBatchTab: (resourceType, resourceId, tab) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            activeBatchTab: tab,
+          },
+        },
+      }
+    })
+  },
+  setUploadedFileName: (resourceType, resourceId, uploadedFileName) => {
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            uploadedFileName,
+          },
+        },
+      }
+    })
+  },
+  runBatchTest: (resourceType, resourceId) => {
+    const config = getEvaluationMockConfig(resourceType)
+    const recordId = createId('batch')
+    const nextRecord: BatchTestRecord = {
+      id: recordId,
+      fileName: get().resources[buildResourceKey(resourceType, resourceId)]?.uploadedFileName ?? config.templateFileName,
+      status: 'running',
+      startedAt: new Date().toLocaleTimeString(),
+      summary: config.historySummaryLabel,
+    }
+
+    set((state) => {
+      const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+      return {
+        resources: {
+          ...state.resources,
+          [resourceKey]: {
+            ...resource,
+            activeBatchTab: 'history',
+            batchRecords: [nextRecord, ...resource.batchRecords],
+          },
+        },
+      }
+    })
+
+    window.setTimeout(() => {
+      set((state) => {
+        const { resource, resourceKey } = withResourceState(state.resources, resourceType, resourceId)
+
+        return {
+          resources: {
+            ...state.resources,
+            [resourceKey]: {
+              ...resource,
+              batchRecords: resource.batchRecords.map(record => record.id === recordId
+                ? {
+                    ...record,
+                    status: resource.metrics.length > 1 ? 'success' : 'failed',
+                  }
+                : record),
+            },
+          },
+        }
+      })
+    }, 1200)
+  },
+}))
+
+export const useEvaluationResource = (resourceType: EvaluationResourceType, resourceId: string) => {
+  const resourceKey = buildResourceKey(resourceType, resourceId)
+  return useEvaluationStore(state => state.resources[resourceKey] ?? (initialResourceCache[resourceKey] ??= buildInitialState(resourceType)))
+}
+
+export const getAllowedOperators = (resourceType: EvaluationResourceType, fieldId: string | null) => {
+  const field = getEvaluationMockConfig(resourceType).fieldOptions.find(option => option.id === fieldId)
+
+  if (!field)
+    return ['contains'] as ComparisonOperator[]
+
+  return getComparisonOperators(field.type)
+}
--- a/web/app/components/evaluation/types.ts
+++ b/web/app/components/evaluation/types.ts
@ -0,0 +1,117 @@
+export type EvaluationResourceType = 'workflow' | 'pipeline' | 'snippet'
+
+export type MetricKind = 'builtin' | 'custom-workflow'
+
+export type BatchTestTab = 'input-fields' | 'history'
+
+export type FieldType = 'string' | 'number' | 'boolean' | 'enum' | 'time'
+
+export type ComparisonOperator
+  = | 'contains'
+    | 'not_contains'
+    | 'is'
+    | 'is_not'
+    | 'is_empty'
+    | 'is_not_empty'
+    | 'greater_than'
+    | 'less_than'
+    | 'greater_or_equal'
+    | 'less_or_equal'
+    | 'before'
+    | 'after'
+
+export type JudgeModelOption = {
+  id: string
+  label: string
+  provider: string
+}
+
+export type MetricOption = {
+  id: string
+  label: string
+  description: string
+  group: string
+  badges: string[]
+}
+
+export type EvaluationWorkflowOption = {
+  id: string
+  label: string
+  description: string
+  targetVariables: Array<{
+    id: string
+    label: string
+  }>
+}
+
+export type EvaluationFieldOption = {
+  id: string
+  label: string
+  group: string
+  type: FieldType
+  options?: Array<{
+    value: string
+    label: string
+  }>
+}
+
+export type CustomMetricMapping = {
+  id: string
+  sourceFieldId: string | null
+  targetVariableId: string | null
+}
+
+export type CustomMetricConfig = {
+  workflowId: string | null
+  mappings: CustomMetricMapping[]
+}
+
+export type EvaluationMetric = {
+  id: string
+  optionId: string
+  kind: MetricKind
+  label: string
+  description: string
+  badges: string[]
+  customConfig?: CustomMetricConfig
+}
+
+export type JudgmentConditionItem = {
+  id: string
+  fieldId: string | null
+  operator: ComparisonOperator
+  value: string | number | boolean | null
+}
+
+export type JudgmentConditionGroup = {
+  id: string
+  logicalOperator: 'and' | 'or'
+  items: JudgmentConditionItem[]
+}
+
+export type BatchTestRecord = {
+  id: string
+  fileName: string
+  status: 'running' | 'success' | 'failed'
+  startedAt: string
+  summary: string
+}
+
+export type EvaluationResourceState = {
+  judgeModelId: string | null
+  metrics: EvaluationMetric[]
+  conditions: JudgmentConditionGroup[]
+  activeBatchTab: BatchTestTab
+  uploadedFileName: string | null
+  batchRecords: BatchTestRecord[]
+}
+
+export type EvaluationMockConfig = {
+  judgeModels: JudgeModelOption[]
+  builtinMetrics: MetricOption[]
+  workflowOptions: EvaluationWorkflowOption[]
+  fieldOptions: EvaluationFieldOption[]
+  templateFileName: string
+  batchRequirements: string[]
+  historySummaryLabel: string
+}
--- a/web/app/components/snippets/components/snippet-main.tsx
+++ b/web/app/components/snippets/components/snippet-main.tsx
@ -2,7 +2,7 @@

 import type { NavIcon } from '@/app/components/app-sidebar/nav-link'
 import type { WorkflowProps } from '@/app/components/workflow'
-import type { SnippetDetailPayload, SnippetInputField } from '@/models/snippet'
+import type { SnippetDetailPayload, SnippetInputField, SnippetSection } from '@/models/snippet'
 import {
  RiFlaskFill,
  RiFlaskLine,
@ -17,6 +17,7 @@ import NavLink from '@/app/components/app-sidebar/nav-link'
 import SnippetInfo from '@/app/components/app-sidebar/snippet-info'
 import { useStore as useAppStore } from '@/app/components/app/store'
 import Toast from '@/app/components/base/toast'
+import Evaluation from '@/app/components/evaluation'
 import { WorkflowWithInnerContext } from '@/app/components/workflow'
 import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
 import { useSnippetDetailStore } from '../store'
@ -25,6 +26,7 @@ import SnippetChildren from './snippet-children'
 type SnippetMainProps = {
  payload: SnippetDetailPayload
  snippetId: string
+  section: SnippetSection
 } & Pick<WorkflowProps, 'nodes' | 'edges' | 'viewport'>

 const ORCHESTRATE_ICONS: { normal: NavIcon, selected: NavIcon } = {
@ -40,6 +42,7 @@ const EVALUATION_ICONS: { normal: NavIcon, selected: NavIcon } = {
 const SnippetMain = ({
  payload,
  snippetId,
+  section,
  nodes,
  edges,
  viewport,
@ -51,7 +54,6 @@ const SnippetMain = ({
  const [fields, setFields] = useState<SnippetInputField[]>(payload.inputFields)
  const setAppSidebarExpand = useAppStore(state => state.setAppSidebarExpand)
  const {
-    activeSection,
    editingField,
    isEditorOpen,
    isInputPanelOpen,
@ -59,12 +61,10 @@ const SnippetMain = ({
    closeEditor,
    openEditor,
    reset,
-    setActiveSection,
    setInputPanelOpen,
    toggleInputPanel,
    togglePublishMenu,
  } = useSnippetDetailStore(useShallow(state => ({
-    activeSection: state.activeSection,
    editingField: state.editingField,
    isEditorOpen: state.isEditorOpen,
    isInputPanelOpen: state.isInputPanelOpen,
@ -72,7 +72,6 @@ const SnippetMain = ({
    closeEditor: state.closeEditor,
    openEditor: state.openEditor,
    reset: state.reset,
-    setActiveSection: state.setActiveSection,
    setInputPanelOpen: state.setInputPanelOpen,
    toggleInputPanel: state.toggleInputPanel,
    togglePublishMenu: state.togglePublishMenu,
@ -145,15 +144,15 @@ const SnippetMain = ({
              mode={mode}
              name={t('sectionOrchestrate')}
              iconMap={ORCHESTRATE_ICONS}
-              active={activeSection === 'orchestrate'}
-              onClick={() => setActiveSection('orchestrate')}
+              href={`/snippets/${snippetId}/orchestrate`}
+              active={section === 'orchestrate'}
            />
            <NavLink
              mode={mode}
              name={t('sectionEvaluation')}
              iconMap={EVALUATION_ICONS}
-              active={activeSection === 'evaluation'}
-              onClick={() => setActiveSection('evaluation')}
+              href={`/snippets/${snippetId}/evaluation`}
+              active={section === 'evaluation'}
            />
          </>
        )}
@ -161,29 +160,35 @@ const SnippetMain = ({

      <div className="relative min-h-0 min-w-0 grow overflow-hidden">
        <div className="absolute inset-0 min-h-0 min-w-0 overflow-hidden">
-          <WorkflowWithInnerContext
-            nodes={nodes}
-            edges={edges}
-            viewport={viewport ?? graph.viewport}
-          >
-            <SnippetChildren
-              fields={fields}
-              uiMeta={uiMeta}
-              editingField={editingField}
-              isEditorOpen={isEditorOpen}
-              isInputPanelOpen={isInputPanelOpen}
-              isPublishMenuOpen={isPublishMenuOpen}
-              onToggleInputPanel={handleToggleInputPanel}
-              onTogglePublishMenu={togglePublishMenu}
-              onCloseInputPanel={handleCloseInputPanel}
-              onOpenEditor={openEditor}
-              onCloseEditor={closeEditor}
-              onSubmitField={handleSubmitField}
-              onRemoveField={handleRemoveField}
-              onPrimarySortChange={handlePrimarySortChange}
-              onSecondarySortChange={handleSecondarySortChange}
-            />
-          </WorkflowWithInnerContext>
+          {section === 'evaluation'
+            ? (
+                <Evaluation resourceType="snippet" resourceId={snippetId} />
+              )
+            : (
+                <WorkflowWithInnerContext
+                  nodes={nodes}
+                  edges={edges}
+                  viewport={viewport ?? graph.viewport}
+                >
+                  <SnippetChildren
+                    fields={fields}
+                    uiMeta={uiMeta}
+                    editingField={editingField}
+                    isEditorOpen={isEditorOpen}
+                    isInputPanelOpen={isInputPanelOpen}
+                    isPublishMenuOpen={isPublishMenuOpen}
+                    onToggleInputPanel={handleToggleInputPanel}
+                    onTogglePublishMenu={togglePublishMenu}
+                    onCloseInputPanel={handleCloseInputPanel}
+                    onOpenEditor={openEditor}
+                    onCloseEditor={closeEditor}
+                    onSubmitField={handleSubmitField}
+                    onRemoveField={handleRemoveField}
+                    onPrimarySortChange={handlePrimarySortChange}
+                    onSecondarySortChange={handleSecondarySortChange}
+                  />
+                </WorkflowWithInnerContext>
+              )}
        </div>
      </div>
    </div>
--- a/web/app/components/snippets/index.tsx
+++ b/web/app/components/snippets/index.tsx
@ -1,5 +1,6 @@
 'use client'

+import type { SnippetSection } from '@/models/snippet'
 import { useMemo } from 'react'
 import { useTranslation } from 'react-i18next'
 import Loading from '@/app/components/base/loading'
@ -14,10 +15,12 @@ import { useSnippetInit } from './hooks/use-snippet-init'

 type SnippetPageProps = {
  snippetId: string
+  section?: SnippetSection
 }

 const SnippetPage = ({
  snippetId,
+  section = 'orchestrate',
 }: SnippetPageProps) => {
  const { t } = useTranslation('snippet')
  const { data, isLoading } = useSnippetInit(snippetId)
@ -62,6 +65,7 @@ const SnippetPage = ({
      <SnippetMain
        key={snippetId}
        snippetId={snippetId}
+        section={section}
        payload={data}
        nodes={nodesData}
        edges={edgesData}
--- a/web/i18n-config/resources.ts
+++ b/web/i18n-config/resources.ts
@ -14,6 +14,7 @@ import type datasetPipeline from '../i18n/en-US/dataset-pipeline.json'
 import type datasetSettings from '../i18n/en-US/dataset-settings.json'
 import type dataset from '../i18n/en-US/dataset.json'
 import type education from '../i18n/en-US/education.json'
+import type evaluation from '../i18n/en-US/evaluation.json'
 import type explore from '../i18n/en-US/explore.json'
 import type layout from '../i18n/en-US/layout.json'
 import type login from '../i18n/en-US/login.json'
@ -48,6 +49,7 @@ export type Resources = {
  datasetPipeline: typeof datasetPipeline
  datasetSettings: typeof datasetSettings
  education: typeof education
+  evaluation: typeof evaluation
  explore: typeof explore
  layout: typeof layout
  login: typeof login
@ -82,6 +84,7 @@ export const namespaces = [
  'datasetPipeline',
  'datasetSettings',
  'education',
+  'evaluation',
  'explore',
  'layout',
  'login',
--- a/web/i18n/en-US/common.json
+++ b/web/i18n/en-US/common.json
@ -93,6 +93,7 @@
  "apiBasedExtension.type": "Type",
  "appMenus.apiAccess": "API Access",
  "appMenus.apiAccessTip": "This knowledge base is accessible via the Service API",
+  "appMenus.evaluation": "Evaluation",
  "appMenus.logAndAnn": "Logs & Annotations",
  "appMenus.logs": "Logs",
  "appMenus.overview": "Monitoring",
@ -149,6 +150,7 @@
  "dataSource.website.with": "With",
  "datasetMenus.documents": "Documents",
  "datasetMenus.emptyTip": "This Knowledge has not been integrated within any application. Please refer to the document for guidance.",
+  "datasetMenus.evaluation": "Evaluation",
  "datasetMenus.hitTesting": "Retrieval Testing",
  "datasetMenus.noRelatedApp": "No linked apps",
  "datasetMenus.pipeline": "Pipeline",
--- a/web/i18n/en-US/evaluation.json
+++ b/web/i18n/en-US/evaluation.json
@ -0,0 +1,73 @@
+{
+  "batch.downloadTemplate": "Download Excel Template",
+  "batch.emptyHistory": "No test history yet.",
+  "batch.noticeDescription": "Download the template, upload your cases, then run a local mock batch test.",
+  "batch.noticeTitle": "Quick start",
+  "batch.requirementsTitle": "Data requirements",
+  "batch.run": "Run Test",
+  "batch.status.failed": "Failed",
+  "batch.status.running": "Running",
+  "batch.status.success": "Success",
+  "batch.tabs.history": "Test History",
+  "batch.tabs.input-fields": "Input Fields",
+  "batch.title": "Batch Test",
+  "batch.uploadHint": "Select a .csv or .xlsx file",
+  "batch.uploadTitle": "Upload test file",
+  "batch.validation": "Complete the judge model, metrics, and custom mappings before running a batch test.",
+  "conditions.addCondition": "Add Condition",
+  "conditions.addGroup": "Add Condition Group",
+  "conditions.boolean.false": "False",
+  "conditions.boolean.true": "True",
+  "conditions.description": "Define additional rules for when results should pass or fail.",
+  "conditions.emptyDescription": "Start with a condition group to define evaluation gates.",
+  "conditions.emptyTitle": "No conditions yet",
+  "conditions.fieldPlaceholder": "Select field",
+  "conditions.groupLabel": "Group {{index}}",
+  "conditions.logical.and": "AND",
+  "conditions.logical.or": "OR",
+  "conditions.operators.after": "After",
+  "conditions.operators.before": "Before",
+  "conditions.operators.contains": "Contains",
+  "conditions.operators.greater_or_equal": "Greater than or equal",
+  "conditions.operators.greater_than": "Greater than",
+  "conditions.operators.is": "Is",
+  "conditions.operators.is_empty": "Is empty",
+  "conditions.operators.is_not": "Is not",
+  "conditions.operators.is_not_empty": "Is not empty",
+  "conditions.operators.less_or_equal": "Less than or equal",
+  "conditions.operators.less_than": "Less than",
+  "conditions.operators.not_contains": "Does not contain",
+  "conditions.removeCondition": "Remove condition",
+  "conditions.removeGroup": "Remove condition group",
+  "conditions.selectFieldFirst": "Select a field first",
+  "conditions.selectTime": "Choose a time...",
+  "conditions.selectValue": "Choose a value",
+  "conditions.title": "Judgment Conditions",
+  "conditions.valuePlaceholder": "Enter a value",
+  "description": "Configure judge models, metrics, and batch tests for this resource.",
+  "judgeModel.description": "Choose the model used to score your evaluation results.",
+  "judgeModel.title": "Judge Model",
+  "metrics.add": "Add Metric",
+  "metrics.addCustom": "Add Custom Metrics",
+  "metrics.added": "Added",
+  "metrics.custom.addMapping": "Add Mapping",
+  "metrics.custom.description": "Select an evaluation workflow and map your variables before running tests.",
+  "metrics.custom.mappingTitle": "Variable Mapping",
+  "metrics.custom.mappingWarning": "Complete the workflow selection and each variable mapping to enable batch tests.",
+  "metrics.custom.sourcePlaceholder": "Source variable",
+  "metrics.custom.targetPlaceholder": "Target variable",
+  "metrics.custom.title": "Custom Evaluator",
+  "metrics.custom.warningBadge": "Needs setup",
+  "metrics.custom.workflowLabel": "Evaluation Workflow",
+  "metrics.custom.workflowPlaceholder": "Select a workflow",
+  "metrics.description": "Combine built-in metrics with custom evaluator workflows.",
+  "metrics.groups.operations": "Operations",
+  "metrics.groups.quality": "Quality",
+  "metrics.noResults": "No metrics match your search.",
+  "metrics.remove": "Remove metric",
+  "metrics.searchPlaceholder": "Search metrics",
+  "metrics.showLess": "Show less",
+  "metrics.showMore": "Show more",
+  "metrics.title": "Metrics",
+  "title": "Evaluation"
+}
--- a/web/i18n/zh-Hans/common.json
+++ b/web/i18n/zh-Hans/common.json
@ -93,6 +93,7 @@
  "apiBasedExtension.type": "类型",
  "appMenus.apiAccess": "访问 API",
  "appMenus.apiAccessTip": "此知识库可通过服务 API 访问",
+  "appMenus.evaluation": "评测",
  "appMenus.logAndAnn": "日志与标注",
  "appMenus.logs": "日志",
  "appMenus.overview": "监测",
@ -149,6 +150,7 @@
  "dataSource.website.with": "使用",
  "datasetMenus.documents": "文档",
  "datasetMenus.emptyTip": "此知识尚未集成到任何应用程序中。请参阅文档以获取指导。",
+  "datasetMenus.evaluation": "评测",
  "datasetMenus.hitTesting": "召回测试",
  "datasetMenus.noRelatedApp": "无关联应用",
  "datasetMenus.pipeline": "流水线",
--- a/web/i18n/zh-Hans/evaluation.json
+++ b/web/i18n/zh-Hans/evaluation.json
@ -0,0 +1,73 @@
+{
+  "batch.downloadTemplate": "下载 Excel 模板",
+  "batch.emptyHistory": "还没有测试历史。",
+  "batch.noticeDescription": "先下载模板，再上传测试集，然后运行本地模拟批量测试。",
+  "batch.noticeTitle": "快速开始",
+  "batch.requirementsTitle": "数据要求",
+  "batch.run": "运行测试",
+  "batch.status.failed": "失败",
+  "batch.status.running": "运行中",
+  "batch.status.success": "成功",
+  "batch.tabs.history": "测试历史",
+  "batch.tabs.input-fields": "输入字段",
+  "batch.title": "批量测试",
+  "batch.uploadHint": "选择 .csv 或 .xlsx 文件",
+  "batch.uploadTitle": "上传测试文件",
+  "batch.validation": "运行批量测试前，请先完成判定模型、指标和自定义映射配置。",
+  "conditions.addCondition": "添加条件",
+  "conditions.addGroup": "添加条件组",
+  "conditions.boolean.false": "否",
+  "conditions.boolean.true": "是",
+  "conditions.description": "定义额外规则，决定结果何时通过或失败。",
+  "conditions.emptyDescription": "从一个条件组开始，定义评测门槛。",
+  "conditions.emptyTitle": "还没有条件",
+  "conditions.fieldPlaceholder": "选择字段",
+  "conditions.groupLabel": "条件组 {{index}}",
+  "conditions.logical.and": "且",
+  "conditions.logical.or": "或",
+  "conditions.operators.after": "晚于",
+  "conditions.operators.before": "早于",
+  "conditions.operators.contains": "包含",
+  "conditions.operators.greater_or_equal": "大于等于",
+  "conditions.operators.greater_than": "大于",
+  "conditions.operators.is": "等于",
+  "conditions.operators.is_empty": "为空",
+  "conditions.operators.is_not": "不等于",
+  "conditions.operators.is_not_empty": "不为空",
+  "conditions.operators.less_or_equal": "小于等于",
+  "conditions.operators.less_than": "小于",
+  "conditions.operators.not_contains": "不包含",
+  "conditions.removeCondition": "删除条件",
+  "conditions.removeGroup": "删除条件组",
+  "conditions.selectFieldFirst": "请先选择字段",
+  "conditions.selectTime": "选择时间...",
+  "conditions.selectValue": "选择值",
+  "conditions.title": "判定条件",
+  "conditions.valuePlaceholder": "输入值",
+  "description": "为当前资源配置判定模型、评测指标和批量测试。",
+  "judgeModel.description": "选择用于打分和判定评测结果的模型。",
+  "judgeModel.title": "判定模型",
+  "metrics.add": "添加指标",
+  "metrics.addCustom": "添加自定义指标",
+  "metrics.added": "已添加",
+  "metrics.custom.addMapping": "添加映射",
+  "metrics.custom.description": "选择评测工作流并完成变量映射后即可运行测试。",
+  "metrics.custom.mappingTitle": "变量映射",
+  "metrics.custom.mappingWarning": "请先完成工作流选择和所有变量映射，再运行批量测试。",
+  "metrics.custom.sourcePlaceholder": "源变量",
+  "metrics.custom.targetPlaceholder": "目标变量",
+  "metrics.custom.title": "自定义评测器",
+  "metrics.custom.warningBadge": "待配置",
+  "metrics.custom.workflowLabel": "评测工作流",
+  "metrics.custom.workflowPlaceholder": "选择工作流",
+  "metrics.description": "组合内置指标和自定义评测工作流。",
+  "metrics.groups.operations": "运行",
+  "metrics.groups.quality": "质量",
+  "metrics.noResults": "没有匹配的指标。",
+  "metrics.remove": "删除指标",
+  "metrics.searchPlaceholder": "搜索指标",
+  "metrics.showLess": "收起",
+  "metrics.showMore": "展开更多",
+  "metrics.title": "指标",
+  "title": "评测"
+}