feat(web): evaluation run detail

2026-05-12 07:37:09 +08:00 · 2026-04-10 17:48:28 +08:00 · 2026-04-10 17:48:28 +08:00 · 79fc352a5a
commit 79fc352a5a
parent 8b6b3cddea
12 changed files with 518 additions and 13 deletions
--- a/web/app/components/evaluation/components/batch-test-panel/history-tab.tsx
+++ b/web/app/components/evaluation/components/batch-test-panel/history-tab.tsx
@ -1,7 +1,7 @@
 import type { EvaluationResourceProps } from '../../types'
-import type { EvaluationLogFile } from '@/types/evaluation'
+import type { EvaluationLog, EvaluationLogFile } from '@/types/evaluation'
 import { keepPreviousData, useMutation, useQuery } from '@tanstack/react-query'
-import { useState } from 'react'
+import { useEffect, useMemo, useState } from 'react'
 import { useTranslation } from 'react-i18next'
 import Pagination from '@/app/components/base/pagination'
 import {
@ -11,7 +11,9 @@ import {
  DropdownMenuTrigger,
 } from '@/app/components/base/ui/dropdown-menu'
 import { consoleClient, consoleQuery } from '@/service/client'
+import { cn } from '@/utils/classnames'
 import { downloadUrl } from '@/utils/download'
+import { useEvaluationResource, useEvaluationStore } from '../../store'

 const PAGE_SIZE = 16
 const LOADING_ROW_IDS = ['1', '2', '3', '4', '5', '6']
@ -23,12 +25,18 @@ const formatCreatedAt = (createdAt: string) => {
  return createdAt.includes('T') ? createdAt.slice(0, 10) : createdAt
 }

+const getLogRunId = (record: EvaluationLog) => {
+  return record.run_id ?? record.evaluation_run_id ?? record.id ?? null
+}
+
 const HistoryTab = ({
  resourceType,
  resourceId,
 }: EvaluationResourceProps) => {
  const { t } = useTranslation('evaluation')
  const [page, setPage] = useState(0)
+  const resource = useEvaluationResource(resourceType, resourceId)
+  const setSelectedRunId = useEvaluationStore(state => state.setSelectedRunId)
  const logsQuery = useQuery({
    ...consoleQuery.evaluation.logs.queryOptions({
      input: {
@ -58,10 +66,19 @@ const HistoryTab = ({
      downloadUrl({ url: fileInfo.download_url, fileName: file.name })
    },
  })
-  const records = logsQuery.data?.data ?? []
+  const records = useMemo(() => logsQuery.data?.data ?? [], [logsQuery.data?.data])
  const total = logsQuery.data?.total ?? 0
  const isInitialLoading = logsQuery.isLoading && !logsQuery.data

+  useEffect(() => {
+    if (resource.selectedRunId)
+      return
+
+    const firstRunId = records.map(getLogRunId).find((runId): runId is string => !!runId)
+    if (firstRunId)
+      setSelectedRunId(resourceType, resourceId, firstRunId)
+  }, [records, resource.selectedRunId, resourceId, resourceType, setSelectedRunId])
+
  return (
    <div className="flex min-h-full flex-col">
      <div className="min-h-0 flex-1 overflow-hidden">
@ -98,7 +115,19 @@ const HistoryTab = ({
              </tr>
            ))}
            {!isInitialLoading && records.map(record => (
-              <tr key={`${record.created_at}-${record.test_file.id}`} className="border-b border-divider-subtle">
+              <tr
+                key={`${record.created_at}-${record.test_file.id}`}
+                className={cn(
+                  'border-b border-divider-subtle',
+                  getLogRunId(record) && 'cursor-pointer hover:bg-state-base-hover',
+                  getLogRunId(record) === resource.selectedRunId && 'bg-background-default-subtle',
+                )}
+                onClick={() => {
+                  const runId = getLogRunId(record)
+                  if (runId)
+                    setSelectedRunId(resourceType, resourceId, runId)
+                }}
+              >
                <td className="h-10 truncate px-3 system-sm-regular text-text-secondary">{formatCreatedAt(record.created_at)}</td>
                <td className="h-10 truncate px-3 system-sm-regular text-text-secondary">{record.created_by}</td>
                <td className="h-10 truncate px-3 system-sm-regular text-text-secondary">{record.version || '-'}</td>
@ -115,6 +144,7 @@ const HistoryTab = ({
                          type="button"
                          aria-label={t('history.actions.open')}
                          className="inline-flex h-8 w-8 items-center justify-center rounded-md text-text-tertiary hover:bg-state-base-hover hover:text-text-secondary"
+                          onClick={event => event.stopPropagation()}
                        />
                      )}
                    >
@ -123,7 +153,10 @@ const HistoryTab = ({
                    <DropdownMenuContent popupClassName="w-[180px] rounded-lg border-[0.5px] border-components-panel-border py-1 shadow-lg">
                      <DropdownMenuItem
                        className="gap-2"
-                        onClick={() => fileDownloadMutation.mutate(record.test_file)}
+                        onClick={(event) => {
+                          event.stopPropagation()
+                          fileDownloadMutation.mutate(record.test_file)
+                        }}
                      >
                        <span aria-hidden="true" className="i-ri-file-download-line h-4 w-4" />
                        {t('history.actions.downloadTestFile')}
@ -131,7 +164,11 @@ const HistoryTab = ({
                      <DropdownMenuItem
                        className="gap-2"
                        disabled={!record.result_file}
-                        onClick={() => record.result_file && fileDownloadMutation.mutate(record.result_file)}
+                        onClick={(event) => {
+                          event.stopPropagation()
+                          if (record.result_file)
+                            fileDownloadMutation.mutate(record.result_file)
+                        }}
                      >
                        <span aria-hidden="true" className="i-ri-download-2-line h-4 w-4" />
                        {t('history.actions.downloadResultFile')}
--- a/web/app/components/evaluation/components/batch-test-panel/input-fields/use-input-fields-actions.ts
+++ b/web/app/components/evaluation/components/batch-test-panel/input-fields/use-input-fields-actions.ts
@ -36,6 +36,7 @@ export const useInputFieldsActions = ({
  const { t } = useTranslation('evaluation')
  const resource = useEvaluationResource(resourceType, resourceId)
  const setBatchTab = useEvaluationStore(state => state.setBatchTab)
+  const setSelectedRunId = useEvaluationStore(state => state.setSelectedRunId)
  const setUploadedFile = useEvaluationStore(state => state.setUploadedFile)
  const setUploadedFileName = useEvaluationStore(state => state.setUploadedFileName)
  const startRunMutation = useStartEvaluationRunMutation()
@ -115,8 +116,9 @@ export const useInputFieldsActions = ({
      },
      body,
    }, {
-      onSuccess: () => {
+      onSuccess: (run) => {
        toast.success(t('batch.runStarted'))
+        setSelectedRunId(resourceType, resourceId, run.id)
        setIsUploadPopoverOpen(false)
        setBatchTab(resourceType, resourceId, 'history')
      },
--- a/web/app/components/evaluation/components/layout/pipeline-evaluation.tsx
+++ b/web/app/components/evaluation/components/layout/pipeline-evaluation.tsx
@ -84,7 +84,10 @@ const PipelineEvaluation = ({
      </div>

      <div className="min-h-0 flex-1 bg-background-default">
-        <PipelineResultsPanel />
+        <PipelineResultsPanel
+          resourceType={resourceType}
+          resourceId={resourceId}
+        />
      </div>
    </div>
  )
--- a/web/app/components/evaluation/components/pipeline/pipeline-results-panel.tsx
+++ b/web/app/components/evaluation/components/pipeline/pipeline-results-panel.tsx
@ -1,16 +1,132 @@
 'use client'

+import type { EvaluationResourceProps } from '../../types'
+import { skipToken, useMutation, useQuery } from '@tanstack/react-query'
 import { useTranslation } from 'react-i18next'
+import { consoleClient, consoleQuery } from '@/service/client'
+import { downloadUrl } from '@/utils/download'
+import { useEvaluationResource } from '../../store'
+import { decodeModelSelection } from '../../utils'
+import PipelineResultsTable from './pipeline-results-table'
+import { getMetricColumns, getRunDate } from './pipeline-results-utils'

-const PipelineResultsPanel = () => {
+const PAGE_SIZE = 100
+
+const PipelineResultsPanel = ({
+  resourceType,
+  resourceId,
+}: EvaluationResourceProps) => {
  const { t } = useTranslation('evaluation')
+  const resource = useEvaluationResource(resourceType, resourceId)
+  const selectedModel = decodeModelSelection(resource.judgeModelId)
+  const selectedRunId = resource.selectedRunId
+  const runDetailQuery = useQuery(consoleQuery.evaluation.runDetail.queryOptions({
+    input: selectedRunId
+      ? {
+          params: {
+            targetType: resourceType,
+            targetId: resourceId,
+            runId: selectedRunId,
+          },
+          query: {
+            page: 1,
+            page_size: PAGE_SIZE,
+          },
+        }
+      : skipToken,
+    refetchOnWindowFocus: false,
+  }))
+  const resultFileDownloadMutation = useMutation({
+    mutationFn: async (fileId: string) => {
+      const fileInfo = await consoleClient.evaluation.file({
+        params: {
+          targetType: resourceType,
+          targetId: resourceId,
+          fileId,
+        },
+      })
+
+      downloadUrl({ url: fileInfo.download_url, fileName: fileInfo.name })
+    },
+  })
+  const runDetail = runDetailQuery.data
+  const items = runDetail?.items.data ?? []
+  const metricColumns = getMetricColumns(resource, items)
+  const thresholdColumns = metricColumns.filter(column => column.threshold !== undefined)
+  const isEmpty = !selectedRunId || (!runDetailQuery.isLoading && items.length === 0)
+
+  if (isEmpty) {
+    return (
+      <div className="flex min-h-[360px] flex-1 items-center justify-center xl:min-h-0">
+        <div className="flex flex-col items-center gap-4 px-4 text-center">
+          <span aria-hidden="true" className="i-ri-file-list-3-line h-12 w-12 text-text-quaternary" />
+          <div className="system-md-medium text-text-quaternary">{t('results.empty')}</div>
+        </div>
+      </div>
+    )
+  }

  return (
-    <div className="flex min-h-[360px] flex-1 items-center justify-center xl:min-h-0">
-      <div className="flex flex-col items-center gap-4 px-4 text-center">
-        <span aria-hidden="true" className="i-ri-file-list-3-line h-12 w-12 text-text-quaternary" />
-        <div className="system-md-medium text-text-quaternary">{t('results.empty')}</div>
+    <div className="flex h-full min-h-0 flex-col border-l border-divider-subtle bg-background-default">
+      <div className="shrink-0 px-6 pt-4 pb-2">
+        <h2 className="system-xl-semibold text-text-primary">{t('results.title')}</h2>
      </div>
+      {runDetailQuery.isError && (
+        <div className="px-6 py-4 system-sm-regular text-text-destructive">{t('results.loadFailed')}</div>
+      )}
+      {!runDetailQuery.isError && (
+        <div className="flex min-h-0 flex-1 flex-col px-6 py-1">
+          <div className="flex shrink-0 flex-wrap items-center justify-between gap-3 py-1">
+            <div className="flex min-w-0 flex-wrap items-center gap-2 system-xs-regular text-text-secondary">
+              <span>{getRunDate(runDetail?.run.started_at ?? runDetail?.run.created_at ?? null)}</span>
+              <span aria-hidden="true">·</span>
+              <span>{t('results.queryCount', { count: runDetail?.run.total_items ?? runDetail?.items.total ?? items.length })}</span>
+              {selectedModel && (
+                <>
+                  <span aria-hidden="true">·</span>
+                  <span className="inline-flex min-w-0 items-center gap-1.5 rounded-lg bg-background-section-burn px-2 py-1">
+                    <span aria-hidden="true" className="i-ri-robot-2-line h-4 w-4 shrink-0 text-text-accent" />
+                    <span className="truncate">{selectedModel.model}</span>
+                  </span>
+                </>
+              )}
+              {thresholdColumns.length > 0 && (
+                <>
+                  <span aria-hidden="true">·</span>
+                  <span className="flex min-w-0 flex-wrap items-center gap-1">
+                    {thresholdColumns.map(column => (
+                      <span
+                        key={column.id}
+                        className="rounded-lg border-[0.5px] border-divider-subtle bg-background-section px-2 py-1 text-text-tertiary"
+                      >
+                        {t('results.metricThreshold', { metric: column.label, threshold: column.threshold })}
+                      </span>
+                    ))}
+                  </span>
+                </>
+              )}
+            </div>
+            <button
+              type="button"
+              className="inline-flex h-7 shrink-0 items-center gap-1 rounded-md border-[0.5px] border-components-button-secondary-border bg-components-button-secondary-bg px-2 system-xs-medium text-components-button-secondary-text shadow-xs disabled:cursor-not-allowed disabled:opacity-50"
+              disabled={!runDetail?.run.result_file_id || resultFileDownloadMutation.isPending}
+              onClick={() => {
+                if (runDetail?.run.result_file_id)
+                  resultFileDownloadMutation.mutate(runDetail.run.result_file_id)
+              }}
+            >
+              <span aria-hidden="true" className="i-ri-download-2-line h-3.5 w-3.5" />
+              {t('results.export')}
+            </button>
+          </div>
+
+          <PipelineResultsTable
+            items={items}
+            metricColumns={metricColumns}
+            isLoading={runDetailQuery.isLoading}
+          />
+        </div>
+      )}
    </div>
  )
 }
--- a/web/app/components/evaluation/components/pipeline/pipeline-results-table.tsx
+++ b/web/app/components/evaluation/components/pipeline/pipeline-results-table.tsx
@ -0,0 +1,118 @@
+import type { MetricColumn } from './pipeline-results-utils'
+import type { EvaluationRunItem } from '@/types/evaluation'
+import { useTranslation } from 'react-i18next'
+import { cn } from '@/utils/classnames'
+import {
+  formatValue,
+  getIsItemPassed,
+  getMetricTextClassName,
+  getMetricValue,
+  getQueryContent,
+} from './pipeline-results-utils'
+
+const LOADING_ROW_IDS = ['1', '2', '3', '4', '5', '6']
+
+type PipelineResultsTableProps = {
+  items: EvaluationRunItem[]
+  metricColumns: MetricColumn[]
+  isLoading: boolean
+}
+
+const PipelineResultsTable = ({
+  items,
+  metricColumns,
+  isLoading,
+}: PipelineResultsTableProps) => {
+  const { t } = useTranslation('evaluation')
+
+  return (
+    <div className="min-h-0 flex-1 overflow-auto py-2">
+      <table className="min-w-full table-fixed border-collapse overflow-hidden rounded-lg">
+        <colgroup>
+          <col className="w-10" />
+          <col className="w-[220px]" />
+          <col className="w-[190px]" />
+          <col className="w-[220px]" />
+          {metricColumns.map(column => <col key={column.id} className="w-24" />)}
+        </colgroup>
+        <thead>
+          <tr className="bg-background-section">
+            <th className="h-7 rounded-l-lg" />
+            <th className="h-7 px-3 text-left system-xs-medium-uppercase text-text-tertiary">{t('results.columns.query')}</th>
+            <th className="h-7 px-3 text-left system-xs-medium-uppercase text-text-tertiary">{t('results.columns.expected')}</th>
+            <th className="h-7 px-3 text-left system-xs-medium-uppercase text-text-tertiary">{t('results.columns.actual')}</th>
+            {metricColumns.map((column, index) => (
+              <th
+                key={column.id}
+                className={cn(
+                  'h-7 px-3 text-left system-xs-medium-uppercase text-text-tertiary',
+                  index === metricColumns.length - 1 && 'rounded-r-lg',
+                )}
+              >
+                {column.label}
+              </th>
+            ))}
+          </tr>
+        </thead>
+        <tbody>
+          {isLoading && LOADING_ROW_IDS.map(rowId => (
+            <tr key={rowId} className="border-b border-divider-subtle">
+              <td colSpan={4 + metricColumns.length} className="h-10 px-3">
+                <div className="h-4 animate-pulse rounded bg-background-section" />
+              </td>
+            </tr>
+          ))}
+          {!isLoading && items.map((item) => {
+            const isPassed = getIsItemPassed(item, metricColumns)
+            const actualOutput = item.error ?? item.actual_output
+
+            return (
+              <tr key={item.id} className="border-b border-divider-subtle even:bg-background-default-subtle">
+                <td className="h-10 px-3 align-top">
+                  <span
+                    aria-label={isPassed ? t('results.status.passed') : t('results.status.failed')}
+                    className={cn(
+                      'mt-3 inline-block h-4 w-4',
+                      isPassed
+                        ? 'i-ri-check-line text-util-colors-green-green-600'
+                        : 'i-ri-close-line text-util-colors-red-red-600',
+                    )}
+                  />
+                </td>
+                <td className="h-10 px-3 py-3 align-top system-sm-regular text-text-secondary">
+                  <div className="line-clamp-2 break-words">{getQueryContent(item)}</div>
+                </td>
+                <td className="h-10 px-3 py-3 align-top system-sm-regular text-text-secondary">
+                  <div className="line-clamp-2 break-words">{formatValue(item.expected_output)}</div>
+                </td>
+                <td className={cn(
+                  'h-10 px-3 py-3 align-top system-sm-regular',
+                  actualOutput ? 'text-text-secondary' : 'text-text-destructive',
+                )}
+                >
+                  <div className="line-clamp-2 break-words">
+                    {actualOutput ? formatValue(actualOutput) : t('results.noResult')}
+                  </div>
+                </td>
+                {metricColumns.map((column) => {
+                  const metricValue = getMetricValue(item.metrics, column)
+
+                  return (
+                    <td
+                      key={column.id}
+                      className={cn('h-10 px-3 py-3 align-top system-sm-regular', getMetricTextClassName(metricValue, column))}
+                    >
+                      {formatValue(metricValue)}
+                    </td>
+                  )
+                })}
+              </tr>
+            )
+          })}
+        </tbody>
+      </table>
+    </div>
+  )
+}
+
+export default PipelineResultsTable
--- a/web/app/components/evaluation/components/pipeline/pipeline-results-utils.ts
+++ b/web/app/components/evaluation/components/pipeline/pipeline-results-utils.ts
@ -0,0 +1,179 @@
+import type { EvaluationResourceState } from '../../types'
+import type { EvaluationRunItem, EvaluationRunMetric } from '@/types/evaluation'
+import { formatTime } from '@/utils/time'
+
+const PREFERRED_QUERY_INPUT_KEYS = ['query', 'question', 'input']
+
+export type MetricColumn = {
+  id: string
+  label: string
+  threshold?: number
+}
+
+const normalizeMetricKey = (value: string) => value.toLowerCase().replace(/[\s_-]/g, '')
+
+const humanizeMetricName = (name: string) => {
+  return name
+    .split(/[-_]/g)
+    .filter(Boolean)
+    .map(part => part.charAt(0).toUpperCase() + part.slice(1))
+    .join(' ')
+}
+
+export const formatValue = (value: unknown) => {
+  if (value === null || value === undefined || value === '')
+    return '-'
+
+  if (typeof value === 'string')
+    return value
+
+  if (typeof value === 'number') {
+    return Number.isInteger(value)
+      ? String(value)
+      : value.toLocaleString(undefined, { maximumFractionDigits: 3 })
+  }
+
+  if (typeof value === 'boolean')
+    return value ? 'true' : 'false'
+
+  return JSON.stringify(value)
+}
+
+export const getQueryContent = (item: EvaluationRunItem) => {
+  for (const key of PREFERRED_QUERY_INPUT_KEYS) {
+    const value = item.inputs[key]
+    if (value !== undefined)
+      return formatValue(value)
+  }
+
+  const firstValue = Object.values(item.inputs).find(value => value !== undefined && value !== null && value !== '')
+  return formatValue(firstValue)
+}
+
+export const getMetricValue = (metrics: EvaluationRunMetric[], column: MetricColumn) => {
+  const normalizedColumnId = normalizeMetricKey(column.id)
+  const normalizedColumnLabel = normalizeMetricKey(column.label)
+  const metric = metrics.find((item) => {
+    if (!item.name)
+      return false
+
+    const normalizedMetricName = normalizeMetricKey(item.name)
+    return normalizedMetricName === normalizedColumnId || normalizedMetricName === normalizedColumnLabel
+  })
+
+  return metric?.value
+}
+
+const getNumericMetricValue = (metrics: EvaluationRunMetric[], column: MetricColumn) => {
+  const value = getMetricValue(metrics, column)
+  if (typeof value === 'number')
+    return value
+
+  if (typeof value === 'string' && value.trim() !== '') {
+    const numericValue = Number(value)
+    return Number.isNaN(numericValue) ? null : numericValue
+  }
+
+  return null
+}
+
+export const getMetricTextClassName = (value: unknown, column: MetricColumn) => {
+  const numericValue = typeof value === 'number'
+    ? value
+    : typeof value === 'string' && value.trim() !== ''
+      ? Number(value)
+      : null
+
+  if (numericValue === null || Number.isNaN(numericValue))
+    return 'text-text-secondary'
+
+  if (column.threshold === undefined)
+    return 'text-text-secondary'
+
+  if (numericValue >= column.threshold)
+    return 'text-util-colors-green-green-600'
+
+  if (numericValue === 0)
+    return 'text-util-colors-red-red-600'
+
+  return 'text-util-colors-warning-warning-600'
+}
+
+const getJudgmentResult = (judgment: Record<string, unknown>) => {
+  for (const key of ['passed', 'pass', 'success', 'result']) {
+    const value = judgment[key]
+    if (typeof value === 'boolean')
+      return value
+
+    if (typeof value === 'string') {
+      const normalizedValue = value.toLowerCase()
+      if (['passed', 'pass', 'success', 'succeeded', 'true'].includes(normalizedValue))
+        return true
+
+      if (['failed', 'fail', 'failure', 'false'].includes(normalizedValue))
+        return false
+    }
+  }
+
+  return null
+}
+
+export const getIsItemPassed = (item: EvaluationRunItem, metricColumns: MetricColumn[]) => {
+  if (item.error)
+    return false
+
+  const judgmentResult = getJudgmentResult(item.judgment)
+  if (judgmentResult !== null)
+    return judgmentResult
+
+  const thresholdColumns = metricColumns.filter(column => column.threshold !== undefined)
+  if (thresholdColumns.length > 0) {
+    return thresholdColumns.every((column) => {
+      const metricValue = getNumericMetricValue(item.metrics, column)
+      const threshold = column.threshold
+      return threshold !== undefined && metricValue !== null && metricValue >= threshold
+    })
+  }
+
+  return item.overall_score === null ? true : item.overall_score > 0
+}
+
+export const getMetricColumns = (
+  resource: EvaluationResourceState,
+  items: EvaluationRunItem[],
+) => {
+  const columns = new Map<string, MetricColumn>()
+
+  resource.metrics.forEach((metric) => {
+    columns.set(normalizeMetricKey(metric.optionId), {
+      id: metric.optionId,
+      label: metric.label,
+      threshold: metric.threshold,
+    })
+  })
+
+  items.forEach((item) => {
+    item.metrics.forEach((metric) => {
+      if (!metric.name)
+        return
+
+      const normalizedName = normalizeMetricKey(metric.name)
+      if (!columns.has(normalizedName)) {
+        columns.set(normalizedName, {
+          id: metric.name,
+          label: humanizeMetricName(metric.name),
+        })
+      }
+    })
+  })
+
+  return Array.from(columns.values())
+}
+
+export const getRunDate = (timestamp: number | null) => {
+  if (!timestamp)
+    return '-'
+
+  const milliseconds = timestamp > 1_000_000_000_000 ? timestamp : timestamp * 1000
+  return formatTime({ date: milliseconds, dateFormat: 'YYYY-MM-DD HH:mm' })
+}
--- a/web/app/components/evaluation/store-utils.ts
+++ b/web/app/components/evaluation/store-utils.ts
@ -394,6 +394,7 @@ export const buildInitialState = (_resourceType: EvaluationResourceType): Evalua
    activeBatchTab: 'input-fields',
    uploadedFileId: null,
    uploadedFileName: null,
+    selectedRunId: null,
    batchRecords: [],
  }
 }
--- a/web/app/components/evaluation/store.ts
+++ b/web/app/components/evaluation/store.ts
@ -82,6 +82,7 @@ type EvaluationStore = {
    uploadedFile: { id: string, name: string } | null,
  ) => void
  setUploadedFileName: (resourceType: EvaluationResourceType, resourceId: string, uploadedFileName: string | null) => void
+  setSelectedRunId: (resourceType: EvaluationResourceType, resourceId: string, runId: string | null) => void
  runBatchTest: (resourceType: EvaluationResourceType, resourceId: string) => void
 }

@ -110,6 +111,7 @@ export const useEvaluationStore = create<EvaluationStore>((set, get) => ({
          activeBatchTab: state.resources[buildResourceKey(resourceType, resourceId)]?.activeBatchTab ?? 'input-fields',
          uploadedFileId: state.resources[buildResourceKey(resourceType, resourceId)]?.uploadedFileId ?? null,
          uploadedFileName: state.resources[buildResourceKey(resourceType, resourceId)]?.uploadedFileName ?? null,
+          selectedRunId: state.resources[buildResourceKey(resourceType, resourceId)]?.selectedRunId ?? null,
          batchRecords: state.resources[buildResourceKey(resourceType, resourceId)]?.batchRecords ?? [],
        },
      },
@ -393,6 +395,14 @@ export const useEvaluationStore = create<EvaluationStore>((set, get) => ({
      })),
    }))
  },
+  setSelectedRunId: (resourceType, resourceId, runId) => {
+    set(state => ({
+      resources: updateResourceState(state.resources, resourceType, resourceId, resource => ({
+        ...resource,
+        selectedRunId: runId,
+      })),
+    }))
+  },
  runBatchTest: (resourceType, resourceId) => {
    const { uploadedFileName } = get().resources[buildResourceKey(resourceType, resourceId)] ?? buildInitialState(resourceType)
    const nextRecord = createBatchTestRecord(resourceType, uploadedFileName)
--- a/web/app/components/evaluation/types.ts
+++ b/web/app/components/evaluation/types.ts
@ -138,6 +138,7 @@ export type EvaluationResourceState = {
  activeBatchTab: BatchTestTab
  uploadedFileId: string | null
  uploadedFileName: string | null
+  selectedRunId: string | null
  batchRecords: BatchTestRecord[]
 }

--- a/web/i18n/en-US/evaluation.json
+++ b/web/i18n/en-US/evaluation.json
@ -115,6 +115,18 @@
  "metrics.update": "Update",
  "pipeline.passIf": "Pass if \u2265",
  "pipeline.uploadAndRun": "Upload & Run Test",
+  "results.columns.actual": "Actual Result",
+  "results.columns.expected": "Expect Result",
+  "results.columns.query": "Query Content",
  "results.empty": "No evaluation results yet.",
+  "results.export": "Export",
+  "results.loadFailed": "Failed to load evaluation results.",
+  "results.metricThreshold": "{{metric}} \u2265 {{threshold}}",
+  "results.noResult": "No Result",
+  "results.queryCount_one": "{{count}} query",
+  "results.queryCount_other": "{{count}} queries",
+  "results.status.failed": "Failed",
+  "results.status.passed": "Passed",
+  "results.title": "Test Details",
  "title": "Evaluation"
 }
--- a/web/i18n/zh-Hans/evaluation.json
+++ b/web/i18n/zh-Hans/evaluation.json
@ -91,13 +91,36 @@
  "metrics.custom.workflowLabel": "评测工作流",
  "metrics.custom.workflowPlaceholder": "选择工作流",
  "metrics.description": "从内置指标中选择，如 Groundedness 和 Correctness ，以评估您的工作流输出。",
+  "metrics.expandNodes": "展开节点",
  "metrics.groups.operations": "运行",
+  "metrics.groups.other": "其他",
  "metrics.groups.quality": "质量",
+  "metrics.noNodesInWorkflow": "当前工作流中没有 LLM 节点",
  "metrics.noResults": "没有匹配的指标。",
+  "metrics.nodesAll": "全部节点",
+  "metrics.nodesLabel": "节点范围",
+  "metrics.nodesSelected": "已选节点",
  "metrics.remove": "删除指标",
+  "metrics.searchNodeOrMetrics": "搜索节点或指标",
  "metrics.searchPlaceholder": "搜索指标",
  "metrics.showLess": "收起",
  "metrics.showMore": "展开更多",
  "metrics.title": "指标",
+  "metrics.update": "更新",
+  "pipeline.passIf": "通过条件 \u2265",
+  "pipeline.uploadAndRun": "上传并运行测试",
+  "results.columns.actual": "实际结果",
+  "results.columns.expected": "预期结果",
+  "results.columns.query": "Query 内容",
+  "results.empty": "还没有评测结果。",
+  "results.export": "导出",
+  "results.loadFailed": "加载评测结果失败。",
+  "results.metricThreshold": "{{metric}} \u2265 {{threshold}}",
+  "results.noResult": "无结果",
+  "results.queryCount_one": "{{count}} 条 query",
+  "results.queryCount_other": "{{count}} 条 query",
+  "results.status.failed": "失败",
+  "results.status.passed": "通过",
+  "results.title": "测试详情",
  "title": "评测"
 }
--- a/web/types/evaluation.ts
+++ b/web/types/evaluation.ts
@ -83,6 +83,9 @@ export type EvaluationLogFile = {
 }

 export type EvaluationLog = {
+  id?: string
+  run_id?: string
+  evaluation_run_id?: string
  created_at: string
  created_by: string
  test_file: EvaluationLogFile