feat(web): evaluation run detail

This commit is contained in:
JzoNg 2026-04-10 17:48:28 +08:00
parent 8b6b3cddea
commit 79fc352a5a
12 changed files with 518 additions and 13 deletions

View File

@ -1,7 +1,7 @@
import type { EvaluationResourceProps } from '../../types'
import type { EvaluationLogFile } from '@/types/evaluation'
import type { EvaluationLog, EvaluationLogFile } from '@/types/evaluation'
import { keepPreviousData, useMutation, useQuery } from '@tanstack/react-query'
import { useState } from 'react'
import { useEffect, useMemo, useState } from 'react'
import { useTranslation } from 'react-i18next'
import Pagination from '@/app/components/base/pagination'
import {
@ -11,7 +11,9 @@ import {
DropdownMenuTrigger,
} from '@/app/components/base/ui/dropdown-menu'
import { consoleClient, consoleQuery } from '@/service/client'
import { cn } from '@/utils/classnames'
import { downloadUrl } from '@/utils/download'
import { useEvaluationResource, useEvaluationStore } from '../../store'
const PAGE_SIZE = 16
const LOADING_ROW_IDS = ['1', '2', '3', '4', '5', '6']
@ -23,12 +25,18 @@ const formatCreatedAt = (createdAt: string) => {
return createdAt.includes('T') ? createdAt.slice(0, 10) : createdAt
}
const getLogRunId = (record: EvaluationLog) => {
return record.run_id ?? record.evaluation_run_id ?? record.id ?? null
}
const HistoryTab = ({
resourceType,
resourceId,
}: EvaluationResourceProps) => {
const { t } = useTranslation('evaluation')
const [page, setPage] = useState(0)
const resource = useEvaluationResource(resourceType, resourceId)
const setSelectedRunId = useEvaluationStore(state => state.setSelectedRunId)
const logsQuery = useQuery({
...consoleQuery.evaluation.logs.queryOptions({
input: {
@ -58,10 +66,19 @@ const HistoryTab = ({
downloadUrl({ url: fileInfo.download_url, fileName: file.name })
},
})
const records = logsQuery.data?.data ?? []
const records = useMemo(() => logsQuery.data?.data ?? [], [logsQuery.data?.data])
const total = logsQuery.data?.total ?? 0
const isInitialLoading = logsQuery.isLoading && !logsQuery.data
useEffect(() => {
if (resource.selectedRunId)
return
const firstRunId = records.map(getLogRunId).find((runId): runId is string => !!runId)
if (firstRunId)
setSelectedRunId(resourceType, resourceId, firstRunId)
}, [records, resource.selectedRunId, resourceId, resourceType, setSelectedRunId])
return (
<div className="flex min-h-full flex-col">
<div className="min-h-0 flex-1 overflow-hidden">
@ -98,7 +115,19 @@ const HistoryTab = ({
</tr>
))}
{!isInitialLoading && records.map(record => (
<tr key={`${record.created_at}-${record.test_file.id}`} className="border-b border-divider-subtle">
<tr
key={`${record.created_at}-${record.test_file.id}`}
className={cn(
'border-b border-divider-subtle',
getLogRunId(record) && 'cursor-pointer hover:bg-state-base-hover',
getLogRunId(record) === resource.selectedRunId && 'bg-background-default-subtle',
)}
onClick={() => {
const runId = getLogRunId(record)
if (runId)
setSelectedRunId(resourceType, resourceId, runId)
}}
>
<td className="h-10 truncate px-3 system-sm-regular text-text-secondary">{formatCreatedAt(record.created_at)}</td>
<td className="h-10 truncate px-3 system-sm-regular text-text-secondary">{record.created_by}</td>
<td className="h-10 truncate px-3 system-sm-regular text-text-secondary">{record.version || '-'}</td>
@ -115,6 +144,7 @@ const HistoryTab = ({
type="button"
aria-label={t('history.actions.open')}
className="inline-flex h-8 w-8 items-center justify-center rounded-md text-text-tertiary hover:bg-state-base-hover hover:text-text-secondary"
onClick={event => event.stopPropagation()}
/>
)}
>
@ -123,7 +153,10 @@ const HistoryTab = ({
<DropdownMenuContent popupClassName="w-[180px] rounded-lg border-[0.5px] border-components-panel-border py-1 shadow-lg">
<DropdownMenuItem
className="gap-2"
onClick={() => fileDownloadMutation.mutate(record.test_file)}
onClick={(event) => {
event.stopPropagation()
fileDownloadMutation.mutate(record.test_file)
}}
>
<span aria-hidden="true" className="i-ri-file-download-line h-4 w-4" />
{t('history.actions.downloadTestFile')}
@ -131,7 +164,11 @@ const HistoryTab = ({
<DropdownMenuItem
className="gap-2"
disabled={!record.result_file}
onClick={() => record.result_file && fileDownloadMutation.mutate(record.result_file)}
onClick={(event) => {
event.stopPropagation()
if (record.result_file)
fileDownloadMutation.mutate(record.result_file)
}}
>
<span aria-hidden="true" className="i-ri-download-2-line h-4 w-4" />
{t('history.actions.downloadResultFile')}

View File

@ -36,6 +36,7 @@ export const useInputFieldsActions = ({
const { t } = useTranslation('evaluation')
const resource = useEvaluationResource(resourceType, resourceId)
const setBatchTab = useEvaluationStore(state => state.setBatchTab)
const setSelectedRunId = useEvaluationStore(state => state.setSelectedRunId)
const setUploadedFile = useEvaluationStore(state => state.setUploadedFile)
const setUploadedFileName = useEvaluationStore(state => state.setUploadedFileName)
const startRunMutation = useStartEvaluationRunMutation()
@ -115,8 +116,9 @@ export const useInputFieldsActions = ({
},
body,
}, {
onSuccess: () => {
onSuccess: (run) => {
toast.success(t('batch.runStarted'))
setSelectedRunId(resourceType, resourceId, run.id)
setIsUploadPopoverOpen(false)
setBatchTab(resourceType, resourceId, 'history')
},

View File

@ -84,7 +84,10 @@ const PipelineEvaluation = ({
</div>
<div className="min-h-0 flex-1 bg-background-default">
<PipelineResultsPanel />
<PipelineResultsPanel
resourceType={resourceType}
resourceId={resourceId}
/>
</div>
</div>
)

View File

@ -1,16 +1,132 @@
'use client'
import type { EvaluationResourceProps } from '../../types'
import { skipToken, useMutation, useQuery } from '@tanstack/react-query'
import { useTranslation } from 'react-i18next'
import { consoleClient, consoleQuery } from '@/service/client'
import { downloadUrl } from '@/utils/download'
import { useEvaluationResource } from '../../store'
import { decodeModelSelection } from '../../utils'
import PipelineResultsTable from './pipeline-results-table'
import { getMetricColumns, getRunDate } from './pipeline-results-utils'
const PipelineResultsPanel = () => {
const PAGE_SIZE = 100
const PipelineResultsPanel = ({
resourceType,
resourceId,
}: EvaluationResourceProps) => {
const { t } = useTranslation('evaluation')
const resource = useEvaluationResource(resourceType, resourceId)
const selectedModel = decodeModelSelection(resource.judgeModelId)
const selectedRunId = resource.selectedRunId
const runDetailQuery = useQuery(consoleQuery.evaluation.runDetail.queryOptions({
input: selectedRunId
? {
params: {
targetType: resourceType,
targetId: resourceId,
runId: selectedRunId,
},
query: {
page: 1,
page_size: PAGE_SIZE,
},
}
: skipToken,
refetchOnWindowFocus: false,
}))
const resultFileDownloadMutation = useMutation({
mutationFn: async (fileId: string) => {
const fileInfo = await consoleClient.evaluation.file({
params: {
targetType: resourceType,
targetId: resourceId,
fileId,
},
})
downloadUrl({ url: fileInfo.download_url, fileName: fileInfo.name })
},
})
const runDetail = runDetailQuery.data
const items = runDetail?.items.data ?? []
const metricColumns = getMetricColumns(resource, items)
const thresholdColumns = metricColumns.filter(column => column.threshold !== undefined)
const isEmpty = !selectedRunId || (!runDetailQuery.isLoading && items.length === 0)
if (isEmpty) {
return (
<div className="flex min-h-[360px] flex-1 items-center justify-center xl:min-h-0">
<div className="flex flex-col items-center gap-4 px-4 text-center">
<span aria-hidden="true" className="i-ri-file-list-3-line h-12 w-12 text-text-quaternary" />
<div className="system-md-medium text-text-quaternary">{t('results.empty')}</div>
</div>
</div>
)
}
return (
<div className="flex min-h-[360px] flex-1 items-center justify-center xl:min-h-0">
<div className="flex flex-col items-center gap-4 px-4 text-center">
<span aria-hidden="true" className="i-ri-file-list-3-line h-12 w-12 text-text-quaternary" />
<div className="system-md-medium text-text-quaternary">{t('results.empty')}</div>
<div className="flex h-full min-h-0 flex-col border-l border-divider-subtle bg-background-default">
<div className="shrink-0 px-6 pt-4 pb-2">
<h2 className="system-xl-semibold text-text-primary">{t('results.title')}</h2>
</div>
{runDetailQuery.isError && (
<div className="px-6 py-4 system-sm-regular text-text-destructive">{t('results.loadFailed')}</div>
)}
{!runDetailQuery.isError && (
<div className="flex min-h-0 flex-1 flex-col px-6 py-1">
<div className="flex shrink-0 flex-wrap items-center justify-between gap-3 py-1">
<div className="flex min-w-0 flex-wrap items-center gap-2 system-xs-regular text-text-secondary">
<span>{getRunDate(runDetail?.run.started_at ?? runDetail?.run.created_at ?? null)}</span>
<span aria-hidden="true">·</span>
<span>{t('results.queryCount', { count: runDetail?.run.total_items ?? runDetail?.items.total ?? items.length })}</span>
{selectedModel && (
<>
<span aria-hidden="true">·</span>
<span className="inline-flex min-w-0 items-center gap-1.5 rounded-lg bg-background-section-burn px-2 py-1">
<span aria-hidden="true" className="i-ri-robot-2-line h-4 w-4 shrink-0 text-text-accent" />
<span className="truncate">{selectedModel.model}</span>
</span>
</>
)}
{thresholdColumns.length > 0 && (
<>
<span aria-hidden="true">·</span>
<span className="flex min-w-0 flex-wrap items-center gap-1">
{thresholdColumns.map(column => (
<span
key={column.id}
className="rounded-lg border-[0.5px] border-divider-subtle bg-background-section px-2 py-1 text-text-tertiary"
>
{t('results.metricThreshold', { metric: column.label, threshold: column.threshold })}
</span>
))}
</span>
</>
)}
</div>
<button
type="button"
className="inline-flex h-7 shrink-0 items-center gap-1 rounded-md border-[0.5px] border-components-button-secondary-border bg-components-button-secondary-bg px-2 system-xs-medium text-components-button-secondary-text shadow-xs disabled:cursor-not-allowed disabled:opacity-50"
disabled={!runDetail?.run.result_file_id || resultFileDownloadMutation.isPending}
onClick={() => {
if (runDetail?.run.result_file_id)
resultFileDownloadMutation.mutate(runDetail.run.result_file_id)
}}
>
<span aria-hidden="true" className="i-ri-download-2-line h-3.5 w-3.5" />
{t('results.export')}
</button>
</div>
<PipelineResultsTable
items={items}
metricColumns={metricColumns}
isLoading={runDetailQuery.isLoading}
/>
</div>
)}
</div>
)
}

View File

@ -0,0 +1,118 @@
import type { MetricColumn } from './pipeline-results-utils'
import type { EvaluationRunItem } from '@/types/evaluation'
import { useTranslation } from 'react-i18next'
import { cn } from '@/utils/classnames'
import {
formatValue,
getIsItemPassed,
getMetricTextClassName,
getMetricValue,
getQueryContent,
} from './pipeline-results-utils'
const LOADING_ROW_IDS = ['1', '2', '3', '4', '5', '6']
type PipelineResultsTableProps = {
items: EvaluationRunItem[]
metricColumns: MetricColumn[]
isLoading: boolean
}
const PipelineResultsTable = ({
items,
metricColumns,
isLoading,
}: PipelineResultsTableProps) => {
const { t } = useTranslation('evaluation')
return (
<div className="min-h-0 flex-1 overflow-auto py-2">
<table className="min-w-full table-fixed border-collapse overflow-hidden rounded-lg">
<colgroup>
<col className="w-10" />
<col className="w-[220px]" />
<col className="w-[190px]" />
<col className="w-[220px]" />
{metricColumns.map(column => <col key={column.id} className="w-24" />)}
</colgroup>
<thead>
<tr className="bg-background-section">
<th className="h-7 rounded-l-lg" />
<th className="h-7 px-3 text-left system-xs-medium-uppercase text-text-tertiary">{t('results.columns.query')}</th>
<th className="h-7 px-3 text-left system-xs-medium-uppercase text-text-tertiary">{t('results.columns.expected')}</th>
<th className="h-7 px-3 text-left system-xs-medium-uppercase text-text-tertiary">{t('results.columns.actual')}</th>
{metricColumns.map((column, index) => (
<th
key={column.id}
className={cn(
'h-7 px-3 text-left system-xs-medium-uppercase text-text-tertiary',
index === metricColumns.length - 1 && 'rounded-r-lg',
)}
>
{column.label}
</th>
))}
</tr>
</thead>
<tbody>
{isLoading && LOADING_ROW_IDS.map(rowId => (
<tr key={rowId} className="border-b border-divider-subtle">
<td colSpan={4 + metricColumns.length} className="h-10 px-3">
<div className="h-4 animate-pulse rounded bg-background-section" />
</td>
</tr>
))}
{!isLoading && items.map((item) => {
const isPassed = getIsItemPassed(item, metricColumns)
const actualOutput = item.error ?? item.actual_output
return (
<tr key={item.id} className="border-b border-divider-subtle even:bg-background-default-subtle">
<td className="h-10 px-3 align-top">
<span
aria-label={isPassed ? t('results.status.passed') : t('results.status.failed')}
className={cn(
'mt-3 inline-block h-4 w-4',
isPassed
? 'i-ri-check-line text-util-colors-green-green-600'
: 'i-ri-close-line text-util-colors-red-red-600',
)}
/>
</td>
<td className="h-10 px-3 py-3 align-top system-sm-regular text-text-secondary">
<div className="line-clamp-2 break-words">{getQueryContent(item)}</div>
</td>
<td className="h-10 px-3 py-3 align-top system-sm-regular text-text-secondary">
<div className="line-clamp-2 break-words">{formatValue(item.expected_output)}</div>
</td>
<td className={cn(
'h-10 px-3 py-3 align-top system-sm-regular',
actualOutput ? 'text-text-secondary' : 'text-text-destructive',
)}
>
<div className="line-clamp-2 break-words">
{actualOutput ? formatValue(actualOutput) : t('results.noResult')}
</div>
</td>
{metricColumns.map((column) => {
const metricValue = getMetricValue(item.metrics, column)
return (
<td
key={column.id}
className={cn('h-10 px-3 py-3 align-top system-sm-regular', getMetricTextClassName(metricValue, column))}
>
{formatValue(metricValue)}
</td>
)
})}
</tr>
)
})}
</tbody>
</table>
</div>
)
}
export default PipelineResultsTable

View File

@ -0,0 +1,179 @@
import type { EvaluationResourceState } from '../../types'
import type { EvaluationRunItem, EvaluationRunMetric } from '@/types/evaluation'
import { formatTime } from '@/utils/time'
const PREFERRED_QUERY_INPUT_KEYS = ['query', 'question', 'input']
export type MetricColumn = {
id: string
label: string
threshold?: number
}
const normalizeMetricKey = (value: string) => value.toLowerCase().replace(/[\s_-]/g, '')
const humanizeMetricName = (name: string) => {
return name
.split(/[-_]/g)
.filter(Boolean)
.map(part => part.charAt(0).toUpperCase() + part.slice(1))
.join(' ')
}
export const formatValue = (value: unknown) => {
if (value === null || value === undefined || value === '')
return '-'
if (typeof value === 'string')
return value
if (typeof value === 'number') {
return Number.isInteger(value)
? String(value)
: value.toLocaleString(undefined, { maximumFractionDigits: 3 })
}
if (typeof value === 'boolean')
return value ? 'true' : 'false'
return JSON.stringify(value)
}
export const getQueryContent = (item: EvaluationRunItem) => {
for (const key of PREFERRED_QUERY_INPUT_KEYS) {
const value = item.inputs[key]
if (value !== undefined)
return formatValue(value)
}
const firstValue = Object.values(item.inputs).find(value => value !== undefined && value !== null && value !== '')
return formatValue(firstValue)
}
export const getMetricValue = (metrics: EvaluationRunMetric[], column: MetricColumn) => {
const normalizedColumnId = normalizeMetricKey(column.id)
const normalizedColumnLabel = normalizeMetricKey(column.label)
const metric = metrics.find((item) => {
if (!item.name)
return false
const normalizedMetricName = normalizeMetricKey(item.name)
return normalizedMetricName === normalizedColumnId || normalizedMetricName === normalizedColumnLabel
})
return metric?.value
}
const getNumericMetricValue = (metrics: EvaluationRunMetric[], column: MetricColumn) => {
const value = getMetricValue(metrics, column)
if (typeof value === 'number')
return value
if (typeof value === 'string' && value.trim() !== '') {
const numericValue = Number(value)
return Number.isNaN(numericValue) ? null : numericValue
}
return null
}
export const getMetricTextClassName = (value: unknown, column: MetricColumn) => {
const numericValue = typeof value === 'number'
? value
: typeof value === 'string' && value.trim() !== ''
? Number(value)
: null
if (numericValue === null || Number.isNaN(numericValue))
return 'text-text-secondary'
if (column.threshold === undefined)
return 'text-text-secondary'
if (numericValue >= column.threshold)
return 'text-util-colors-green-green-600'
if (numericValue === 0)
return 'text-util-colors-red-red-600'
return 'text-util-colors-warning-warning-600'
}
const getJudgmentResult = (judgment: Record<string, unknown>) => {
for (const key of ['passed', 'pass', 'success', 'result']) {
const value = judgment[key]
if (typeof value === 'boolean')
return value
if (typeof value === 'string') {
const normalizedValue = value.toLowerCase()
if (['passed', 'pass', 'success', 'succeeded', 'true'].includes(normalizedValue))
return true
if (['failed', 'fail', 'failure', 'false'].includes(normalizedValue))
return false
}
}
return null
}
export const getIsItemPassed = (item: EvaluationRunItem, metricColumns: MetricColumn[]) => {
if (item.error)
return false
const judgmentResult = getJudgmentResult(item.judgment)
if (judgmentResult !== null)
return judgmentResult
const thresholdColumns = metricColumns.filter(column => column.threshold !== undefined)
if (thresholdColumns.length > 0) {
return thresholdColumns.every((column) => {
const metricValue = getNumericMetricValue(item.metrics, column)
const threshold = column.threshold
return threshold !== undefined && metricValue !== null && metricValue >= threshold
})
}
return item.overall_score === null ? true : item.overall_score > 0
}
export const getMetricColumns = (
resource: EvaluationResourceState,
items: EvaluationRunItem[],
) => {
const columns = new Map<string, MetricColumn>()
resource.metrics.forEach((metric) => {
columns.set(normalizeMetricKey(metric.optionId), {
id: metric.optionId,
label: metric.label,
threshold: metric.threshold,
})
})
items.forEach((item) => {
item.metrics.forEach((metric) => {
if (!metric.name)
return
const normalizedName = normalizeMetricKey(metric.name)
if (!columns.has(normalizedName)) {
columns.set(normalizedName, {
id: metric.name,
label: humanizeMetricName(metric.name),
})
}
})
})
return Array.from(columns.values())
}
export const getRunDate = (timestamp: number | null) => {
if (!timestamp)
return '-'
const milliseconds = timestamp > 1_000_000_000_000 ? timestamp : timestamp * 1000
return formatTime({ date: milliseconds, dateFormat: 'YYYY-MM-DD HH:mm' })
}

View File

@ -394,6 +394,7 @@ export const buildInitialState = (_resourceType: EvaluationResourceType): Evalua
activeBatchTab: 'input-fields',
uploadedFileId: null,
uploadedFileName: null,
selectedRunId: null,
batchRecords: [],
}
}

View File

@ -82,6 +82,7 @@ type EvaluationStore = {
uploadedFile: { id: string, name: string } | null,
) => void
setUploadedFileName: (resourceType: EvaluationResourceType, resourceId: string, uploadedFileName: string | null) => void
setSelectedRunId: (resourceType: EvaluationResourceType, resourceId: string, runId: string | null) => void
runBatchTest: (resourceType: EvaluationResourceType, resourceId: string) => void
}
@ -110,6 +111,7 @@ export const useEvaluationStore = create<EvaluationStore>((set, get) => ({
activeBatchTab: state.resources[buildResourceKey(resourceType, resourceId)]?.activeBatchTab ?? 'input-fields',
uploadedFileId: state.resources[buildResourceKey(resourceType, resourceId)]?.uploadedFileId ?? null,
uploadedFileName: state.resources[buildResourceKey(resourceType, resourceId)]?.uploadedFileName ?? null,
selectedRunId: state.resources[buildResourceKey(resourceType, resourceId)]?.selectedRunId ?? null,
batchRecords: state.resources[buildResourceKey(resourceType, resourceId)]?.batchRecords ?? [],
},
},
@ -393,6 +395,14 @@ export const useEvaluationStore = create<EvaluationStore>((set, get) => ({
})),
}))
},
setSelectedRunId: (resourceType, resourceId, runId) => {
set(state => ({
resources: updateResourceState(state.resources, resourceType, resourceId, resource => ({
...resource,
selectedRunId: runId,
})),
}))
},
runBatchTest: (resourceType, resourceId) => {
const { uploadedFileName } = get().resources[buildResourceKey(resourceType, resourceId)] ?? buildInitialState(resourceType)
const nextRecord = createBatchTestRecord(resourceType, uploadedFileName)

View File

@ -138,6 +138,7 @@ export type EvaluationResourceState = {
activeBatchTab: BatchTestTab
uploadedFileId: string | null
uploadedFileName: string | null
selectedRunId: string | null
batchRecords: BatchTestRecord[]
}

View File

@ -115,6 +115,18 @@
"metrics.update": "Update",
"pipeline.passIf": "Pass if \u2265",
"pipeline.uploadAndRun": "Upload & Run Test",
"results.columns.actual": "Actual Result",
"results.columns.expected": "Expect Result",
"results.columns.query": "Query Content",
"results.empty": "No evaluation results yet.",
"results.export": "Export",
"results.loadFailed": "Failed to load evaluation results.",
"results.metricThreshold": "{{metric}} \u2265 {{threshold}}",
"results.noResult": "No Result",
"results.queryCount_one": "{{count}} query",
"results.queryCount_other": "{{count}} queries",
"results.status.failed": "Failed",
"results.status.passed": "Passed",
"results.title": "Test Details",
"title": "Evaluation"
}

View File

@ -91,13 +91,36 @@
"metrics.custom.workflowLabel": "评测工作流",
"metrics.custom.workflowPlaceholder": "选择工作流",
"metrics.description": "从内置指标中选择,如 Groundedness 和 Correctness ,以评估您的工作流输出。",
"metrics.expandNodes": "展开节点",
"metrics.groups.operations": "运行",
"metrics.groups.other": "其他",
"metrics.groups.quality": "质量",
"metrics.noNodesInWorkflow": "当前工作流中没有 LLM 节点",
"metrics.noResults": "没有匹配的指标。",
"metrics.nodesAll": "全部节点",
"metrics.nodesLabel": "节点范围",
"metrics.nodesSelected": "已选节点",
"metrics.remove": "删除指标",
"metrics.searchNodeOrMetrics": "搜索节点或指标",
"metrics.searchPlaceholder": "搜索指标",
"metrics.showLess": "收起",
"metrics.showMore": "展开更多",
"metrics.title": "指标",
"metrics.update": "更新",
"pipeline.passIf": "通过条件 \u2265",
"pipeline.uploadAndRun": "上传并运行测试",
"results.columns.actual": "实际结果",
"results.columns.expected": "预期结果",
"results.columns.query": "Query 内容",
"results.empty": "还没有评测结果。",
"results.export": "导出",
"results.loadFailed": "加载评测结果失败。",
"results.metricThreshold": "{{metric}} \u2265 {{threshold}}",
"results.noResult": "无结果",
"results.queryCount_one": "{{count}} 条 query",
"results.queryCount_other": "{{count}} 条 query",
"results.status.failed": "失败",
"results.status.passed": "通过",
"results.title": "测试详情",
"title": "评测"
}

View File

@ -83,6 +83,9 @@ export type EvaluationLogFile = {
}
export type EvaluationLog = {
id?: string
run_id?: string
evaluation_run_id?: string
created_at: string
created_by: string
test_file: EvaluationLogFile