使用 OpenAI Evals 进行工具评估
本指南展示了如何通过工具评估来衡量和改进模型从源代码中提取结构化信息的能力。在本例中,提取的是 Python 文件中定义的符号(函数、类、方法和变量)集合。
设置
安装最新的 openai Python 包(≥ 1.14.0)并设置 OPENAI_API_KEY
环境变量。如果您还想评估带工具有的助手,请在您的账户中启用Assistants v2 beta。
pip install --upgrade openai
export OPENAI_API_KEY=sk‑...
下面我们导入 SDK,创建一个客户端,并定义一个助手,该助手从 openai 包本身内部的文件构建一个小数据集。
%pip install --upgrade openai pandas jinja2 rich --quiet
import os
import time
import openai
from rich import print
client = openai.OpenAI(
api_key=os.getenv("OPENAI_API_KEY") or os.getenv("_OPENAI_API_KEY"),
)
[1m[ [0m [34;49mnotice [0m [1;39;49m] [0m [39;49m A new release of pip is available: [0m [31;49m24.0 [0m [39;49m -> [0m [32;49m25.1.1 [0m
[1m[ [0m [34;49mnotice [0m [1;39;49m] [0m [39;49m To update, run: [0m [32;49mpip install --upgrade pip [0m
Note: you may need to restart the kernel to use updated packages.
数据集工厂和评分标准
get_dataset
通过读取多个 SDK 文件来构建一个小的内存数据集。structured_output_grader
定义了一个详细的评估标准。sampled.output_tools[0].function.arguments.symbols
指定了基于工具调用的代码文件中提取的符号。client.evals.create(...)
将评估注册到平台。
def get_dataset(limit=None):
openai_sdk_file_path = os.path.dirname(openai.__file__)
file_paths = [
os.path.join(openai_sdk_file_path, "resources", "evals", "evals.py"),
os.path.join(openai_sdk_file_path, "resources", "responses", "responses.py"),
os.path.join(openai_sdk_file_path, "resources", "images.py"),
os.path.join(openai_sdk_file_path, "resources", "embeddings.py"),
os.path.join(openai_sdk_file_path, "resources", "files.py"),
]
items = []
for file_path in file_paths:
items.append({"input": open(file_path, "r").read()})
if limit:
return items[:limit]
return items
structured_output_grader = """
You are a helpful assistant that grades the quality of extracted information from a code file.
You will be given a code file and a list of extracted information.
You should grade the quality of the extracted information.
You should grade the quality on a scale of 1 to 7.
You should apply the following criteria, and calculate your score as follows:
You should first check for completeness on a scale of 1 to 7.
Then you should apply a quality modifier.
The quality modifier is a multiplier from 0 to 1 that you multiply by the completeness score.
If there is 100% coverage for completion and it is all high quality, then you would return 7*1.
If there is 100% coverage for completion but it is all low quality, then you would return 7*0.5.
etc.
"""
structured_output_grader_user_prompt = """
<Code File>
{{item.input}}
</Code File>
<Extracted Information>
{{sample.output_tools[0].function.arguments.symbols}}
</Extracted Information>
"""
Evals 创建
在这里,我们创建一个评估,用于评估从代码文件中提取的信息的质量。
logs_eval = client.evals.create(
name="Code QA Eval",
data_source_config={
"type": "custom",
"item_schema": {"type": "object", "properties": {"input": {"type": "string"}}},
"include_sample_schema": True,
},
testing_criteria=[
{
"type": "score_model",
"name": "General Evaluator",
"model": "o3",
"input": [
{"role": "system", "content": structured_output_grader},
{"role": "user", "content": structured_output_grader_user_prompt},
],
"range": [1, 7],
"pass_threshold": 5.0,
}
],
)
symbol_tool = {
"name": "extract_symbols",
"description": "Extract the symbols from the code file",
"parameters": {
"type": "object",
"properties": {
"symbols": {
"type": "array",
"description": "A list of symbols extracted from Python code.",
"items": {
"type": "object",
"properties": {
"name": {"type": "string", "description": "The name of the symbol."},
"symbol_type": {"type": "string", "description": "The type of the symbol, e.g., variable, function, class."},
},
"required": ["name", "symbol_type"],
"additionalProperties": False,
},
}
},
"required": ["symbols"],
"additionalProperties": False,
},
}
启动模型运行
在这里,我们针对同一个评估启动两个运行:一个调用Completions端点,另一个调用Responses端点。
gpt_4one_completions_run = client.evals.runs.create(
name="gpt-4.1",
eval_id=logs_eval.id,
data_source={
"type": "completions",
"source": {"type": "file_content", "content": [{"item": item} for item in get_dataset(limit=1)]},
"input_messages": {
"type": "template",
"template": [
{"type": "message", "role": "system", "content": {"type": "input_text", "text": "You are a helpful assistant."}},
{"type": "message", "role": "user", "content": {"type": "input_text", "text": "Extract the symbols from the code file {{item.input}}"}},
],
},
"model": "gpt-4.1",
"sampling_params": {
"seed": 42,
"temperature": 0.7,
"max_completions_tokens": 10000,
"top_p": 0.9,
"tools": [{"type": "function", "function": symbol_tool}],
},
},
)
gpt_4one_responses_run = client.evals.runs.create(
name="gpt-4.1-mini",
eval_id=logs_eval.id,
data_source={
"type": "responses",
"source": {"type": "file_content", "content": [{"item": item} for item in get_dataset(limit=1)]},
"input_messages": {
"type": "template",
"template": [
{"type": "message", "role": "system", "content": {"type": "input_text", "text": "You are a helpful assistant."}},
{"type": "message", "role": "user", "content": {"type": "input_text", "text": "Extract the symbols from the code file {{item.input}}"}},
],
},
"model": "gpt-4.1-mini",
"sampling_params": {
"seed": 42,
"temperature": 0.7,
"max_completions_tokens": 10000,
"top_p": 0.9,
"tools": [{"type": "function", **symbol_tool}],
},
},
)
实用轮询器
我们创建一个实用轮询器,用于轮询评估运行的结果。
def poll_runs(eval_id, run_ids):
# poll both runs at the same time, until they are complete or failed
while True:
runs = [client.evals.runs.retrieve(run_id, eval_id=eval_id) for run_id in run_ids]
for run in runs:
print(run.id, run.status, run.result_counts)
if all(run.status in ("completed", "failed") for run in runs):
break
time.sleep(5)
poll_runs(logs_eval.id, [gpt_4one_completions_run.id, gpt_4one_responses_run.id])
evalrun_6848e2269570819198b757fe12b979da completed ResultCounts(errored=0, failed=1, passed=0, total=1)
evalrun_6848e227d3a481918a9b970c897b5998 completed ResultCounts(errored=0, failed=1, passed=0, total=1)
### 获取输出
completions_output = client.evals.runs.output_items.list(
run_id=gpt_4one_completions_run.id, eval_id=logs_eval.id
)
responses_output = client.evals.runs.output_items.list(
run_id=gpt_4one_responses_run.id, eval_id=logs_eval.id
)
检查结果
对于 completions 和 responses,我们打印模型返回的符号字典。您可以将其与参考答案进行 diff,或计算精度/召回率。
import json
import pandas as pd
from IPython.display import display, HTML
def extract_symbols(output_list):
symbols_list = []
for item in output_list:
try:
args = item.sample.output[0].tool_calls[0]["function"]["arguments"]
symbols = json.loads(args)["symbols"]
symbols_list.append(symbols)
except Exception as e:
symbols_list.append([{"error": str(e)}])
return symbols_list
completions_symbols = extract_symbols(completions_output)
responses_symbols = extract_symbols(responses_output)
def symbols_to_html_table(symbols):
if symbols and isinstance(symbols, list):
df = pd.DataFrame(symbols)
return (
df.style
.set_properties(**{
'white-space': 'pre-wrap',
'word-break': 'break-word',
'padding': '2px 6px',
'border': '1px solid #C3E7FA',
'font-size': '0.92em',
'background-color': '#FDFEFF'
})
.set_table_styles([{
'selector': 'th',
'props': [
('font-size', '0.95em'),
('background-color', '#1CA7EC'),
('color', '#fff'),
('border-bottom', '1px solid #18647E'),
('padding', '2px 6px')
]
}])
.hide(axis='index')
.to_html()
)
return f"<div style='padding:4px 0;color:#D9534F;font-style:italic;font-size:0.9em'>{str(symbols)}</div>"
table_rows = []
max_len = max(len(completions_symbols), len(responses_symbols))
for i in range(max_len):
c_html = symbols_to_html_table(completions_symbols[i]) if i < len(completions_symbols) else ""
r_html = symbols_to_html_table(responses_symbols[i]) if i < len(responses_symbols) else ""
table_rows.append(f"""
<tr style="height:1.2em;">
<td style="vertical-align:top; background:#F6F8FA; border-right:1px solid #E3E3E3; padding:2px 4px;">{c_html}</td>
<td style="vertical-align:top; background:#F6F8FA; padding:2px 4px;">{r_html}</td>
</tr>
""")
table_html = f"""
<div style="margin-bottom:0.5em;margin-top:0.2em;">
<h4 style="color:#1CA7EC;font-weight:600;letter-spacing:0.5px;
text-shadow:0 1px 2px rgba(0,0,0,0.06), 0 0px 0px #fff;font-size:1.05em;margin:0 0 0.35em 0;">
Completions vs Responses Output Symbols
</h4>
<table style="border-collapse:separate;border-spacing:0 0.2em;width:100%;border-radius:5px;overflow:hidden;box-shadow:0 1px 7px #BEE7FA22;">
<thead>
<tr style="height:1.4em;">
<th style="width:50%;background:#323C50;color:#fff;font-size:1em;padding:6px 10px;border-bottom:2px solid #1CA7EC;text-align:center;">Completions Output</th>
<th style="width:50%;background:#323C50;color:#fff;font-size:1em;padding:6px 10px;border-bottom:2px solid #1CA7EC;text-align:center;">Responses Output</th>
</tr>
</thead>
<tbody>
{''.join(table_rows)}
</tbody>
</table>
</div>
"""
display(HTML(table_html))
Completions vs Responses Output Symbols
可视化 Evals Dashboard
您可以导航到 Evals Dashboard 来可视化数据。
您还可以在运行完成后查看 Evals Dashboard 中失败结果的解释,如下图所示。
本笔记本演示了如何使用 OpenAI Evals 通过工具调用来评估和改进模型从 Python 代码中提取结构化信息的能力。
OpenAI Evals 为评估 LLM 在结构化提取任务上的表现提供了一个强大、可复现的框架。通过结合清晰的工具模式、严格的评分标准和结构良好的数据集,您可以衡量和改进整体性能。
有关更多详细信息,请参阅 OpenAI Evals 文档。