czyPL commited on
Commit
9cb15fd
·
1 Parent(s): d23511c

add application file

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +1 -3
  2. app.py +877 -4
  3. results/Claude-3.7-Sonnet/nonthinking_context-120000_bon-3_summary.json +164 -0
  4. results/Claude-3.7-Sonnet/thinking_context-120000_bon-3_summary.json +164 -0
  5. results/Claude-4-Sonnet/nonthinking_context-1000000_bon-3_summary.json +164 -0
  6. results/Claude-4-Sonnet/thinking_context-1000000_bon-3_summary.json +164 -0
  7. results/DeepSeek-R1-0528/thinking_context-120000_bon-3_summary.json +164 -0
  8. results/DeepSeek-R1/thinking_context-120000_bon-3_summary.json +164 -0
  9. results/DeepSeek-V3-0324/nonthinking_context-120000_bon-3_summary.json +164 -0
  10. results/DeepSeek-V3-0324/thinking_context-120000_bon-3_summary.json +164 -0
  11. results/DeepSeek-V3.1/nonthinking_context-120000_bon-3_summary.json +164 -0
  12. results/DeepSeek-V3.1/thinking_context-120000_bon-3_summary.json +164 -0
  13. results/DeepSeek-V3.2/nonthinking_context-120000_bon-3_summary.json +164 -0
  14. results/DeepSeek-V3.2/thinking_context-120000_bon-3_summary.json +164 -0
  15. results/GLM-4.5/nonthinking_context-120000_bon-3_summary.json +164 -0
  16. results/GLM-4.5/thinking_context-120000_bon-3_summary.json +164 -0
  17. results/GLM-4.6/nonthinking_context-120000_bon-3_summary.json +164 -0
  18. results/GLM-4.6/thinking_context-120000_bon-3_summary.json +164 -0
  19. results/GPT-4o/nonthinking_context-120000_bon-3_summary.json +164 -0
  20. results/GPT-4o/thinking_context-120000_bon-3_summary.json +164 -0
  21. results/GPT-5/thinking_context-272000_bon-3_summary.json +164 -0
  22. results/GPT-OSS-120B/thinking_context-120000_bon-3_summary.json +164 -0
  23. results/GPT-OSS-20B/thinking_context-120000_bon-3_summary.json +164 -0
  24. results/Gemini-2.5-Flash/nonthinking_context-1000000_bon-3_summary.json +164 -0
  25. results/Gemini-2.5-Flash/thinking_context-1000000_bon-3_summary.json +164 -0
  26. results/Gemini-2.5-Pro/thinking_context-1000000_bon-3_summary.json +164 -0
  27. results/Gemma-3-12B-It/nonthinking_context-120000_bon-3_summary.json +164 -0
  28. results/Gemma-3-12B-It/thinking_context-120000_bon-3_summary.json +164 -0
  29. results/Gemma-3-27B-It/nonthinking_context-120000_bon-3_summary.json +164 -0
  30. results/Gemma-3-27B-It/thinking_context-120000_bon-3_summary.json +164 -0
  31. results/Gemma-3-4B-It/nonthinking_context-120000_bon-3_summary.json +164 -0
  32. results/Gemma-3-4B-It/thinking_context-120000_bon-3_summary.json +164 -0
  33. results/Kimi-K2-Instruct-0905/nonthinking_context-224000_bon-3_summary.json +164 -0
  34. results/Kimi-K2-Instruct-0905/thinking_context-224000_bon-3_summary.json +164 -0
  35. results/Llama-3.1-405B-Instruct/nonthinking_context-120000_bon-3_summary.json +164 -0
  36. results/Llama-3.1-405B-Instruct/thinking_context-120000_bon-3_summary.json +164 -0
  37. results/Llama-3.1-70B-Instruct/nonthinking_context-120000_bon-3_summary.json +164 -0
  38. results/Llama-3.1-70B-Instruct/thinking_context-120000_bon-3_summary.json +164 -0
  39. results/Llama-3.1-8B-Instruct/nonthinking_context-120000_bon-3_summary.json +164 -0
  40. results/Llama-3.1-8B-Instruct/thinking_context-120000_bon-3_summary.json +164 -0
  41. results/Llama-3.2-3B-Instruct/nonthinking_context-120000_bon-3_summary.json +164 -0
  42. results/Llama-3.2-3B-Instruct/thinking_context-120000_bon-3_summary.json +164 -0
  43. results/Llama-3.3-70B-Instruct/nonthinking_context-120000_bon-3_summary.json +164 -0
  44. results/Llama-3.3-70B-Instruct/thinking_context-120000_bon-3_summary.json +164 -0
  45. results/Magistral-Small-2509/thinking_context-120000_bon-3_summary.json +164 -0
  46. results/MiniMax-M2/thinking_context-1000000_bon-3_summary.json +164 -0
  47. results/MiniMax-Text-01/nonthinking_context-1000000_bon-3_summary.json +164 -0
  48. results/MiniMax-Text-01/thinking_context-1000000_bon-3_summary.json +164 -0
  49. results/Ministral-3-14B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json +164 -0
  50. results/Ministral-3-14B-Instruct-2512/thinking_context-224000_bon-3_summary.json +164 -0
README.md CHANGED
@@ -9,6 +9,4 @@ app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  short_description: Realistic and Comprehensive Bilingual Long-Context Benchmark
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
9
  pinned: false
10
  license: apache-2.0
11
  short_description: Realistic and Comprehensive Bilingual Long-Context Benchmark
12
+ ---
 
 
app.py CHANGED
@@ -1,7 +1,880 @@
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ LongBenchmark 结果可视化
5
+ """
6
+
7
+ import json
8
+ import re
9
+ import pandas as pd
10
+ from pathlib import Path
11
  import gradio as gr
12
+ import plotly.graph_objects as go
13
+
14
+ with open('./results/model_info.json', 'r', encoding='utf-8') as f:
15
+ MODLE_INFO_DICT = json.load(f)
16
+
17
+ def get_color(index):
18
+ """基于索引生成颜色,使用黄金角度确保颜色分布均匀且无限"""
19
+ # 黄金角度约 137.508 度,确保颜色在色环上分布均匀
20
+ hue = (index * 137.508) % 360
21
+ # 固定饱和度为70%,亮度为60%,确保颜色既鲜艳又不刺眼
22
+ return f"hsl({hue}, 70%, 60%)"
23
+
24
+ class ResultParser:
25
+ def __init__(self, output_dir: str):
26
+ self.output_dir = Path(output_dir)
27
+ self.results = []
28
+
29
+ def parse_filename(self, filename: str):
30
+ """解析文件名,提取context长度和是否包含thinking或nonthinking"""
31
+ # 提取context长度
32
+ context_match = re.search(r'context-(\d+)', filename)
33
+ context_length = int(context_match.group(1)) if context_match else 0
34
+
35
+ filename_lower = filename.lower()
36
+ # 检查是否包含nonthinking(优先检查,因为nonthinking也包含thinking)
37
+ has_nonthinking = 'nonthinking' in filename_lower
38
+ # 检查是否包含thinking(但不包含nonthinking)
39
+ has_thinking = 'thinking' in filename_lower and not has_nonthinking
40
+
41
+ return context_length, has_thinking, has_nonthinking
42
+
43
+ def parse_result_file(self, model_name: str, file_path: Path):
44
+ """解析单个结果文件"""
45
+ try:
46
+ with open(file_path, 'r', encoding='utf-8') as f:
47
+ data = json.load(f)
48
+
49
+ context_length, has_thinking, has_nonthinking = self.parse_filename(file_path.name)
50
+ # 使用JSON文件中的date字段作为评估日期
51
+ eval_date = data.get('date', "未知")
52
+
53
+ # 提取BoN数据
54
+ bon_data = {}
55
+ for bon_key in ['BoN-1', 'BoN-2', 'BoN-3']:
56
+ if bon_key in data and 'overall_metric' in data[bon_key]:
57
+ bon_data[bon_key] = data[bon_key]['overall_metric']
58
+
59
+ result = {
60
+ 'model_name': model_name,
61
+ 'eval_date': eval_date,
62
+ 'context_length': context_length,
63
+ 'has_thinking': has_thinking,
64
+ 'has_nonthinking': has_nonthinking,
65
+ 'overall_metric': data.get('average_overall_metric', 0.0),
66
+ 'token_length_metrics': data.get('average_token_length_metric', {}),
67
+ 'contextual_requirement': data.get('average_contextual_requirement_metric', {}),
68
+ 'difficulty': data.get('average_difficulty_metric', {}),
69
+ 'primary_task': data.get('average_primary_task_metric', {}),
70
+ 'language': data.get('average_language_metric', {}),
71
+ 'bon_data': bon_data, # 存储BoN-1, BoN-2, BoN-3的overall_metric
72
+ 'pass_at_k': {
73
+ 'pass@1': data.get('pass@1'),
74
+ 'pass@2': data.get('pass@2'),
75
+ 'pass@3': data.get('pass@3')
76
+ }
77
+ }
78
+
79
+ return result
80
+
81
+ except Exception as e:
82
+ print(f"解析文件 {file_path} 时出错: {e}")
83
+ return None
84
+
85
+ def scan_all_results(self):
86
+ """扫描所有模型的结果文件"""
87
+ self.results = []
88
+
89
+ if not self.output_dir.exists():
90
+ print(f"输出目录不存在: {self.output_dir}")
91
+ return
92
+
93
+ # 遍历所有模型目录
94
+ for model_dir in self.output_dir.iterdir():
95
+ if not model_dir.is_dir():
96
+ continue
97
+
98
+ model_name = model_dir.name
99
+ print(f"扫描模型: {model_name}")
100
+
101
+ # 查找该模型下的所有_summary.json文件
102
+ for file_path in model_dir.glob("*_summary.json"):
103
+ print(f" 解析文件: {file_path.name}")
104
+ result = self.parse_result_file(model_name, file_path)
105
+ if result:
106
+ self.results.append(result)
107
+
108
+ print(f"总共解析了 {len(self.results)} 个结果文件")
109
+
110
+ def get_leaderboard_data(self):
111
+ """获取排行榜数据"""
112
+ if not self.results:
113
+ return pd.DataFrame()
114
+
115
+ # 按模型名称聚合数据
116
+ model_groups = {}
117
+ for result in self.results:
118
+ model_name = result['model_name']
119
+ if model_name not in model_groups:
120
+ model_groups[model_name] = {
121
+ 'dates': [],
122
+ 'contexts': [],
123
+ 'thinking_scores': [],
124
+ 'non_thinking_scores': []
125
+ }
126
+
127
+ group = model_groups[model_name]
128
+ group['dates'].append(result['eval_date'])
129
+ group['contexts'].append(result['context_length'])
130
+
131
+ score = result['overall_metric']
132
+ if result['has_thinking']:
133
+ group['thinking_scores'].append(score)
134
+ else:
135
+ group['non_thinking_scores'].append(score)
136
+
137
+ leaderboard_data = []
138
+ for model_name, group in model_groups.items():
139
+ # 获取最新日期
140
+ valid_dates = [d for d in group['dates'] if d != "未知"]
141
+ latest_date = max(valid_dates) if valid_dates else "未知"
142
+
143
+ # 获取最大Context Window
144
+ max_context = max(group['contexts']) if group['contexts'] else 0
145
+
146
+ # 格式化截断长度
147
+ if max_context >= 1000000:
148
+ context_str = f"{max_context/1000000:.0f}M" if max_context % 1000000 == 0 else f"{max_context/1000000:.1f}M"
149
+ elif max_context >= 1000:
150
+ context_str = f"{max_context/1000:.0f}k" if max_context % 1000 == 0 else f"{max_context/1000:.1f}k"
151
+ else:
152
+ context_str = str(max_context)
153
+
154
+ # 获取模型类型和上下文长度
155
+ model_context = "-"
156
+ model_url = ""
157
+ if model_name in MODLE_INFO_DICT:
158
+ model_info = MODLE_INFO_DICT[model_name]
159
+ if isinstance(model_info, dict):
160
+ model_type = model_info.get("type", "Unknown")
161
+ model_context = model_info.get("context_length", "-")
162
+ model_url = model_info.get("url", "")
163
+ else:
164
+ model_type = str(model_info)
165
+ else:
166
+ model_type = "Unknown"
167
+
168
+ # 处理模型名称链接和图标
169
+ display_model_name = model_name
170
+
171
+ if model_url:
172
+ display_model_name = f"[{display_model_name}]({model_url})"
173
+
174
+ # 计算平均分
175
+ nt_score_val = 0
176
+ nt_score_str = "-"
177
+ if group['non_thinking_scores']:
178
+ nt_score_val = sum(group['non_thinking_scores']) / len(group['non_thinking_scores'])
179
+ nt_score_str = f"{nt_score_val * 100:.2f}"
180
+
181
+ t_score_val = 0
182
+ t_score_str = "-"
183
+ if group['thinking_scores']:
184
+ t_score_val = sum(group['thinking_scores']) / len(group['thinking_scores'])
185
+ t_score_str = f"{t_score_val * 100:.2f}"
186
+
187
+ leaderboard_data.append({
188
+ '模型名称': display_model_name,
189
+ '模型类型': model_type,
190
+ '上下文长度': model_context,
191
+ '截断长度': context_str,
192
+ '非思考得分': nt_score_str,
193
+ '思考得分': t_score_str,
194
+ '_sort_score': max(nt_score_val, t_score_val)
195
+ })
196
+
197
+ df = pd.DataFrame(leaderboard_data)
198
+ # 按最高分降序排列
199
+ if not df.empty:
200
+ df = df.sort_values('_sort_score', ascending=False).drop(columns=['_sort_score']).reset_index(drop=True)
201
+
202
+ return df
203
+
204
+ def get_display_name_for_result(result):
205
+ """获取模型的显示名称(根据是否包含thinking或nonthinking添加后缀)"""
206
+ if result.get('has_nonthinking'):
207
+ return f"{result['model_name']}_nonthinking"
208
+ elif result.get('has_thinking'):
209
+ return f"{result['model_name']}_thinking"
210
+ else:
211
+ return result['model_name']
212
+
213
+ def get_model_color_index(model_name, all_models):
214
+ """获取模型在颜色列表中的索引"""
215
+ try:
216
+ return all_models.index(model_name)
217
+ except ValueError:
218
+ return 0
219
+
220
+ def create_contextual_requirement_chart(results, selected_models):
221
+ """创建上下文需求对比柱状图"""
222
+ if not selected_models:
223
+ return go.Figure()
224
+
225
+ # 收集数据 - 直接使用summary中的值,不需要计算平均值
226
+ chart_data = {}
227
+
228
+ for result in results:
229
+ display_name = get_display_name_for_result(result)
230
+ if display_name in selected_models:
231
+ model_name = display_name
232
+ contextual_requirement = result['contextual_requirement']
233
+
234
+ # 直接存储每个模型的结果,不需要计算平均值
235
+ if model_name not in chart_data:
236
+ chart_data[model_name] = {}
237
+
238
+ for req_type, score in contextual_requirement.items():
239
+ chart_data[model_name][req_type] = score * 100 # 乘以100
240
+
241
+ # 创建图表
242
+ fig = go.Figure()
243
+
244
+ # 获取所有需求类型 - 保持原始顺序,不排序
245
+ all_req_types = []
246
+ for result in results:
247
+ display_name = get_display_name_for_result(result)
248
+ if display_name in selected_models:
249
+ contextual_requirement = result['contextual_requirement']
250
+ for req_type in contextual_requirement.keys():
251
+ if req_type not in all_req_types:
252
+ all_req_types.append(req_type)
253
+
254
+ for model_name in selected_models:
255
+ if model_name in chart_data:
256
+ scores = [chart_data[model_name].get(req_type, 0) for req_type in all_req_types]
257
+ color_index = get_model_color_index(model_name, selected_models)
258
+
259
+ fig.add_trace(go.Bar(
260
+ name=model_name,
261
+ x=all_req_types,
262
+ y=scores,
263
+ marker_color=get_color(color_index),
264
+ text=[f"{score:.2f}" for score in scores], # 保留2位小数
265
+ textposition='auto'
266
+ ))
267
+
268
+ fig.update_layout(
269
+ title='模型在不同上下文需求上的性能对比',
270
+ xaxis_title='上下文需求类型',
271
+ yaxis_title='平均得分',
272
+ barmode='group',
273
+ autosize=True, # 自动调整大小
274
+ legend=dict(
275
+ orientation="h",
276
+ yanchor="top",
277
+ y=-0.25, # 调整到更下方
278
+ xanchor="center",
279
+ x=0.5
280
+ ),
281
+ margin=dict(b=100) # 增加底部边距
282
+ )
283
+
284
+ return fig
285
+
286
+ def create_primary_task_radar_chart(results, selected_models):
287
+ """创建主要任务雷达图(按任务前缀聚合,使用'.'前缀,绘制最多11个任务)"""
288
+ if not selected_models:
289
+ return go.Figure()
290
+
291
+ # 收集所有模型下的任务前缀,保持出现顺序
292
+ prefix_order = []
293
+ # 为每个模型构建 前缀 -> [scores] 的映射
294
+ model_prefix_scores = {}
295
+
296
+ for result in results:
297
+ display_name = get_display_name_for_result(result)
298
+ if display_name not in selected_models:
299
+ continue
300
+ primary_task = result.get('primary_task', {})
301
+ if display_name not in model_prefix_scores:
302
+ model_prefix_scores[display_name] = {}
303
+ for task_key, score in primary_task.items():
304
+ prefix = task_key.split('.')[0].strip() if isinstance(task_key, str) else str(task_key)
305
+ if prefix not in prefix_order:
306
+ prefix_order.append(prefix)
307
+ if prefix not in model_prefix_scores[display_name]:
308
+ model_prefix_scores[display_name][prefix] = []
309
+ model_prefix_scores[display_name][prefix].append(score * 100)
310
+
311
+ # 只取前11个前缀用于绘制
312
+ categories = prefix_order[:11]
313
+
314
+ # 创建雷达图
315
+ fig = go.Figure()
316
+
317
+ for model_name in selected_models:
318
+ if model_name not in model_prefix_scores:
319
+ continue
320
+ # 对每个前缀做均值聚合;缺失则为0
321
+ values = []
322
+ for prefix in categories:
323
+ scores = model_prefix_scores[model_name].get(prefix, [])
324
+ if scores:
325
+ values.append(sum(scores) / len(scores))
326
+ else:
327
+ values.append(0)
328
+ # 闭合多边形
329
+ r_values = values + ([values[0]] if values else [])
330
+ theta_values = categories + ([categories[0]] if categories else [])
331
+ color_index = get_model_color_index(model_name, selected_models)
332
+ fig.add_trace(go.Scatterpolar(
333
+ r=r_values,
334
+ theta=theta_values,
335
+ mode='lines+markers',
336
+ name=model_name,
337
+ line=dict(color=get_color(color_index), width=3),
338
+ marker=dict(size=6),
339
+ fill='toself'
340
+ ))
341
+
342
+ fig.update_layout(
343
+ title='模型在不同主要任务上的性能对比',
344
+ polar=dict(
345
+ radialaxis=dict(visible=True, range=[0, 100])
346
+ ),
347
+ legend=dict(
348
+ orientation="h",
349
+ yanchor="top",
350
+ y=-0.2,
351
+ xanchor="center",
352
+ x=0.5
353
+ ),
354
+ margin=dict(b=100)
355
+ )
356
+
357
+ return fig
358
+
359
+ def create_language_chart(results, selected_models):
360
+ """创建语言对比柱状图"""
361
+ if not selected_models:
362
+ return go.Figure()
363
+
364
+ # 收集数据 - 直接使用summary中的值,不需要计算平均值
365
+ chart_data = {}
366
+
367
+ for result in results:
368
+ display_name = get_display_name_for_result(result)
369
+ if display_name in selected_models:
370
+ model_name = display_name
371
+ language = result['language']
372
+
373
+ # 直接存储每个模型的结果,不需要计算平均值
374
+ if model_name not in chart_data:
375
+ chart_data[model_name] = {}
376
+
377
+ for lang_type, score in language.items():
378
+ chart_data[model_name][lang_type] = score * 100 # 乘以100
379
+
380
+ # 创建图表
381
+ fig = go.Figure()
382
+
383
+ # 获取所有语言类型 - 保持原始顺序,不排序
384
+ all_lang_types = []
385
+ for result in results:
386
+ display_name = get_display_name_for_result(result)
387
+ if display_name in selected_models:
388
+ language = result['language']
389
+ for lang_type in language.keys():
390
+ if lang_type not in all_lang_types:
391
+ all_lang_types.append(lang_type)
392
+
393
+ for model_name in selected_models:
394
+ if model_name in chart_data:
395
+ scores = [chart_data[model_name].get(lang_type, 0) for lang_type in all_lang_types]
396
+ color_index = get_model_color_index(model_name, selected_models)
397
+
398
+ fig.add_trace(go.Bar(
399
+ name=model_name,
400
+ x=all_lang_types,
401
+ y=scores,
402
+ marker_color=get_color(color_index),
403
+ text=[f"{score:.2f}" for score in scores], # 保留2位小数
404
+ textposition='auto'
405
+ ))
406
+
407
+ fig.update_layout(
408
+ title='模型在不同语言上的性能对比',
409
+ xaxis_title='语言类型',
410
+ yaxis_title='平均得分',
411
+ barmode='group',
412
+ autosize=True, # 自动调整大小
413
+ legend=dict(
414
+ orientation="h",
415
+ yanchor="top",
416
+ y=-0.25, # 调整到更下方
417
+ xanchor="center",
418
+ x=0.5
419
+ ),
420
+ margin=dict(b=100) # 增加底部边距
421
+ )
422
+
423
+ return fig
424
+
425
+ def create_difficulty_chart(results, selected_models):
426
+ """创建难度对比柱状图"""
427
+ if not selected_models:
428
+ return go.Figure()
429
+
430
+ # 收集数据 - 直接使用summary中的值,不需要计算平均值
431
+ chart_data = {}
432
+
433
+ for result in results:
434
+ display_name = get_display_name_for_result(result)
435
+ if display_name in selected_models:
436
+ model_name = display_name
437
+ difficulty = result['difficulty']
438
+
439
+ # 直接存储每个模型的结果,不需要计算平均值
440
+ if model_name not in chart_data:
441
+ chart_data[model_name] = {}
442
+
443
+ for diff_type, score in difficulty.items():
444
+ chart_data[model_name][diff_type] = score * 100 # 乘以100
445
+
446
+ # 创建图表
447
+ fig = go.Figure()
448
+
449
+ # 获取所有难度类型 - 保持原始顺序,不排序
450
+ all_diff_types = []
451
+ for result in results:
452
+ display_name = get_display_name_for_result(result)
453
+ if display_name in selected_models:
454
+ difficulty = result['difficulty']
455
+ for diff_type in difficulty.keys():
456
+ if diff_type not in all_diff_types:
457
+ all_diff_types.append(diff_type)
458
+
459
+ for model_name in selected_models:
460
+ if model_name in chart_data:
461
+ scores = [chart_data[model_name].get(diff_type, 0) for diff_type in all_diff_types]
462
+ color_index = get_model_color_index(model_name, selected_models)
463
+
464
+ fig.add_trace(go.Bar(
465
+ name=model_name,
466
+ x=all_diff_types,
467
+ y=scores,
468
+ marker_color=get_color(color_index),
469
+ text=[f"{score:.2f}" for score in scores], # 保留2位小数
470
+ textposition='auto'
471
+ ))
472
+
473
+ fig.update_layout(
474
+ title='模型在不同难度上的性能对比',
475
+ xaxis_title='难度类型',
476
+ yaxis_title='平均得分',
477
+ barmode='group',
478
+ autosize=True, # 自动调整大小
479
+ legend=dict(
480
+ orientation="h",
481
+ yanchor="top",
482
+ y=-0.25, # 调整到更下方
483
+ xanchor="center",
484
+ x=0.5
485
+ ),
486
+ margin=dict(b=100) # 增加底部边距
487
+ )
488
+
489
+ return fig
490
+
491
+ def create_length_heatmap(results, selected_models):
492
+ """创建长度热力图:横坐标为长度,纵坐标为模型"""
493
+ if not selected_models:
494
+ return go.Figure()
495
+
496
+ # 定义标准的context长度范围:8k, 16k, 32k, 64k, 128k, 256k
497
+ standard_lengths = [8000, 16000, 32000, 64000, 128000, 256000]
498
+ standard_length_keys = ['8k', '16k', '32k', '64k', '128k', '256k']
499
+
500
+ # 准备热力图数据
501
+ heatmap_data = []
502
+ model_names = []
503
+
504
+ for result in results:
505
+ display_name = get_display_name_for_result(result)
506
+ if display_name in selected_models:
507
+ model_names.append(display_name)
508
+
509
+ # 从token_length_metrics中获取数据
510
+ token_length_metrics = result.get('token_length_metrics', {})
511
+ row_data = []
512
+
513
+ for key in standard_length_keys:
514
+ if key in token_length_metrics:
515
+ row_data.append(token_length_metrics[key] * 100) # 乘以100转换为百分比
516
+ else:
517
+ row_data.append(None) # 没有数据点
518
+
519
+ heatmap_data.append(row_data)
520
+
521
+ # 创建热力图
522
+ fig = go.Figure(data=go.Heatmap(
523
+ z=heatmap_data,
524
+ x=[f"{length//1000}k" for length in standard_lengths], # x轴标签
525
+ y=model_names, # y轴标签
526
+ colorscale='RdYlBu_r', # 颜色映射:红色表示低分,蓝色表示高分
527
+ showscale=True,
528
+ text=[[f"{val:.2f}" if val is not None else "N/A" for val in row] for row in heatmap_data], # 显示数值
529
+ texttemplate="%{text}",
530
+ textfont={"size": 10},
531
+ hoverongaps=False
532
+ ))
533
+
534
+ fig.update_layout(
535
+ title='模型在不同Context长度上的性能热力图',
536
+ xaxis_title='Context长度 (tokens)',
537
+ yaxis_title='模型名称',
538
+ autosize=True,
539
+ height=max(400, len(model_names) * 50), # 根据模型数量调整高度
540
+ margin=dict(l=150, r=50, t=80, b=80) # 调整边距,左侧留更多空间给模型名称
541
+ )
542
+
543
+ return fig
544
+
545
+ def create_bon_chart(results, selected_models):
546
+ """创建BoN 1-3折线图,显示overall_metric"""
547
+ if not selected_models:
548
+ return go.Figure()
549
+
550
+ # BoN 标签
551
+ bon_labels = ['BoN-1', 'BoN-2', 'BoN-3']
552
+ bon_indices = [1, 2, 3]
553
+
554
+ # 为每个模型准备数据
555
+ model_data = {}
556
+ for result in results:
557
+ display_name = get_display_name_for_result(result)
558
+ if display_name in selected_models:
559
+ if display_name not in model_data:
560
+ model_data[display_name] = {}
561
+
562
+ # 从bon_data中获取数据
563
+ bon_data = result.get('bon_data', {})
564
+ for bon_key in bon_labels:
565
+ if bon_key in bon_data:
566
+ bon_index = bon_labels.index(bon_key) + 1
567
+ model_data[display_name][bon_index] = bon_data[bon_key] * 100 # 乘以100转换为百分比
568
+
569
+ # 创建图表
570
+ fig = go.Figure()
571
+
572
+ for model_name, data in model_data.items():
573
+ if not data:
574
+ continue
575
+
576
+ # 为每个BoN准备数据
577
+ x_values = []
578
+ y_values = []
579
+ text_values = []
580
+
581
+ for bon_index in bon_indices:
582
+ x_values.append(bon_index)
583
+ if bon_index in data:
584
+ y_values.append(data[bon_index])
585
+ text_values.append(f"{data[bon_index]:.2f}")
586
+ else:
587
+ y_values.append(None)
588
+ text_values.append("")
589
+
590
+ # 获取模型颜色索引
591
+ color_index = get_model_color_index(model_name, selected_models)
592
+
593
+ fig.add_trace(go.Scatter(
594
+ x=x_values,
595
+ y=y_values,
596
+ mode='lines+markers',
597
+ name=model_name,
598
+ line=dict(color=get_color(color_index), width=3),
599
+ marker=dict(size=10),
600
+ text=text_values,
601
+ textposition='top center',
602
+ connectgaps=False
603
+ ))
604
+
605
+ # 设置x轴
606
+ fig.update_layout(
607
+ title='模型在不同Best-of-N下的对比',
608
+ xaxis_title='N',
609
+ yaxis_title='平均得分',
610
+ autosize=True,
611
+ xaxis=dict(
612
+ tickmode='array',
613
+ tickvals=bon_indices,
614
+ ticktext=bon_labels,
615
+ tickangle=0
616
+ ),
617
+ legend=dict(
618
+ orientation="h",
619
+ yanchor="top",
620
+ y=-0.25,
621
+ xanchor="center",
622
+ x=0.5
623
+ ),
624
+ margin=dict(b=100)
625
+ )
626
+
627
+ return fig
628
+
629
+ def create_pass_k_chart(results, selected_models):
630
+ """创建Pass@N 折线图"""
631
+ if not selected_models:
632
+ return go.Figure()
633
+
634
+ # Pass@K 标签
635
+ k_labels = ['pass@1', 'pass@2', 'pass@3']
636
+ k_indices = [1, 2, 3]
637
+
638
+ # 为每个模型准备数据
639
+ model_data = {}
640
+ for result in results:
641
+ display_name = get_display_name_for_result(result)
642
+ if display_name in selected_models:
643
+ if display_name not in model_data:
644
+ model_data[display_name] = {}
645
+
646
+ # 从pass_at_k中获取数据
647
+ pass_data = result.get('pass_at_k', {})
648
+ for i, k_key in enumerate(k_labels):
649
+ val = pass_data.get(k_key)
650
+ if val is not None:
651
+ k_index = k_indices[i]
652
+ model_data[display_name][k_index] = val * 100 # 乘以100转换为百分比
653
+
654
+ # 创建图表
655
+ fig = go.Figure()
656
+
657
+ for model_name, data in model_data.items():
658
+ if not data:
659
+ continue
660
+
661
+ # 为每个Pass@K准备数据
662
+ x_values = []
663
+ y_values = []
664
+ text_values = []
665
+
666
+ for k_index in k_indices:
667
+ x_values.append(k_index)
668
+ if k_index in data:
669
+ y_values.append(data[k_index])
670
+ text_values.append(f"{data[k_index]:.2f}")
671
+ else:
672
+ y_values.append(None)
673
+ text_values.append("")
674
+
675
+ # 获取模型颜色索引
676
+ color_index = get_model_color_index(model_name, selected_models)
677
+
678
+ fig.add_trace(go.Scatter(
679
+ x=x_values,
680
+ y=y_values,
681
+ mode='lines+markers',
682
+ name=model_name,
683
+ line=dict(color=get_color(color_index), width=3),
684
+ marker=dict(size=10),
685
+ text=text_values,
686
+ textposition='top center',
687
+ connectgaps=False
688
+ ))
689
+
690
+ # 设置x轴
691
+ fig.update_layout(
692
+ title='模型在不同Pass@N下的对比',
693
+ xaxis_title='N',
694
+ yaxis_title='Pass@N (%)',
695
+ autosize=True,
696
+ xaxis=dict(
697
+ tickmode='array',
698
+ tickvals=k_indices,
699
+ ticktext=k_labels,
700
+ tickangle=0
701
+ ),
702
+ legend=dict(
703
+ orientation="h",
704
+ yanchor="top",
705
+ y=-0.25,
706
+ xanchor="center",
707
+ x=0.5
708
+ ),
709
+ margin=dict(b=100)
710
+ )
711
+
712
+ return fig
713
+
714
+ def create_gradio_interface(parser: ResultParser):
715
+ """创建Gradio界面"""
716
+
717
+ def refresh_data():
718
+ """刷新数据"""
719
+ parser.scan_all_results()
720
+ return parser.get_leaderboard_data()
721
+
722
+ def get_model_choices():
723
+ """获取模型选择列表(按是否包含Thinking或NonThinking区分,以相应后缀标识)"""
724
+ if not parser.results:
725
+ return []
726
+ display_names = set()
727
+ for r in parser.results:
728
+ name = get_display_name_for_result(r)
729
+ display_names.add(name)
730
+ models = sorted(list(display_names))
731
+ return models
732
+
733
+ def update_charts(selected_models):
734
+ """更新所有图表"""
735
+ if not selected_models:
736
+ return None, None, None, None, None, None, None
737
+
738
+ length_heatmap = create_length_heatmap(parser.results, selected_models)
739
+ contextual_chart = create_contextual_requirement_chart(parser.results, selected_models)
740
+ primary_task_radar_chart = create_primary_task_radar_chart(parser.results, selected_models)
741
+ language_chart = create_language_chart(parser.results, selected_models)
742
+ difficulty_chart = create_difficulty_chart(parser.results, selected_models)
743
+ bon_chart = create_bon_chart(parser.results, selected_models)
744
+ pass_k_chart = create_pass_k_chart(parser.results, selected_models)
745
+
746
+ return length_heatmap, contextual_chart, primary_task_radar_chart, language_chart, difficulty_chart, bon_chart, pass_k_chart
747
+
748
+ # 自定义CSS:
749
+ # 1. 强制所有表头居中(包括内部的按钮或文本容器)
750
+ # 2. 除了第一列(模型名称),其他列内容居中
751
+ custom_css = """
752
+ /* 强制标题居中 */
753
+ h1 {
754
+ text-align: center;
755
+ display: block;
756
+ }
757
+
758
+ /* 表头居中 */
759
+ #leaderboard_table th,
760
+ #leaderboard_table th button,
761
+ #leaderboard_table th span {
762
+ text-align: center !important;
763
+ justify-content: center !important;
764
+ }
765
+
766
+ /* 内容列居中:从第3列开始(跳过行号和模型名称) */
767
+ #leaderboard_table td:nth-child(n+3) {
768
+ text-align: center !important;
769
+ }
770
+ """
771
+
772
+ # 创建界面
773
+ with gr.Blocks(title="LongBench Pro 结果可视化", theme=gr.themes.Soft(), css=custom_css) as demo:
774
+ gr.Markdown("# LongBench Pro 结果可视化")
775
+
776
+ gr.HTML("""
777
+ <div style="text-align: center; display: flex; justify-content: center; gap: 10px; margin-bottom: 20px;">
778
+ <a href="https://huggingface.co/datasets/caskcsg/LongBench-Pro" target="_blank"><img src="https://img.shields.io/badge/HF-Dataset-yellow?logo=huggingface&logoColor=white" alt="HF Dataset"></a>
779
+ <a href="https://github.com/caskcsg/longcontext/tree/main/LongBench_Pro" target="_blank"><img src="https://img.shields.io/badge/Github-Code-blue?logo=github&logoColor=white" alt="Github Code"></a>
780
+ <a href="https://huggingface.co/spaces/caskcsg/LongBench-Pro-Leaderboard" target="_blank"><img src="https://img.shields.io/badge/🏆-Leaderboard-red" alt="Leaderboard"></a>
781
+ <a href="#" target="_blank"><img src="https://img.shields.io/badge/📄-Arxiv_Paper-green" alt="Paper"></a>
782
+ </div>
783
+ """)
784
+
785
+ # 排行榜区域
786
+ gr.Markdown("## 🏆 总体性能排行榜")
787
+ gr.Markdown("""
788
+ - *思考模型和混合思考模型的思考得分,使用本身的思考能力(Non-Thinking Prompt)*
789
+ - *指令模型的思考得分,使用思考提示获得(Thinking Prompt)*
790
+ """)
791
+ leaderboard_df = gr.Dataframe(
792
+ headers=["模型名称", "模型类型", "上下文长度", "截断长度", "非思考得分", "思考得分"],
793
+ datatype=["markdown", "str", "str", "str", "str", "str"],
794
+ interactive=False,
795
+ wrap=True,
796
+ show_row_numbers=True,
797
+ show_search="filter",
798
+ show_fullscreen_button=True,
799
+ max_height=800,
800
+ column_widths=["250px", "100px", "100px", "100px", "120px", "120px"],
801
+ elem_id="leaderboard_table"
802
+ )
803
+
804
+ # 模型筛选和图表区域
805
+ gr.HTML("<br>")
806
+ gr.Markdown("## 📊 特定维度对比")
807
+ with gr.Row():
808
+ with gr.Column(scale=4):
809
+ model_selector = gr.Dropdown(
810
+ choices=[],
811
+ label="选择模型",
812
+ value=[],
813
+ multiselect=True,
814
+ interactive=True
815
+ )
816
+ with gr.Column(scale=1):
817
+ update_charts_btn = gr.Button("更新图表", variant="primary", size="lg")
818
+
819
+ with gr.Tabs():
820
+ with gr.TabItem("语言维度"):
821
+ language_plot = gr.Plot()
822
+
823
+ with gr.TabItem("难度维度"):
824
+ difficulty_plot = gr.Plot()
825
+
826
+ with gr.TabItem("长度维度"):
827
+ length_heatmap = gr.Plot()
828
+
829
+ with gr.TabItem("主要任务维度"):
830
+ primary_task_radar_plot = gr.Plot()
831
+
832
+ with gr.TabItem("上下文需求维度"):
833
+ contextual_plot = gr.Plot()
834
+
835
+ with gr.TabItem("BoN维度"):
836
+ bon_plot = gr.Plot()
837
+
838
+ with gr.TabItem("Pass@N维度"):
839
+ pass_k_plot = gr.Plot()
840
+
841
+ # 事件处理
842
+ def update_model_choices():
843
+ models = get_model_choices()
844
+ return gr.Dropdown(choices=models, value=[])
845
+
846
+ update_charts_btn.click(
847
+ fn=update_charts,
848
+ inputs=[model_selector],
849
+ outputs=[length_heatmap, contextual_plot, primary_task_radar_plot, language_plot, difficulty_plot, bon_plot, pass_k_plot]
850
+ )
851
+
852
+ # 初始化 - 页面加载时自动刷新数据
853
+ demo.load(
854
+ fn=refresh_data,
855
+ outputs=[leaderboard_df]
856
+ ).then(
857
+ fn=update_model_choices,
858
+ outputs=[model_selector]
859
+ )
860
+
861
+ return demo
862
 
863
+ def main():
864
+ """主函数"""
865
+ output_dir = "./results"
866
+
867
+ print("初始化结果解析器...")
868
+ parser = ResultParser(output_dir)
869
+
870
+ print("扫描结果文件...")
871
+ parser.scan_all_results()
872
+
873
+ print("创建Gradio界面...")
874
+ demo = create_gradio_interface(parser)
875
+
876
+ print("启动服务器...")
877
+ demo.launch()
878
 
879
+ if __name__ == "__main__":
880
+ main()
results/Claude-3.7-Sonnet/nonthinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.5144730485997339,
9
+ "inference_iteration_1_overall_metric": 0.5192628714494713,
10
+ "inference_iteration_2_overall_metric": 0.5090899475543829,
11
+ "inference_iteration_3_overall_metric": 0.515066326795347,
12
+ "average_token_length_metric": {
13
+ "8k": 0.5927607589235461,
14
+ "16k": 0.5922491004183165,
15
+ "32k": 0.5555486925170308,
16
+ "64k": 0.4991997081584744,
17
+ "128k": 0.45285894052515324,
18
+ "256k": 0.39422109105588254
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.47584256909012274,
22
+ "Partial": 0.5636391134301498
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.6868572950582806,
26
+ "Moderate": 0.48375113564429373,
27
+ "Hard": 0.4728683670759167,
28
+ "Extreme": 0.3731393645349295
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.7586982224527067,
32
+ "T2. Sequencing & Structure Reconstruction": 0.7545327049493711,
33
+ "T3. Evidence-Grounded QA": 0.5277777777777779,
34
+ "T4. Summarization & Synthesis": 0.5250996637138268,
35
+ "T5. Attribution & Citation Alignment": 0.5254132304220211,
36
+ "T6. Aggregation & Clustering": 0.47394883159992857,
37
+ "T7. Consistency & Compliance Checking": 0.3040021982475052,
38
+ "T8. Structured & Numeric Reasoning": 0.41188271604938276,
39
+ "T9. Version & Code Diff Analysis": 0.6042705189653765,
40
+ "T10. Rule Induction & In-Context Learning": 0.5114814814814815,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.43888888888888894
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.5100199196277736,
45
+ "English": 0.5189261775716956
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.5192628714494713,
49
+ "token_length": {
50
+ "8k": 0.5937761874854375,
51
+ "16k": 0.606154781504802,
52
+ "32k": 0.5701163293545726,
53
+ "64k": 0.49747085680734393,
54
+ "128k": 0.4476635155122931,
55
+ "256k": 0.40039555803238175
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.47604900489990065,
59
+ "Partial": 0.5742623379671082
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.6993477849390543,
63
+ "Moderate": 0.4824572359285609,
64
+ "Hard": 0.47426941765067004,
65
+ "Extreme": 0.37571516352087697
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.7482072090157142,
69
+ "T2. Sequencing & Structure Reconstruction": 0.7523087560587559,
70
+ "T3. Evidence-Grounded QA": 0.5333333333333333,
71
+ "T4. Summarization & Synthesis": 0.5249609995597003,
72
+ "T5. Attribution & Citation Alignment": 0.5307787048666445,
73
+ "T6. Aggregation & Clustering": 0.47337460590728553,
74
+ "T7. Consistency & Compliance Checking": 0.30808530916861365,
75
+ "T8. Structured & Numeric Reasoning": 0.40740740740740744,
76
+ "T9. Version & Code Diff Analysis": 0.6209514621148434,
77
+ "T10. Rule Induction & In-Context Learning": 0.5469444444444443,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.45
79
+ },
80
+ "language": {
81
+ "Chinese": 0.5151862466463957,
82
+ "English": 0.5233394962525484
83
+ }
84
+ },
85
+ "pass@1": 0.2673333333333333,
86
+ "BoN-2": {
87
+ "overall_metric": 0.5638452937577435,
88
+ "token_length": {
89
+ "8k": 0.6334745299899752,
90
+ "16k": 0.6535009894669588,
91
+ "32k": 0.6109603205298609,
92
+ "64k": 0.5566414838337063,
93
+ "128k": 0.5030500434216845,
94
+ "256k": 0.4254443953042751
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.523183868231744,
98
+ "Partial": 0.6155961989726518
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.7554808778953406,
102
+ "Moderate": 0.5292531239954449,
103
+ "Hard": 0.5124984336029716,
104
+ "Extreme": 0.41036353942072434
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.8018540164669292,
108
+ "T2. Sequencing & Structure Reconstruction": 0.7873358123358121,
109
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
110
+ "T4. Summarization & Synthesis": 0.5413926274234249,
111
+ "T5. Attribution & Citation Alignment": 0.5675265408978069,
112
+ "T6. Aggregation & Clustering": 0.5409163851157314,
113
+ "T7. Consistency & Compliance Checking": 0.34558583778191654,
114
+ "T8. Structured & Numeric Reasoning": 0.4685185185185185,
115
+ "T9. Version & Code Diff Analysis": 0.6508982849457906,
116
+ "T10. Rule Induction & In-Context Learning": 0.6081944444444445,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.49166666666666664
118
+ },
119
+ "language": {
120
+ "Chinese": 0.5545031574164562,
121
+ "English": 0.5731874300990306
122
+ }
123
+ },
124
+ "pass@2": 0.30533333333333335,
125
+ "BoN-3": {
126
+ "overall_metric": 0.5987860690936202,
127
+ "token_length": {
128
+ "8k": 0.6767030215651172,
129
+ "16k": 0.6801366595488965,
130
+ "32k": 0.6345247903374839,
131
+ "64k": 0.6039272497657204,
132
+ "128k": 0.5233376257525678,
133
+ "256k": 0.47408706759193875
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.5614328850984434,
137
+ "Partial": 0.6463264850874838
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.7882244354415668,
141
+ "Moderate": 0.5766160107480841,
142
+ "Hard": 0.5655081578600745,
143
+ "Extreme": 0.42768418415872295
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.8178727620501064,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8166698116698113,
148
+ "T3. Evidence-Grounded QA": 0.6166666666666667,
149
+ "T4. Summarization & Synthesis": 0.5469270547891242,
150
+ "T5. Attribution & Citation Alignment": 0.581425502230592,
151
+ "T6. Aggregation & Clustering": 0.5699768544212985,
152
+ "T7. Consistency & Compliance Checking": 0.3875679491876082,
153
+ "T8. Structured & Numeric Reasoning": 0.5462962962962963,
154
+ "T9. Version & Code Diff Analysis": 0.6792246386283735,
155
+ "T10. Rule Induction & In-Context Learning": 0.6452777777777778,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.525
157
+ },
158
+ "language": {
159
+ "Chinese": 0.5867683815613326,
160
+ "English": 0.6108037566259096
161
+ }
162
+ },
163
+ "pass@3": 0.3433333333333333
164
+ }
results/Claude-3.7-Sonnet/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.5966078087059191,
9
+ "inference_iteration_1_overall_metric": 0.5938171820634314,
10
+ "inference_iteration_2_overall_metric": 0.5955816438384393,
11
+ "inference_iteration_3_overall_metric": 0.6004246002158852,
12
+ "average_token_length_metric": {
13
+ "8k": 0.6997135645823386,
14
+ "16k": 0.6577212798228894,
15
+ "32k": 0.6419035800281319,
16
+ "64k": 0.6238264957040918,
17
+ "128k": 0.523846643485212,
18
+ "256k": 0.43263528861285133
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.5527640561663963,
22
+ "Partial": 0.652408948301675
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.7825568834883242,
26
+ "Moderate": 0.6155843766907921,
27
+ "Hard": 0.5658238514809286,
28
+ "Extreme": 0.4006643574451805
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.8486700113628681,
32
+ "T2. Sequencing & Structure Reconstruction": 0.8000456983629063,
33
+ "T3. Evidence-Grounded QA": 0.5027777777777777,
34
+ "T4. Summarization & Synthesis": 0.5309981037882555,
35
+ "T5. Attribution & Citation Alignment": 0.5878801280848494,
36
+ "T6. Aggregation & Clustering": 0.5732629573374424,
37
+ "T7. Consistency & Compliance Checking": 0.3939611740106759,
38
+ "T8. Structured & Numeric Reasoning": 0.6212962962962962,
39
+ "T9. Version & Code Diff Analysis": 0.69342672946219,
40
+ "T10. Rule Induction & In-Context Learning": 0.5610185185185187,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5499999999999997
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.588350975552306,
45
+ "English": 0.6048646418595319
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.5938171820634314,
49
+ "token_length": {
50
+ "8k": 0.7191856133745022,
51
+ "16k": 0.6402554520543442,
52
+ "32k": 0.6341044882273853,
53
+ "64k": 0.6259136300211012,
54
+ "128k": 0.5177437687626877,
55
+ "256k": 0.4257001399405685
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.5550251903269423,
59
+ "Partial": 0.6431888079098727
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.7745826629942724,
63
+ "Moderate": 0.6051939662024207,
64
+ "Hard": 0.5720458613452599,
65
+ "Extreme": 0.40262413493324634
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.8535130690589529,
69
+ "T2. Sequencing & Structure Reconstruction": 0.7808294450361011,
70
+ "T3. Evidence-Grounded QA": 0.5,
71
+ "T4. Summarization & Synthesis": 0.5297721568932299,
72
+ "T5. Attribution & Citation Alignment": 0.5849862779597039,
73
+ "T6. Aggregation & Clustering": 0.5655938313957183,
74
+ "T7. Consistency & Compliance Checking": 0.3878929800328972,
75
+ "T8. Structured & Numeric Reasoning": 0.6314814814814815,
76
+ "T9. Version & Code Diff Analysis": 0.6772724985908704,
77
+ "T10. Rule Induction & In-Context Learning": 0.5855555555555555,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333
79
+ },
80
+ "language": {
81
+ "Chinese": 0.5905648959587577,
82
+ "English": 0.5970694681681056
83
+ }
84
+ },
85
+ "pass@1": 0.37266666666666665,
86
+ "BoN-2": {
87
+ "overall_metric": 0.6528466974974912,
88
+ "token_length": {
89
+ "8k": 0.7638003125719123,
90
+ "16k": 0.7101654799888735,
91
+ "32k": 0.7028814358570394,
92
+ "64k": 0.6870948174265773,
93
+ "128k": 0.565071084237125,
94
+ "256k": 0.4880670549034259
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.6177398094156152,
98
+ "Partial": 0.6975281914198818
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.8281250142111442,
102
+ "Moderate": 0.6919969423858446,
103
+ "Hard": 0.6370194614390577,
104
+ "Extreme": 0.4455182924797047
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.8893518121473104,
108
+ "T2. Sequencing & Structure Reconstruction": 0.8424985248669454,
109
+ "T3. Evidence-Grounded QA": 0.6,
110
+ "T4. Summarization & Synthesis": 0.544816527949887,
111
+ "T5. Attribution & Citation Alignment": 0.6380851790586051,
112
+ "T6. Aggregation & Clustering": 0.6286177167059521,
113
+ "T7. Consistency & Compliance Checking": 0.45873422235423905,
114
+ "T8. Structured & Numeric Reasoning": 0.6578703703703704,
115
+ "T9. Version & Code Diff Analysis": 0.7510537661056169,
116
+ "T10. Rule Induction & In-Context Learning": 0.618611111111111,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6583333333333333
118
+ },
119
+ "language": {
120
+ "Chinese": 0.6376749344767387,
121
+ "English": 0.6680184605182464
122
+ }
123
+ },
124
+ "pass@2": 0.44333333333333336,
125
+ "BoN-3": {
126
+ "overall_metric": 0.6830970097115105,
127
+ "token_length": {
128
+ "8k": 0.7837282981657816,
129
+ "16k": 0.7434736805597991,
130
+ "32k": 0.7293201301488285,
131
+ "64k": 0.7233152422607191,
132
+ "128k": 0.6019561639441493,
133
+ "256k": 0.5167885431897875
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.6474666039819896,
137
+ "Partial": 0.7284447988218108
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.8607457235279341,
141
+ "Moderate": 0.7303808913214545,
142
+ "Hard": 0.6594975210053281,
143
+ "Extreme": 0.47293457878264067
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.9066883296818435,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8643315529499735,
148
+ "T3. Evidence-Grounded QA": 0.6166666666666667,
149
+ "T4. Summarization & Synthesis": 0.5522881837833883,
150
+ "T5. Attribution & Citation Alignment": 0.6507666087400348,
151
+ "T6. Aggregation & Clustering": 0.6674614963830652,
152
+ "T7. Consistency & Compliance Checking": 0.48674980053348876,
153
+ "T8. Structured & Numeric Reasoning": 0.7092592592592593,
154
+ "T9. Version & Code Diff Analysis": 0.7716543341971472,
155
+ "T10. Rule Induction & In-Context Learning": 0.6727777777777777,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.7083333333333334
157
+ },
158
+ "language": {
159
+ "Chinese": 0.6635013624534305,
160
+ "English": 0.7026926569695916
161
+ }
162
+ },
163
+ "pass@3": 0.4826666666666667
164
+ }
results/Claude-4-Sonnet/nonthinking_context-1000000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.5606565628619046,
9
+ "inference_iteration_1_overall_metric": 0.5620036050651629,
10
+ "inference_iteration_2_overall_metric": 0.5631248059457928,
11
+ "inference_iteration_3_overall_metric": 0.5568412775747574,
12
+ "average_token_length_metric": {
13
+ "8k": 0.6071528197414233,
14
+ "16k": 0.5816154959256097,
15
+ "32k": 0.5612446325027117,
16
+ "64k": 0.5254403501645888,
17
+ "128k": 0.5465188702735214,
18
+ "256k": 0.541967208563576
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.5264359183584719,
22
+ "Partial": 0.6042101104117302
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.6842286640015465,
26
+ "Moderate": 0.5396282888053806,
27
+ "Hard": 0.5757154269645611,
28
+ "Extreme": 0.4292097599695439
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.8091570544332052,
32
+ "T2. Sequencing & Structure Reconstruction": 0.8207229190562522,
33
+ "T3. Evidence-Grounded QA": 0.5027777777777778,
34
+ "T4. Summarization & Synthesis": 0.5379205858992453,
35
+ "T5. Attribution & Citation Alignment": 0.6634794615844591,
36
+ "T6. Aggregation & Clustering": 0.5193980953038955,
37
+ "T7. Consistency & Compliance Checking": 0.41333574812040713,
38
+ "T8. Structured & Numeric Reasoning": 0.3430555555555555,
39
+ "T9. Version & Code Diff Analysis": 0.7423632867012375,
40
+ "T10. Rule Induction & In-Context Learning": 0.5125462962962964,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5055555555555556
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.5499191967861178,
45
+ "English": 0.5713939289376934
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.5620036050651629,
49
+ "token_length": {
50
+ "8k": 0.6116650341526972,
51
+ "16k": 0.5789009200875381,
52
+ "32k": 0.564884526756138,
53
+ "64k": 0.5150575277083638,
54
+ "128k": 0.5512053153279016,
55
+ "256k": 0.5503083063583438
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.5327035707633319,
59
+ "Partial": 0.5992945578129499
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.6903796852415736,
63
+ "Moderate": 0.5418819297458669,
64
+ "Hard": 0.5712079185155163,
65
+ "Extreme": 0.4285441662812896
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.8022069774152061,
69
+ "T2. Sequencing & Structure Reconstruction": 0.8176220076220074,
70
+ "T3. Evidence-Grounded QA": 0.5333333333333333,
71
+ "T4. Summarization & Synthesis": 0.5377531684167628,
72
+ "T5. Attribution & Citation Alignment": 0.6702134910265403,
73
+ "T6. Aggregation & Clustering": 0.509165168922684,
74
+ "T7. Consistency & Compliance Checking": 0.4132211721134369,
75
+ "T8. Structured & Numeric Reasoning": 0.336574074074074,
76
+ "T9. Version & Code Diff Analysis": 0.755058796168736,
77
+ "T10. Rule Induction & In-Context Learning": 0.5204166666666666,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5
79
+ },
80
+ "language": {
81
+ "Chinese": 0.5525156622096248,
82
+ "English": 0.5714915479207029
83
+ }
84
+ },
85
+ "pass@1": 0.2986666666666667,
86
+ "BoN-2": {
87
+ "overall_metric": 0.6070413152649775,
88
+ "token_length": {
89
+ "8k": 0.648428309386666,
90
+ "16k": 0.6138312363156012,
91
+ "32k": 0.6050527198490672,
92
+ "64k": 0.5709459527531863,
93
+ "128k": 0.6043903989020145,
94
+ "256k": 0.5995992743833368
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.5725088578790201,
98
+ "Partial": 0.6509917155743812
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.7433988294310949,
102
+ "Moderate": 0.5959678783418261,
103
+ "Hard": 0.6127141026903381,
104
+ "Extreme": 0.461215101348602
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.8352274736404431,
108
+ "T2. Sequencing & Structure Reconstruction": 0.8427542827542828,
109
+ "T3. Evidence-Grounded QA": 0.6,
110
+ "T4. Summarization & Synthesis": 0.5511890444295781,
111
+ "T5. Attribution & Citation Alignment": 0.709250434569062,
112
+ "T6. Aggregation & Clustering": 0.5590519848415796,
113
+ "T7. Consistency & Compliance Checking": 0.4538880045271121,
114
+ "T8. Structured & Numeric Reasoning": 0.400462962962963,
115
+ "T9. Version & Code Diff Analysis": 0.7896296658102724,
116
+ "T10. Rule Induction & In-Context Learning": 0.5898611111111111,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.55
118
+ },
119
+ "language": {
120
+ "Chinese": 0.6018498726474579,
121
+ "English": 0.6122327578825001
122
+ }
123
+ },
124
+ "pass@2": 0.3486666666666667,
125
+ "BoN-3": {
126
+ "overall_metric": 0.6359674546066088,
127
+ "token_length": {
128
+ "8k": 0.6791089276748106,
129
+ "16k": 0.6344789904129863,
130
+ "32k": 0.6382754639404674,
131
+ "64k": 0.6155932927177382,
132
+ "128k": 0.6246025171745627,
133
+ "256k": 0.6237455357190941
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.5997037562420825,
137
+ "Partial": 0.6821212525251004
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.7810114340069705,
141
+ "Moderate": 0.6237269051463531,
142
+ "Hard": 0.6375808018524918,
143
+ "Extreme": 0.4840585077179298
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.8531556276526557,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8630080105080103,
148
+ "T3. Evidence-Grounded QA": 0.6416666666666667,
149
+ "T4. Summarization & Synthesis": 0.5557327849332047,
150
+ "T5. Attribution & Citation Alignment": 0.7262293044645985,
151
+ "T6. Aggregation & Clustering": 0.5966415249311195,
152
+ "T7. Consistency & Compliance Checking": 0.47966164058768634,
153
+ "T8. Structured & Numeric Reasoning": 0.4393518518518518,
154
+ "T9. Version & Code Diff Analysis": 0.7950682623015005,
155
+ "T10. Rule Induction & In-Context Learning": 0.6329166666666667,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6083333333333333
157
+ },
158
+ "language": {
159
+ "Chinese": 0.6245809400158543,
160
+ "English": 0.6473539691973664
161
+ }
162
+ },
163
+ "pass@3": 0.38
164
+ }
results/Claude-4-Sonnet/thinking_context-1000000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 3,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.6987364832054667,
9
+ "inference_iteration_1_overall_metric": 0.7019992434982991,
10
+ "inference_iteration_2_overall_metric": 0.6978899327024527,
11
+ "inference_iteration_3_overall_metric": 0.6963202734156487,
12
+ "average_token_length_metric": {
13
+ "8k": 0.7273068305229948,
14
+ "16k": 0.7148161402734813,
15
+ "32k": 0.7282156837997693,
16
+ "64k": 0.7051754330841736,
17
+ "128k": 0.6642984844940268,
18
+ "256k": 0.6526063270583587
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.6617044440872328,
22
+ "Partial": 0.7458681693559481
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.8377531760390221,
26
+ "Moderate": 0.7658446956684767,
27
+ "Hard": 0.7472224806628969,
28
+ "Extreme": 0.4705256363582413
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.907681462321177,
32
+ "T2. Sequencing & Structure Reconstruction": 0.8890326186159514,
33
+ "T3. Evidence-Grounded QA": 0.661111111111111,
34
+ "T4. Summarization & Synthesis": 0.5383660231848545,
35
+ "T5. Attribution & Citation Alignment": 0.7860152219301051,
36
+ "T6. Aggregation & Clustering": 0.6671470819716809,
37
+ "T7. Consistency & Compliance Checking": 0.5518199768375653,
38
+ "T8. Structured & Numeric Reasoning": 0.6859567901234568,
39
+ "T9. Version & Code Diff Analysis": 0.8575767924690506,
40
+ "T10. Rule Induction & In-Context Learning": 0.6481481481481483,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5888888888888888
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.6865315143381795,
45
+ "English": 0.7109414520727555
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.7019992434982991,
49
+ "token_length": {
50
+ "8k": 0.7253942984634015,
51
+ "16k": 0.7347686831241128,
52
+ "32k": 0.7405843026072749,
53
+ "64k": 0.6852109109698611,
54
+ "128k": 0.6670707753146399,
55
+ "256k": 0.6589664905105064
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.6664641511659741,
59
+ "Partial": 0.747225724648532
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.8574789158116471,
63
+ "Moderate": 0.7537775763318981,
64
+ "Hard": 0.7422849474268556,
65
+ "Extreme": 0.47120899649989867
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.9235851079801015,
69
+ "T2. Sequencing & Structure Reconstruction": 0.8866411828911827,
70
+ "T3. Evidence-Grounded QA": 0.7166666666666667,
71
+ "T4. Summarization & Synthesis": 0.5397488846450174,
72
+ "T5. Attribution & Citation Alignment": 0.7968571187727533,
73
+ "T6. Aggregation & Clustering": 0.6625413639156731,
74
+ "T7. Consistency & Compliance Checking": 0.5348173324914223,
75
+ "T8. Structured & Numeric Reasoning": 0.7013888888888888,
76
+ "T9. Version & Code Diff Analysis": 0.8605924270512658,
77
+ "T10. Rule Induction & In-Context Learning": 0.6277777777777778,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.575
79
+ },
80
+ "language": {
81
+ "Chinese": 0.7011338968631923,
82
+ "English": 0.7028645901334066
83
+ }
84
+ },
85
+ "pass@1": 0.4786666666666667,
86
+ "BoN-2": {
87
+ "overall_metric": 0.7662921811826703,
88
+ "token_length": {
89
+ "8k": 0.7864626566034867,
90
+ "16k": 0.7903224578099031,
91
+ "32k": 0.7829418518359753,
92
+ "64k": 0.776573418015718,
93
+ "128k": 0.7369784925906059,
94
+ "256k": 0.724474210240336
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.7387739445109663,
98
+ "Partial": 0.8013153914921128
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.9044185942106279,
102
+ "Moderate": 0.8489744455895216,
103
+ "Hard": 0.8300963355442819,
104
+ "Extreme": 0.5187660309473054
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.9337455622019767,
108
+ "T2. Sequencing & Structure Reconstruction": 0.920133061383061,
109
+ "T3. Evidence-Grounded QA": 0.8,
110
+ "T4. Summarization & Synthesis": 0.5512587507775879,
111
+ "T5. Attribution & Citation Alignment": 0.851460921546061,
112
+ "T6. Aggregation & Clustering": 0.7263341037175177,
113
+ "T7. Consistency & Compliance Checking": 0.6403373459035246,
114
+ "T8. Structured & Numeric Reasoning": 0.7763888888888889,
115
+ "T9. Version & Code Diff Analysis": 0.8952412388875778,
116
+ "T10. Rule Induction & In-Context Learning": 0.7372222222222222,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.675
118
+ },
119
+ "language": {
120
+ "Chinese": 0.7559027655426326,
121
+ "English": 0.7766815968227087
122
+ }
123
+ },
124
+ "pass@2": 0.5586666666666666,
125
+ "BoN-3": {
126
+ "overall_metric": 0.7907893209368455,
127
+ "token_length": {
128
+ "8k": 0.8018918816218435,
129
+ "16k": 0.8122398647423218,
130
+ "32k": 0.8052123235968958,
131
+ "64k": 0.8059913515132954,
132
+ "128k": 0.7658095463175019,
133
+ "256k": 0.7535909578292158
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.7602540929686584,
137
+ "Partial": 0.8296523383509014
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.9273787695935313,
141
+ "Moderate": 0.8740334338695857,
142
+ "Hard": 0.8656496547603084,
143
+ "Extreme": 0.5373159132889705
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.9422092659363656,
147
+ "T2. Sequencing & Structure Reconstruction": 0.9295053557553554,
148
+ "T3. Evidence-Grounded QA": 0.8416666666666667,
149
+ "T4. Summarization & Synthesis": 0.5573876341160701,
150
+ "T5. Attribution & Citation Alignment": 0.8621875560226953,
151
+ "T6. Aggregation & Clustering": 0.7544804518953127,
152
+ "T7. Consistency & Compliance Checking": 0.6657566567798248,
153
+ "T8. Structured & Numeric Reasoning": 0.8055555555555556,
154
+ "T9. Version & Code Diff Analysis": 0.9043321479784867,
155
+ "T10. Rule Induction & In-Context Learning": 0.7838888888888889,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.725
157
+ },
158
+ "language": {
159
+ "Chinese": 0.7791329047815275,
160
+ "English": 0.8024457370921636
161
+ }
162
+ },
163
+ "pass@3": 0.598
164
+ }
results/DeepSeek-R1-0528/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 1,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.61893761586453,
9
+ "inference_iteration_1_overall_metric": 0.6270481753191374,
10
+ "inference_iteration_2_overall_metric": 0.6115350419668117,
11
+ "inference_iteration_3_overall_metric": 0.6182296303076407,
12
+ "average_token_length_metric": {
13
+ "8k": 0.7165288754873151,
14
+ "16k": 0.6828199674990499,
15
+ "32k": 0.6181133860648209,
16
+ "64k": 0.6286866574208946,
17
+ "128k": 0.5812085902020846,
18
+ "256k": 0.48626821851301744
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.5840521215265608,
22
+ "Partial": 0.6633373359310379
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.8266775116602355,
26
+ "Moderate": 0.6653109531944901,
27
+ "Hard": 0.5367579918507135,
28
+ "Extreme": 0.41488622286022364
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.8218247532961501,
32
+ "T2. Sequencing & Structure Reconstruction": 0.8170893828393826,
33
+ "T3. Evidence-Grounded QA": 0.5638888888888887,
34
+ "T4. Summarization & Synthesis": 0.5505430321723027,
35
+ "T5. Attribution & Citation Alignment": 0.5766419140172199,
36
+ "T6. Aggregation & Clustering": 0.5593864809441226,
37
+ "T7. Consistency & Compliance Checking": 0.44029922393263027,
38
+ "T8. Structured & Numeric Reasoning": 0.6998456790123458,
39
+ "T9. Version & Code Diff Analysis": 0.7073888549627423,
40
+ "T10. Rule Induction & In-Context Learning": 0.613935185185185,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5361111111111111
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.6388577912805221,
45
+ "English": 0.5990174404485396
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.6270481753191374,
49
+ "token_length": {
50
+ "8k": 0.7085495640791158,
51
+ "16k": 0.6852891854029057,
52
+ "32k": 0.6523714114738722,
53
+ "64k": 0.6349276261566131,
54
+ "128k": 0.5990851877476868,
55
+ "256k": 0.48206607705463184
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.5954038691856903,
59
+ "Partial": 0.6673227467617081
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.842688379979359,
63
+ "Moderate": 0.6856872320332753,
64
+ "Hard": 0.5371806890772243,
65
+ "Extreme": 0.4113355332448204
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.8085658637642787,
69
+ "T2. Sequencing & Structure Reconstruction": 0.8118172105672106,
70
+ "T3. Evidence-Grounded QA": 0.6083333333333333,
71
+ "T4. Summarization & Synthesis": 0.5488292098363808,
72
+ "T5. Attribution & Citation Alignment": 0.5873852854613878,
73
+ "T6. Aggregation & Clustering": 0.5544843141923278,
74
+ "T7. Consistency & Compliance Checking": 0.4479896544786345,
75
+ "T8. Structured & Numeric Reasoning": 0.7194444444444444,
76
+ "T9. Version & Code Diff Analysis": 0.704738113297963,
77
+ "T10. Rule Induction & In-Context Learning": 0.6438888888888888,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666
79
+ },
80
+ "language": {
81
+ "Chinese": 0.6382004484945908,
82
+ "English": 0.6158959021436853
83
+ }
84
+ },
85
+ "pass@1": 0.39066666666666666,
86
+ "BoN-2": {
87
+ "overall_metric": 0.6908377344426165,
88
+ "token_length": {
89
+ "8k": 0.7595240418657973,
90
+ "16k": 0.7570136230136708,
91
+ "32k": 0.7096605299566262,
92
+ "64k": 0.7135170329369038,
93
+ "128k": 0.6539368768633997,
94
+ "256k": 0.5513743020193068
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.6581544826620869,
98
+ "Partial": 0.7324346003451124
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.900422588090864,
102
+ "Moderate": 0.7645982045756979,
103
+ "Hard": 0.6173528116491919,
104
+ "Extreme": 0.46106606935258343
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.8656746491642889,
108
+ "T2. Sequencing & Structure Reconstruction": 0.8850374162874161,
109
+ "T3. Evidence-Grounded QA": 0.6833333333333333,
110
+ "T4. Summarization & Synthesis": 0.5661734768963829,
111
+ "T5. Attribution & Citation Alignment": 0.6630436055268355,
112
+ "T6. Aggregation & Clustering": 0.6284207499424888,
113
+ "T7. Consistency & Compliance Checking": 0.520982919531347,
114
+ "T8. Structured & Numeric Reasoning": 0.7888888888888889,
115
+ "T9. Version & Code Diff Analysis": 0.7536592506692632,
116
+ "T10. Rule Induction & In-Context Learning": 0.7194444444444444,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5916666666666667
118
+ },
119
+ "language": {
120
+ "Chinese": 0.6996429460445758,
121
+ "English": 0.6820325228406605
122
+ }
123
+ },
124
+ "pass@2": 0.4533333333333333,
125
+ "BoN-3": {
126
+ "overall_metric": 0.7254771584689355,
127
+ "token_length": {
128
+ "8k": 0.7994316297691582,
129
+ "16k": 0.7937454228456009,
130
+ "32k": 0.7297465435731235,
131
+ "64k": 0.7326783552830499,
132
+ "128k": 0.6951047141031044,
133
+ "256k": 0.6021562852395752
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.6913631461546793,
137
+ "Partial": 0.7688949923234434
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.9337047416686951,
141
+ "Moderate": 0.8141591847806626,
142
+ "Hard": 0.6485103013588345,
143
+ "Extreme": 0.4896785698290395
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.8968239339375051,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8988072575572574,
148
+ "T3. Evidence-Grounded QA": 0.7583333333333333,
149
+ "T4. Summarization & Synthesis": 0.5734497230075248,
150
+ "T5. Attribution & Citation Alignment": 0.6928069332150858,
151
+ "T6. Aggregation & Clustering": 0.6670419649341216,
152
+ "T7. Consistency & Compliance Checking": 0.5624760870987693,
153
+ "T8. Structured & Numeric Reasoning": 0.8055555555555556,
154
+ "T9. Version & Code Diff Analysis": 0.7817439995394256,
155
+ "T10. Rule Induction & In-Context Learning": 0.7638888888888888,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.65
157
+ },
158
+ "language": {
159
+ "Chinese": 0.7321739274049238,
160
+ "English": 0.7187803895329472
161
+ }
162
+ },
163
+ "pass@3": 0.49666666666666665
164
+ }
results/DeepSeek-R1/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.6006714049681133,
9
+ "inference_iteration_1_overall_metric": 0.6007584621917721,
10
+ "inference_iteration_2_overall_metric": 0.5960043654782469,
11
+ "inference_iteration_3_overall_metric": 0.6052513872343173,
12
+ "average_token_length_metric": {
13
+ "8k": 0.6896237775198697,
14
+ "16k": 0.66847824761939,
15
+ "32k": 0.6242811862728697,
16
+ "64k": 0.5907117819226532,
17
+ "128k": 0.526720556197483,
18
+ "256k": 0.5042128802764103
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.5734808170096616,
22
+ "Partial": 0.6352776078243236
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.8244195631460464,
26
+ "Moderate": 0.5882837964508552,
27
+ "Hard": 0.5338546774181954,
28
+ "Extreme": 0.4075883160627708
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.8460279171139484,
32
+ "T2. Sequencing & Structure Reconstruction": 0.7927840387644306,
33
+ "T3. Evidence-Grounded QA": 0.5666666666666665,
34
+ "T4. Summarization & Synthesis": 0.5315482688091906,
35
+ "T5. Attribution & Citation Alignment": 0.46763122932017526,
36
+ "T6. Aggregation & Clustering": 0.5661396588973091,
37
+ "T7. Consistency & Compliance Checking": 0.4411785360364781,
38
+ "T8. Structured & Numeric Reasoning": 0.6290123456790124,
39
+ "T9. Version & Code Diff Analysis": 0.7118775193966861,
40
+ "T10. Rule Induction & In-Context Learning": 0.6290277777777776,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5083333333333333
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.5813030699136731,
45
+ "English": 0.6200397400225522
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.6007584621917721,
49
+ "token_length": {
50
+ "8k": 0.6794004597378533,
51
+ "16k": 0.6605152745514365,
52
+ "32k": 0.637696287010787,
53
+ "64k": 0.6015965809771497,
54
+ "128k": 0.5259184504039809,
55
+ "256k": 0.49942372046943184
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.5724096385120696,
59
+ "Partial": 0.6368387832386698
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.8239541273708798,
63
+ "Moderate": 0.5859117167110014,
64
+ "Hard": 0.541012801830159,
65
+ "Extreme": 0.40525140462840953
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.8346607474979665,
69
+ "T2. Sequencing & Structure Reconstruction": 0.7960157843246078,
70
+ "T3. Evidence-Grounded QA": 0.5916666666666667,
71
+ "T4. Summarization & Synthesis": 0.5314348105743746,
72
+ "T5. Attribution & Citation Alignment": 0.46439938615714244,
73
+ "T6. Aggregation & Clustering": 0.5590113115895492,
74
+ "T7. Consistency & Compliance Checking": 0.4443221207730568,
75
+ "T8. Structured & Numeric Reasoning": 0.612962962962963,
76
+ "T9. Version & Code Diff Analysis": 0.7031087891880523,
77
+ "T10. Rule Induction & In-Context Learning": 0.6470833333333333,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5166666666666667
79
+ },
80
+ "language": {
81
+ "Chinese": 0.5732559357352887,
82
+ "English": 0.6282609886482589
83
+ }
84
+ },
85
+ "pass@1": 0.3433333333333333,
86
+ "BoN-2": {
87
+ "overall_metric": 0.6591761776037405,
88
+ "token_length": {
89
+ "8k": 0.7392791599059808,
90
+ "16k": 0.7298238663653037,
91
+ "32k": 0.6777532203191601,
92
+ "64k": 0.6787515982515117,
93
+ "128k": 0.5821796500623371,
94
+ "256k": 0.5472695707181553
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.6319274531975012,
98
+ "Partial": 0.6938563723025926
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.8783150831551243,
102
+ "Moderate": 0.678502573566839,
103
+ "Hard": 0.5963536523109759,
104
+ "Extreme": 0.4476885160139848
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.8861689034562782,
108
+ "T2. Sequencing & Structure Reconstruction": 0.8374043195366726,
109
+ "T3. Evidence-Grounded QA": 0.625,
110
+ "T4. Summarization & Synthesis": 0.5455058096240588,
111
+ "T5. Attribution & Citation Alignment": 0.5369499475317325,
112
+ "T6. Aggregation & Clustering": 0.6381104251141014,
113
+ "T7. Consistency & Compliance Checking": 0.5019132623573087,
114
+ "T8. Structured & Numeric Reasoning": 0.699537037037037,
115
+ "T9. Version & Code Diff Analysis": 0.7605821531353517,
116
+ "T10. Rule Induction & In-Context Learning": 0.7220833333333334,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667
118
+ },
119
+ "language": {
120
+ "Chinese": 0.6390711207410246,
121
+ "English": 0.6792812344664584
122
+ }
123
+ },
124
+ "pass@2": 0.4093333333333333,
125
+ "BoN-3": {
126
+ "overall_metric": 0.6982292810132508,
127
+ "token_length": {
128
+ "8k": 0.783866781971667,
129
+ "16k": 0.7689573555512769,
130
+ "32k": 0.7150318293064207,
131
+ "64k": 0.707631405403529,
132
+ "128k": 0.6291537694328186,
133
+ "256k": 0.5847345444137952
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.6747138288729008,
137
+ "Partial": 0.7281580382827888
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.9164006912153285,
141
+ "Moderate": 0.7293351521236777,
142
+ "Hard": 0.6332713918179128,
143
+ "Extreme": 0.48146703898856563
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.9025424539049985,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8813529526029528,
148
+ "T3. Evidence-Grounded QA": 0.6666666666666666,
149
+ "T4. Summarization & Synthesis": 0.5516760401143904,
150
+ "T5. Attribution & Citation Alignment": 0.5770877507616411,
151
+ "T6. Aggregation & Clustering": 0.6710035196738627,
152
+ "T7. Consistency & Compliance Checking": 0.5581333121650413,
153
+ "T8. Structured & Numeric Reasoning": 0.7560185185185184,
154
+ "T9. Version & Code Diff Analysis": 0.8001126786344129,
155
+ "T10. Rule Induction & In-Context Learning": 0.7456944444444444,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.625
157
+ },
158
+ "language": {
159
+ "Chinese": 0.6861725652789407,
160
+ "English": 0.7102859967475628
161
+ }
162
+ },
163
+ "pass@3": 0.45866666666666667
164
+ }
results/DeepSeek-V3-0324/nonthinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.5169762636111047,
9
+ "inference_iteration_1_overall_metric": 0.5181528065498966,
10
+ "inference_iteration_2_overall_metric": 0.5148683077997773,
11
+ "inference_iteration_3_overall_metric": 0.5179076764836414,
12
+ "average_token_length_metric": {
13
+ "8k": 0.5649502252093578,
14
+ "16k": 0.5347800008319371,
15
+ "32k": 0.5556420045489457,
16
+ "64k": 0.5214603320658495,
17
+ "128k": 0.4864319755387441,
18
+ "256k": 0.4385930434717982
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.477916384525216,
22
+ "Partial": 0.566688836993147
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.6526369307346341,
26
+ "Moderate": 0.4967551723461267,
27
+ "Hard": 0.48299903154456436,
28
+ "Extreme": 0.4039646133594433
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.7877679677233363,
32
+ "T2. Sequencing & Structure Reconstruction": 0.7655772360963239,
33
+ "T3. Evidence-Grounded QA": 0.5416666666666666,
34
+ "T4. Summarization & Synthesis": 0.5446412810076181,
35
+ "T5. Attribution & Citation Alignment": 0.5358172435200781,
36
+ "T6. Aggregation & Clustering": 0.4889882988114357,
37
+ "T7. Consistency & Compliance Checking": 0.3557205172395749,
38
+ "T8. Structured & Numeric Reasoning": 0.24367283950617283,
39
+ "T9. Version & Code Diff Analysis": 0.6358733797519817,
40
+ "T10. Rule Induction & In-Context Learning": 0.6043981481481482,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4138888888888889
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.5177833454262655,
45
+ "English": 0.5161691817959461
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.5181528065498966,
49
+ "token_length": {
50
+ "8k": 0.5625899513794126,
51
+ "16k": 0.5301053854931655,
52
+ "32k": 0.5507157770563107,
53
+ "64k": 0.5301772699202785,
54
+ "128k": 0.4898481902149404,
55
+ "256k": 0.44548026523527584
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.4761374399130544,
59
+ "Partial": 0.571626909542243
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.6469710156877221,
63
+ "Moderate": 0.5054097919629513,
64
+ "Hard": 0.48366558283696526,
65
+ "Extreme": 0.4080599930595185
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.7857412502114394,
69
+ "T2. Sequencing & Structure Reconstruction": 0.7680012781036988,
70
+ "T3. Evidence-Grounded QA": 0.5416666666666666,
71
+ "T4. Summarization & Synthesis": 0.5417591122015827,
72
+ "T5. Attribution & Citation Alignment": 0.5399240477985636,
73
+ "T6. Aggregation & Clustering": 0.4939990937730982,
74
+ "T7. Consistency & Compliance Checking": 0.3541605841917901,
75
+ "T8. Structured & Numeric Reasoning": 0.24212962962962964,
76
+ "T9. Version & Code Diff Analysis": 0.6325782099444311,
77
+ "T10. Rule Induction & In-Context Learning": 0.6151388888888889,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667
79
+ },
80
+ "language": {
81
+ "Chinese": 0.5160793653421631,
82
+ "English": 0.5202262477576317
83
+ }
84
+ },
85
+ "pass@1": 0.23666666666666666,
86
+ "BoN-2": {
87
+ "overall_metric": 0.5442270665213056,
88
+ "token_length": {
89
+ "8k": 0.594568981140541,
90
+ "16k": 0.5576674233487121,
91
+ "32k": 0.5818313481616337,
92
+ "64k": 0.5446955915354488,
93
+ "128k": 0.5245780781334816,
94
+ "256k": 0.4620209768080174
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.5071422110822568,
98
+ "Partial": 0.591425973443732
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.6783617155027842,
102
+ "Moderate": 0.5372286722751785,
103
+ "Hard": 0.5038179540985808,
104
+ "Extreme": 0.4284267679263639
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.8015300122926023,
108
+ "T2. Sequencing & Structure Reconstruction": 0.787322466174887,
109
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
110
+ "T4. Summarization & Synthesis": 0.5591478963061529,
111
+ "T5. Attribution & Citation Alignment": 0.5635377708713467,
112
+ "T6. Aggregation & Clustering": 0.5151496972716835,
113
+ "T7. Consistency & Compliance Checking": 0.37875998396209754,
114
+ "T8. Structured & Numeric Reasoning": 0.29953703703703705,
115
+ "T9. Version & Code Diff Analysis": 0.6509912195762164,
116
+ "T10. Rule Induction & In-Context Learning": 0.6334722222222222,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.45
118
+ },
119
+ "language": {
120
+ "Chinese": 0.5466945351249929,
121
+ "English": 0.5417595979176187
122
+ }
123
+ },
124
+ "pass@2": 0.26066666666666666,
125
+ "BoN-3": {
126
+ "overall_metric": 0.5555999285609496,
127
+ "token_length": {
128
+ "8k": 0.6111433308773455,
129
+ "16k": 0.5701468581090299,
130
+ "32k": 0.5948383611006594,
131
+ "64k": 0.5601715262733609,
132
+ "128k": 0.5307382241597844,
133
+ "256k": 0.4665612708455233
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.5192835049000674,
137
+ "Partial": 0.6018208314020757
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.6899946609782567,
141
+ "Moderate": 0.55112703775723,
142
+ "Hard": 0.5166174713215149,
143
+ "Extreme": 0.4369188707412474
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.8147581778505403,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8027788153812361,
148
+ "T3. Evidence-Grounded QA": 0.5916666666666667,
149
+ "T4. Summarization & Synthesis": 0.5641241115145229,
150
+ "T5. Attribution & Citation Alignment": 0.5716268051520625,
151
+ "T6. Aggregation & Clustering": 0.5308177346459271,
152
+ "T7. Consistency & Compliance Checking": 0.3957991644610091,
153
+ "T8. Structured & Numeric Reasoning": 0.29953703703703705,
154
+ "T9. Version & Code Diff Analysis": 0.6606747373420039,
155
+ "T10. Rule Induction & In-Context Learning": 0.6418055555555555,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4583333333333333
157
+ },
158
+ "language": {
159
+ "Chinese": 0.5602606340750268,
160
+ "English": 0.550939223046875
161
+ }
162
+ },
163
+ "pass@3": 0.272
164
+ }
results/DeepSeek-V3-0324/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.5670800708470047,
9
+ "inference_iteration_1_overall_metric": 0.5592880863605422,
10
+ "inference_iteration_2_overall_metric": 0.5704394040472569,
11
+ "inference_iteration_3_overall_metric": 0.5715127221332137,
12
+ "average_token_length_metric": {
13
+ "8k": 0.6286936796561847,
14
+ "16k": 0.6309027535519853,
15
+ "32k": 0.5969011989319307,
16
+ "64k": 0.5427727165403452,
17
+ "128k": 0.5275644070173147,
18
+ "256k": 0.4756456693842662
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.5368141482212665,
22
+ "Partial": 0.605600336007035
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.7920271132046242,
26
+ "Moderate": 0.5713738824882528,
27
+ "Hard": 0.4620098327210685,
28
+ "Extreme": 0.3868526000236004
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.8062267128065881,
32
+ "T2. Sequencing & Structure Reconstruction": 0.7434196920363564,
33
+ "T3. Evidence-Grounded QA": 0.5111111111111112,
34
+ "T4. Summarization & Synthesis": 0.5146898974811284,
35
+ "T5. Attribution & Citation Alignment": 0.537979547816877,
36
+ "T6. Aggregation & Clustering": 0.5248106585111237,
37
+ "T7. Consistency & Compliance Checking": 0.3681804918962003,
38
+ "T8. Structured & Numeric Reasoning": 0.5841049382716048,
39
+ "T9. Version & Code Diff Analysis": 0.6466057172430281,
40
+ "T10. Rule Induction & In-Context Learning": 0.6183796296296297,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4944444444444444
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.5527161660546936,
45
+ "English": 0.5814439756393159
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.5592880863605422,
49
+ "token_length": {
50
+ "8k": 0.6182266355968743,
51
+ "16k": 0.6224911215102044,
52
+ "32k": 0.6006190818475612,
53
+ "64k": 0.5386968289399401,
54
+ "128k": 0.5299641908835836,
55
+ "256k": 0.4457306593850912
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.530042096148415,
59
+ "Partial": 0.5965102557214322
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.7861925110638937,
63
+ "Moderate": 0.563744851626512,
64
+ "Hard": 0.4535437025457247,
65
+ "Extreme": 0.377252152391456
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.7951398867629517,
69
+ "T2. Sequencing & Structure Reconstruction": 0.751023606023606,
70
+ "T3. Evidence-Grounded QA": 0.4666666666666667,
71
+ "T4. Summarization & Synthesis": 0.5136736975702932,
72
+ "T5. Attribution & Citation Alignment": 0.510566852786565,
73
+ "T6. Aggregation & Clustering": 0.5116905358775884,
74
+ "T7. Consistency & Compliance Checking": 0.37914696318940605,
75
+ "T8. Structured & Numeric Reasoning": 0.5847222222222223,
76
+ "T9. Version & Code Diff Analysis": 0.6690241210962063,
77
+ "T10. Rule Induction & In-Context Learning": 0.5966666666666666,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.475
79
+ },
80
+ "language": {
81
+ "Chinese": 0.5492580351840802,
82
+ "English": 0.569318137537005
83
+ }
84
+ },
85
+ "pass@1": 0.30666666666666664,
86
+ "BoN-2": {
87
+ "overall_metric": 0.6262109782348795,
88
+ "token_length": {
89
+ "8k": 0.6883768372773764,
90
+ "16k": 0.7011104482454619,
91
+ "32k": 0.647383679165818,
92
+ "64k": 0.6092322863406843,
93
+ "128k": 0.5965297187489229,
94
+ "256k": 0.5146328996310173
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.5949295492853987,
98
+ "Partial": 0.6660237059887659
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.8509257637159784,
102
+ "Moderate": 0.66205670168914,
103
+ "Hard": 0.5144444684176784,
104
+ "Extreme": 0.42991229790988206
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.8583478908910738,
108
+ "T2. Sequencing & Structure Reconstruction": 0.7814144189144188,
109
+ "T3. Evidence-Grounded QA": 0.5916666666666667,
110
+ "T4. Summarization & Synthesis": 0.5277169061289491,
111
+ "T5. Attribution & Citation Alignment": 0.5960013921722541,
112
+ "T6. Aggregation & Clustering": 0.5992683212004715,
113
+ "T7. Consistency & Compliance Checking": 0.4404830107556436,
114
+ "T8. Structured & Numeric Reasoning": 0.6166666666666667,
115
+ "T9. Version & Code Diff Analysis": 0.7117518441173546,
116
+ "T10. Rule Induction & In-Context Learning": 0.7094444444444444,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667
118
+ },
119
+ "language": {
120
+ "Chinese": 0.6172660802474611,
121
+ "English": 0.6351558762222995
122
+ }
123
+ },
124
+ "pass@2": 0.37133333333333335,
125
+ "BoN-3": {
126
+ "overall_metric": 0.656766091226133,
127
+ "token_length": {
128
+ "8k": 0.7136143676151803,
129
+ "16k": 0.723535463035206,
130
+ "32k": 0.676630607848943,
131
+ "64k": 0.6371662004749992,
132
+ "128k": 0.6230203333837121,
133
+ "256k": 0.5666295749987625
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.6274188340913682,
137
+ "Partial": 0.6941171457612914
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.8771215787680936,
141
+ "Moderate": 0.7073341252772278,
142
+ "Hard": 0.5445129149531545,
143
+ "Extreme": 0.4558925937418165
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.8681438231282913,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8017822455322453,
148
+ "T3. Evidence-Grounded QA": 0.625,
149
+ "T4. Summarization & Synthesis": 0.5352432221044823,
150
+ "T5. Attribution & Citation Alignment": 0.6492191718976088,
151
+ "T6. Aggregation & Clustering": 0.6200836170157672,
152
+ "T7. Consistency & Compliance Checking": 0.4675370424175185,
153
+ "T8. Structured & Numeric Reasoning": 0.6592592592592593,
154
+ "T9. Version & Code Diff Analysis": 0.7376455774030053,
155
+ "T10. Rule Induction & In-Context Learning": 0.7722222222222221,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6
157
+ },
158
+ "language": {
159
+ "Chinese": 0.6466196520294557,
160
+ "English": 0.6669125304228131
161
+ }
162
+ },
163
+ "pass@3": 0.39866666666666667
164
+ }
results/DeepSeek-V3.1/nonthinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.513858634133048,
9
+ "inference_iteration_1_overall_metric": 0.5123343209652136,
10
+ "inference_iteration_2_overall_metric": 0.5169477472023125,
11
+ "inference_iteration_3_overall_metric": 0.5122938342316177,
12
+ "average_token_length_metric": {
13
+ "8k": 0.5798800160519532,
14
+ "16k": 0.557162234839459,
15
+ "32k": 0.5231647768475723,
16
+ "64k": 0.5020895430155518,
17
+ "128k": 0.47482295470763564,
18
+ "256k": 0.44603227933611866
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.4799762392602454,
22
+ "Partial": 0.556981682152979
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.6361477184952151,
26
+ "Moderate": 0.4879546435756716,
27
+ "Hard": 0.4929175841249406,
28
+ "Extreme": 0.4106651751804595
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.7899439606505894,
32
+ "T2. Sequencing & Structure Reconstruction": 0.7633837790575241,
33
+ "T3. Evidence-Grounded QA": 0.5472222222222223,
34
+ "T4. Summarization & Synthesis": 0.5504457320532966,
35
+ "T5. Attribution & Citation Alignment": 0.5405950417654154,
36
+ "T6. Aggregation & Clustering": 0.4826886948879801,
37
+ "T7. Consistency & Compliance Checking": 0.3782532668616311,
38
+ "T8. Structured & Numeric Reasoning": 0.20864197530864195,
39
+ "T9. Version & Code Diff Analysis": 0.6426366556970474,
40
+ "T10. Rule Induction & In-Context Learning": 0.5235185185185185,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4611111111111112
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.5242036714829956,
45
+ "English": 0.5035135967831006
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.5123343209652136,
49
+ "token_length": {
50
+ "8k": 0.5775546994411811,
51
+ "16k": 0.5639037168302903,
52
+ "32k": 0.5253784942631851,
53
+ "64k": 0.4921258363031359,
54
+ "128k": 0.46961700213634144,
55
+ "256k": 0.4454261768171497
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.4805828046453364,
59
+ "Partial": 0.5527453417359676
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.6289501731523275,
63
+ "Moderate": 0.48863452317355005,
64
+ "Hard": 0.49859679584956984,
65
+ "Extreme": 0.4091764699789028
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.7832093443146274,
69
+ "T2. Sequencing & Structure Reconstruction": 0.758390884975293,
70
+ "T3. Evidence-Grounded QA": 0.525,
71
+ "T4. Summarization & Synthesis": 0.5526444826651147,
72
+ "T5. Attribution & Citation Alignment": 0.550721576382896,
73
+ "T6. Aggregation & Clustering": 0.49314760273206115,
74
+ "T7. Consistency & Compliance Checking": 0.36201890631349554,
75
+ "T8. Structured & Numeric Reasoning": 0.19351851851851853,
76
+ "T9. Version & Code Diff Analysis": 0.6396574046033499,
77
+ "T10. Rule Induction & In-Context Learning": 0.5548611111111109,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4666666666666667
79
+ },
80
+ "language": {
81
+ "Chinese": 0.5269577477981102,
82
+ "English": 0.49771089413231795
83
+ }
84
+ },
85
+ "pass@1": 0.24,
86
+ "BoN-2": {
87
+ "overall_metric": 0.5575639036526701,
88
+ "token_length": {
89
+ "8k": 0.6295556911757817,
90
+ "16k": 0.600201088330424,
91
+ "32k": 0.568079050385207,
92
+ "64k": 0.540874671214598,
93
+ "128k": 0.5142278846286528,
94
+ "256k": 0.4924450361813656
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.5270669681600962,
98
+ "Partial": 0.596378185188677
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.6861712816104073,
102
+ "Moderate": 0.5385181729697447,
103
+ "Hard": 0.5320539370618487,
104
+ "Extreme": 0.4459453589628649
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.7998040423032986,
108
+ "T2. Sequencing & Structure Reconstruction": 0.7974421612945821,
109
+ "T3. Evidence-Grounded QA": 0.6083333333333333,
110
+ "T4. Summarization & Synthesis": 0.5644516725623473,
111
+ "T5. Attribution & Citation Alignment": 0.603125221836478,
112
+ "T6. Aggregation & Clustering": 0.5375390883957035,
113
+ "T7. Consistency & Compliance Checking": 0.42616915873421196,
114
+ "T8. Structured & Numeric Reasoning": 0.2569444444444444,
115
+ "T9. Version & Code Diff Analysis": 0.6918022158557025,
116
+ "T10. Rule Induction & In-Context Learning": 0.5736111111111112,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5
118
+ },
119
+ "language": {
120
+ "Chinese": 0.5701747476373757,
121
+ "English": 0.5449530596679679
122
+ }
123
+ },
124
+ "pass@2": 0.278,
125
+ "BoN-3": {
126
+ "overall_metric": 0.5783189478755758,
127
+ "token_length": {
128
+ "8k": 0.6444603085006093,
129
+ "16k": 0.6168368410260194,
130
+ "32k": 0.591356956659339,
131
+ "64k": 0.5741881409846333,
132
+ "128k": 0.5329008044819139,
133
+ "256k": 0.5101706356009489
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.5453502508711524,
137
+ "Partial": 0.6202791076993917
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.700691507162887,
141
+ "Moderate": 0.5586892040625308,
142
+ "Hard": 0.5577221017016739,
143
+ "Extreme": 0.47068692726136246
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.8323002597882068,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8148032724056932,
148
+ "T3. Evidence-Grounded QA": 0.625,
149
+ "T4. Summarization & Synthesis": 0.5726119582175254,
150
+ "T5. Attribution & Citation Alignment": 0.625566004790705,
151
+ "T6. Aggregation & Clustering": 0.5576354028999125,
152
+ "T7. Consistency & Compliance Checking": 0.4599095603777464,
153
+ "T8. Structured & Numeric Reasoning": 0.2736111111111111,
154
+ "T9. Version & Code Diff Analysis": 0.7066934638816516,
155
+ "T10. Rule Induction & In-Context Learning": 0.5819444444444444,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333
157
+ },
158
+ "language": {
159
+ "Chinese": 0.5882921354915299,
160
+ "English": 0.5683457602596252
161
+ }
162
+ },
163
+ "pass@3": 0.3
164
+ }
results/DeepSeek-V3.1/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 8,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.6621817899708398,
9
+ "inference_iteration_1_overall_metric": 0.6612230154154042,
10
+ "inference_iteration_2_overall_metric": 0.6610111426397741,
11
+ "inference_iteration_3_overall_metric": 0.6643112118573413,
12
+ "average_token_length_metric": {
13
+ "8k": 0.7494820895775017,
14
+ "16k": 0.7158886748078707,
15
+ "32k": 0.668616684861116,
16
+ "64k": 0.7028333128738413,
17
+ "128k": 0.6150251691532579,
18
+ "256k": 0.5212448085514543
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.6306995662884327,
22
+ "Partial": 0.7022500746575411
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.8572162949660228,
26
+ "Moderate": 0.7353266184513482,
27
+ "Hard": 0.622190936275892,
28
+ "Extreme": 0.4267542215146938
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.8547667520039813,
32
+ "T2. Sequencing & Structure Reconstruction": 0.8502990373823708,
33
+ "T3. Evidence-Grounded QA": 0.5944444444444446,
34
+ "T4. Summarization & Synthesis": 0.5592502973941748,
35
+ "T5. Attribution & Citation Alignment": 0.664951753589773,
36
+ "T6. Aggregation & Clustering": 0.6143401320362227,
37
+ "T7. Consistency & Compliance Checking": 0.5020434872004602,
38
+ "T8. Structured & Numeric Reasoning": 0.7200617283950619,
39
+ "T9. Version & Code Diff Analysis": 0.7327346609657337,
40
+ "T10. Rule Induction & In-Context Learning": 0.6883796296296296,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5777777777777778
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.6626168849921547,
45
+ "English": 0.6617466949495263
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.6612230154154042,
49
+ "token_length": {
50
+ "8k": 0.7464911564030304,
51
+ "16k": 0.7250299866543051,
52
+ "32k": 0.658322634935698,
53
+ "64k": 0.7169507057254954,
54
+ "128k": 0.6020278750216188,
55
+ "256k": 0.5185157337522811
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.6306907002796491,
59
+ "Partial": 0.7000823255881854
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.8474131666511834,
63
+ "Moderate": 0.7235563816620875,
64
+ "Hard": 0.6392066996365914,
65
+ "Extreme": 0.4307791961407244
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.858023793743841,
69
+ "T2. Sequencing & Structure Reconstruction": 0.8565782365782365,
70
+ "T3. Evidence-Grounded QA": 0.6083333333333333,
71
+ "T4. Summarization & Synthesis": 0.5579680390729822,
72
+ "T5. Attribution & Citation Alignment": 0.6494356501600668,
73
+ "T6. Aggregation & Clustering": 0.6299150042042198,
74
+ "T7. Consistency & Compliance Checking": 0.5158224119304962,
75
+ "T8. Structured & Numeric Reasoning": 0.6930555555555555,
76
+ "T9. Version & Code Diff Analysis": 0.7300925156020261,
77
+ "T10. Rule Induction & In-Context Learning": 0.6966666666666667,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.55
79
+ },
80
+ "language": {
81
+ "Chinese": 0.6664246885361451,
82
+ "English": 0.6560213422946652
83
+ }
84
+ },
85
+ "pass@1": 0.44533333333333336,
86
+ "BoN-2": {
87
+ "overall_metric": 0.7247597749822192,
88
+ "token_length": {
89
+ "8k": 0.7863113902541472,
90
+ "16k": 0.7674558325218811,
91
+ "32k": 0.7281178415241295,
92
+ "64k": 0.7821445768686631,
93
+ "128k": 0.6970380441541918,
94
+ "256k": 0.5874909645703058
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.6960065402743163,
98
+ "Partial": 0.7613548009740968
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.9116793007094617,
102
+ "Moderate": 0.8131496116751571,
103
+ "Hard": 0.7011503473822069,
104
+ "Extreme": 0.47744898037225214
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.8830939332648011,
108
+ "T2. Sequencing & Structure Reconstruction": 0.8806919931919931,
109
+ "T3. Evidence-Grounded QA": 0.6916666666666667,
110
+ "T4. Summarization & Synthesis": 0.5727671170579177,
111
+ "T5. Attribution & Citation Alignment": 0.7446276254629812,
112
+ "T6. Aggregation & Clustering": 0.683646617964251,
113
+ "T7. Consistency & Compliance Checking": 0.5837976903893948,
114
+ "T8. Structured & Numeric Reasoning": 0.7902777777777777,
115
+ "T9. Version & Code Diff Analysis": 0.7892333891029187,
116
+ "T10. Rule Induction & In-Context Learning": 0.7691666666666667,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6416666666666667
118
+ },
119
+ "language": {
120
+ "Chinese": 0.7206824886096936,
121
+ "English": 0.7288370613547459
122
+ }
123
+ },
124
+ "pass@2": 0.5166666666666667,
125
+ "BoN-3": {
126
+ "overall_metric": 0.7535692341250043,
127
+ "token_length": {
128
+ "8k": 0.8201462848118084,
129
+ "16k": 0.7948417017172265,
130
+ "32k": 0.7537778254297324,
131
+ "64k": 0.8032112869269382,
132
+ "128k": 0.7298931033645517,
133
+ "256k": 0.6195452024997666
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.726098496655499,
137
+ "Partial": 0.7885319909043751
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.9344410438349966,
141
+ "Moderate": 0.844684360754373,
142
+ "Hard": 0.7404763342771397,
143
+ "Extreme": 0.5041859708975693
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.9022474827372764,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8967201779701777,
148
+ "T3. Evidence-Grounded QA": 0.7333333333333333,
149
+ "T4. Summarization & Synthesis": 0.5812947594742044,
150
+ "T5. Attribution & Citation Alignment": 0.7893345155800883,
151
+ "T6. Aggregation & Clustering": 0.7043090297107774,
152
+ "T7. Consistency & Compliance Checking": 0.6227060379432763,
153
+ "T8. Structured & Numeric Reasoning": 0.8129629629629629,
154
+ "T9. Version & Code Diff Analysis": 0.8129681115419459,
155
+ "T10. Rule Induction & In-Context Learning": 0.79375,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.7
157
+ },
158
+ "language": {
159
+ "Chinese": 0.7447219230644647,
160
+ "English": 0.7624165451855438
161
+ }
162
+ },
163
+ "pass@3": 0.552
164
+ }
results/DeepSeek-V3.2/nonthinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.5167049903246114,
9
+ "inference_iteration_1_overall_metric": 0.5175993160365915,
10
+ "inference_iteration_2_overall_metric": 0.5135807596157895,
11
+ "inference_iteration_3_overall_metric": 0.5189348953214519,
12
+ "average_token_length_metric": {
13
+ "8k": 0.5691616296699119,
14
+ "16k": 0.5676134549372556,
15
+ "32k": 0.5289760437098003,
16
+ "64k": 0.5015696259811485,
17
+ "128k": 0.4857001095282947,
18
+ "256k": 0.44720907812125815
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.48158748704864085,
22
+ "Partial": 0.5613999944940294
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.6212240545937193,
26
+ "Moderate": 0.5136346834318135,
27
+ "Hard": 0.5163049021668649,
28
+ "Extreme": 0.4044885248516518
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.7713866456415331,
32
+ "T2. Sequencing & Structure Reconstruction": 0.762687420604087,
33
+ "T3. Evidence-Grounded QA": 0.5333333333333333,
34
+ "T4. Summarization & Synthesis": 0.5515992544844097,
35
+ "T5. Attribution & Citation Alignment": 0.5944535310423185,
36
+ "T6. Aggregation & Clustering": 0.4789878465188747,
37
+ "T7. Consistency & Compliance Checking": 0.39465891182344254,
38
+ "T8. Structured & Numeric Reasoning": 0.2149691358024691,
39
+ "T9. Version & Code Diff Analysis": 0.6451135379199672,
40
+ "T10. Rule Induction & In-Context Learning": 0.49787037037037035,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.46944444444444444
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.5273254322066815,
45
+ "English": 0.5060845484425422
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.5175993160365915,
49
+ "token_length": {
50
+ "8k": 0.5773249164903291,
51
+ "16k": 0.5604627326667504,
52
+ "32k": 0.5374659261246943,
53
+ "64k": 0.5074991434398594,
54
+ "128k": 0.4702901190904467,
55
+ "256k": 0.45255305840747634
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.4801143243240927,
59
+ "Partial": 0.5653074873070485
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.6321632804118208,
63
+ "Moderate": 0.5036215268809261,
64
+ "Hard": 0.5164886663122836,
65
+ "Extreme": 0.40201006150807705
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.7671950408606149,
69
+ "T2. Sequencing & Structure Reconstruction": 0.7640499777999779,
70
+ "T3. Evidence-Grounded QA": 0.5833333333333334,
71
+ "T4. Summarization & Synthesis": 0.5488326711816013,
72
+ "T5. Attribution & Citation Alignment": 0.58422686569879,
73
+ "T6. Aggregation & Clustering": 0.47446486157270457,
74
+ "T7. Consistency & Compliance Checking": 0.3846286380881271,
75
+ "T8. Structured & Numeric Reasoning": 0.2226851851851852,
76
+ "T9. Version & Code Diff Analysis": 0.6587133120918426,
77
+ "T10. Rule Induction & In-Context Learning": 0.5076388888888889,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.43333333333333335
79
+ },
80
+ "language": {
81
+ "Chinese": 0.5202198683535451,
82
+ "English": 0.5149787637196416
83
+ }
84
+ },
85
+ "pass@1": 0.24533333333333332,
86
+ "BoN-2": {
87
+ "overall_metric": 0.5858993921620037,
88
+ "token_length": {
89
+ "8k": 0.6332517244583726,
90
+ "16k": 0.6310887297252004,
91
+ "32k": 0.6114096243459353,
92
+ "64k": 0.5723573466459502,
93
+ "128k": 0.5523069746695444,
94
+ "256k": 0.5149819531270248
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.5499254609160144,
98
+ "Partial": 0.6316843955659928
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.6997370485006186,
102
+ "Moderate": 0.578931807315116,
103
+ "Hard": 0.59328992133015,
104
+ "Extreme": 0.46091761656187924
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.8076556299673503,
108
+ "T2. Sequencing & Structure Reconstruction": 0.8115853128353127,
109
+ "T3. Evidence-Grounded QA": 0.65,
110
+ "T4. Summarization & Synthesis": 0.5673030178914519,
111
+ "T5. Attribution & Citation Alignment": 0.6729248677257385,
112
+ "T6. Aggregation & Clustering": 0.5463889541830715,
113
+ "T7. Consistency & Compliance Checking": 0.4736347042901523,
114
+ "T8. Structured & Numeric Reasoning": 0.2773148148148148,
115
+ "T9. Version & Code Diff Analysis": 0.7103491970064771,
116
+ "T10. Rule Induction & In-Context Learning": 0.5745833333333332,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5833333333333334
118
+ },
119
+ "language": {
120
+ "Chinese": 0.5980713707246905,
121
+ "English": 0.5737274135993191
122
+ }
123
+ },
124
+ "pass@2": 0.312,
125
+ "BoN-3": {
126
+ "overall_metric": 0.6250975643039252,
127
+ "token_length": {
128
+ "8k": 0.6624896013016575,
129
+ "16k": 0.667371637798531,
130
+ "32k": 0.6458484887458542,
131
+ "64k": 0.619893995338142,
132
+ "128k": 0.5995137452171235,
133
+ "256k": 0.5554679174222451
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.5850937206249248,
137
+ "Partial": 0.6760115471681097
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.7532527253719965,
141
+ "Moderate": 0.6215376875723243,
142
+ "Hard": 0.6270141217985696,
143
+ "Extreme": 0.48578877254181324
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.8312391426962816,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8401638639138637,
148
+ "T3. Evidence-Grounded QA": 0.725,
149
+ "T4. Summarization & Synthesis": 0.5737811360033489,
150
+ "T5. Attribution & Citation Alignment": 0.7032234702446885,
151
+ "T6. Aggregation & Clustering": 0.5739851542792719,
152
+ "T7. Consistency & Compliance Checking": 0.5085181572307016,
153
+ "T8. Structured & Numeric Reasoning": 0.33425925925925926,
154
+ "T9. Version & Code Diff Analysis": 0.7396125292314827,
155
+ "T10. Rule Induction & In-Context Learning": 0.6588888888888889,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6166666666666667
157
+ },
158
+ "language": {
159
+ "Chinese": 0.6402707582944462,
160
+ "English": 0.6099243703134061
161
+ }
162
+ },
163
+ "pass@3": 0.3526666666666667
164
+ }
results/DeepSeek-V3.2/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.6782077426413915,
9
+ "inference_iteration_1_overall_metric": 0.671629754030229,
10
+ "inference_iteration_2_overall_metric": 0.6777556491690084,
11
+ "inference_iteration_3_overall_metric": 0.6852378247249357,
12
+ "average_token_length_metric": {
13
+ "8k": 0.755369154280727,
14
+ "16k": 0.7449467265987637,
15
+ "32k": 0.6953336880653428,
16
+ "64k": 0.6946800210314833,
17
+ "128k": 0.6477035080898761,
18
+ "256k": 0.5312133577821595
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.6459619783148297,
22
+ "Partial": 0.7192478063297452
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.8502179380964533,
26
+ "Moderate": 0.7507860067400632,
27
+ "Hard": 0.6772551692268365,
28
+ "Extreme": 0.4427333362390087
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.8628300416379104,
32
+ "T2. Sequencing & Structure Reconstruction": 0.8633894500561163,
33
+ "T3. Evidence-Grounded QA": 0.6277777777777778,
34
+ "T4. Summarization & Synthesis": 0.5645627813985595,
35
+ "T5. Attribution & Citation Alignment": 0.7367830500533472,
36
+ "T6. Aggregation & Clustering": 0.6168551563610202,
37
+ "T7. Consistency & Compliance Checking": 0.5431477714039084,
38
+ "T8. Structured & Numeric Reasoning": 0.6640432098765434,
39
+ "T9. Version & Code Diff Analysis": 0.7821104015574073,
40
+ "T10. Rule Induction & In-Context Learning": 0.6818518518518517,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.622222222222222
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.6775197474946019,
45
+ "English": 0.6788957377881832
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.671629754030229,
49
+ "token_length": {
50
+ "8k": 0.7397920433374541,
51
+ "16k": 0.7269924173975423,
52
+ "32k": 0.7007145536231846,
53
+ "64k": 0.6696695962094932,
54
+ "128k": 0.655131428243527,
55
+ "256k": 0.5374784853701818
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.6384885103680042,
59
+ "Partial": 0.7138095186912471
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.8524722619644284,
63
+ "Moderate": 0.7391126766592921,
64
+ "Hard": 0.6576844523580323,
65
+ "Extreme": 0.4383605238465565
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.8608914983834914,
69
+ "T2. Sequencing & Structure Reconstruction": 0.8649913049913043,
70
+ "T3. Evidence-Grounded QA": 0.6166666666666667,
71
+ "T4. Summarization & Synthesis": 0.5629979913624931,
72
+ "T5. Attribution & Citation Alignment": 0.727192234350903,
73
+ "T6. Aggregation & Clustering": 0.6142105299342737,
74
+ "T7. Consistency & Compliance Checking": 0.5448247044942024,
75
+ "T8. Structured & Numeric Reasoning": 0.6222222222222223,
76
+ "T9. Version & Code Diff Analysis": 0.7868571557580843,
77
+ "T10. Rule Induction & In-Context Learning": 0.7038888888888889,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6
79
+ },
80
+ "language": {
81
+ "Chinese": 0.6695591460858414,
82
+ "English": 0.6737003619746216
83
+ }
84
+ },
85
+ "pass@1": 0.442,
86
+ "BoN-2": {
87
+ "overall_metric": 0.739062545668539,
88
+ "token_length": {
89
+ "8k": 0.7985986496818439,
90
+ "16k": 0.8081758798304621,
91
+ "32k": 0.7769565912085228,
92
+ "64k": 0.7312553059908137,
93
+ "128k": 0.7215196949254835,
94
+ "256k": 0.5978691523741125
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.7083303283149,
98
+ "Partial": 0.7781762768459
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.9085052195225997,
102
+ "Moderate": 0.8289156864861236,
103
+ "Hard": 0.7590532764337315,
104
+ "Extreme": 0.4812983463842697
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.8977317620746387,
108
+ "T2. Sequencing & Structure Reconstruction": 0.9110066322566314,
109
+ "T3. Evidence-Grounded QA": 0.725,
110
+ "T4. Summarization & Synthesis": 0.5777725397440469,
111
+ "T5. Attribution & Citation Alignment": 0.807338444836897,
112
+ "T6. Aggregation & Clustering": 0.6747522573464864,
113
+ "T7. Consistency & Compliance Checking": 0.6096717826867489,
114
+ "T8. Structured & Numeric Reasoning": 0.7398148148148148,
115
+ "T9. Version & Code Diff Analysis": 0.8172408263391231,
116
+ "T10. Rule Induction & In-Context Learning": 0.7575,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.7083333333333334
118
+ },
119
+ "language": {
120
+ "Chinese": 0.7379779669884229,
121
+ "English": 0.7401471243486567
122
+ }
123
+ },
124
+ "pass@2": 0.5286666666666666,
125
+ "BoN-3": {
126
+ "overall_metric": 0.769484517884144,
127
+ "token_length": {
128
+ "8k": 0.8183989834849181,
129
+ "16k": 0.8287125912826261,
130
+ "32k": 0.8000120047613876,
131
+ "64k": 0.7929440904260506,
132
+ "128k": 0.761968193898885,
133
+ "256k": 0.6148712434509996
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.7396412839189731,
137
+ "Partial": 0.8074668156579995
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.9315740410553816,
141
+ "Moderate": 0.8685767211690967,
142
+ "Hard": 0.8013938214601453,
143
+ "Extreme": 0.5058786414037999
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.904701887970005,
147
+ "T2. Sequencing & Structure Reconstruction": 0.9204745254745246,
148
+ "T3. Evidence-Grounded QA": 0.7583333333333333,
149
+ "T4. Summarization & Synthesis": 0.5854934073644178,
150
+ "T5. Attribution & Citation Alignment": 0.8343370156947629,
151
+ "T6. Aggregation & Clustering": 0.6936792642304824,
152
+ "T7. Consistency & Compliance Checking": 0.6489698568119598,
153
+ "T8. Structured & Numeric Reasoning": 0.789814814814815,
154
+ "T9. Version & Code Diff Analysis": 0.8323537332622081,
155
+ "T10. Rule Induction & In-Context Learning": 0.8091666666666666,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.775
157
+ },
158
+ "language": {
159
+ "Chinese": 0.76992326145198,
160
+ "English": 0.7690457743163093
161
+ }
162
+ },
163
+ "pass@3": 0.572
164
+ }
results/GLM-4.5/nonthinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.43035083788419254,
9
+ "inference_iteration_1_overall_metric": 0.4323298899239496,
10
+ "inference_iteration_2_overall_metric": 0.42711968234411496,
11
+ "inference_iteration_3_overall_metric": 0.43160294138451205,
12
+ "average_token_length_metric": {
13
+ "8k": 0.5314683958937002,
14
+ "16k": 0.49409349830535854,
15
+ "32k": 0.5016963643416883,
16
+ "64k": 0.41773091498134723,
17
+ "128k": 0.3415048455402667,
18
+ "256k": 0.2956110082427921
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.40615033864921796,
22
+ "Partial": 0.46115147327415856
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.5567819400237102,
26
+ "Moderate": 0.36920960557839383,
27
+ "Hard": 0.4020802519560651,
28
+ "Extreme": 0.3505786202440932
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.698931526590424,
32
+ "T2. Sequencing & Structure Reconstruction": 0.6796812769380679,
33
+ "T3. Evidence-Grounded QA": 0.4333333333333333,
34
+ "T4. Summarization & Synthesis": 0.5382688702946171,
35
+ "T5. Attribution & Citation Alignment": 0.46682998936207576,
36
+ "T6. Aggregation & Clustering": 0.3861931118799582,
37
+ "T7. Consistency & Compliance Checking": 0.2699805385521367,
38
+ "T8. Structured & Numeric Reasoning": 0.19089506172839513,
39
+ "T9. Version & Code Diff Analysis": 0.5555800013857407,
40
+ "T10. Rule Induction & In-Context Learning": 0.4250462962962961,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3111111111111111
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.4302365847397278,
45
+ "English": 0.4304650910286559
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.4323298899239496,
49
+ "token_length": {
50
+ "8k": 0.5493363038285968,
51
+ "16k": 0.5003136640414932,
52
+ "32k": 0.48767860893851595,
53
+ "64k": 0.427526424572054,
54
+ "128k": 0.3355553710370271,
55
+ "256k": 0.29356896712600833
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.4069098702566493,
59
+ "Partial": 0.4646826422277853
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.549387464064302,
63
+ "Moderate": 0.38289221280334496,
64
+ "Hard": 0.4057019361680662,
65
+ "Extreme": 0.35405992762316457
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.6989719229147292,
69
+ "T2. Sequencing & Structure Reconstruction": 0.6906849631849629,
70
+ "T3. Evidence-Grounded QA": 0.4583333333333333,
71
+ "T4. Summarization & Synthesis": 0.5384461382258066,
72
+ "T5. Attribution & Citation Alignment": 0.4818936352277487,
73
+ "T6. Aggregation & Clustering": 0.37379481108384327,
74
+ "T7. Consistency & Compliance Checking": 0.26307329406050667,
75
+ "T8. Structured & Numeric Reasoning": 0.19120370370370374,
76
+ "T9. Version & Code Diff Analysis": 0.5459081401129251,
77
+ "T10. Rule Induction & In-Context Learning": 0.43111111111111106,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665
79
+ },
80
+ "language": {
81
+ "Chinese": 0.42606506993377413,
82
+ "English": 0.4385947099141243
83
+ }
84
+ },
85
+ "pass@1": 0.188,
86
+ "BoN-2": {
87
+ "overall_metric": 0.5016598810015709,
88
+ "token_length": {
89
+ "8k": 0.6177853878601806,
90
+ "16k": 0.5606584472775635,
91
+ "32k": 0.5705417377303723,
92
+ "64k": 0.49224051142500497,
93
+ "128k": 0.4148610132238459,
94
+ "256k": 0.3538721884924622
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.4781661168763403,
98
+ "Partial": 0.531561035342776
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.6469329658307705,
102
+ "Moderate": 0.4476620656378584,
103
+ "Hard": 0.47548119381384835,
104
+ "Extreme": 0.39518120452359784
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.7612773821912874,
108
+ "T2. Sequencing & Structure Reconstruction": 0.7455342342842339,
109
+ "T3. Evidence-Grounded QA": 0.5416666666666666,
110
+ "T4. Summarization & Synthesis": 0.5515008189706921,
111
+ "T5. Attribution & Citation Alignment": 0.5415377453921782,
112
+ "T6. Aggregation & Clustering": 0.46179351546743685,
113
+ "T7. Consistency & Compliance Checking": 0.33666684704302524,
114
+ "T8. Structured & Numeric Reasoning": 0.24537037037037035,
115
+ "T9. Version & Code Diff Analysis": 0.6269577879155583,
116
+ "T10. Rule Induction & In-Context Learning": 0.5281944444444444,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333
118
+ },
119
+ "language": {
120
+ "Chinese": 0.48816204809318214,
121
+ "English": 0.5151577139099619
122
+ }
123
+ },
124
+ "pass@2": 0.23933333333333334,
125
+ "BoN-3": {
126
+ "overall_metric": 0.5384969000760513,
127
+ "token_length": {
128
+ "8k": 0.6356799930117969,
129
+ "16k": 0.5929083561253189,
130
+ "32k": 0.6076554475255213,
131
+ "64k": 0.5450784408185039,
132
+ "128k": 0.4563507575799897,
133
+ "256k": 0.3933084053951815
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.5134959721986003,
137
+ "Partial": 0.5703162628291728
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.6991002895184023,
141
+ "Moderate": 0.4873593462709992,
142
+ "Hard": 0.49915491813734847,
143
+ "Extreme": 0.4219917912549902
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.7936494394174134,
147
+ "T2. Sequencing & Structure Reconstruction": 0.7682061919561918,
148
+ "T3. Evidence-Grounded QA": 0.6,
149
+ "T4. Summarization & Synthesis": 0.5604787610213925,
150
+ "T5. Attribution & Citation Alignment": 0.5757231079007503,
151
+ "T6. Aggregation & Clustering": 0.4903242104980166,
152
+ "T7. Consistency & Compliance Checking": 0.3811432372555578,
153
+ "T8. Structured & Numeric Reasoning": 0.274537037037037,
154
+ "T9. Version & Code Diff Analysis": 0.6567859123578721,
155
+ "T10. Rule Induction & In-Context Learning": 0.5740277777777778,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334
157
+ },
158
+ "language": {
159
+ "Chinese": 0.5274718845754229,
160
+ "English": 0.5495219155766814
161
+ }
162
+ },
163
+ "pass@3": 0.27466666666666667
164
+ }
results/GLM-4.5/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 2,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.5547937875815533,
9
+ "inference_iteration_1_overall_metric": 0.5516398292433491,
10
+ "inference_iteration_2_overall_metric": 0.5535867950098665,
11
+ "inference_iteration_3_overall_metric": 0.559154738491441,
12
+ "average_token_length_metric": {
13
+ "8k": 0.6972820277601768,
14
+ "16k": 0.6560112539595868,
15
+ "32k": 0.6029656036576351,
16
+ "64k": 0.5486294944947675,
17
+ "128k": 0.4403706772307823,
18
+ "256k": 0.38350366838636923
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.5245120995377551,
22
+ "Partial": 0.593334117819114
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.7655351117837857,
26
+ "Moderate": 0.5513197070461822,
27
+ "Hard": 0.473832729215578,
28
+ "Extreme": 0.37939478048385467
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.81334590378291,
32
+ "T2. Sequencing & Structure Reconstruction": 0.7505160653892534,
33
+ "T3. Evidence-Grounded QA": 0.5027777777777779,
34
+ "T4. Summarization & Synthesis": 0.5448942860165678,
35
+ "T5. Attribution & Citation Alignment": 0.5301683776855378,
36
+ "T6. Aggregation & Clustering": 0.5045905245734419,
37
+ "T7. Consistency & Compliance Checking": 0.3519808997100079,
38
+ "T8. Structured & Numeric Reasoning": 0.6371913580246912,
39
+ "T9. Version & Code Diff Analysis": 0.6439646495440424,
40
+ "T10. Rule Induction & In-Context Learning": 0.5808333333333334,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3277777777777777
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.5739046099726642,
45
+ "English": 0.5356829651904418
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.5516398292433491,
49
+ "token_length": {
50
+ "8k": 0.7055429088605052,
51
+ "16k": 0.6578481199386829,
52
+ "32k": 0.5956492591185892,
53
+ "64k": 0.5401522196952867,
54
+ "128k": 0.42395107911700697,
55
+ "256k": 0.386695388730029
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.5177580971479406,
59
+ "Partial": 0.5947620337284167
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.752722499925911,
63
+ "Moderate": 0.5341124686360985,
64
+ "Hard": 0.4868060322041808,
65
+ "Extreme": 0.3854592094497618
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.802009002313031,
69
+ "T2. Sequencing & Structure Reconstruction": 0.7465347152847152,
70
+ "T3. Evidence-Grounded QA": 0.5,
71
+ "T4. Summarization & Synthesis": 0.545377281117479,
72
+ "T5. Attribution & Citation Alignment": 0.5499549412575729,
73
+ "T6. Aggregation & Clustering": 0.5019983393746833,
74
+ "T7. Consistency & Compliance Checking": 0.36021434419736664,
75
+ "T8. Structured & Numeric Reasoning": 0.6138888888888889,
76
+ "T9. Version & Code Diff Analysis": 0.6552473446554453,
77
+ "T10. Rule Induction & In-Context Learning": 0.5405555555555556,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667
79
+ },
80
+ "language": {
81
+ "Chinese": 0.5760533640670983,
82
+ "English": 0.5272262944196019
83
+ }
84
+ },
85
+ "pass@1": 0.304,
86
+ "BoN-2": {
87
+ "overall_metric": 0.634985055308529,
88
+ "token_length": {
89
+ "8k": 0.7829957270581746,
90
+ "16k": 0.720868655519701,
91
+ "32k": 0.6955532292535292,
92
+ "64k": 0.6191323026708053,
93
+ "128k": 0.5371685233314611,
94
+ "256k": 0.45419189401750604
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.6004623267171497,
98
+ "Partial": 0.67892307351574
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.8494656930118193,
102
+ "Moderate": 0.6521983830806934,
103
+ "Hard": 0.561519594238419,
104
+ "Extreme": 0.43697868974062276
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.8580534190067889,
108
+ "T2. Sequencing & Structure Reconstruction": 0.8009605209605205,
109
+ "T3. Evidence-Grounded QA": 0.6166666666666667,
110
+ "T4. Summarization & Synthesis": 0.5623513196545108,
111
+ "T5. Attribution & Citation Alignment": 0.6128841050494817,
112
+ "T6. Aggregation & Clustering": 0.5858431630098295,
113
+ "T7. Consistency & Compliance Checking": 0.42985731617797807,
114
+ "T8. Structured & Numeric Reasoning": 0.7472222222222222,
115
+ "T9. Version & Code Diff Analysis": 0.7035408856813825,
116
+ "T10. Rule Induction & In-Context Learning": 0.6718055555555555,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4666666666666667
118
+ },
119
+ "language": {
120
+ "Chinese": 0.6487038966161633,
121
+ "English": 0.6212662140008957
122
+ }
123
+ },
124
+ "pass@2": 0.4,
125
+ "BoN-3": {
126
+ "overall_metric": 0.6753829492782434,
127
+ "token_length": {
128
+ "8k": 0.8185554078642306,
129
+ "16k": 0.7599530890518512,
130
+ "32k": 0.7311990883460826,
131
+ "64k": 0.6754068771726057,
132
+ "128k": 0.5730612976856176,
133
+ "256k": 0.49412193554907724
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.6412352384515387,
137
+ "Partial": 0.718843672148596
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.8844767087888782,
141
+ "Moderate": 0.717837978263145,
142
+ "Hard": 0.600883682680195,
143
+ "Extreme": 0.4673774778829585
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.896714960990196,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8421179283679281,
148
+ "T3. Evidence-Grounded QA": 0.675,
149
+ "T4. Summarization & Synthesis": 0.5712469046961466,
150
+ "T5. Attribution & Citation Alignment": 0.6300235179241888,
151
+ "T6. Aggregation & Clustering": 0.6348800775177585,
152
+ "T7. Consistency & Compliance Checking": 0.4736376213352886,
153
+ "T8. Structured & Numeric Reasoning": 0.7824074074074073,
154
+ "T9. Version & Code Diff Analysis": 0.7424625612755764,
155
+ "T10. Rule Induction & In-Context Learning": 0.765,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334
157
+ },
158
+ "language": {
159
+ "Chinese": 0.6895150466449506,
160
+ "English": 0.6612508519115379
161
+ }
162
+ },
163
+ "pass@3": 0.444
164
+ }
results/GLM-4.6/nonthinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.45854238430368943,
9
+ "inference_iteration_1_overall_metric": 0.44890397156188516,
10
+ "inference_iteration_2_overall_metric": 0.4676756901179884,
11
+ "inference_iteration_3_overall_metric": 0.45904749123119587,
12
+ "average_token_length_metric": {
13
+ "8k": 0.539826456202734,
14
+ "16k": 0.49883990878468565,
15
+ "32k": 0.5226004279628154,
16
+ "64k": 0.4617605114078172,
17
+ "128k": 0.3868307627999842,
18
+ "256k": 0.34139623866410224
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.43758451425043066,
22
+ "Partial": 0.4852160370987465
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.5881815896217503,
26
+ "Moderate": 0.40045739164324096,
27
+ "Hard": 0.43071501653405486,
28
+ "Extreme": 0.3729573279423014
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.7391294273870368,
32
+ "T2. Sequencing & Structure Reconstruction": 0.7168876379566528,
33
+ "T3. Evidence-Grounded QA": 0.48333333333333334,
34
+ "T4. Summarization & Synthesis": 0.54430615481068,
35
+ "T5. Attribution & Citation Alignment": 0.5036524549754317,
36
+ "T6. Aggregation & Clustering": 0.4230515938862694,
37
+ "T7. Consistency & Compliance Checking": 0.28024190494681,
38
+ "T8. Structured & Numeric Reasoning": 0.204783950617284,
39
+ "T9. Version & Code Diff Analysis": 0.5493453618981832,
40
+ "T10. Rule Induction & In-Context Learning": 0.48856481481481484,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3444444444444445
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.4607293656684521,
45
+ "English": 0.45635540293892746
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.44890397156188516,
49
+ "token_length": {
50
+ "8k": 0.5369079824065298,
51
+ "16k": 0.4950184758232463,
52
+ "32k": 0.5119378052749619,
53
+ "64k": 0.4359616248204175,
54
+ "128k": 0.3900490912931948,
55
+ "256k": 0.323548849752962
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.42475510461227917,
59
+ "Partial": 0.47963889313411256
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.5839417584803529,
63
+ "Moderate": 0.38543423470966093,
64
+ "Hard": 0.4251833017579716,
65
+ "Extreme": 0.3582444584458006
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.7552782575667283,
69
+ "T2. Sequencing & Structure Reconstruction": 0.72015059015059,
70
+ "T3. Evidence-Grounded QA": 0.425,
71
+ "T4. Summarization & Synthesis": 0.5415035414147457,
72
+ "T5. Attribution & Citation Alignment": 0.49621823831328055,
73
+ "T6. Aggregation & Clustering": 0.41038094275648807,
74
+ "T7. Consistency & Compliance Checking": 0.27110571290315905,
75
+ "T8. Structured & Numeric Reasoning": 0.1953703703703704,
76
+ "T9. Version & Code Diff Analysis": 0.5256412558109732,
77
+ "T10. Rule Induction & In-Context Learning": 0.4822222222222222,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.35
79
+ },
80
+ "language": {
81
+ "Chinese": 0.44671558878002343,
82
+ "English": 0.45109235434374795
83
+ }
84
+ },
85
+ "pass@1": 0.19266666666666668,
86
+ "BoN-2": {
87
+ "overall_metric": 0.5266630332287439,
88
+ "token_length": {
89
+ "8k": 0.604236555945032,
90
+ "16k": 0.5564543546519637,
91
+ "32k": 0.5870438443448399,
92
+ "64k": 0.5387565602861155,
93
+ "128k": 0.46692762105066604,
94
+ "256k": 0.4065592630938467
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.5015019335956029,
98
+ "Partial": 0.5586862509436522
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.6773656505276844,
102
+ "Moderate": 0.47328676466113107,
103
+ "Hard": 0.49837787163941455,
104
+ "Extreme": 0.4152118781770787
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.7867939033692525,
108
+ "T2. Sequencing & Structure Reconstruction": 0.7854105213890441,
109
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
110
+ "T4. Summarization & Synthesis": 0.5572174880251567,
111
+ "T5. Attribution & Citation Alignment": 0.573956288098141,
112
+ "T6. Aggregation & Clustering": 0.4915724205045227,
113
+ "T7. Consistency & Compliance Checking": 0.33460325146257974,
114
+ "T8. Structured & Numeric Reasoning": 0.2578703703703704,
115
+ "T9. Version & Code Diff Analysis": 0.6423128731937172,
116
+ "T10. Rule Induction & In-Context Learning": 0.6198611111111111,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.425
118
+ },
119
+ "language": {
120
+ "Chinese": 0.518790207475404,
121
+ "English": 0.5345358589820849
122
+ }
123
+ },
124
+ "pass@2": 0.25466666666666665,
125
+ "BoN-3": {
126
+ "overall_metric": 0.5609610306922399,
127
+ "token_length": {
128
+ "8k": 0.6386076789983793,
129
+ "16k": 0.5860184400135818,
130
+ "32k": 0.6210153091497477,
131
+ "64k": 0.5871225156541778,
132
+ "128k": 0.4913536643924111,
133
+ "256k": 0.4416485759451446
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.5370910562726694,
137
+ "Partial": 0.5913409981353319
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.7100406216614994,
141
+ "Moderate": 0.5174437099567866,
142
+ "Hard": 0.5265960486680222,
143
+ "Extreme": 0.44880562762488274
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.815117939706351,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8064498061783285,
148
+ "T3. Evidence-Grounded QA": 0.6416666666666667,
149
+ "T4. Summarization & Synthesis": 0.5680717589841464,
150
+ "T5. Attribution & Citation Alignment": 0.604565984118908,
151
+ "T6. Aggregation & Clustering": 0.5283992168363204,
152
+ "T7. Consistency & Compliance Checking": 0.35872230397543203,
153
+ "T8. Structured & Numeric Reasoning": 0.28935185185185186,
154
+ "T9. Version & Code Diff Analysis": 0.6707362245587523,
155
+ "T10. Rule Induction & In-Context Learning": 0.6406944444444443,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5
157
+ },
158
+ "language": {
159
+ "Chinese": 0.5609588769304155,
160
+ "English": 0.5609631844540666
161
+ }
162
+ },
163
+ "pass@3": 0.286
164
+ }
results/GLM-4.6/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.5820993757625644,
9
+ "inference_iteration_1_overall_metric": 0.5900318347862288,
10
+ "inference_iteration_2_overall_metric": 0.5774825139114689,
11
+ "inference_iteration_3_overall_metric": 0.5787837785899949,
12
+ "average_token_length_metric": {
13
+ "8k": 0.7122784818137915,
14
+ "16k": 0.6603518496747058,
15
+ "32k": 0.6352743108645184,
16
+ "64k": 0.5897286272690893,
17
+ "128k": 0.475467875017661,
18
+ "256k": 0.4194951099356217
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.5470143190278319,
22
+ "Partial": 0.6267530843340428
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.7978473417092227,
26
+ "Moderate": 0.6094768922677877,
27
+ "Hard": 0.4892370620605133,
28
+ "Extreme": 0.3887688912252786
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.8197327977970514,
32
+ "T2. Sequencing & Structure Reconstruction": 0.8006519321293782,
33
+ "T3. Evidence-Grounded QA": 0.538888888888889,
34
+ "T4. Summarization & Synthesis": 0.5408566771607968,
35
+ "T5. Attribution & Citation Alignment": 0.5337112988588841,
36
+ "T6. Aggregation & Clustering": 0.5397680321862239,
37
+ "T7. Consistency & Compliance Checking": 0.380513781495624,
38
+ "T8. Structured & Numeric Reasoning": 0.6123456790123456,
39
+ "T9. Version & Code Diff Analysis": 0.6754501038965057,
40
+ "T10. Rule Induction & In-Context Learning": 0.6013425925925924,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.46666666666666673
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.5991918774535788,
45
+ "English": 0.5650068740715505
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.5900318347862288,
49
+ "token_length": {
50
+ "8k": 0.7190910013269595,
51
+ "16k": 0.6680291983169964,
52
+ "32k": 0.6447298296516131,
53
+ "64k": 0.5905857251798682,
54
+ "128k": 0.4766512837488421,
55
+ "256k": 0.4411039704930917
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.5455741223917344,
59
+ "Partial": 0.6466143778337665
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.8187580590040169,
63
+ "Moderate": 0.6184177733393077,
64
+ "Hard": 0.4890540077544932,
65
+ "Extreme": 0.38715232500749663
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.8264146064969754,
69
+ "T2. Sequencing & Structure Reconstruction": 0.8039418272654707,
70
+ "T3. Evidence-Grounded QA": 0.55,
71
+ "T4. Summarization & Synthesis": 0.5427878753470068,
72
+ "T5. Attribution & Citation Alignment": 0.5419657251498339,
73
+ "T6. Aggregation & Clustering": 0.5399584229450126,
74
+ "T7. Consistency & Compliance Checking": 0.37011974365960826,
75
+ "T8. Structured & Numeric Reasoning": 0.6416666666666667,
76
+ "T9. Version & Code Diff Analysis": 0.693087317328304,
77
+ "T10. Rule Induction & In-Context Learning": 0.6145833333333334,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.475
79
+ },
80
+ "language": {
81
+ "Chinese": 0.6075617792305364,
82
+ "English": 0.5725018903419208
83
+ }
84
+ },
85
+ "pass@1": 0.36466666666666664,
86
+ "BoN-2": {
87
+ "overall_metric": 0.6659160211594685,
88
+ "token_length": {
89
+ "8k": 0.7935291454736249,
90
+ "16k": 0.7469698033059613,
91
+ "32k": 0.7147019641303554,
92
+ "64k": 0.6716314717341791,
93
+ "128k": 0.5707409059251077,
94
+ "256k": 0.497922836387586
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.6309312423015924,
98
+ "Partial": 0.7104421033422219
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.8923409007483016,
102
+ "Moderate": 0.7327459154582487,
103
+ "Hard": 0.5632013922542815,
104
+ "Extreme": 0.4414476037490934
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.880882952904511,
108
+ "T2. Sequencing & Structure Reconstruction": 0.8476685999185997,
109
+ "T3. Evidence-Grounded QA": 0.6666666666666666,
110
+ "T4. Summarization & Synthesis": 0.5583277247093672,
111
+ "T5. Attribution & Citation Alignment": 0.6177785331058078,
112
+ "T6. Aggregation & Clustering": 0.6268647075743846,
113
+ "T7. Consistency & Compliance Checking": 0.4566279411326888,
114
+ "T8. Structured & Numeric Reasoning": 0.7217592592592593,
115
+ "T9. Version & Code Diff Analysis": 0.7464145919055787,
116
+ "T10. Rule Induction & In-Context Learning": 0.7400000000000001,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5583333333333333
118
+ },
119
+ "language": {
120
+ "Chinese": 0.6881369643762367,
121
+ "English": 0.6436950779427024
122
+ }
123
+ },
124
+ "pass@2": 0.43866666666666665,
125
+ "BoN-3": {
126
+ "overall_metric": 0.706844154531168,
127
+ "token_length": {
128
+ "8k": 0.8222844269368486,
129
+ "16k": 0.7772591556882481,
130
+ "32k": 0.7466099276083229,
131
+ "64k": 0.7234900151981559,
132
+ "128k": 0.62014724878678,
133
+ "256k": 0.551274152968658
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.6725119993190923,
137
+ "Partial": 0.7505396248010854
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.9285919042851156,
141
+ "Moderate": 0.7918869916930842,
142
+ "Hard": 0.6026672046571206,
143
+ "Extreme": 0.4764972072411785
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.9055704999975606,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8679013780100734,
148
+ "T3. Evidence-Grounded QA": 0.7416666666666667,
149
+ "T4. Summarization & Synthesis": 0.5683880603721403,
150
+ "T5. Attribution & Citation Alignment": 0.6545164467516165,
151
+ "T6. Aggregation & Clustering": 0.6671955091257745,
152
+ "T7. Consistency & Compliance Checking": 0.5102082868832639,
153
+ "T8. Structured & Numeric Reasoning": 0.7717592592592593,
154
+ "T9. Version & Code Diff Analysis": 0.7612642969391072,
155
+ "T10. Rule Induction & In-Context Learning": 0.7541666666666667,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6583333333333333
157
+ },
158
+ "language": {
159
+ "Chinese": 0.7225009620618484,
160
+ "English": 0.6911873470004898
161
+ }
162
+ },
163
+ "pass@3": 0.48133333333333334
164
+ }
results/GPT-4o/nonthinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.46665010092977977,
9
+ "inference_iteration_1_overall_metric": 0.4658401067882854,
10
+ "inference_iteration_2_overall_metric": 0.46753496394327626,
11
+ "inference_iteration_3_overall_metric": 0.466575232057776,
12
+ "average_token_length_metric": {
13
+ "8k": 0.5113488376851383,
14
+ "16k": 0.4997009141224516,
15
+ "32k": 0.5251055066966325,
16
+ "64k": 0.45692433752384126,
17
+ "128k": 0.4357776587958875,
18
+ "256k": 0.37104335075472816
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.4340554247509889,
22
+ "Partial": 0.5081342342482407
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.5937760942513476,
26
+ "Moderate": 0.4302951347009006,
27
+ "Hard": 0.4487990540053617,
28
+ "Extreme": 0.3629928487032055
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.7349022081832958,
32
+ "T2. Sequencing & Structure Reconstruction": 0.7250800279966945,
33
+ "T3. Evidence-Grounded QA": 0.522222222222222,
34
+ "T4. Summarization & Synthesis": 0.5082664460738612,
35
+ "T5. Attribution & Citation Alignment": 0.5342878338439898,
36
+ "T6. Aggregation & Clustering": 0.4265212400920285,
37
+ "T7. Consistency & Compliance Checking": 0.27964395302062434,
38
+ "T8. Structured & Numeric Reasoning": 0.21157407407407405,
39
+ "T9. Version & Code Diff Analysis": 0.5920364002998717,
40
+ "T10. Rule Induction & In-Context Learning": 0.4702777777777778,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.36944444444444435
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.4566116864733059,
45
+ "English": 0.47668851538625334
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.4658401067882854,
49
+ "token_length": {
50
+ "8k": 0.49942481947489,
51
+ "16k": 0.5075293232399243,
52
+ "32k": 0.535471182543319,
53
+ "64k": 0.4466006941173288,
54
+ "128k": 0.43362259025283534,
55
+ "256k": 0.37239203110142105
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.434998076864218,
59
+ "Partial": 0.5050935994189195
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.5852393493724929,
63
+ "Moderate": 0.43887833780663893,
64
+ "Hard": 0.4558206214636842,
65
+ "Extreme": 0.3593336239903742
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.7408543245056541,
69
+ "T2. Sequencing & Structure Reconstruction": 0.7499819162319159,
70
+ "T3. Evidence-Grounded QA": 0.5166666666666667,
71
+ "T4. Summarization & Synthesis": 0.5048929077004661,
72
+ "T5. Attribution & Citation Alignment": 0.5359141447270879,
73
+ "T6. Aggregation & Clustering": 0.42758922871826105,
74
+ "T7. Consistency & Compliance Checking": 0.27436727289246843,
75
+ "T8. Structured & Numeric Reasoning": 0.2152777777777778,
76
+ "T9. Version & Code Diff Analysis": 0.592312178161249,
77
+ "T10. Rule Induction & In-Context Learning": 0.44819444444444445,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.35833333333333334
79
+ },
80
+ "language": {
81
+ "Chinese": 0.4424240788239187,
82
+ "English": 0.48925613475265434
83
+ }
84
+ },
85
+ "pass@1": 0.194,
86
+ "BoN-2": {
87
+ "overall_metric": 0.5398849816112041,
88
+ "token_length": {
89
+ "8k": 0.5759687409807404,
90
+ "16k": 0.549906030476181,
91
+ "32k": 0.6066920241775022,
92
+ "64k": 0.5317817512096467,
93
+ "128k": 0.5214872601681293,
94
+ "256k": 0.4534740826550301
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.5083509910328655,
98
+ "Partial": 0.5800191514381833
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.6782240284299118,
102
+ "Moderate": 0.5120970500900949,
103
+ "Hard": 0.5256120585356611,
104
+ "Extreme": 0.41596717800169664
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.7993838319887435,
108
+ "T2. Sequencing & Structure Reconstruction": 0.7934912309912311,
109
+ "T3. Evidence-Grounded QA": 0.6666666666666666,
110
+ "T4. Summarization & Synthesis": 0.5276777173176291,
111
+ "T5. Attribution & Citation Alignment": 0.6042284709538017,
112
+ "T6. Aggregation & Clustering": 0.4920141039764696,
113
+ "T7. Consistency & Compliance Checking": 0.34899250013259275,
114
+ "T8. Structured & Numeric Reasoning": 0.26296296296296295,
115
+ "T9. Version & Code Diff Analysis": 0.6671322238361709,
116
+ "T10. Rule Induction & In-Context Learning": 0.5840277777777777,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.45
118
+ },
119
+ "language": {
120
+ "Chinese": 0.525731927704756,
121
+ "English": 0.5540380355176548
122
+ }
123
+ },
124
+ "pass@2": 0.25466666666666665,
125
+ "BoN-3": {
126
+ "overall_metric": 0.5744593750081309,
127
+ "token_length": {
128
+ "8k": 0.606512147802086,
129
+ "16k": 0.5958370007923852,
130
+ "32k": 0.6384127504911996,
131
+ "64k": 0.5707759455131253,
132
+ "128k": 0.5568976109117445,
133
+ "256k": 0.4783207945382497
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.5391592397427731,
137
+ "Partial": 0.6193868198913163
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.7196856800163218,
141
+ "Moderate": 0.540664277732673,
142
+ "Hard": 0.5601282438717835,
143
+ "Extreme": 0.44698074091055134
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.8103075764869724,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8166357716357717,
148
+ "T3. Evidence-Grounded QA": 0.7,
149
+ "T4. Summarization & Synthesis": 0.5377648665242384,
150
+ "T5. Attribution & Citation Alignment": 0.6423496422537093,
151
+ "T6. Aggregation & Clustering": 0.532547329213996,
152
+ "T7. Consistency & Compliance Checking": 0.40458496685251083,
153
+ "T8. Structured & Numeric Reasoning": 0.30046296296296293,
154
+ "T9. Version & Code Diff Analysis": 0.6915969977123051,
155
+ "T10. Rule Induction & In-Context Learning": 0.6340277777777777,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.49166666666666664
157
+ },
158
+ "language": {
159
+ "Chinese": 0.563099563818372,
160
+ "English": 0.5858191861978923
161
+ }
162
+ },
163
+ "pass@3": 0.286
164
+ }
results/GPT-4o/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.4943586900400841,
9
+ "inference_iteration_1_overall_metric": 0.4968481802669354,
10
+ "inference_iteration_2_overall_metric": 0.4953906178052376,
11
+ "inference_iteration_3_overall_metric": 0.49083727204807814,
12
+ "average_token_length_metric": {
13
+ "8k": 0.5879811335998006,
14
+ "16k": 0.5326400416753286,
15
+ "32k": 0.512948102728002,
16
+ "64k": 0.4721690409999518,
17
+ "128k": 0.44658724759711643,
18
+ "256k": 0.41382657364030534
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.46488316610562114,
22
+ "Partial": 0.5318729932294004
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.7183848834775743,
26
+ "Moderate": 0.43069679620968054,
27
+ "Hard": 0.41352044386464876,
28
+ "Extreme": 0.34385849736921437
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.7592784040430812,
32
+ "T2. Sequencing & Structure Reconstruction": 0.6992785706820794,
33
+ "T3. Evidence-Grounded QA": 0.41944444444444456,
34
+ "T4. Summarization & Synthesis": 0.4904128501144795,
35
+ "T5. Attribution & Citation Alignment": 0.5555445495468067,
36
+ "T6. Aggregation & Clustering": 0.46463843332938043,
37
+ "T7. Consistency & Compliance Checking": 0.26892122251820344,
38
+ "T8. Structured & Numeric Reasoning": 0.4810185185185185,
39
+ "T9. Version & Code Diff Analysis": 0.5265279154913766,
40
+ "T10. Rule Induction & In-Context Learning": 0.48212962962962963,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4250000000000001
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.46257560421163496,
45
+ "English": 0.526141775868533
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.4968481802669354,
49
+ "token_length": {
50
+ "8k": 0.5741867935281576,
51
+ "16k": 0.535374756046493,
52
+ "32k": 0.5202544808789716,
53
+ "64k": 0.47647427072064424,
54
+ "128k": 0.4412042204935842,
55
+ "256k": 0.4335945599337654
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.47247358832994213,
59
+ "Partial": 0.5278703881867464
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.7337781908034996,
63
+ "Moderate": 0.4336235016656763,
64
+ "Hard": 0.39945319018108144,
65
+ "Extreme": 0.3428000420213732
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.75748852621513,
69
+ "T2. Sequencing & Structure Reconstruction": 0.7128378857984122,
70
+ "T3. Evidence-Grounded QA": 0.4166666666666667,
71
+ "T4. Summarization & Synthesis": 0.4915611304736425,
72
+ "T5. Attribution & Citation Alignment": 0.5573645055278006,
73
+ "T6. Aggregation & Clustering": 0.4608094132930736,
74
+ "T7. Consistency & Compliance Checking": 0.2337605934000851,
75
+ "T8. Structured & Numeric Reasoning": 0.5046296296296297,
76
+ "T9. Version & Code Diff Analysis": 0.5617174175041987,
77
+ "T10. Rule Induction & In-Context Learning": 0.5141666666666667,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4
79
+ },
80
+ "language": {
81
+ "Chinese": 0.46378379039245493,
82
+ "English": 0.5299125701414172
83
+ }
84
+ },
85
+ "pass@1": 0.25266666666666665,
86
+ "BoN-2": {
87
+ "overall_metric": 0.5733973874130963,
88
+ "token_length": {
89
+ "8k": 0.6607408168420358,
90
+ "16k": 0.6168176180801352,
91
+ "32k": 0.6026001586229682,
92
+ "64k": 0.5549119793217003,
93
+ "128k": 0.5219672618111347,
94
+ "256k": 0.4833464898006067
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.5468729804164447,
98
+ "Partial": 0.6071557235906547
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.832354149230665,
102
+ "Moderate": 0.5035087598472016,
103
+ "Hard": 0.482911736547395,
104
+ "Extreme": 0.39505876757369657
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.8194314757440243,
108
+ "T2. Sequencing & Structure Reconstruction": 0.7647144522144521,
109
+ "T3. Evidence-Grounded QA": 0.5166666666666667,
110
+ "T4. Summarization & Synthesis": 0.507756894787762,
111
+ "T5. Attribution & Citation Alignment": 0.6431090618973574,
112
+ "T6. Aggregation & Clustering": 0.5325685690744169,
113
+ "T7. Consistency & Compliance Checking": 0.31928948521783374,
114
+ "T8. Structured & Numeric Reasoning": 0.5949074074074073,
115
+ "T9. Version & Code Diff Analysis": 0.5981405988039616,
116
+ "T10. Rule Induction & In-Context Learning": 0.6058333333333333,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666
118
+ },
119
+ "language": {
120
+ "Chinese": 0.5418805636874472,
121
+ "English": 0.6049142111387469
122
+ }
123
+ },
124
+ "pass@2": 0.33,
125
+ "BoN-3": {
126
+ "overall_metric": 0.6133026799603023,
127
+ "token_length": {
128
+ "8k": 0.7092280973251848,
129
+ "16k": 0.6573848999983455,
130
+ "32k": 0.6374049433271965,
131
+ "64k": 0.5884360272634136,
132
+ "128k": 0.5736764233126421,
133
+ "256k": 0.513685688535035
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.585559935676702,
137
+ "Partial": 0.6486116272303409
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.8579938866329095,
141
+ "Moderate": 0.5531379960724733,
142
+ "Hard": 0.5375404413799146,
143
+ "Extreme": 0.4345338594537549
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.8365910773415853,
147
+ "T2. Sequencing & Structure Reconstruction": 0.7977771302771303,
148
+ "T3. Evidence-Grounded QA": 0.5916666666666667,
149
+ "T4. Summarization & Synthesis": 0.5159806774505266,
150
+ "T5. Attribution & Citation Alignment": 0.6935011362804332,
151
+ "T6. Aggregation & Clustering": 0.5705653708431486,
152
+ "T7. Consistency & Compliance Checking": 0.381137305153889,
153
+ "T8. Structured & Numeric Reasoning": 0.6296296296296297,
154
+ "T9. Version & Code Diff Analysis": 0.6584905752696673,
155
+ "T10. Rule Induction & In-Context Learning": 0.6336111111111112,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667
157
+ },
158
+ "language": {
159
+ "Chinese": 0.5780672262010988,
160
+ "English": 0.6485381337195074
161
+ }
162
+ },
163
+ "pass@3": 0.36666666666666664
164
+ }
results/GPT-5/thinking_context-272000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.726053089253122,
9
+ "inference_iteration_1_overall_metric": 0.7242860759291603,
10
+ "inference_iteration_2_overall_metric": 0.72436075729001,
11
+ "inference_iteration_3_overall_metric": 0.729512434540192,
12
+ "average_token_length_metric": {
13
+ "8k": 0.7537078410340138,
14
+ "16k": 0.7627066310839429,
15
+ "32k": 0.7434290864816196,
16
+ "64k": 0.7646193918174649,
17
+ "128k": 0.6936202889645278,
18
+ "256k": 0.638235296137159
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.6915568234658586,
22
+ "Partial": 0.7699574275278195
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.8523326045847652,
26
+ "Moderate": 0.8231088494697211,
27
+ "Hard": 0.787367547123676,
28
+ "Extreme": 0.4836991814871219
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.9032376385150938,
32
+ "T2. Sequencing & Structure Reconstruction": 0.9075063054229715,
33
+ "T3. Evidence-Grounded QA": 0.6666666666666666,
34
+ "T4. Summarization & Synthesis": 0.5256066584699448,
35
+ "T5. Attribution & Citation Alignment": 0.8116994715897818,
36
+ "T6. Aggregation & Clustering": 0.6716265654111317,
37
+ "T7. Consistency & Compliance Checking": 0.631179283519898,
38
+ "T8. Structured & Numeric Reasoning": 0.7979938271604939,
39
+ "T9. Version & Code Diff Analysis": 0.818404768269679,
40
+ "T10. Rule Induction & In-Context Learning": 0.6802314814814814,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6111111111111112
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.7196645097291159,
45
+ "English": 0.7324416687771269
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.7242860759291603,
49
+ "token_length": {
50
+ "8k": 0.7638228227994025,
51
+ "16k": 0.7511485364018967,
52
+ "32k": 0.7397315002658593,
53
+ "64k": 0.7648062624572959,
54
+ "128k": 0.6947065191324134,
55
+ "256k": 0.6315008145180959
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.6845638507619599,
59
+ "Partial": 0.7748416352328712
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.8419121655420269,
63
+ "Moderate": 0.8140896757444649,
64
+ "Hard": 0.8018107002313927,
65
+ "Extreme": 0.4855278214669571
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.9022908711992736,
69
+ "T2. Sequencing & Structure Reconstruction": 0.9003492803492802,
70
+ "T3. Evidence-Grounded QA": 0.6666666666666666,
71
+ "T4. Summarization & Synthesis": 0.525285592483348,
72
+ "T5. Attribution & Citation Alignment": 0.8350389199886978,
73
+ "T6. Aggregation & Clustering": 0.6728116198035761,
74
+ "T7. Consistency & Compliance Checking": 0.6250527729039961,
75
+ "T8. Structured & Numeric Reasoning": 0.7824074074074074,
76
+ "T9. Version & Code Diff Analysis": 0.8228424738103258,
77
+ "T10. Rule Induction & In-Context Learning": 0.6890277777777778,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5916666666666667
79
+ },
80
+ "language": {
81
+ "Chinese": 0.7225808137285838,
82
+ "English": 0.725991338129738
83
+ }
84
+ },
85
+ "pass@1": 0.5033333333333333,
86
+ "BoN-2": {
87
+ "overall_metric": 0.773365567880672,
88
+ "token_length": {
89
+ "8k": 0.7988567725267066,
90
+ "16k": 0.7953552672252621,
91
+ "32k": 0.7853032014648265,
92
+ "64k": 0.8171591510524335,
93
+ "128k": 0.7387615265550217,
94
+ "256k": 0.7047574884597809
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.740254405395005,
98
+ "Partial": 0.8155070474078848
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.8943471956938479,
102
+ "Moderate": 0.8694949682853881,
103
+ "Hard": 0.8603608124174508,
104
+ "Extreme": 0.5205560974396651
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.9235081407527015,
108
+ "T2. Sequencing & Structure Reconstruction": 0.9270526695526693,
109
+ "T3. Evidence-Grounded QA": 0.7583333333333333,
110
+ "T4. Summarization & Synthesis": 0.5388141391367185,
111
+ "T5. Attribution & Citation Alignment": 0.8662194687189113,
112
+ "T6. Aggregation & Clustering": 0.724952326567939,
113
+ "T7. Consistency & Compliance Checking": 0.6769275451403334,
114
+ "T8. Structured & Numeric Reasoning": 0.837962962962963,
115
+ "T9. Version & Code Diff Analysis": 0.8518498172294341,
116
+ "T10. Rule Induction & In-Context Learning": 0.749861111111111,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6916666666666667
118
+ },
119
+ "language": {
120
+ "Chinese": 0.7653921632804664,
121
+ "English": 0.7813389724808776
122
+ }
123
+ },
124
+ "pass@2": 0.5773333333333334,
125
+ "BoN-3": {
126
+ "overall_metric": 0.7997603117800453,
127
+ "token_length": {
128
+ "8k": 0.8156058789899132,
129
+ "16k": 0.8312258319915683,
130
+ "32k": 0.8146647150412942,
131
+ "64k": 0.8402343004850696,
132
+ "128k": 0.7648319163907665,
133
+ "256k": 0.7319992277816549
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.7681553658992594,
137
+ "Partial": 0.8399847883555894
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.9168344057692764,
141
+ "Moderate": 0.9117105202934518,
142
+ "Hard": 0.8867394849893248,
143
+ "Extreme": 0.5408505285512573
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.9362853987173203,
147
+ "T2. Sequencing & Structure Reconstruction": 0.9359547859547858,
148
+ "T3. Evidence-Grounded QA": 0.7916666666666666,
149
+ "T4. Summarization & Synthesis": 0.5458038576746401,
150
+ "T5. Attribution & Citation Alignment": 0.8823540286034711,
151
+ "T6. Aggregation & Clustering": 0.7446436845926303,
152
+ "T7. Consistency & Compliance Checking": 0.6987021524631377,
153
+ "T8. Structured & Numeric Reasoning": 0.8824074074074073,
154
+ "T9. Version & Code Diff Analysis": 0.8622815151611319,
155
+ "T10. Rule Induction & In-Context Learning": 0.8040277777777778,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.75
157
+ },
158
+ "language": {
159
+ "Chinese": 0.7871986587353806,
160
+ "English": 0.8123219648247083
161
+ }
162
+ },
163
+ "pass@3": 0.6106666666666667
164
+ }
results/GPT-OSS-120B/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.5260760130553013,
9
+ "inference_iteration_1_overall_metric": 0.5251990705311491,
10
+ "inference_iteration_2_overall_metric": 0.5187040802401437,
11
+ "inference_iteration_3_overall_metric": 0.5343248883946079,
12
+ "average_token_length_metric": {
13
+ "8k": 0.6379995894817992,
14
+ "16k": 0.6200629617253591,
15
+ "32k": 0.5668769322787303,
16
+ "64k": 0.5173492904735919,
17
+ "128k": 0.4362186866504548,
18
+ "256k": 0.3779486177218682
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.49116755127749995,
22
+ "Partial": 0.5705049644088642
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.7406025674065024,
26
+ "Moderate": 0.506610347347898,
27
+ "Hard": 0.44966953179643426,
28
+ "Extreme": 0.3540424932279647
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.7441033370277687,
32
+ "T2. Sequencing & Structure Reconstruction": 0.7329905896572565,
33
+ "T3. Evidence-Grounded QA": 0.5333333333333334,
34
+ "T4. Summarization & Synthesis": 0.5106082800845382,
35
+ "T5. Attribution & Citation Alignment": 0.46625824816375694,
36
+ "T6. Aggregation & Clustering": 0.5279484217060981,
37
+ "T7. Consistency & Compliance Checking": 0.31563292534840204,
38
+ "T8. Structured & Numeric Reasoning": 0.5515432098765432,
39
+ "T9. Version & Code Diff Analysis": 0.5119880580465573,
40
+ "T10. Rule Induction & In-Context Learning": 0.5589814814814815,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.425
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.505405756924764,
45
+ "English": 0.5467462691858365
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.5251990705311491,
49
+ "token_length": {
50
+ "8k": 0.6529081123251393,
51
+ "16k": 0.6200667957335821,
52
+ "32k": 0.5763521514454887,
53
+ "64k": 0.49832867440843903,
54
+ "128k": 0.4350202675435077,
55
+ "256k": 0.36851842173074006
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.48686804900920944,
59
+ "Partial": 0.5739840070136185
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.7411278216421984,
63
+ "Moderate": 0.5064062158524808,
64
+ "Hard": 0.4567280838491506,
65
+ "Extreme": 0.34597541625321154
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.737237169976606,
69
+ "T2. Sequencing & Structure Reconstruction": 0.7325725663225663,
70
+ "T3. Evidence-Grounded QA": 0.5166666666666667,
71
+ "T4. Summarization & Synthesis": 0.5064758252528795,
72
+ "T5. Attribution & Citation Alignment": 0.4582161191244139,
73
+ "T6. Aggregation & Clustering": 0.5344712887432478,
74
+ "T7. Consistency & Compliance Checking": 0.3124319752506027,
75
+ "T8. Structured & Numeric Reasoning": 0.5666666666666667,
76
+ "T9. Version & Code Diff Analysis": 0.5286040271943487,
77
+ "T10. Rule Induction & In-Context Learning": 0.5565277777777777,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333
79
+ },
80
+ "language": {
81
+ "Chinese": 0.5156423632056797,
82
+ "English": 0.534755777856619
83
+ }
84
+ },
85
+ "pass@1": 0.2833333333333333,
86
+ "BoN-2": {
87
+ "overall_metric": 0.6024165651661463,
88
+ "token_length": {
89
+ "8k": 0.7164315027505042,
90
+ "16k": 0.700257643310589,
91
+ "32k": 0.6507634459310141,
92
+ "64k": 0.5868187511846459,
93
+ "128k": 0.5172068676740627,
94
+ "256k": 0.4430211801460688
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.5670686758499749,
98
+ "Partial": 0.6474047879321861
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.8314482789657763,
102
+ "Moderate": 0.5941446742173757,
103
+ "Hard": 0.5272666327730237,
104
+ "Extreme": 0.4063157035624808
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.8187672440796736,
108
+ "T2. Sequencing & Structure Reconstruction": 0.7801595626595622,
109
+ "T3. Evidence-Grounded QA": 0.6583333333333333,
110
+ "T4. Summarization & Synthesis": 0.5244604747568062,
111
+ "T5. Attribution & Citation Alignment": 0.5425411826774184,
112
+ "T6. Aggregation & Clustering": 0.5998244613513222,
113
+ "T7. Consistency & Compliance Checking": 0.38919737648291697,
114
+ "T8. Structured & Numeric Reasoning": 0.6231481481481482,
115
+ "T9. Version & Code Diff Analysis": 0.5981069547631331,
116
+ "T10. Rule Induction & In-Context Learning": 0.6645833333333333,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.525
118
+ },
119
+ "language": {
120
+ "Chinese": 0.5840400269819745,
121
+ "English": 0.620793103350321
122
+ }
123
+ },
124
+ "pass@2": 0.3506666666666667,
125
+ "BoN-3": {
126
+ "overall_metric": 0.6337631142743206,
127
+ "token_length": {
128
+ "8k": 0.7401560588202145,
129
+ "16k": 0.7330726977884732,
130
+ "32k": 0.6780170211387931,
131
+ "64k": 0.6366671272752144,
132
+ "128k": 0.543143313800391,
133
+ "256k": 0.47152246682284277
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.5976534402407667,
137
+ "Partial": 0.6797208812261188
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.858812640816625,
141
+ "Moderate": 0.6436207584661483,
142
+ "Hard": 0.5564637457814393,
143
+ "Extreme": 0.43152853820526416
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.841381085118233,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8196503034003031,
148
+ "T3. Evidence-Grounded QA": 0.6916666666666667,
149
+ "T4. Summarization & Synthesis": 0.5298447017215417,
150
+ "T5. Attribution & Citation Alignment": 0.5830791662518957,
151
+ "T6. Aggregation & Clustering": 0.6214286606830465,
152
+ "T7. Consistency & Compliance Checking": 0.41851724869569523,
153
+ "T8. Structured & Numeric Reasoning": 0.6564814814814816,
154
+ "T9. Version & Code Diff Analysis": 0.626359252313376,
155
+ "T10. Rule Induction & In-Context Learning": 0.7104166666666667,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.575
157
+ },
158
+ "language": {
159
+ "Chinese": 0.6166297449202884,
160
+ "English": 0.6508964836283548
161
+ }
162
+ },
163
+ "pass@3": 0.382
164
+ }
results/GPT-OSS-20B/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.4466309565832364,
9
+ "inference_iteration_1_overall_metric": 0.4454625807266656,
10
+ "inference_iteration_2_overall_metric": 0.45246537487177085,
11
+ "inference_iteration_3_overall_metric": 0.44196491415127315,
12
+ "average_token_length_metric": {
13
+ "8k": 0.5748339290561163,
14
+ "16k": 0.520513959710621,
15
+ "32k": 0.4891012266007553,
16
+ "64k": 0.41584677147603494,
17
+ "128k": 0.358630149540046,
18
+ "256k": 0.3208597031158458
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.415365323316177,
22
+ "Partial": 0.4864235807413135
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.650502297368124,
26
+ "Moderate": 0.39329906469313236,
27
+ "Hard": 0.35893228463928295,
28
+ "Extreme": 0.3159306081507978
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.7015484192056548,
32
+ "T2. Sequencing & Structure Reconstruction": 0.685767192683859,
33
+ "T3. Evidence-Grounded QA": 0.45555555555555555,
34
+ "T4. Summarization & Synthesis": 0.4908914699997827,
35
+ "T5. Attribution & Citation Alignment": 0.36677196742848295,
36
+ "T6. Aggregation & Clustering": 0.4730052458390773,
37
+ "T7. Consistency & Compliance Checking": 0.20816491065985157,
38
+ "T8. Structured & Numeric Reasoning": 0.41743827160493835,
39
+ "T9. Version & Code Diff Analysis": 0.447495265816877,
40
+ "T10. Rule Induction & In-Context Learning": 0.4786111111111111,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3083333333333334
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.4149338600461589,
45
+ "English": 0.4783280531203148
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.4454625807266656,
49
+ "token_length": {
50
+ "8k": 0.568519720930888,
51
+ "16k": 0.5262031339471792,
52
+ "32k": 0.4861844372065094,
53
+ "64k": 0.41353173269416393,
54
+ "128k": 0.3591760759962795,
55
+ "256k": 0.3191603835849737
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.4087106311919467,
59
+ "Partial": 0.49223778922539846
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.6501986428504741,
63
+ "Moderate": 0.4020783848645907,
64
+ "Hard": 0.35262004275477665,
65
+ "Extreme": 0.3106597264865292
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.6988816931417082,
69
+ "T2. Sequencing & Structure Reconstruction": 0.6855601343101344,
70
+ "T3. Evidence-Grounded QA": 0.44166666666666665,
71
+ "T4. Summarization & Synthesis": 0.48711155580421095,
72
+ "T5. Attribution & Citation Alignment": 0.3737909056226912,
73
+ "T6. Aggregation & Clustering": 0.47376675235942955,
74
+ "T7. Consistency & Compliance Checking": 0.19543673928650457,
75
+ "T8. Structured & Numeric Reasoning": 0.4592592592592593,
76
+ "T9. Version & Code Diff Analysis": 0.4319105105134516,
77
+ "T10. Rule Induction & In-Context Learning": 0.4483333333333333,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.30833333333333335
79
+ },
80
+ "language": {
81
+ "Chinese": 0.4135726150175988,
82
+ "English": 0.4773525464357322
83
+ }
84
+ },
85
+ "pass@1": 0.22066666666666668,
86
+ "BoN-2": {
87
+ "overall_metric": 0.5272106840251494,
88
+ "token_length": {
89
+ "8k": 0.6646954568347576,
90
+ "16k": 0.5904928422287453,
91
+ "32k": 0.5839842384318158,
92
+ "64k": 0.5063967419452635,
93
+ "128k": 0.43057993780384596,
94
+ "256k": 0.38711488690647056
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.49180621097136906,
98
+ "Partial": 0.5722709224572338
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.7548882368495022,
102
+ "Moderate": 0.4956196438725791,
103
+ "Hard": 0.4286236931227538,
104
+ "Extreme": 0.3633035715559389
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.7734553050177705,
108
+ "T2. Sequencing & Structure Reconstruction": 0.7497803122803125,
109
+ "T3. Evidence-Grounded QA": 0.5833333333333334,
110
+ "T4. Summarization & Synthesis": 0.5045962797540592,
111
+ "T5. Attribution & Citation Alignment": 0.4402260724918136,
112
+ "T6. Aggregation & Clustering": 0.5442968593297539,
113
+ "T7. Consistency & Compliance Checking": 0.271121809138241,
114
+ "T8. Structured & Numeric Reasoning": 0.5342592592592592,
115
+ "T9. Version & Code Diff Analysis": 0.5242253558461997,
116
+ "T10. Rule Induction & In-Context Learning": 0.5733333333333334,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667
118
+ },
119
+ "language": {
120
+ "Chinese": 0.5001287057888837,
121
+ "English": 0.5542926622614154
122
+ }
123
+ },
124
+ "pass@2": 0.288,
125
+ "BoN-3": {
126
+ "overall_metric": 0.5630326206114822,
127
+ "token_length": {
128
+ "8k": 0.696307151429279,
129
+ "16k": 0.6363828018704142,
130
+ "32k": 0.6080757988876709,
131
+ "64k": 0.5447446048393001,
132
+ "128k": 0.4651876442546681,
133
+ "256k": 0.4274977223875639
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.5231691902067062,
137
+ "Partial": 0.6137678956721072
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.7981617653401918,
141
+ "Moderate": 0.5378309814958724,
142
+ "Hard": 0.4581227829612033,
143
+ "Extreme": 0.3909189138526289
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.7979486004255343,
147
+ "T2. Sequencing & Structure Reconstruction": 0.787096468346468,
148
+ "T3. Evidence-Grounded QA": 0.6416666666666667,
149
+ "T4. Summarization & Synthesis": 0.5114673228538231,
150
+ "T5. Attribution & Citation Alignment": 0.4791973901175429,
151
+ "T6. Aggregation & Clustering": 0.5794154259483205,
152
+ "T7. Consistency & Compliance Checking": 0.29817240493883673,
153
+ "T8. Structured & Numeric Reasoning": 0.5800925925925926,
154
+ "T9. Version & Code Diff Analysis": 0.5598440073472041,
155
+ "T10. Rule Induction & In-Context Learning": 0.6325000000000001,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665
157
+ },
158
+ "language": {
159
+ "Chinese": 0.543096347456943,
160
+ "English": 0.5829688937660222
161
+ }
162
+ },
163
+ "pass@3": 0.316
164
+ }
results/Gemini-2.5-Flash/nonthinking_context-1000000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.5591861836855936,
9
+ "inference_iteration_1_overall_metric": 0.555222408533961,
10
+ "inference_iteration_2_overall_metric": 0.5555746542742924,
11
+ "inference_iteration_3_overall_metric": 0.5667614882485269,
12
+ "average_token_length_metric": {
13
+ "8k": 0.5794836437092291,
14
+ "16k": 0.585038678968723,
15
+ "32k": 0.5764993408909757,
16
+ "64k": 0.5298001757287436,
17
+ "128k": 0.5583690767328653,
18
+ "256k": 0.5259261860830253
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.5219144924948039,
22
+ "Partial": 0.6066228815647814
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.6655203056614667,
26
+ "Moderate": 0.5398880938056573,
27
+ "Hard": 0.5786822600966999,
28
+ "Extreme": 0.4425719452767774
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.7771168693642845,
32
+ "T2. Sequencing & Structure Reconstruction": 0.8032048969548965,
33
+ "T3. Evidence-Grounded QA": 0.6055555555555554,
34
+ "T4. Summarization & Synthesis": 0.5340660905787081,
35
+ "T5. Attribution & Citation Alignment": 0.745788551044203,
36
+ "T6. Aggregation & Clustering": 0.5023487328603856,
37
+ "T7. Consistency & Compliance Checking": 0.4407587176859518,
38
+ "T8. Structured & Numeric Reasoning": 0.2706790123456789,
39
+ "T9. Version & Code Diff Analysis": 0.7292026752712853,
40
+ "T10. Rule Induction & In-Context Learning": 0.5269907407407407,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.44722222222222224
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.5654483346944664,
45
+ "English": 0.5529240326767215
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.555222408533961,
49
+ "token_length": {
50
+ "8k": 0.5888731703826585,
51
+ "16k": 0.5734020363631814,
52
+ "32k": 0.5716727282141728,
53
+ "64k": 0.5201046130976303,
54
+ "128k": 0.5511174700611993,
55
+ "256k": 0.5261644330849247
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.5109299072772209,
59
+ "Partial": 0.6115946828607216
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.6581097400970598,
63
+ "Moderate": 0.5327009022057297,
64
+ "Hard": 0.5733517044448927,
65
+ "Extreme": 0.4453988971639295
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.7847565381129994,
69
+ "T2. Sequencing & Structure Reconstruction": 0.8209740259740258,
70
+ "T3. Evidence-Grounded QA": 0.5833333333333334,
71
+ "T4. Summarization & Synthesis": 0.5367973042699341,
72
+ "T5. Attribution & Citation Alignment": 0.7270779373385174,
73
+ "T6. Aggregation & Clustering": 0.49367147369310305,
74
+ "T7. Consistency & Compliance Checking": 0.444842452883972,
75
+ "T8. Structured & Numeric Reasoning": 0.27037037037037037,
76
+ "T9. Version & Code Diff Analysis": 0.7099867444467561,
77
+ "T10. Rule Induction & In-Context Learning": 0.522361111111111,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665
79
+ },
80
+ "language": {
81
+ "Chinese": 0.5642007220552895,
82
+ "English": 0.5462440950126328
83
+ }
84
+ },
85
+ "pass@1": 0.2846666666666667,
86
+ "BoN-2": {
87
+ "overall_metric": 0.600215470701642,
88
+ "token_length": {
89
+ "8k": 0.6308006025002828,
90
+ "16k": 0.6218713098115006,
91
+ "32k": 0.6209629478481974,
92
+ "64k": 0.5627834642545474,
93
+ "128k": 0.6034759308063585,
94
+ "256k": 0.5613985689889686
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.5620277169409751,
98
+ "Partial": 0.6488180663970381
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.7092035834801983,
102
+ "Moderate": 0.562063871268018,
103
+ "Hard": 0.6309834472774163,
104
+ "Extreme": 0.4856736448985854
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.8017289450670718,
108
+ "T2. Sequencing & Structure Reconstruction": 0.8388936988936988,
109
+ "T3. Evidence-Grounded QA": 0.6833333333333333,
110
+ "T4. Summarization & Synthesis": 0.5495701538780565,
111
+ "T5. Attribution & Citation Alignment": 0.7927380382693102,
112
+ "T6. Aggregation & Clustering": 0.5444118952178385,
113
+ "T7. Consistency & Compliance Checking": 0.5074368060401414,
114
+ "T8. Structured & Numeric Reasoning": 0.3101851851851852,
115
+ "T9. Version & Code Diff Analysis": 0.751572829108757,
116
+ "T10. Rule Induction & In-Context Learning": 0.5584722222222223,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334
118
+ },
119
+ "language": {
120
+ "Chinese": 0.6052369717987128,
121
+ "English": 0.5951939696045724
122
+ }
123
+ },
124
+ "pass@2": 0.32666666666666666,
125
+ "BoN-3": {
126
+ "overall_metric": 0.6297638749448518,
127
+ "token_length": {
128
+ "8k": 0.6514500993442873,
129
+ "16k": 0.6579228544698497,
130
+ "32k": 0.6569374765913272,
131
+ "64k": 0.5879896058573842,
132
+ "128k": 0.6298609557348528,
133
+ "256k": 0.5944222576714168
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.5933694722646927,
137
+ "Partial": 0.6760840238105122
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.7491261498971669,
141
+ "Moderate": 0.6049484175447234,
142
+ "Hard": 0.6548738677798389,
143
+ "Extreme": 0.49881447206374147
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.8141979383533716,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8490187590187587,
148
+ "T3. Evidence-Grounded QA": 0.7083333333333334,
149
+ "T4. Summarization & Synthesis": 0.5562735797066128,
150
+ "T5. Attribution & Citation Alignment": 0.8231149616655856,
151
+ "T6. Aggregation & Clustering": 0.5745805749141115,
152
+ "T7. Consistency & Compliance Checking": 0.5414909010178753,
153
+ "T8. Structured & Numeric Reasoning": 0.3379629629629629,
154
+ "T9. Version & Code Diff Analysis": 0.7854748730572404,
155
+ "T10. Rule Induction & In-Context Learning": 0.6129166666666667,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666
157
+ },
158
+ "language": {
159
+ "Chinese": 0.6363071790468551,
160
+ "English": 0.6232205708428518
161
+ }
162
+ },
163
+ "pass@3": 0.3606666666666667
164
+ }
results/Gemini-2.5-Flash/thinking_context-1000000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.674056323049449,
9
+ "inference_iteration_1_overall_metric": 0.6759983703704746,
10
+ "inference_iteration_2_overall_metric": 0.6798720821649146,
11
+ "inference_iteration_3_overall_metric": 0.6662985166129568,
12
+ "average_token_length_metric": {
13
+ "8k": 0.7135890855897297,
14
+ "16k": 0.6856610849454701,
15
+ "32k": 0.6818807100418771,
16
+ "64k": 0.7027829296522448,
17
+ "128k": 0.6399411623659867,
18
+ "256k": 0.6204829657013889
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.63656677094683,
22
+ "Partial": 0.7217702984527835
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.7982292314606038,
26
+ "Moderate": 0.7239074886730374,
27
+ "Hard": 0.7218614768527792,
28
+ "Extreme": 0.47388809993909664
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.8642581912430608,
32
+ "T2. Sequencing & Structure Reconstruction": 0.877125374625374,
33
+ "T3. Evidence-Grounded QA": 0.6638888888888889,
34
+ "T4. Summarization & Synthesis": 0.545007270213042,
35
+ "T5. Attribution & Citation Alignment": 0.7904062945014397,
36
+ "T6. Aggregation & Clustering": 0.6528080258554949,
37
+ "T7. Consistency & Compliance Checking": 0.5102049505643657,
38
+ "T8. Structured & Numeric Reasoning": 0.6658950617283949,
39
+ "T9. Version & Code Diff Analysis": 0.8004985540165189,
40
+ "T10. Rule Induction & In-Context Learning": 0.605046296296296,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5361111111111112
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.6759423962737819,
45
+ "English": 0.6721702498251175
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.6759983703704746,
49
+ "token_length": {
50
+ "8k": 0.7102710318952188,
51
+ "16k": 0.6883832260010031,
52
+ "32k": 0.6744900827350674,
53
+ "64k": 0.7119047863552107,
54
+ "128k": 0.6560068019942544,
55
+ "256k": 0.6149342932420981
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.6308759268961498,
59
+ "Partial": 0.733426934792345
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.8006604241201102,
63
+ "Moderate": 0.7377405126394252,
64
+ "Hard": 0.7224536765391667,
65
+ "Extreme": 0.46837070558456173
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.8740026641040577,
69
+ "T2. Sequencing & Structure Reconstruction": 0.8621174196174196,
70
+ "T3. Evidence-Grounded QA": 0.675,
71
+ "T4. Summarization & Synthesis": 0.5455538913362303,
72
+ "T5. Attribution & Citation Alignment": 0.8061189254081019,
73
+ "T6. Aggregation & Clustering": 0.6554340790288137,
74
+ "T7. Consistency & Compliance Checking": 0.5169546620879129,
75
+ "T8. Structured & Numeric Reasoning": 0.6708333333333333,
76
+ "T9. Version & Code Diff Analysis": 0.7797147286011522,
77
+ "T10. Rule Induction & In-Context Learning": 0.5926388888888889,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.55
79
+ },
80
+ "language": {
81
+ "Chinese": 0.6768990117658723,
82
+ "English": 0.6750977289750792
83
+ }
84
+ },
85
+ "pass@1": 0.4493333333333333,
86
+ "BoN-2": {
87
+ "overall_metric": 0.7479747877310018,
88
+ "token_length": {
89
+ "8k": 0.7719367863380663,
90
+ "16k": 0.7848557527755085,
91
+ "32k": 0.7435010923393187,
92
+ "64k": 0.7892992664474747,
93
+ "128k": 0.7226763130932595,
94
+ "256k": 0.6755795153923868
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.720714483291413,
98
+ "Partial": 0.782669720654116
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.8637385553396211,
102
+ "Moderate": 0.8121373016562715,
103
+ "Hard": 0.8141424576405533,
104
+ "Extreme": 0.535557607922782
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.9027833860231022,
108
+ "T2. Sequencing & Structure Reconstruction": 0.9054959854959853,
109
+ "T3. Evidence-Grounded QA": 0.7666666666666667,
110
+ "T4. Summarization & Synthesis": 0.5582670711283242,
111
+ "T5. Attribution & Citation Alignment": 0.8550801223307993,
112
+ "T6. Aggregation & Clustering": 0.7273381143811253,
113
+ "T7. Consistency & Compliance Checking": 0.6357469182183698,
114
+ "T8. Structured & Numeric Reasoning": 0.7416666666666667,
115
+ "T9. Version & Code Diff Analysis": 0.8585696216489627,
116
+ "T10. Rule Induction & In-Context Learning": 0.712361111111111,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6333333333333333
118
+ },
119
+ "language": {
120
+ "Chinese": 0.7535034625567544,
121
+ "English": 0.74244611290525
122
+ }
123
+ },
124
+ "pass@2": 0.5313333333333333,
125
+ "BoN-3": {
126
+ "overall_metric": 0.7790736538060705,
127
+ "token_length": {
128
+ "8k": 0.800474143876513,
129
+ "16k": 0.8203674165938436,
130
+ "32k": 0.767107753488941,
131
+ "64k": 0.8124208427670737,
132
+ "128k": 0.743086283346713,
133
+ "256k": 0.7309854827633406
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.7557801236733843,
137
+ "Partial": 0.808719964884035
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.8954381880917556,
141
+ "Moderate": 0.8399060564631339,
142
+ "Hard": 0.848259677254879,
143
+ "Extreme": 0.5662031295553962
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.9242311389541524,
147
+ "T2. Sequencing & Structure Reconstruction": 0.9315938690938689,
148
+ "T3. Evidence-Grounded QA": 0.7916666666666666,
149
+ "T4. Summarization & Synthesis": 0.5695550592103452,
150
+ "T5. Attribution & Citation Alignment": 0.8809133228802023,
151
+ "T6. Aggregation & Clustering": 0.7566450728547504,
152
+ "T7. Consistency & Compliance Checking": 0.662115306165705,
153
+ "T8. Structured & Numeric Reasoning": 0.7824074074074074,
154
+ "T9. Version & Code Diff Analysis": 0.8867644916844079,
155
+ "T10. Rule Induction & In-Context Learning": 0.7519444444444443,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.7
157
+ },
158
+ "language": {
159
+ "Chinese": 0.7890668786410705,
160
+ "English": 0.7690804289710705
161
+ }
162
+ },
163
+ "pass@3": 0.5786666666666667
164
+ }
results/Gemini-2.5-Pro/thinking_context-1000000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.7342184707317124,
9
+ "inference_iteration_1_overall_metric": 0.7402405885346022,
10
+ "inference_iteration_2_overall_metric": 0.7288378446496467,
11
+ "inference_iteration_3_overall_metric": 0.7335769790108894,
12
+ "average_token_length_metric": {
13
+ "8k": 0.7449778241967657,
14
+ "16k": 0.7478649041506191,
15
+ "32k": 0.7530566835243759,
16
+ "64k": 0.7417918268320294,
17
+ "128k": 0.6999601003776742,
18
+ "256k": 0.7176594853088111
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.7006912685258201,
22
+ "Partial": 0.7768894553573948
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.8440057387459964,
26
+ "Moderate": 0.819848501651939,
27
+ "Hard": 0.8102915033262061,
28
+ "Extreme": 0.5077419967802616
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.900910721502263,
32
+ "T2. Sequencing & Structure Reconstruction": 0.9242053162886497,
33
+ "T3. Evidence-Grounded QA": 0.6500000000000001,
34
+ "T4. Summarization & Synthesis": 0.5430214244860422,
35
+ "T5. Attribution & Citation Alignment": 0.8428063760922413,
36
+ "T6. Aggregation & Clustering": 0.7039837824498163,
37
+ "T7. Consistency & Compliance Checking": 0.6274987753728497,
38
+ "T8. Structured & Numeric Reasoning": 0.7824074074074073,
39
+ "T9. Version & Code Diff Analysis": 0.873498394228399,
40
+ "T10. Rule Induction & In-Context Learning": 0.683564814814815,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5888888888888888
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.7449054000919034,
45
+ "English": 0.7235315413715225
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.7402405885346022,
49
+ "token_length": {
50
+ "8k": 0.7531611316917413,
51
+ "16k": 0.7524897292361332,
52
+ "32k": 0.759794274989024,
53
+ "64k": 0.7435484076033682,
54
+ "128k": 0.6968353298720406,
55
+ "256k": 0.7356146578153082
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.6997313831727073,
59
+ "Partial": 0.7917977589951957
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.8496811955485003,
63
+ "Moderate": 0.8519813060901164,
64
+ "Hard": 0.7987384600115967,
65
+ "Extreme": 0.5085375776002995
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.8998469455079252,
69
+ "T2. Sequencing & Structure Reconstruction": 0.9226606264106262,
70
+ "T3. Evidence-Grounded QA": 0.7083333333333334,
71
+ "T4. Summarization & Synthesis": 0.5451740468187636,
72
+ "T5. Attribution & Citation Alignment": 0.8298635734563725,
73
+ "T6. Aggregation & Clustering": 0.6845419570109973,
74
+ "T7. Consistency & Compliance Checking": 0.6246870318062281,
75
+ "T8. Structured & Numeric Reasoning": 0.7995370370370369,
76
+ "T9. Version & Code Diff Analysis": 0.8931464590407817,
77
+ "T10. Rule Induction & In-Context Learning": 0.6825,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6083333333333333
79
+ },
80
+ "language": {
81
+ "Chinese": 0.7482902342281601,
82
+ "English": 0.7321909428410444
83
+ }
84
+ },
85
+ "pass@1": 0.5366666666666666,
86
+ "BoN-2": {
87
+ "overall_metric": 0.7920257143166666,
88
+ "token_length": {
89
+ "8k": 0.8033887159307824,
90
+ "16k": 0.7898793857734576,
91
+ "32k": 0.8117863707215878,
92
+ "64k": 0.7964277126734358,
93
+ "128k": 0.7607965077861976,
94
+ "256k": 0.7898755930145346
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.7559933032110803,
98
+ "Partial": 0.8378851466328657
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.8992430876416225,
102
+ "Moderate": 0.8976673772472249,
103
+ "Hard": 0.8677753697095277,
104
+ "Extreme": 0.5554328394573533
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.9332256652370721,
108
+ "T2. Sequencing & Structure Reconstruction": 0.9412599437599435,
109
+ "T3. Evidence-Grounded QA": 0.775,
110
+ "T4. Summarization & Synthesis": 0.5583020906699263,
111
+ "T5. Attribution & Citation Alignment": 0.8863658130468474,
112
+ "T6. Aggregation & Clustering": 0.7465324819181888,
113
+ "T7. Consistency & Compliance Checking": 0.7073416442539386,
114
+ "T8. Structured & Numeric Reasoning": 0.8560185185185186,
115
+ "T9. Version & Code Diff Analysis": 0.9071622825418993,
116
+ "T10. Rule Induction & In-Context Learning": 0.7341666666666666,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.7
118
+ },
119
+ "language": {
120
+ "Chinese": 0.795156398417639,
121
+ "English": 0.7888950302156931
122
+ }
123
+ },
124
+ "pass@2": 0.6133333333333333,
125
+ "BoN-3": {
126
+ "overall_metric": 0.8133867657039734,
127
+ "token_length": {
128
+ "8k": 0.8126699858808082,
129
+ "16k": 0.8091582494700531,
130
+ "32k": 0.8359194281957039,
131
+ "64k": 0.816367429901851,
132
+ "128k": 0.7927970026749335,
133
+ "256k": 0.8134084981004824
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.7838454845628277,
137
+ "Partial": 0.85098475988361
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.9143269591461097,
141
+ "Moderate": 0.9126631081599108,
142
+ "Hard": 0.9014414032827288,
143
+ "Extreme": 0.5797689782741141
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.9453427087326437,
147
+ "T2. Sequencing & Structure Reconstruction": 0.9562455137455134,
148
+ "T3. Evidence-Grounded QA": 0.7833333333333333,
149
+ "T4. Summarization & Synthesis": 0.5633383564725131,
150
+ "T5. Attribution & Citation Alignment": 0.9113749401192192,
151
+ "T6. Aggregation & Clustering": 0.7887683935896461,
152
+ "T7. Consistency & Compliance Checking": 0.7334094152440016,
153
+ "T8. Structured & Numeric Reasoning": 0.8671296296296297,
154
+ "T9. Version & Code Diff Analysis": 0.922905227868178,
155
+ "T10. Rule Induction & In-Context Learning": 0.7925,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.7083333333333334
157
+ },
158
+ "language": {
159
+ "Chinese": 0.8203398785387466,
160
+ "English": 0.8064336528691968
161
+ }
162
+ },
163
+ "pass@3": 0.6466666666666666
164
+ }
results/Gemma-3-12B-It/nonthinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.3215537579100566,
9
+ "inference_iteration_1_overall_metric": 0.32433034575226716,
10
+ "inference_iteration_2_overall_metric": 0.31844270123543494,
11
+ "inference_iteration_3_overall_metric": 0.3218882267424678,
12
+ "average_token_length_metric": {
13
+ "8k": 0.3884654162699951,
14
+ "16k": 0.3447777819230472,
15
+ "32k": 0.34075916239810233,
16
+ "64k": 0.2830133627729955,
17
+ "128k": 0.2903427416083741,
18
+ "256k": 0.2819640824878266
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.2967079738599621,
22
+ "Partial": 0.3531756648829045
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.43662085782547294,
26
+ "Moderate": 0.23390254455699586,
27
+ "Hard": 0.30432253509954527,
28
+ "Extreme": 0.26439167130106106
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.5521425232273246,
32
+ "T2. Sequencing & Structure Reconstruction": 0.6018784456284454,
33
+ "T3. Evidence-Grounded QA": 0.39722222222222225,
34
+ "T4. Summarization & Synthesis": 0.4889337252719512,
35
+ "T5. Attribution & Citation Alignment": 0.17289417311274363,
36
+ "T6. Aggregation & Clustering": 0.31608511775757386,
37
+ "T7. Consistency & Compliance Checking": 0.18810666022578687,
38
+ "T8. Structured & Numeric Reasoning": 0.06064814814814815,
39
+ "T9. Version & Code Diff Analysis": 0.34441506928983295,
40
+ "T10. Rule Induction & In-Context Learning": 0.39800925925925923,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.21666666666666667
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.31275985343364654,
45
+ "English": 0.3303476623864672
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.32433034575226716,
49
+ "token_length": {
50
+ "8k": 0.3906271090949922,
51
+ "16k": 0.3561137033219127,
52
+ "32k": 0.34838703683305916,
53
+ "64k": 0.2733930672728249,
54
+ "128k": 0.295394704679806,
55
+ "256k": 0.2820664533110089
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.299184653750444,
59
+ "Partial": 0.35633395375458804
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.4375571061543292,
63
+ "Moderate": 0.2456864337252399,
64
+ "Hard": 0.3046160927997635,
65
+ "Extreme": 0.26489900749156403
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.5440288685497278,
69
+ "T2. Sequencing & Structure Reconstruction": 0.595296185296185,
70
+ "T3. Evidence-Grounded QA": 0.4166666666666667,
71
+ "T4. Summarization & Synthesis": 0.4882012721729454,
72
+ "T5. Attribution & Citation Alignment": 0.16495676080926253,
73
+ "T6. Aggregation & Clustering": 0.3232791449049618,
74
+ "T7. Consistency & Compliance Checking": 0.1944817769562575,
75
+ "T8. Structured & Numeric Reasoning": 0.0699074074074074,
76
+ "T9. Version & Code Diff Analysis": 0.345977074505613,
77
+ "T10. Rule Induction & In-Context Learning": 0.4008333333333333,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.21666666666666667
79
+ },
80
+ "language": {
81
+ "Chinese": 0.3144798192828198,
82
+ "English": 0.334180872221715
83
+ }
84
+ },
85
+ "pass@1": 0.11133333333333334,
86
+ "BoN-2": {
87
+ "overall_metric": 0.3467523917671754,
88
+ "token_length": {
89
+ "8k": 0.4142271322547509,
90
+ "16k": 0.377736900046965,
91
+ "32k": 0.37302728004656277,
92
+ "64k": 0.3062912069169447,
93
+ "128k": 0.3073157553107305,
94
+ "256k": 0.3019160760270996
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.32135117390700496,
98
+ "Partial": 0.3790812144983023
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.46615306988638716,
102
+ "Moderate": 0.26112276416188485,
103
+ "Hard": 0.32737280433310617,
104
+ "Extreme": 0.28492633788743743
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.5650465241334548,
108
+ "T2. Sequencing & Structure Reconstruction": 0.6249731287231288,
109
+ "T3. Evidence-Grounded QA": 0.43333333333333335,
110
+ "T4. Summarization & Synthesis": 0.5032712591907067,
111
+ "T5. Attribution & Citation Alignment": 0.20597827766959523,
112
+ "T6. Aggregation & Clustering": 0.346419862594183,
113
+ "T7. Consistency & Compliance Checking": 0.21497503949007407,
114
+ "T8. Structured & Numeric Reasoning": 0.07962962962962963,
115
+ "T9. Version & Code Diff Analysis": 0.3894322431353122,
116
+ "T10. Rule Induction & In-Context Learning": 0.4174999999999999,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.23333333333333334
118
+ },
119
+ "language": {
120
+ "Chinese": 0.3387201257962417,
121
+ "English": 0.3547846577381098
122
+ }
123
+ },
124
+ "pass@2": 0.122,
125
+ "BoN-3": {
126
+ "overall_metric": 0.36208312298536466,
127
+ "token_length": {
128
+ "8k": 0.4291254746141897,
129
+ "16k": 0.3819529993839447,
130
+ "32k": 0.3886685107360013,
131
+ "64k": 0.3331907556566436,
132
+ "128k": 0.3232918491379485,
133
+ "256k": 0.3162691483834608
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.33738791539283736,
137
+ "Partial": 0.39351338719403606
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.4781807777959498,
141
+ "Moderate": 0.2766679371680117,
142
+ "Hard": 0.3532458860700918,
143
+ "Extreme": 0.29681012423769865
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.5905198790888515,
147
+ "T2. Sequencing & Structure Reconstruction": 0.6396154771154773,
148
+ "T3. Evidence-Grounded QA": 0.43333333333333335,
149
+ "T4. Summarization & Synthesis": 0.5090709574388708,
150
+ "T5. Attribution & Citation Alignment": 0.21374662530636385,
151
+ "T6. Aggregation & Clustering": 0.3778424215167421,
152
+ "T7. Consistency & Compliance Checking": 0.2211746842442865,
153
+ "T8. Structured & Numeric Reasoning": 0.09351851851851851,
154
+ "T9. Version & Code Diff Analysis": 0.4124771063926183,
155
+ "T10. Rule Induction & In-Context Learning": 0.43847222222222215,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25
157
+ },
158
+ "language": {
159
+ "Chinese": 0.35324378584530425,
160
+ "English": 0.3709224601254252
161
+ }
162
+ },
163
+ "pass@3": 0.12733333333333333
164
+ }
results/Gemma-3-12B-It/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.3191893284499226,
9
+ "inference_iteration_1_overall_metric": 0.3198032509438655,
10
+ "inference_iteration_2_overall_metric": 0.3185140430839788,
11
+ "inference_iteration_3_overall_metric": 0.3192506913219244,
12
+ "average_token_length_metric": {
13
+ "8k": 0.39039471003518295,
14
+ "16k": 0.36666548253930176,
15
+ "32k": 0.3454075697967419,
16
+ "64k": 0.2965559300528977,
17
+ "128k": 0.2634390883568726,
18
+ "256k": 0.25267318991854026
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.29023833980013936,
22
+ "Partial": 0.3560360412769198
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.45478741131761163,
26
+ "Moderate": 0.2260826308323087,
27
+ "Hard": 0.28024203424188526,
28
+ "Extreme": 0.2573832550303484
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.5500567032378852,
32
+ "T2. Sequencing & Structure Reconstruction": 0.5750085972908512,
33
+ "T3. Evidence-Grounded QA": 0.275,
34
+ "T4. Summarization & Synthesis": 0.4560070506311603,
35
+ "T5. Attribution & Citation Alignment": 0.1618052577444313,
36
+ "T6. Aggregation & Clustering": 0.3308854760173438,
37
+ "T7. Consistency & Compliance Checking": 0.16988798875914354,
38
+ "T8. Structured & Numeric Reasoning": 0.254783950617284,
39
+ "T9. Version & Code Diff Analysis": 0.3307362069623843,
40
+ "T10. Rule Induction & In-Context Learning": 0.3245833333333333,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.18333333333333338
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.29411514963064544,
45
+ "English": 0.3442635072692001
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.3198032509438655,
49
+ "token_length": {
50
+ "8k": 0.3780018581730685,
51
+ "16k": 0.3712141189933461,
52
+ "32k": 0.32728165209503113,
53
+ "64k": 0.30571589667403015,
54
+ "128k": 0.2747567763160667,
55
+ "256k": 0.2618492034116515
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.29216027329955657,
59
+ "Partial": 0.3549852224911681
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.45511408988195684,
63
+ "Moderate": 0.23953366795757702,
64
+ "Hard": 0.27643841537644914,
65
+ "Extreme": 0.25278171138445865
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.5667485461728832,
69
+ "T2. Sequencing & Structure Reconstruction": 0.562965737965738,
70
+ "T3. Evidence-Grounded QA": 0.25,
71
+ "T4. Summarization & Synthesis": 0.4542471570812729,
72
+ "T5. Attribution & Citation Alignment": 0.17077813789682644,
73
+ "T6. Aggregation & Clustering": 0.331058456491463,
74
+ "T7. Consistency & Compliance Checking": 0.17425550039116783,
75
+ "T8. Structured & Numeric Reasoning": 0.25555555555555554,
76
+ "T9. Version & Code Diff Analysis": 0.34802456680209815,
77
+ "T10. Rule Induction & In-Context Learning": 0.32847222222222217,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.175
79
+ },
80
+ "language": {
81
+ "Chinese": 0.29038552736401346,
82
+ "English": 0.34922097452371764
83
+ }
84
+ },
85
+ "pass@1": 0.11466666666666667,
86
+ "BoN-2": {
87
+ "overall_metric": 0.38355638163031974,
88
+ "token_length": {
89
+ "8k": 0.4753389925367892,
90
+ "16k": 0.42098100793684207,
91
+ "32k": 0.4193716441650546,
92
+ "64k": 0.35186094157335357,
93
+ "128k": 0.3173777879671563,
94
+ "256k": 0.3164079156027232
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.3475360182865405,
98
+ "Partial": 0.4294004804314932
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.5523314770092703,
102
+ "Moderate": 0.2771664695259602,
103
+ "Hard": 0.3369385202392175,
104
+ "Extreme": 0.29916922382926153
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.6227581461675343,
108
+ "T2. Sequencing & Structure Reconstruction": 0.6340152902652901,
109
+ "T3. Evidence-Grounded QA": 0.35,
110
+ "T4. Summarization & Synthesis": 0.47044783692403785,
111
+ "T5. Attribution & Citation Alignment": 0.21419149989714506,
112
+ "T6. Aggregation & Clustering": 0.40641717668678456,
113
+ "T7. Consistency & Compliance Checking": 0.22278173964383363,
114
+ "T8. Structured & Numeric Reasoning": 0.31851851851851853,
115
+ "T9. Version & Code Diff Analysis": 0.41479917818461753,
116
+ "T10. Rule Induction & In-Context Learning": 0.4166666666666667,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25
118
+ },
119
+ "language": {
120
+ "Chinese": 0.35221914805982085,
121
+ "English": 0.41489361520081863
122
+ }
123
+ },
124
+ "pass@2": 0.154,
125
+ "BoN-3": {
126
+ "overall_metric": 0.41783270358538355,
127
+ "token_length": {
128
+ "8k": 0.5097501627706587,
129
+ "16k": 0.4551451198013833,
130
+ "32k": 0.4565584105132001,
131
+ "64k": 0.39691938688026035,
132
+ "128k": 0.34453029473355373,
133
+ "256k": 0.3440928468132463
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.37691399280812576,
137
+ "Partial": 0.46991106275643846
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.6081488352407031,
141
+ "Moderate": 0.3030799752726802,
142
+ "Hard": 0.35694868313688616,
143
+ "Extreme": 0.32471144207202707
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.6492859989190713,
147
+ "T2. Sequencing & Structure Reconstruction": 0.673747440830774,
148
+ "T3. Evidence-Grounded QA": 0.4083333333333333,
149
+ "T4. Summarization & Synthesis": 0.47953981981487215,
150
+ "T5. Attribution & Citation Alignment": 0.23614408266148504,
151
+ "T6. Aggregation & Clustering": 0.44385472207531035,
152
+ "T7. Consistency & Compliance Checking": 0.24389274305473457,
153
+ "T8. Structured & Numeric Reasoning": 0.3634259259259259,
154
+ "T9. Version & Code Diff Analysis": 0.4490980326738026,
155
+ "T10. Rule Induction & In-Context Learning": 0.4583333333333333,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667
157
+ },
158
+ "language": {
159
+ "Chinese": 0.3946655531503397,
160
+ "English": 0.440999854020427
161
+ }
162
+ },
163
+ "pass@3": 0.174
164
+ }
results/Gemma-3-27B-It/nonthinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.3613898319544999,
9
+ "inference_iteration_1_overall_metric": 0.35966281601422384,
10
+ "inference_iteration_2_overall_metric": 0.3610482900444428,
11
+ "inference_iteration_3_overall_metric": 0.3634583898048339,
12
+ "average_token_length_metric": {
13
+ "8k": 0.43644157643949566,
14
+ "16k": 0.3804621509069283,
15
+ "32k": 0.39249485549033103,
16
+ "64k": 0.3508346036478247,
17
+ "128k": 0.30224089028156714,
18
+ "256k": 0.30586491496085444
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.33698292855454914,
22
+ "Partial": 0.3924531635544373
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.49956833239118315,
26
+ "Moderate": 0.2504305317277634,
27
+ "Hard": 0.3319623187177785,
28
+ "Extreme": 0.30223017713736006
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.6186817382099091,
32
+ "T2. Sequencing & Structure Reconstruction": 0.6828640898869156,
33
+ "T3. Evidence-Grounded QA": 0.475,
34
+ "T4. Summarization & Synthesis": 0.4863291170840516,
35
+ "T5. Attribution & Citation Alignment": 0.20718550263597674,
36
+ "T6. Aggregation & Clustering": 0.35878702296933646,
37
+ "T7. Consistency & Compliance Checking": 0.19534470405785112,
38
+ "T8. Structured & Numeric Reasoning": 0.11558641975308641,
39
+ "T9. Version & Code Diff Analysis": 0.3981518981106548,
40
+ "T10. Rule Induction & In-Context Learning": 0.37236111111111114,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.27222222222222225
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.3481327108034932,
45
+ "English": 0.3746469531055068
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.35966281601422384,
49
+ "token_length": {
50
+ "8k": 0.43768168643426314,
51
+ "16k": 0.37280230082410293,
52
+ "32k": 0.39131018375907956,
53
+ "64k": 0.3439379322622389,
54
+ "128k": 0.30914303161909973,
55
+ "256k": 0.30310176118655724
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.33431999039428895,
59
+ "Partial": 0.39191732134868573
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.49522975795181967,
63
+ "Moderate": 0.24738817424396956,
64
+ "Hard": 0.33414580218141465,
65
+ "Extreme": 0.30165945795823235
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.6149553491444276,
69
+ "T2. Sequencing & Structure Reconstruction": 0.6777184081350744,
70
+ "T3. Evidence-Grounded QA": 0.4666666666666667,
71
+ "T4. Summarization & Synthesis": 0.48496676951441003,
72
+ "T5. Attribution & Citation Alignment": 0.2117639703302691,
73
+ "T6. Aggregation & Clustering": 0.3687808030960292,
74
+ "T7. Consistency & Compliance Checking": 0.19168084686723558,
75
+ "T8. Structured & Numeric Reasoning": 0.1189814814814815,
76
+ "T9. Version & Code Diff Analysis": 0.3861048947753823,
77
+ "T10. Rule Induction & In-Context Learning": 0.35944444444444446,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.275
79
+ },
80
+ "language": {
81
+ "Chinese": 0.34703709850451614,
82
+ "English": 0.3722885335239309
83
+ }
84
+ },
85
+ "pass@1": 0.13066666666666665,
86
+ "BoN-2": {
87
+ "overall_metric": 0.38518154623771866,
88
+ "token_length": {
89
+ "8k": 0.45699608191277485,
90
+ "16k": 0.3970070268223878,
91
+ "32k": 0.4155514974235712,
92
+ "64k": 0.3795777424428599,
93
+ "128k": 0.3342775351649673,
94
+ "256k": 0.32767939365975096
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.36193530889736725,
98
+ "Partial": 0.4147676664890755
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.5260736620896884,
102
+ "Moderate": 0.2746052566038148,
103
+ "Hard": 0.3586645680646715,
104
+ "Extreme": 0.32088598840944504
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.6391494300203041,
108
+ "T2. Sequencing & Structure Reconstruction": 0.6911707890874557,
109
+ "T3. Evidence-Grounded QA": 0.49166666666666664,
110
+ "T4. Summarization & Synthesis": 0.4979131525726192,
111
+ "T5. Attribution & Citation Alignment": 0.24413521215425465,
112
+ "T6. Aggregation & Clustering": 0.3897620242575365,
113
+ "T7. Consistency & Compliance Checking": 0.2135542070257249,
114
+ "T8. Structured & Numeric Reasoning": 0.14675925925925926,
115
+ "T9. Version & Code Diff Analysis": 0.4478430638786239,
116
+ "T10. Rule Induction & In-Context Learning": 0.39444444444444443,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2833333333333333
118
+ },
119
+ "language": {
120
+ "Chinese": 0.37305464561658386,
121
+ "English": 0.3973084468588535
122
+ }
123
+ },
124
+ "pass@2": 0.144,
125
+ "BoN-3": {
126
+ "overall_metric": 0.4011866600471999,
127
+ "token_length": {
128
+ "8k": 0.4633867051191647,
129
+ "16k": 0.42281662150014376,
130
+ "32k": 0.44096928608538427,
131
+ "64k": 0.39220811203028316,
132
+ "128k": 0.34412857203690767,
133
+ "256k": 0.34361066351131797
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.37845160215956763,
137
+ "Partial": 0.4301221882678235
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.5455848419176232,
141
+ "Moderate": 0.2912498230785689,
142
+ "Hard": 0.36825828865136306,
143
+ "Extreme": 0.33684148176489936
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.654875485368947,
147
+ "T2. Sequencing & Structure Reconstruction": 0.7061728240894906,
148
+ "T3. Evidence-Grounded QA": 0.5166666666666667,
149
+ "T4. Summarization & Synthesis": 0.5049097313487289,
150
+ "T5. Attribution & Citation Alignment": 0.2565335955530918,
151
+ "T6. Aggregation & Clustering": 0.4143461915069394,
152
+ "T7. Consistency & Compliance Checking": 0.22258770429716687,
153
+ "T8. Structured & Numeric Reasoning": 0.175462962962963,
154
+ "T9. Version & Code Diff Analysis": 0.4681907705235851,
155
+ "T10. Rule Induction & In-Context Learning": 0.40555555555555556,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2833333333333333
157
+ },
158
+ "language": {
159
+ "Chinese": 0.3916620761944417,
160
+ "English": 0.410711243899959
161
+ }
162
+ },
163
+ "pass@3": 0.15533333333333332
164
+ }
results/Gemma-3-27B-It/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.37338415281793874,
9
+ "inference_iteration_1_overall_metric": 0.3756110685938797,
10
+ "inference_iteration_2_overall_metric": 0.3716321861397887,
11
+ "inference_iteration_3_overall_metric": 0.3729092037201496,
12
+ "average_token_length_metric": {
13
+ "8k": 0.44812577930836095,
14
+ "16k": 0.4266217475899872,
15
+ "32k": 0.4074453646105579,
16
+ "64k": 0.35662526806956907,
17
+ "128k": 0.2952141304786102,
18
+ "256k": 0.3062726268505501
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.3443713298159222,
22
+ "Partial": 0.4103095639114165
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.5780767692142667,
26
+ "Moderate": 0.24533089723723267,
27
+ "Hard": 0.3056384367420397,
28
+ "Extreme": 0.27775702033096106
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.6037064651554387,
32
+ "T2. Sequencing & Structure Reconstruction": 0.6278458897510606,
33
+ "T3. Evidence-Grounded QA": 0.3361111111111111,
34
+ "T4. Summarization & Synthesis": 0.45719209902963875,
35
+ "T5. Attribution & Citation Alignment": 0.23234121031762375,
36
+ "T6. Aggregation & Clustering": 0.38387242742350736,
37
+ "T7. Consistency & Compliance Checking": 0.18133975282134737,
38
+ "T8. Structured & Numeric Reasoning": 0.3114197530864198,
39
+ "T9. Version & Code Diff Analysis": 0.44895353115875314,
40
+ "T10. Rule Induction & In-Context Learning": 0.35731481481481475,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.28888888888888886
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.3378195347928154,
45
+ "English": 0.40894877084306375
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.3756110685938797,
49
+ "token_length": {
50
+ "8k": 0.4545820286350837,
51
+ "16k": 0.4352872653228386,
52
+ "32k": 0.3950079365533934,
53
+ "64k": 0.35311614333477187,
54
+ "128k": 0.3114655730243775,
55
+ "256k": 0.3042074646928149
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.35375594987355935,
59
+ "Partial": 0.4034266742379251
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.5844518111059833,
63
+ "Moderate": 0.23662855463799365,
64
+ "Hard": 0.3096811226362121,
65
+ "Extreme": 0.281425757285206
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.6068670183757786,
69
+ "T2. Sequencing & Structure Reconstruction": 0.6163485749654042,
70
+ "T3. Evidence-Grounded QA": 0.36666666666666664,
71
+ "T4. Summarization & Synthesis": 0.4560619845050425,
72
+ "T5. Attribution & Citation Alignment": 0.22005868934189987,
73
+ "T6. Aggregation & Clustering": 0.39305351710005704,
74
+ "T7. Consistency & Compliance Checking": 0.1715251657008684,
75
+ "T8. Structured & Numeric Reasoning": 0.33888888888888885,
76
+ "T9. Version & Code Diff Analysis": 0.44004517714509755,
77
+ "T10. Rule Induction & In-Context Learning": 0.35888888888888887,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.275
79
+ },
80
+ "language": {
81
+ "Chinese": 0.34160224855312066,
82
+ "English": 0.40961988863463966
83
+ }
84
+ },
85
+ "pass@1": 0.15533333333333332,
86
+ "BoN-2": {
87
+ "overall_metric": 0.4363206399942735,
88
+ "token_length": {
89
+ "8k": 0.5234602086730173,
90
+ "16k": 0.5018004637617909,
91
+ "32k": 0.46794475339715547,
92
+ "64k": 0.4185517955545234,
93
+ "128k": 0.3516627837551589,
94
+ "256k": 0.354503834823997
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.4063870314951086,
98
+ "Partial": 0.4744179599023025
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.6923421245761895,
102
+ "Moderate": 0.2843297701377906,
103
+ "Hard": 0.35026505438423683,
104
+ "Extreme": 0.31221398104277637
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.6567331296259676,
108
+ "T2. Sequencing & Structure Reconstruction": 0.6726985976985973,
109
+ "T3. Evidence-Grounded QA": 0.43333333333333335,
110
+ "T4. Summarization & Synthesis": 0.4711965663860271,
111
+ "T5. Attribution & Citation Alignment": 0.27489752350046465,
112
+ "T6. Aggregation & Clustering": 0.44484952313131765,
113
+ "T7. Consistency & Compliance Checking": 0.22198381203531814,
114
+ "T8. Structured & Numeric Reasoning": 0.40925925925925927,
115
+ "T9. Version & Code Diff Analysis": 0.5240655133007459,
116
+ "T10. Rule Induction & In-Context Learning": 0.4486111111111111,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.35833333333333334
118
+ },
119
+ "language": {
120
+ "Chinese": 0.3992628958468014,
121
+ "English": 0.47337838414174643
122
+ }
123
+ },
124
+ "pass@2": 0.19933333333333333,
125
+ "BoN-3": {
126
+ "overall_metric": 0.4678951184844386,
127
+ "token_length": {
128
+ "8k": 0.5595081038285018,
129
+ "16k": 0.5288508385707865,
130
+ "32k": 0.5128049001652387,
131
+ "64k": 0.45281870495664023,
132
+ "128k": 0.37498216355710723,
133
+ "256k": 0.3784059998283628
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.4378657173135933,
137
+ "Partial": 0.5061143563382442
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.7390417273273124,
141
+ "Moderate": 0.31763207699520324,
142
+ "Hard": 0.37625789112044933,
143
+ "Extreme": 0.3297508281124345
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.6917909626535441,
147
+ "T2. Sequencing & Structure Reconstruction": 0.7037670200170199,
148
+ "T3. Evidence-Grounded QA": 0.475,
149
+ "T4. Summarization & Synthesis": 0.4763487195551507,
150
+ "T5. Attribution & Citation Alignment": 0.30687631368745927,
151
+ "T6. Aggregation & Clustering": 0.4772314892310187,
152
+ "T7. Consistency & Compliance Checking": 0.24817243122652502,
153
+ "T8. Structured & Numeric Reasoning": 0.4217592592592592,
154
+ "T9. Version & Code Diff Analysis": 0.5559667511226697,
155
+ "T10. Rule Induction & In-Context Learning": 0.5098611111111111,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333
157
+ },
158
+ "language": {
159
+ "Chinese": 0.4238794242578244,
160
+ "English": 0.511910812711055
161
+ }
162
+ },
163
+ "pass@3": 0.22333333333333333
164
+ }
results/Gemma-3-4B-It/nonthinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.2175748220994214,
9
+ "inference_iteration_1_overall_metric": 0.21852109706784154,
10
+ "inference_iteration_2_overall_metric": 0.2163322668515703,
11
+ "inference_iteration_3_overall_metric": 0.21787110237885274,
12
+ "average_token_length_metric": {
13
+ "8k": 0.24656523663590132,
14
+ "16k": 0.2205604877341683,
15
+ "32k": 0.23963284248634728,
16
+ "64k": 0.21111513028758372,
17
+ "128k": 0.19343593120899555,
18
+ "256k": 0.19413930424353257
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.21444597727329182,
22
+ "Partial": 0.22155698824176823
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.28179521332096,
26
+ "Moderate": 0.15821172453341914,
27
+ "Hard": 0.20704875252188354,
28
+ "Extreme": 0.19312877831692504
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.4236351950773965,
32
+ "T2. Sequencing & Structure Reconstruction": 0.45359587924627603,
33
+ "T3. Evidence-Grounded QA": 0.3194444444444444,
34
+ "T4. Summarization & Synthesis": 0.443681734303759,
35
+ "T5. Attribution & Citation Alignment": 0.042786026910229404,
36
+ "T6. Aggregation & Clustering": 0.19637703429305803,
37
+ "T7. Consistency & Compliance Checking": 0.09142599749178396,
38
+ "T8. Structured & Numeric Reasoning": 0.02438271604938272,
39
+ "T9. Version & Code Diff Analysis": 0.1616430041389551,
40
+ "T10. Rule Induction & In-Context Learning": 0.22050925925925927,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.18611111111111114
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.20889372279815283,
45
+ "English": 0.22625592140069015
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.21852109706784154,
49
+ "token_length": {
50
+ "8k": 0.2476866042969573,
51
+ "16k": 0.21620176526480195,
52
+ "32k": 0.24753627633784483,
53
+ "64k": 0.21130221395252485,
54
+ "128k": 0.19685176433002144,
55
+ "256k": 0.19154795822489784
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.21588524494794425,
59
+ "Partial": 0.22187581794771047
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.28399652990881596,
63
+ "Moderate": 0.1580374343562769,
64
+ "Hard": 0.20857790675358978,
65
+ "Extreme": 0.19305337410218493
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.4309006915611223,
69
+ "T2. Sequencing & Structure Reconstruction": 0.4447710584938352,
70
+ "T3. Evidence-Grounded QA": 0.3333333333333333,
71
+ "T4. Summarization & Synthesis": 0.441308668504869,
72
+ "T5. Attribution & Citation Alignment": 0.03940722221903115,
73
+ "T6. Aggregation & Clustering": 0.20337572553598166,
74
+ "T7. Consistency & Compliance Checking": 0.08406090392051238,
75
+ "T8. Structured & Numeric Reasoning": 0.02361111111111111,
76
+ "T9. Version & Code Diff Analysis": 0.16341557282886326,
77
+ "T10. Rule Induction & In-Context Learning": 0.22013888888888886,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668
79
+ },
80
+ "language": {
81
+ "Chinese": 0.2120533839166827,
82
+ "English": 0.22498881021900005
83
+ }
84
+ },
85
+ "pass@1": 0.06733333333333333,
86
+ "BoN-2": {
87
+ "overall_metric": 0.23271347684478086,
88
+ "token_length": {
89
+ "8k": 0.2631313909667352,
90
+ "16k": 0.2312324535272521,
91
+ "32k": 0.25978977734748787,
92
+ "64k": 0.23161875208541116,
93
+ "128k": 0.20537856042145283,
94
+ "256k": 0.20512992672034427
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.2292600969188946,
98
+ "Partial": 0.23710868765954476
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.2946012095946107,
102
+ "Moderate": 0.1735404680186527,
103
+ "Hard": 0.22252519869057716,
104
+ "Extreme": 0.21047603309909335
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.4536230189628364,
108
+ "T2. Sequencing & Structure Reconstruction": 0.4791046371928724,
109
+ "T3. Evidence-Grounded QA": 0.3333333333333333,
110
+ "T4. Summarization & Synthesis": 0.4555393638488407,
111
+ "T5. Attribution & Citation Alignment": 0.04458180082240233,
112
+ "T6. Aggregation & Clustering": 0.22301925498410874,
113
+ "T7. Consistency & Compliance Checking": 0.10512521699205389,
114
+ "T8. Structured & Numeric Reasoning": 0.029166666666666667,
115
+ "T9. Version & Code Diff Analysis": 0.18246404287967255,
116
+ "T10. Rule Induction & In-Context Learning": 0.23263888888888887,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668
118
+ },
119
+ "language": {
120
+ "Chinese": 0.22667394776839053,
121
+ "English": 0.2387530059211707
122
+ }
123
+ },
124
+ "pass@2": 0.06866666666666667,
125
+ "BoN-3": {
126
+ "overall_metric": 0.2401600102211412,
127
+ "token_length": {
128
+ "8k": 0.26882845096050667,
129
+ "16k": 0.24012842113745536,
130
+ "32k": 0.2657505147595882,
131
+ "64k": 0.237575165952952,
132
+ "128k": 0.21797743491880378,
133
+ "256k": 0.21070007359754095
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.23644784183701267,
137
+ "Partial": 0.24488458816457748
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.3019876106242685,
141
+ "Moderate": 0.17780290261890433,
142
+ "Hard": 0.22967603723970634,
143
+ "Extreme": 0.22027403043562677
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.4596998605405389,
147
+ "T2. Sequencing & Structure Reconstruction": 0.5016641610023963,
148
+ "T3. Evidence-Grounded QA": 0.3333333333333333,
149
+ "T4. Summarization & Synthesis": 0.4669275398584054,
150
+ "T5. Attribution & Citation Alignment": 0.051513017753619265,
151
+ "T6. Aggregation & Clustering": 0.23601198455578568,
152
+ "T7. Consistency & Compliance Checking": 0.11322244187918541,
153
+ "T8. Structured & Numeric Reasoning": 0.03148148148148148,
154
+ "T9. Version & Code Diff Analysis": 0.19070502006795878,
155
+ "T10. Rule Induction & In-Context Learning": 0.23541666666666666,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668
157
+ },
158
+ "language": {
159
+ "Chinese": 0.23450699509250633,
160
+ "English": 0.24581302534977606
161
+ }
162
+ },
163
+ "pass@3": 0.07
164
+ }
results/Gemma-3-4B-It/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.2119885030064203,
9
+ "inference_iteration_1_overall_metric": 0.21437249924782262,
10
+ "inference_iteration_2_overall_metric": 0.21469684223951344,
11
+ "inference_iteration_3_overall_metric": 0.20689616753192464,
12
+ "average_token_length_metric": {
13
+ "8k": 0.24366425705090342,
14
+ "16k": 0.2312288563166909,
15
+ "32k": 0.24934489050979397,
16
+ "64k": 0.17455165550407764,
17
+ "128k": 0.18287496802077124,
18
+ "256k": 0.19026639063628464
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.20450724123760522,
22
+ "Partial": 0.221510108894003
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.28661876230483024,
26
+ "Moderate": 0.13848484214026374,
27
+ "Hard": 0.1987191002737751,
28
+ "Extreme": 0.18722857209328425
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.4122355140839005,
32
+ "T2. Sequencing & Structure Reconstruction": 0.4238909558076955,
33
+ "T3. Evidence-Grounded QA": 0.2361111111111111,
34
+ "T4. Summarization & Synthesis": 0.4241502347645632,
35
+ "T5. Attribution & Citation Alignment": 0.04263401505989564,
36
+ "T6. Aggregation & Clustering": 0.23701628551906095,
37
+ "T7. Consistency & Compliance Checking": 0.07531749025982754,
38
+ "T8. Structured & Numeric Reasoning": 0.10648148148148145,
39
+ "T9. Version & Code Diff Analysis": 0.15631527456623664,
40
+ "T10. Rule Induction & In-Context Learning": 0.20962962962962953,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.11666666666666664
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.19120313153416005,
45
+ "English": 0.23277387447868045
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.21437249924782262,
49
+ "token_length": {
50
+ "8k": 0.260899815427902,
51
+ "16k": 0.2556087799887465,
52
+ "32k": 0.23432924859855547,
53
+ "64k": 0.1786455712537597,
54
+ "128k": 0.1740181124720787,
55
+ "256k": 0.18273346774589402
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.2082323257071287,
59
+ "Partial": 0.22218726557234228
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.28518994612968396,
63
+ "Moderate": 0.1458906399979874,
64
+ "Hard": 0.19668309266803527,
65
+ "Extreme": 0.19339405931078624
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.44068453349253894,
69
+ "T2. Sequencing & Structure Reconstruction": 0.42484177261436434,
70
+ "T3. Evidence-Grounded QA": 0.25,
71
+ "T4. Summarization & Synthesis": 0.42183521824061226,
72
+ "T5. Attribution & Citation Alignment": 0.03826565166477447,
73
+ "T6. Aggregation & Clustering": 0.25040130008880007,
74
+ "T7. Consistency & Compliance Checking": 0.08179456674127736,
75
+ "T8. Structured & Numeric Reasoning": 0.08703703703703704,
76
+ "T9. Version & Code Diff Analysis": 0.16976304211815604,
77
+ "T10. Rule Induction & In-Context Learning": 0.19708333333333336,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.10833333333333334
79
+ },
80
+ "language": {
81
+ "Chinese": 0.19621205437007752,
82
+ "English": 0.23253294412556783
83
+ }
84
+ },
85
+ "pass@1": 0.06333333333333334,
86
+ "BoN-2": {
87
+ "overall_metric": 0.2636576810412479,
88
+ "token_length": {
89
+ "8k": 0.3079135549113116,
90
+ "16k": 0.28734590159692097,
91
+ "32k": 0.30413774794877135,
92
+ "64k": 0.21523254007725492,
93
+ "128k": 0.22804447155407118,
94
+ "256k": 0.23927187015915638
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.250251500957686,
98
+ "Partial": 0.28072009205669
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.364020302669296,
102
+ "Moderate": 0.1823512513190994,
103
+ "Hard": 0.23524288009416136,
104
+ "Extreme": 0.22578189022008766
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.5017988282740397,
108
+ "T2. Sequencing & Structure Reconstruction": 0.5025411389881287,
109
+ "T3. Evidence-Grounded QA": 0.275,
110
+ "T4. Summarization & Synthesis": 0.44235758157965793,
111
+ "T5. Attribution & Citation Alignment": 0.07123185213097494,
112
+ "T6. Aggregation & Clustering": 0.3076327414869081,
113
+ "T7. Consistency & Compliance Checking": 0.11226263632567846,
114
+ "T8. Structured & Numeric Reasoning": 0.14583333333333334,
115
+ "T9. Version & Code Diff Analysis": 0.2253096564350266,
116
+ "T10. Rule Induction & In-Context Learning": 0.26222222222222225,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.16666666666666666
118
+ },
119
+ "language": {
120
+ "Chinese": 0.24369127122121384,
121
+ "English": 0.28362409086128176
122
+ }
123
+ },
124
+ "pass@2": 0.086,
125
+ "BoN-3": {
126
+ "overall_metric": 0.29217739130674947,
127
+ "token_length": {
128
+ "8k": 0.3390735712300384,
129
+ "16k": 0.3084943145981662,
130
+ "32k": 0.331141630537573,
131
+ "64k": 0.24932737796255805,
132
+ "128k": 0.25196083809503234,
133
+ "256k": 0.27306661541712973
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.2733927975152606,
137
+ "Partial": 0.3160850561322812
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.4041733526262008,
141
+ "Moderate": 0.2194670736422517,
142
+ "Hard": 0.2580927963130063,
143
+ "Extreme": 0.23963574676642313
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.5332538419210622,
147
+ "T2. Sequencing & Structure Reconstruction": 0.5533578363062058,
148
+ "T3. Evidence-Grounded QA": 0.3416666666666667,
149
+ "T4. Summarization & Synthesis": 0.4493150256833664,
150
+ "T5. Attribution & Citation Alignment": 0.09079534419446701,
151
+ "T6. Aggregation & Clustering": 0.3434824981209785,
152
+ "T7. Consistency & Compliance Checking": 0.12257916371054065,
153
+ "T8. Structured & Numeric Reasoning": 0.16805555555555557,
154
+ "T9. Version & Code Diff Analysis": 0.24931951714865616,
155
+ "T10. Rule Induction & In-Context Learning": 0.30833333333333335,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.175
157
+ },
158
+ "language": {
159
+ "Chinese": 0.27088663804629004,
160
+ "English": 0.3134681445672092
161
+ }
162
+ },
163
+ "pass@3": 0.102
164
+ }
results/Kimi-K2-Instruct-0905/nonthinking_context-224000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 67,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.5009443422920304,
9
+ "inference_iteration_1_overall_metric": 0.5011015308802983,
10
+ "inference_iteration_2_overall_metric": 0.49751406897312744,
11
+ "inference_iteration_3_overall_metric": 0.5042174270226657,
12
+ "average_token_length_metric": {
13
+ "8k": 0.5193469215810047,
14
+ "16k": 0.5532046525085649,
15
+ "32k": 0.5393076869166767,
16
+ "64k": 0.45954315717941974,
17
+ "128k": 0.4753071835553842,
18
+ "256k": 0.4589564520111373
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.45716785858755954,
22
+ "Partial": 0.5566598670068132
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.6491770551060967,
26
+ "Moderate": 0.4905460752618018,
27
+ "Hard": 0.43431147544571097,
28
+ "Extreme": 0.3960536599333797
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.7424451818013483,
32
+ "T2. Sequencing & Structure Reconstruction": 0.7428685203685202,
33
+ "T3. Evidence-Grounded QA": 0.5472222222222223,
34
+ "T4. Summarization & Synthesis": 0.5115863734748296,
35
+ "T5. Attribution & Citation Alignment": 0.5310286936858898,
36
+ "T6. Aggregation & Clustering": 0.481867796853936,
37
+ "T7. Consistency & Compliance Checking": 0.36661627375742456,
38
+ "T8. Structured & Numeric Reasoning": 0.24089506172839517,
39
+ "T9. Version & Code Diff Analysis": 0.607908662662019,
40
+ "T10. Rule Induction & In-Context Learning": 0.5085648148148147,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.43611111111111117
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.502898779221276,
45
+ "English": 0.4989899053627864
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.5011015308802983,
49
+ "token_length": {
50
+ "8k": 0.5266204875905753,
51
+ "16k": 0.5431095505457598,
52
+ "32k": 0.5392474254502099,
53
+ "64k": 0.46556965866611255,
54
+ "128k": 0.48161604654304363,
55
+ "256k": 0.45044601648609367
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.45533929299371073,
59
+ "Partial": 0.5593443790995934
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.6458118886638174,
63
+ "Moderate": 0.4926159658473186,
64
+ "Hard": 0.42892012991547307,
65
+ "Extreme": 0.4021536227747844
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.734408014297419,
69
+ "T2. Sequencing & Structure Reconstruction": 0.7401580826580821,
70
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
71
+ "T4. Summarization & Synthesis": 0.5077956681433179,
72
+ "T5. Attribution & Citation Alignment": 0.5196800787679438,
73
+ "T6. Aggregation & Clustering": 0.4941189417411527,
74
+ "T7. Consistency & Compliance Checking": 0.3706991980056276,
75
+ "T8. Structured & Numeric Reasoning": 0.22268518518518515,
76
+ "T9. Version & Code Diff Analysis": 0.6228334158501364,
77
+ "T10. Rule Induction & In-Context Learning": 0.5243055555555556,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667
79
+ },
80
+ "language": {
81
+ "Chinese": 0.49773496712013804,
82
+ "English": 0.5044680946404603
83
+ }
84
+ },
85
+ "pass@1": 0.23,
86
+ "BoN-2": {
87
+ "overall_metric": 0.5483962332490746,
88
+ "token_length": {
89
+ "8k": 0.5725846797858735,
90
+ "16k": 0.6070506360109902,
91
+ "32k": 0.5785761030801342,
92
+ "64k": 0.5092135264066221,
93
+ "128k": 0.5131800223023555,
94
+ "256k": 0.5097724319084751
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.49887561173193595,
98
+ "Partial": 0.6114224788163434
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.7046240276873015,
102
+ "Moderate": 0.5462638174512109,
103
+ "Hard": 0.4707831873116643,
104
+ "Extreme": 0.4364568187575384
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.7623386988134085,
108
+ "T2. Sequencing & Structure Reconstruction": 0.7700949513449513,
109
+ "T3. Evidence-Grounded QA": 0.6,
110
+ "T4. Summarization & Synthesis": 0.5249959487118497,
111
+ "T5. Attribution & Citation Alignment": 0.5953546052259285,
112
+ "T6. Aggregation & Clustering": 0.5412045721601162,
113
+ "T7. Consistency & Compliance Checking": 0.41849900851913713,
114
+ "T8. Structured & Numeric Reasoning": 0.28935185185185186,
115
+ "T9. Version & Code Diff Analysis": 0.6584466738317519,
116
+ "T10. Rule Induction & In-Context Learning": 0.5701388888888889,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5
118
+ },
119
+ "language": {
120
+ "Chinese": 0.5438314702987085,
121
+ "English": 0.552960996199442
122
+ }
123
+ },
124
+ "pass@2": 0.27666666666666667,
125
+ "BoN-3": {
126
+ "overall_metric": 0.5729921291255787,
127
+ "token_length": {
128
+ "8k": 0.5918227174634976,
129
+ "16k": 0.6198567950677695,
130
+ "32k": 0.6115768945303457,
131
+ "64k": 0.5284625404433138,
132
+ "128k": 0.558149430962295,
133
+ "256k": 0.528084396286257
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.524120078602167,
137
+ "Partial": 0.6351929207008328
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.728216785648972,
141
+ "Moderate": 0.5755547819370206,
142
+ "Hard": 0.5066701999617829,
143
+ "Extreme": 0.45151519711603216
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.7860047126136753,
147
+ "T2. Sequencing & Structure Reconstruction": 0.7853528878528878,
148
+ "T3. Evidence-Grounded QA": 0.625,
149
+ "T4. Summarization & Synthesis": 0.5322270993764735,
150
+ "T5. Attribution & Citation Alignment": 0.6208079019292253,
151
+ "T6. Aggregation & Clustering": 0.5652450687006129,
152
+ "T7. Consistency & Compliance Checking": 0.44884599567602773,
153
+ "T8. Structured & Numeric Reasoning": 0.31157407407407406,
154
+ "T9. Version & Code Diff Analysis": 0.6765946379547446,
155
+ "T10. Rule Induction & In-Context Learning": 0.5895833333333333,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5583333333333333
157
+ },
158
+ "language": {
159
+ "Chinese": 0.5755180331103958,
160
+ "English": 0.5704662251407642
161
+ }
162
+ },
163
+ "pass@3": 0.3
164
+ }
results/Kimi-K2-Instruct-0905/thinking_context-224000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 69,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.5553060678788313,
9
+ "inference_iteration_1_overall_metric": 0.558917739810572,
10
+ "inference_iteration_2_overall_metric": 0.5552262066724464,
11
+ "inference_iteration_3_overall_metric": 0.5517742571534756,
12
+ "average_token_length_metric": {
13
+ "8k": 0.5978532013581613,
14
+ "16k": 0.5816609532803436,
15
+ "32k": 0.5872894997726004,
16
+ "64k": 0.5360933501085343,
17
+ "128k": 0.522886026665569,
18
+ "256k": 0.5060533760877814
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.5076499747745465,
22
+ "Partial": 0.6159592772842868
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.7729188134795828,
26
+ "Moderate": 0.5733088402612271,
27
+ "Hard": 0.4375074213815535,
28
+ "Extreme": 0.38246894115073926
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.758377055109974,
32
+ "T2. Sequencing & Structure Reconstruction": 0.7423694091857211,
33
+ "T3. Evidence-Grounded QA": 0.4861111111111112,
34
+ "T4. Summarization & Synthesis": 0.5011656658098056,
35
+ "T5. Attribution & Citation Alignment": 0.6197584764672828,
36
+ "T6. Aggregation & Clustering": 0.5164556923382113,
37
+ "T7. Consistency & Compliance Checking": 0.3547519606397262,
38
+ "T8. Structured & Numeric Reasoning": 0.5962962962962963,
39
+ "T9. Version & Code Diff Analysis": 0.6299270957790389,
40
+ "T10. Rule Induction & In-Context Learning": 0.5606944444444444,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666654
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.5409680916435047,
45
+ "English": 0.5696440441141594
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.558917739810572,
49
+ "token_length": {
50
+ "8k": 0.6032704706738693,
51
+ "16k": 0.5808397170448323,
52
+ "32k": 0.5927696772272222,
53
+ "64k": 0.5485389223926213,
54
+ "128k": 0.5293584568340762,
55
+ "256k": 0.49872919469081073
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.5058707162683681,
59
+ "Partial": 0.6264321334097424
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.7833558783440772,
63
+ "Moderate": 0.5768808845916201,
64
+ "Hard": 0.4381739958885578,
65
+ "Extreme": 0.3805641270346401
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.7528903747933631,
69
+ "T2. Sequencing & Structure Reconstruction": 0.7690795456301782,
70
+ "T3. Evidence-Grounded QA": 0.525,
71
+ "T4. Summarization & Synthesis": 0.5001505956406523,
72
+ "T5. Attribution & Citation Alignment": 0.6016113058386018,
73
+ "T6. Aggregation & Clustering": 0.5066860396628982,
74
+ "T7. Consistency & Compliance Checking": 0.37783095340307493,
75
+ "T8. Structured & Numeric Reasoning": 0.5875000000000001,
76
+ "T9. Version & Code Diff Analysis": 0.6056866583526187,
77
+ "T10. Rule Induction & In-Context Learning": 0.5740277777777777,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.45
79
+ },
80
+ "language": {
81
+ "Chinese": 0.5344377355706056,
82
+ "English": 0.5833977440505398
83
+ }
84
+ },
85
+ "pass@1": 0.30733333333333335,
86
+ "BoN-2": {
87
+ "overall_metric": 0.6315444476656525,
88
+ "token_length": {
89
+ "8k": 0.6765293955798586,
90
+ "16k": 0.6653958948787116,
91
+ "32k": 0.6614274068144226,
92
+ "64k": 0.6099720675525434,
93
+ "128k": 0.5887552831784975,
94
+ "256k": 0.5871866379898839
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.5906087497944075,
98
+ "Partial": 0.6836444267745122
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.8546630775122501,
102
+ "Moderate": 0.6784971908255039,
103
+ "Hard": 0.5110824387985737,
104
+ "Extreme": 0.4354103526732191
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.7926072573330768,
108
+ "T2. Sequencing & Structure Reconstruction": 0.814456423206423,
109
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
110
+ "T4. Summarization & Synthesis": 0.5156065007848037,
111
+ "T5. Attribution & Citation Alignment": 0.705413217864198,
112
+ "T6. Aggregation & Clustering": 0.6176306197741449,
113
+ "T7. Consistency & Compliance Checking": 0.42951044567324476,
114
+ "T8. Structured & Numeric Reasoning": 0.6861111111111111,
115
+ "T9. Version & Code Diff Analysis": 0.7066217095721875,
116
+ "T10. Rule Induction & In-Context Learning": 0.6680555555555556,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.525
118
+ },
119
+ "language": {
120
+ "Chinese": 0.617638270540212,
121
+ "English": 0.6454506247910954
122
+ }
123
+ },
124
+ "pass@2": 0.37066666666666664,
125
+ "BoN-3": {
126
+ "overall_metric": 0.6596591269299542,
127
+ "token_length": {
128
+ "8k": 0.6994761842119754,
129
+ "16k": 0.6880103332979634,
130
+ "32k": 0.6831628614701334,
131
+ "64k": 0.6414281237947528,
132
+ "128k": 0.6223896794908902,
133
+ "256k": 0.6234875793140215
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.6186943779973499,
137
+ "Partial": 0.7117960801169108
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.865163305432102,
141
+ "Moderate": 0.7223592192252923,
142
+ "Hard": 0.5480741090923853,
143
+ "Extreme": 0.4666471483928459
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.8175679720885881,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8413347300847297,
148
+ "T3. Evidence-Grounded QA": 0.6,
149
+ "T4. Summarization & Synthesis": 0.5241839616339089,
150
+ "T5. Attribution & Citation Alignment": 0.7286969407202896,
151
+ "T6. Aggregation & Clustering": 0.6482898910584165,
152
+ "T7. Consistency & Compliance Checking": 0.47389920059132556,
153
+ "T8. Structured & Numeric Reasoning": 0.7092592592592593,
154
+ "T9. Version & Code Diff Analysis": 0.7448385112889893,
155
+ "T10. Rule Induction & In-Context Learning": 0.7002777777777779,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666
157
+ },
158
+ "language": {
159
+ "Chinese": 0.646426154128056,
160
+ "English": 0.6728920997318576
161
+ }
162
+ },
163
+ "pass@3": 0.4013333333333333
164
+ }
results/Llama-3.1-405B-Instruct/nonthinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.4006972406362581,
9
+ "inference_iteration_1_overall_metric": 0.4033767470362484,
10
+ "inference_iteration_2_overall_metric": 0.4037400979033875,
11
+ "inference_iteration_3_overall_metric": 0.3949748769691391,
12
+ "average_token_length_metric": {
13
+ "8k": 0.495611810737427,
14
+ "16k": 0.47999448108480186,
15
+ "32k": 0.4902670612376324,
16
+ "64k": 0.40596651011726403,
17
+ "128k": 0.2929513776114342,
18
+ "256k": 0.23939220302898997
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.3708303649326616,
22
+ "Partial": 0.4387096278953802
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.5544944910188809,
26
+ "Moderate": 0.2906684222492303,
27
+ "Hard": 0.35510805093665054,
28
+ "Extreme": 0.33443208075583397
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.7010722319989281,
32
+ "T2. Sequencing & Structure Reconstruction": 0.6852361434861434,
33
+ "T3. Evidence-Grounded QA": 0.39722222222222225,
34
+ "T4. Summarization & Synthesis": 0.49860889459519425,
35
+ "T5. Attribution & Citation Alignment": 0.33280185377001925,
36
+ "T6. Aggregation & Clustering": 0.4288546372402087,
37
+ "T7. Consistency & Compliance Checking": 0.22662521432331084,
38
+ "T8. Structured & Numeric Reasoning": 0.1651234567901234,
39
+ "T9. Version & Code Diff Analysis": 0.4037951252761809,
40
+ "T10. Rule Induction & In-Context Learning": 0.43962962962962965,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3194444444444445
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.38105191405127925,
45
+ "English": 0.42034256722123686
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.4033767470362484,
49
+ "token_length": {
50
+ "8k": 0.5193261381744707,
51
+ "16k": 0.47458776670121805,
52
+ "32k": 0.4893823156329057,
53
+ "64k": 0.41443272045065593,
54
+ "128k": 0.28371722198812693,
55
+ "256k": 0.2388143192701119
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.3743223052410306,
59
+ "Partial": 0.44035512750288913
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.5734527544487493,
63
+ "Moderate": 0.2799762087674356,
64
+ "Hard": 0.35650549807929216,
65
+ "Extreme": 0.3289038173440243
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.6946832479137959,
69
+ "T2. Sequencing & Structure Reconstruction": 0.6867736892736891,
70
+ "T3. Evidence-Grounded QA": 0.4083333333333333,
71
+ "T4. Summarization & Synthesis": 0.5005294746426308,
72
+ "T5. Attribution & Citation Alignment": 0.3263529995384955,
73
+ "T6. Aggregation & Clustering": 0.4192752041935995,
74
+ "T7. Consistency & Compliance Checking": 0.23609519970144652,
75
+ "T8. Structured & Numeric Reasoning": 0.18101851851851852,
76
+ "T9. Version & Code Diff Analysis": 0.391786542964143,
77
+ "T10. Rule Induction & In-Context Learning": 0.47083333333333327,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.30833333333333335
79
+ },
80
+ "language": {
81
+ "Chinese": 0.3889859691853749,
82
+ "English": 0.41776752488712193
83
+ }
84
+ },
85
+ "pass@1": 0.16266666666666665,
86
+ "BoN-2": {
87
+ "overall_metric": 0.44522764896517386,
88
+ "token_length": {
89
+ "8k": 0.558048061407528,
90
+ "16k": 0.5158119021911939,
91
+ "32k": 0.530639639804942,
92
+ "64k": 0.45350021661205264,
93
+ "128k": 0.3440588360232635,
94
+ "256k": 0.26930723775206405
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.40918538567791846,
98
+ "Partial": 0.49109962042168165
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.6186902100237465,
102
+ "Moderate": 0.32736405593203205,
103
+ "Hard": 0.39034570869637836,
104
+ "Extreme": 0.36866970508796515
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.7336012615268711,
108
+ "T2. Sequencing & Structure Reconstruction": 0.737358012358012,
109
+ "T3. Evidence-Grounded QA": 0.45,
110
+ "T4. Summarization & Synthesis": 0.5147616333746151,
111
+ "T5. Attribution & Citation Alignment": 0.3674427224531845,
112
+ "T6. Aggregation & Clustering": 0.4743354751511784,
113
+ "T7. Consistency & Compliance Checking": 0.26985986799607303,
114
+ "T8. Structured & Numeric Reasoning": 0.20462962962962963,
115
+ "T9. Version & Code Diff Analysis": 0.44727785652000346,
116
+ "T10. Rule Induction & In-Context Learning": 0.5249999999999999,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.36666666666666664
118
+ },
119
+ "language": {
120
+ "Chinese": 0.433626244820898,
121
+ "English": 0.4568290531094507
122
+ }
123
+ },
124
+ "pass@2": 0.19,
125
+ "BoN-3": {
126
+ "overall_metric": 0.4623473432019363,
127
+ "token_length": {
128
+ "8k": 0.5703117586774046,
129
+ "16k": 0.531567048206503,
130
+ "32k": 0.5504750677715584,
131
+ "64k": 0.47066633396240054,
132
+ "128k": 0.3715170610120738,
133
+ "256k": 0.27954678958167684
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.42433070289568703,
137
+ "Partial": 0.510732158137163
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.6297066688011407,
141
+ "Moderate": 0.34570653296360615,
142
+ "Hard": 0.4161126844074583,
143
+ "Extreme": 0.3859923786829926
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.7453735974615648,
147
+ "T2. Sequencing & Structure Reconstruction": 0.7443156843156841,
148
+ "T3. Evidence-Grounded QA": 0.45,
149
+ "T4. Summarization & Synthesis": 0.52546578139621,
150
+ "T5. Attribution & Citation Alignment": 0.4112102181587475,
151
+ "T6. Aggregation & Clustering": 0.5005878653860251,
152
+ "T7. Consistency & Compliance Checking": 0.2880921279803352,
153
+ "T8. Structured & Numeric Reasoning": 0.22685185185185183,
154
+ "T9. Version & Code Diff Analysis": 0.4880120741980102,
155
+ "T10. Rule Induction & In-Context Learning": 0.5249999999999999,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.36666666666666664
157
+ },
158
+ "language": {
159
+ "Chinese": 0.4464982468640505,
160
+ "English": 0.47819643953982244
161
+ }
162
+ },
163
+ "pass@3": 0.20266666666666666
164
+ }
results/Llama-3.1-405B-Instruct/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.40659333298471173,
9
+ "inference_iteration_1_overall_metric": 0.3990307184980547,
10
+ "inference_iteration_2_overall_metric": 0.40746318832781453,
11
+ "inference_iteration_3_overall_metric": 0.4132860921282645,
12
+ "average_token_length_metric": {
13
+ "8k": 0.5237859741060642,
14
+ "16k": 0.517961066700275,
15
+ "32k": 0.4641044292483723,
16
+ "64k": 0.4182090511525332,
17
+ "128k": 0.260073356838944,
18
+ "256k": 0.25542611986208036
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.3729971231698154,
22
+ "Partial": 0.44935214547639707
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.613595833316379,
26
+ "Moderate": 0.2921863382975652,
27
+ "Hard": 0.3409469495726698,
28
+ "Extreme": 0.29809383550926016
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.6642543814799384,
32
+ "T2. Sequencing & Structure Reconstruction": 0.660868776285443,
33
+ "T3. Evidence-Grounded QA": 0.3,
34
+ "T4. Summarization & Synthesis": 0.4699204609681677,
35
+ "T5. Attribution & Citation Alignment": 0.35415278369391223,
36
+ "T6. Aggregation & Clustering": 0.44523374235335905,
37
+ "T7. Consistency & Compliance Checking": 0.2145859605426568,
38
+ "T8. Structured & Numeric Reasoning": 0.36867283950617286,
39
+ "T9. Version & Code Diff Analysis": 0.43511107590777826,
40
+ "T10. Rule Induction & In-Context Learning": 0.37759259259259265,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2777777777777778
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.36859109108368215,
45
+ "English": 0.4445955748857405
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.3990307184980547,
49
+ "token_length": {
50
+ "8k": 0.5319602144193065,
51
+ "16k": 0.495294502488435,
52
+ "32k": 0.44900241142401065,
53
+ "64k": 0.4104061454120568,
54
+ "128k": 0.24974039710926457,
55
+ "256k": 0.2577806401352564
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.3663953815207406,
59
+ "Partial": 0.4405666019237288
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.6016370051267634,
63
+ "Moderate": 0.26872089549030526,
64
+ "Hard": 0.34181177303603094,
65
+ "Extreme": 0.3002570456178896
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.6857190555998647,
69
+ "T2. Sequencing & Structure Reconstruction": 0.6476817164317161,
70
+ "T3. Evidence-Grounded QA": 0.2916666666666667,
71
+ "T4. Summarization & Synthesis": 0.4722890782260215,
72
+ "T5. Attribution & Citation Alignment": 0.3720362720390496,
73
+ "T6. Aggregation & Clustering": 0.4527252876757789,
74
+ "T7. Consistency & Compliance Checking": 0.19573002717715302,
75
+ "T8. Structured & Numeric Reasoning": 0.3111111111111111,
76
+ "T9. Version & Code Diff Analysis": 0.4434471088718592,
77
+ "T10. Rule Induction & In-Context Learning": 0.39402777777777775,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667
79
+ },
80
+ "language": {
81
+ "Chinese": 0.3646393289213864,
82
+ "English": 0.43342210807472437
83
+ }
84
+ },
85
+ "pass@1": 0.17866666666666667,
86
+ "BoN-2": {
87
+ "overall_metric": 0.47482444558969183,
88
+ "token_length": {
89
+ "8k": 0.6017952145070679,
90
+ "16k": 0.591088648529682,
91
+ "32k": 0.5375509398549342,
92
+ "64k": 0.4892562129103632,
93
+ "128k": 0.3164459575452793,
94
+ "256k": 0.3128097001908253
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.43173242380596516,
98
+ "Partial": 0.5296688369507991
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.7232714101187123,
102
+ "Moderate": 0.3321291637725583,
103
+ "Hard": 0.3952399665178113,
104
+ "Extreme": 0.3486594773940952
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.7289955546187882,
108
+ "T2. Sequencing & Structure Reconstruction": 0.7412526825026818,
109
+ "T3. Evidence-Grounded QA": 0.4166666666666667,
110
+ "T4. Summarization & Synthesis": 0.49118408476251574,
111
+ "T5. Attribution & Citation Alignment": 0.40752656877934634,
112
+ "T6. Aggregation & Clustering": 0.5058127366024506,
113
+ "T7. Consistency & Compliance Checking": 0.2513867378923854,
114
+ "T8. Structured & Numeric Reasoning": 0.4481481481481482,
115
+ "T9. Version & Code Diff Analysis": 0.5031863563544513,
116
+ "T10. Rule Induction & In-Context Learning": 0.48847222222222225,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.35
118
+ },
119
+ "language": {
120
+ "Chinese": 0.43768030437389926,
121
+ "English": 0.511968586805485
122
+ }
123
+ },
124
+ "pass@2": 0.24,
125
+ "BoN-3": {
126
+ "overall_metric": 0.5112966568518927,
127
+ "token_length": {
128
+ "8k": 0.638281295176006,
129
+ "16k": 0.6423408107009556,
130
+ "32k": 0.5654314097480697,
131
+ "64k": 0.5253595874826821,
132
+ "128k": 0.3530891462917609,
133
+ "256k": 0.3432776917118821
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.4709592628636537,
137
+ "Partial": 0.5626351582914695
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.7672241772316003,
141
+ "Moderate": 0.375515223666526,
142
+ "Hard": 0.43497012537117896,
143
+ "Extreme": 0.3702560590461619
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.7536336494527094,
147
+ "T2. Sequencing & Structure Reconstruction": 0.7626347726347721,
148
+ "T3. Evidence-Grounded QA": 0.43333333333333335,
149
+ "T4. Summarization & Synthesis": 0.5004237684828063,
150
+ "T5. Attribution & Citation Alignment": 0.44213397827806017,
151
+ "T6. Aggregation & Clustering": 0.5445203343972627,
152
+ "T7. Consistency & Compliance Checking": 0.2924260189234064,
153
+ "T8. Structured & Numeric Reasoning": 0.5148148148148148,
154
+ "T9. Version & Code Diff Analysis": 0.5454347340415279,
155
+ "T10. Rule Induction & In-Context Learning": 0.5093055555555556,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667
157
+ },
158
+ "language": {
159
+ "Chinese": 0.47723166126529054,
160
+ "English": 0.545361652438495
161
+ }
162
+ },
163
+ "pass@3": 0.27466666666666667
164
+ }
results/Llama-3.1-70B-Instruct/nonthinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.31531891526483563,
9
+ "inference_iteration_1_overall_metric": 0.3187840899451787,
10
+ "inference_iteration_2_overall_metric": 0.3181153572604232,
11
+ "inference_iteration_3_overall_metric": 0.3090572985889053,
12
+ "average_token_length_metric": {
13
+ "8k": 0.44072420920233435,
14
+ "16k": 0.4154170524608382,
15
+ "32k": 0.39938052404397517,
16
+ "64k": 0.3038357678876172,
17
+ "128k": 0.16668959617065898,
18
+ "256k": 0.16586634182358947
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.28629550819966637,
22
+ "Partial": 0.3522577969841428
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.4401547192469389,
26
+ "Moderate": 0.21463302825178993,
27
+ "Hard": 0.2886301431775311,
28
+ "Extreme": 0.26222895835717064
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.5528929885667343,
32
+ "T2. Sequencing & Structure Reconstruction": 0.5351228410297036,
33
+ "T3. Evidence-Grounded QA": 0.34722222222222227,
34
+ "T4. Summarization & Synthesis": 0.48251270202602564,
35
+ "T5. Attribution & Citation Alignment": 0.2383585369805011,
36
+ "T6. Aggregation & Clustering": 0.3094103218555832,
37
+ "T7. Consistency & Compliance Checking": 0.18317006381802675,
38
+ "T8. Structured & Numeric Reasoning": 0.11111111111111113,
39
+ "T9. Version & Code Diff Analysis": 0.35298805295632424,
40
+ "T10. Rule Induction & In-Context Learning": 0.3324074074074074,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.1944444444444444
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.27963370499725654,
45
+ "English": 0.3510041255324155
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.3187840899451787,
49
+ "token_length": {
50
+ "8k": 0.45086107163290673,
51
+ "16k": 0.42082616936389317,
52
+ "32k": 0.4024958091173256,
53
+ "64k": 0.31412697939763573,
54
+ "128k": 0.16271506723441695,
55
+ "256k": 0.16167944292489206
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.28863047320759827,
59
+ "Partial": 0.35716142033846177
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.4450445268436371,
63
+ "Moderate": 0.23675467668036354,
64
+ "Hard": 0.2806267700317323,
65
+ "Extreme": 0.25941235199849716
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.5578290894275468,
69
+ "T2. Sequencing & Structure Reconstruction": 0.5340221352721353,
70
+ "T3. Evidence-Grounded QA": 0.36666666666666664,
71
+ "T4. Summarization & Synthesis": 0.4824250532281757,
72
+ "T5. Attribution & Citation Alignment": 0.22435495508076153,
73
+ "T6. Aggregation & Clustering": 0.3143695347862014,
74
+ "T7. Consistency & Compliance Checking": 0.1819753578820025,
75
+ "T8. Structured & Numeric Reasoning": 0.11342592592592592,
76
+ "T9. Version & Code Diff Analysis": 0.33790255230380384,
77
+ "T10. Rule Induction & In-Context Learning": 0.3586111111111111,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334
79
+ },
80
+ "language": {
81
+ "Chinese": 0.29265028040862706,
82
+ "English": 0.34491789948172935
83
+ }
84
+ },
85
+ "pass@1": 0.116,
86
+ "BoN-2": {
87
+ "overall_metric": 0.3704962078912661,
88
+ "token_length": {
89
+ "8k": 0.5007140188951568,
90
+ "16k": 0.471793141596409,
91
+ "32k": 0.4656246883405684,
92
+ "64k": 0.3555228262932919,
93
+ "128k": 0.22278285337549444,
94
+ "256k": 0.20653971884667408
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.33766430263043345,
98
+ "Partial": 0.41228226913232546
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.505817704967975,
102
+ "Moderate": 0.26539475589230027,
103
+ "Hard": 0.33902165222515473,
104
+ "Extreme": 0.3119632638554326
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.6325785537327107,
108
+ "T2. Sequencing & Structure Reconstruction": 0.6024494976700858,
109
+ "T3. Evidence-Grounded QA": 0.4083333333333333,
110
+ "T4. Summarization & Synthesis": 0.5068628671369674,
111
+ "T5. Attribution & Citation Alignment": 0.27736426767676775,
112
+ "T6. Aggregation & Clustering": 0.39468559218559207,
113
+ "T7. Consistency & Compliance Checking": 0.23698781644898675,
114
+ "T8. Structured & Numeric Reasoning": 0.1412037037037037,
115
+ "T9. Version & Code Diff Analysis": 0.42207618836131144,
116
+ "T10. Rule Induction & In-Context Learning": 0.3888888888888889,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.23333333333333334
118
+ },
119
+ "language": {
120
+ "Chinese": 0.3386454170505693,
121
+ "English": 0.4023469987319626
122
+ }
123
+ },
124
+ "pass@2": 0.144,
125
+ "BoN-3": {
126
+ "overall_metric": 0.39272719549993146,
127
+ "token_length": {
128
+ "8k": 0.5352185196666011,
129
+ "16k": 0.48774731079355566,
130
+ "32k": 0.4914359262892438,
131
+ "64k": 0.38089490889752375,
132
+ "128k": 0.23106162164056493,
133
+ "256k": 0.23000488571210054
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.3601448943540025,
137
+ "Partial": 0.434195578776569
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.5296491139243411,
141
+ "Moderate": 0.28398341296374197,
142
+ "Hard": 0.3678982241467773,
143
+ "Extreme": 0.3304684709396022
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.6534337905352087,
147
+ "T2. Sequencing & Structure Reconstruction": 0.6269316794316794,
148
+ "T3. Evidence-Grounded QA": 0.43333333333333335,
149
+ "T4. Summarization & Synthesis": 0.5138761303847846,
150
+ "T5. Attribution & Citation Alignment": 0.30993542257550877,
151
+ "T6. Aggregation & Clustering": 0.4248105135605135,
152
+ "T7. Consistency & Compliance Checking": 0.25284892696838046,
153
+ "T8. Structured & Numeric Reasoning": 0.1523148148148148,
154
+ "T9. Version & Code Diff Analysis": 0.4435626489175109,
155
+ "T10. Rule Induction & In-Context Learning": 0.4330555555555556,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25
157
+ },
158
+ "language": {
159
+ "Chinese": 0.3526743750393581,
160
+ "English": 0.4327800159605054
161
+ }
162
+ },
163
+ "pass@3": 0.16266666666666665
164
+ }
results/Llama-3.1-70B-Instruct/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.3212355454655496,
9
+ "inference_iteration_1_overall_metric": 0.3168848382877898,
10
+ "inference_iteration_2_overall_metric": 0.3235694833261471,
11
+ "inference_iteration_3_overall_metric": 0.32325231478271244,
12
+ "average_token_length_metric": {
13
+ "8k": 0.44607201777886324,
14
+ "16k": 0.43551597572008266,
15
+ "32k": 0.40532179664339874,
16
+ "64k": 0.3372735574136524,
17
+ "128k": 0.14963906519016765,
18
+ "256k": 0.15359086004713354
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.2920710323592253,
22
+ "Partial": 0.3583540166917813
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.4845503461095368,
26
+ "Moderate": 0.21437000317705404,
27
+ "Hard": 0.28040007010006374,
28
+ "Extreme": 0.2393443186282747
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.5203183043430563,
32
+ "T2. Sequencing & Structure Reconstruction": 0.5048829192634423,
33
+ "T3. Evidence-Grounded QA": 0.2972222222222221,
34
+ "T4. Summarization & Synthesis": 0.46869982746667466,
35
+ "T5. Attribution & Citation Alignment": 0.24051072181402314,
36
+ "T6. Aggregation & Clustering": 0.3352447566072413,
37
+ "T7. Consistency & Compliance Checking": 0.16831334519236535,
38
+ "T8. Structured & Numeric Reasoning": 0.24182098765432103,
39
+ "T9. Version & Code Diff Analysis": 0.3504639112512836,
40
+ "T10. Rule Induction & In-Context Learning": 0.3041666666666666,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.21111111111111108
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.27396983223922683,
45
+ "English": 0.368501258691873
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.3168848382877898,
49
+ "token_length": {
50
+ "8k": 0.4288512287020653,
51
+ "16k": 0.431350020549353,
52
+ "32k": 0.41805229286031653,
53
+ "64k": 0.3101016124828498,
54
+ "128k": 0.16200619520020057,
55
+ "256k": 0.1509476799319523
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.28780546371048366,
59
+ "Partial": 0.3538949513861792
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.48546407013873055,
63
+ "Moderate": 0.1945019101312392,
64
+ "Hard": 0.2709484383728626,
65
+ "Extreme": 0.24276914751620637
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.48095324443543325,
69
+ "T2. Sequencing & Structure Reconstruction": 0.4873936033102699,
70
+ "T3. Evidence-Grounded QA": 0.275,
71
+ "T4. Summarization & Synthesis": 0.46817349027530025,
72
+ "T5. Attribution & Citation Alignment": 0.21679364691461467,
73
+ "T6. Aggregation & Clustering": 0.3425653712663516,
74
+ "T7. Consistency & Compliance Checking": 0.17647214053474497,
75
+ "T8. Structured & Numeric Reasoning": 0.25462962962962965,
76
+ "T9. Version & Code Diff Analysis": 0.347662448182329,
77
+ "T10. Rule Induction & In-Context Learning": 0.31625,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334
79
+ },
80
+ "language": {
81
+ "Chinese": 0.2661805594268294,
82
+ "English": 0.36758911714875
83
+ }
84
+ },
85
+ "pass@1": 0.13733333333333334,
86
+ "BoN-2": {
87
+ "overall_metric": 0.39714621701263747,
88
+ "token_length": {
89
+ "8k": 0.5101215226314286,
90
+ "16k": 0.5452446091546882,
91
+ "32k": 0.5033835517180962,
92
+ "64k": 0.40629425415229514,
93
+ "128k": 0.20367087402793838,
94
+ "256k": 0.21416249039138233
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.3628015637327161,
98
+ "Partial": 0.4408575939143573
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.6103509331261675,
102
+ "Moderate": 0.2739929015343184,
103
+ "Hard": 0.33264890818640663,
104
+ "Extreme": 0.2868424835064885
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.6190525866570429,
108
+ "T2. Sequencing & Structure Reconstruction": 0.5863848211097236,
109
+ "T3. Evidence-Grounded QA": 0.4,
110
+ "T4. Summarization & Synthesis": 0.48619725203488445,
111
+ "T5. Attribution & Citation Alignment": 0.290019754922306,
112
+ "T6. Aggregation & Clustering": 0.4354357416367219,
113
+ "T7. Consistency & Compliance Checking": 0.21250441110110105,
114
+ "T8. Structured & Numeric Reasoning": 0.3388888888888889,
115
+ "T9. Version & Code Diff Analysis": 0.4174297354939513,
116
+ "T10. Rule Induction & In-Context Learning": 0.42666666666666675,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25833333333333336
118
+ },
119
+ "language": {
120
+ "Chinese": 0.3417787747514596,
121
+ "English": 0.45251365927381704
122
+ }
123
+ },
124
+ "pass@2": 0.19133333333333333,
125
+ "BoN-3": {
126
+ "overall_metric": 0.43806378317359107,
127
+ "token_length": {
128
+ "8k": 0.5718190971548373,
129
+ "16k": 0.5932030702540269,
130
+ "32k": 0.5401634596906353,
131
+ "64k": 0.45712267932316264,
132
+ "128k": 0.2296143794444798,
133
+ "256k": 0.23646001317440798
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.39971488113838277,
137
+ "Partial": 0.48687147667294856
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.6591788734633496,
141
+ "Moderate": 0.3103483415629278,
142
+ "Hard": 0.38512512860737047,
143
+ "Extreme": 0.3144991474359921
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.6739172902122323,
147
+ "T2. Sequencing & Structure Reconstruction": 0.6553062801836215,
148
+ "T3. Evidence-Grounded QA": 0.475,
149
+ "T4. Summarization & Synthesis": 0.5017174053624821,
150
+ "T5. Attribution & Citation Alignment": 0.31632987998243095,
151
+ "T6. Aggregation & Clustering": 0.4726401785278596,
152
+ "T7. Consistency & Compliance Checking": 0.24365040952524628,
153
+ "T8. Structured & Numeric Reasoning": 0.36944444444444446,
154
+ "T9. Version & Code Diff Analysis": 0.44797944073835727,
155
+ "T10. Rule Induction & In-Context Learning": 0.46027777777777784,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665
157
+ },
158
+ "language": {
159
+ "Chinese": 0.386238080047615,
160
+ "English": 0.48988948629956824
161
+ }
162
+ },
163
+ "pass@3": 0.21933333333333332
164
+ }
results/Llama-3.1-8B-Instruct/nonthinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.21094590782574696,
9
+ "inference_iteration_1_overall_metric": 0.20814425242445228,
10
+ "inference_iteration_2_overall_metric": 0.213015185500322,
11
+ "inference_iteration_3_overall_metric": 0.21167828555246626,
12
+ "average_token_length_metric": {
13
+ "8k": 0.24549737122739362,
14
+ "16k": 0.2608710428868677,
15
+ "32k": 0.2249354240045269,
16
+ "64k": 0.18691854981764278,
17
+ "128k": 0.18010527298228765,
18
+ "256k": 0.16734778603576234
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.1813007104244737,
22
+ "Partial": 0.24867615906373072
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.254652823450579,
26
+ "Moderate": 0.13823151671179162,
27
+ "Hard": 0.21215047305696197,
28
+ "Extreme": 0.21003592225516218
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.4667231034949644,
32
+ "T2. Sequencing & Structure Reconstruction": 0.4213756913607651,
33
+ "T3. Evidence-Grounded QA": 0.15000000000000002,
34
+ "T4. Summarization & Synthesis": 0.47465443881044483,
35
+ "T5. Attribution & Citation Alignment": 0.08095709533952888,
36
+ "T6. Aggregation & Clustering": 0.1895252817222955,
37
+ "T7. Consistency & Compliance Checking": 0.12098051997071714,
38
+ "T8. Structured & Numeric Reasoning": 0.04969135802469136,
39
+ "T9. Version & Code Diff Analysis": 0.21578074220253873,
40
+ "T10. Rule Induction & In-Context Learning": 0.16759259259259265,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.11944444444444445
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.17905397547001678,
45
+ "English": 0.24283784018147686
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.20814425242445228,
49
+ "token_length": {
50
+ "8k": 0.22656456269661684,
51
+ "16k": 0.25006742264480875,
52
+ "32k": 0.22983820975916858,
53
+ "64k": 0.17867648708372652,
54
+ "128k": 0.1893868580863639,
55
+ "256k": 0.17433197427602828
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.1770456924597648,
59
+ "Partial": 0.24772423783405423
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.2566570988180275,
63
+ "Moderate": 0.1324196374719661,
64
+ "Hard": 0.20899042479807298,
65
+ "Extreme": 0.2041821568416995
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.4630702505342932,
69
+ "T2. Sequencing & Structure Reconstruction": 0.4120432796938637,
70
+ "T3. Evidence-Grounded QA": 0.13333333333333333,
71
+ "T4. Summarization & Synthesis": 0.47400084179308405,
72
+ "T5. Attribution & Citation Alignment": 0.0690637373143065,
73
+ "T6. Aggregation & Clustering": 0.19812149190741143,
74
+ "T7. Consistency & Compliance Checking": 0.11267079352137302,
75
+ "T8. Structured & Numeric Reasoning": 0.04490740740740741,
76
+ "T9. Version & Code Diff Analysis": 0.19979772893803915,
77
+ "T10. Rule Induction & In-Context Learning": 0.18361111111111109,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.13333333333333333
79
+ },
80
+ "language": {
81
+ "Chinese": 0.17869850347586846,
82
+ "English": 0.2375900013730359
83
+ }
84
+ },
85
+ "pass@1": 0.052,
86
+ "BoN-2": {
87
+ "overall_metric": 0.24847856382430364,
88
+ "token_length": {
89
+ "8k": 0.28288621820265775,
90
+ "16k": 0.30147254873111595,
91
+ "32k": 0.2728297990630907,
92
+ "64k": 0.21066076283394117,
93
+ "128k": 0.21820825855623124,
94
+ "256k": 0.2048137955587849
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.2142227788917067,
98
+ "Partial": 0.29207683555669967
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.3015279708931293,
102
+ "Moderate": 0.1575465527579005,
103
+ "Hard": 0.2601383702201079,
104
+ "Extreme": 0.24243277505755167
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.5278328421453239,
108
+ "T2. Sequencing & Structure Reconstruction": 0.4858897020853544,
109
+ "T3. Evidence-Grounded QA": 0.18333333333333332,
110
+ "T4. Summarization & Synthesis": 0.4923937781072418,
111
+ "T5. Attribution & Citation Alignment": 0.10321693015669395,
112
+ "T6. Aggregation & Clustering": 0.24202529361366723,
113
+ "T7. Consistency & Compliance Checking": 0.1496372306528247,
114
+ "T8. Structured & Numeric Reasoning": 0.06435185185185185,
115
+ "T9. Version & Code Diff Analysis": 0.2640161200205543,
116
+ "T10. Rule Induction & In-Context Learning": 0.22361111111111112,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666
118
+ },
119
+ "language": {
120
+ "Chinese": 0.21521719416329613,
121
+ "English": 0.2817399334853111
122
+ }
123
+ },
124
+ "pass@2": 0.068,
125
+ "BoN-3": {
126
+ "overall_metric": 0.27478587280873845,
127
+ "token_length": {
128
+ "8k": 0.32764512992755657,
129
+ "16k": 0.3226696445637056,
130
+ "32k": 0.29882237152612906,
131
+ "64k": 0.24198009489674097,
132
+ "128k": 0.23991327950411545,
133
+ "256k": 0.21768471643418316
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.2383436635644805,
137
+ "Partial": 0.32116686639233955
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.33550759893050514,
141
+ "Moderate": 0.18287651202405897,
142
+ "Hard": 0.28514599318022166,
143
+ "Extreme": 0.26183100573765283
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.5538072480868509,
147
+ "T2. Sequencing & Structure Reconstruction": 0.5290554353054353,
148
+ "T3. Evidence-Grounded QA": 0.2,
149
+ "T4. Summarization & Synthesis": 0.5032056223860172,
150
+ "T5. Attribution & Citation Alignment": 0.1333833560986498,
151
+ "T6. Aggregation & Clustering": 0.2737365591929987,
152
+ "T7. Consistency & Compliance Checking": 0.17502387399050007,
153
+ "T8. Structured & Numeric Reasoning": 0.08750000000000001,
154
+ "T9. Version & Code Diff Analysis": 0.3068144317903629,
155
+ "T10. Rule Induction & In-Context Learning": 0.2625,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666
157
+ },
158
+ "language": {
159
+ "Chinese": 0.23247692481372026,
160
+ "English": 0.3170948208037568
161
+ }
162
+ },
163
+ "pass@3": 0.078
164
+ }
results/Llama-3.1-8B-Instruct/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.20055372622856252,
9
+ "inference_iteration_1_overall_metric": 0.20930536348826667,
10
+ "inference_iteration_2_overall_metric": 0.19814963328771615,
11
+ "inference_iteration_3_overall_metric": 0.19420618190970523,
12
+ "average_token_length_metric": {
13
+ "8k": 0.25813666494087695,
14
+ "16k": 0.2584728735658432,
15
+ "32k": 0.22849707778354275,
16
+ "64k": 0.18730191383793596,
17
+ "128k": 0.12903377597359986,
18
+ "256k": 0.14188005126957745
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.1871659292157323,
22
+ "Partial": 0.21759274060852876
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.2628031197661334,
26
+ "Moderate": 0.12316795604778738,
27
+ "Hard": 0.17987117557385604,
28
+ "Extreme": 0.19677540131116525
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.4057709287198289,
32
+ "T2. Sequencing & Structure Reconstruction": 0.34219177348181007,
33
+ "T3. Evidence-Grounded QA": 0.12222222222222225,
34
+ "T4. Summarization & Synthesis": 0.4554008546339488,
35
+ "T5. Attribution & Citation Alignment": 0.09150573083223558,
36
+ "T6. Aggregation & Clustering": 0.2205869310535933,
37
+ "T7. Consistency & Compliance Checking": 0.10384155329285091,
38
+ "T8. Structured & Numeric Reasoning": 0.10123456790123456,
39
+ "T9. Version & Code Diff Analysis": 0.17017808218806194,
40
+ "T10. Rule Induction & In-Context Learning": 0.1756018518518519,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.10555555555555557
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.14712355199329824,
45
+ "English": 0.2539839004638273
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.20930536348826667,
49
+ "token_length": {
50
+ "8k": 0.2793229214860049,
51
+ "16k": 0.271683549702249,
52
+ "32k": 0.23348902534053415,
53
+ "64k": 0.19344554869301367,
54
+ "128k": 0.12028061678065591,
55
+ "256k": 0.15761051892714206
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.18602878507502096,
59
+ "Partial": 0.2389300996505795
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.28424670669624624,
63
+ "Moderate": 0.1253791912220988,
64
+ "Hard": 0.17541923812572474,
65
+ "Extreme": 0.20459178755292767
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.4385523704155171,
69
+ "T2. Sequencing & Structure Reconstruction": 0.33931459481923876,
70
+ "T3. Evidence-Grounded QA": 0.11666666666666667,
71
+ "T4. Summarization & Synthesis": 0.45953789724020144,
72
+ "T5. Attribution & Citation Alignment": 0.0722496761788766,
73
+ "T6. Aggregation & Clustering": 0.21044140806468803,
74
+ "T7. Consistency & Compliance Checking": 0.1283895768466609,
75
+ "T8. Structured & Numeric Reasoning": 0.1125,
76
+ "T9. Version & Code Diff Analysis": 0.2038326942491424,
77
+ "T10. Rule Induction & In-Context Learning": 0.19249999999999998,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.11666666666666667
79
+ },
80
+ "language": {
81
+ "Chinese": 0.15245924739914177,
82
+ "English": 0.26615147957739155
83
+ }
84
+ },
85
+ "pass@1": 0.06,
86
+ "BoN-2": {
87
+ "overall_metric": 0.2668017232451993,
88
+ "token_length": {
89
+ "8k": 0.3461725655993255,
90
+ "16k": 0.34802165706779187,
91
+ "32k": 0.31037609052110204,
92
+ "64k": 0.2389649239073623,
93
+ "128k": 0.1748998439051164,
94
+ "256k": 0.18237525847049635
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.2499435538255567,
98
+ "Partial": 0.28825757523383516
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.3588173402056394,
102
+ "Moderate": 0.16793760263800153,
103
+ "Hard": 0.24236535579298768,
104
+ "Extreme": 0.246988799777556
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.5197176048343898,
108
+ "T2. Sequencing & Structure Reconstruction": 0.48746135503643234,
109
+ "T3. Evidence-Grounded QA": 0.16666666666666666,
110
+ "T4. Summarization & Synthesis": 0.4794299302901568,
111
+ "T5. Attribution & Citation Alignment": 0.15065800813584843,
112
+ "T6. Aggregation & Clustering": 0.3052547184784026,
113
+ "T7. Consistency & Compliance Checking": 0.15302354691124467,
114
+ "T8. Structured & Numeric Reasoning": 0.14444444444444443,
115
+ "T9. Version & Code Diff Analysis": 0.2507539108503574,
116
+ "T10. Rule Induction & In-Context Learning": 0.23458333333333334,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666
118
+ },
119
+ "language": {
120
+ "Chinese": 0.19465748810477185,
121
+ "English": 0.33894595838562674
122
+ }
123
+ },
124
+ "pass@2": 0.082,
125
+ "BoN-3": {
126
+ "overall_metric": 0.29957877493017654,
127
+ "token_length": {
128
+ "8k": 0.3888483252966101,
129
+ "16k": 0.39060272417227027,
130
+ "32k": 0.34430797611932984,
131
+ "64k": 0.2620211212858359,
132
+ "128k": 0.19544431047747687,
133
+ "256k": 0.21624819222953548
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.2794795135612338,
137
+ "Partial": 0.3251596530361036
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.4075586546242195,
141
+ "Moderate": 0.19423764623204026,
142
+ "Hard": 0.27460228265341846,
143
+ "Extreme": 0.26688670776930307
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.5783680297471054,
147
+ "T2. Sequencing & Structure Reconstruction": 0.5321006964107361,
148
+ "T3. Evidence-Grounded QA": 0.20833333333333334,
149
+ "T4. Summarization & Synthesis": 0.4907425557642246,
150
+ "T5. Attribution & Citation Alignment": 0.1724297780962502,
151
+ "T6. Aggregation & Clustering": 0.3362168666306595,
152
+ "T7. Consistency & Compliance Checking": 0.16743722857143514,
153
+ "T8. Structured & Numeric Reasoning": 0.1921296296296296,
154
+ "T9. Version & Code Diff Analysis": 0.2933902615835246,
155
+ "T10. Rule Induction & In-Context Learning": 0.2673611111111111,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.15833333333333333
157
+ },
158
+ "language": {
159
+ "Chinese": 0.22893100795326488,
160
+ "English": 0.37022654190708804
161
+ }
162
+ },
163
+ "pass@3": 0.09866666666666667
164
+ }
results/Llama-3.2-3B-Instruct/nonthinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.15708345836639478,
9
+ "inference_iteration_1_overall_metric": 0.1522499746285183,
10
+ "inference_iteration_2_overall_metric": 0.16292071303675215,
11
+ "inference_iteration_3_overall_metric": 0.15607968743391326,
12
+ "average_token_length_metric": {
13
+ "8k": 0.19074751544937305,
14
+ "16k": 0.18311448739692587,
15
+ "32k": 0.15849934191444578,
16
+ "64k": 0.13711682904337855,
17
+ "128k": 0.13633492271322553,
18
+ "256k": 0.13668765368101904
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.15215903834853456,
22
+ "Partial": 0.16335090202548921
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.18487754128195516,
26
+ "Moderate": 0.10373568516116806,
27
+ "Hard": 0.15009434290092064,
28
+ "Extreme": 0.16626666941305895
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.3024747670482544,
32
+ "T2. Sequencing & Structure Reconstruction": 0.30977286822552064,
33
+ "T3. Evidence-Grounded QA": 0.11388888888888889,
34
+ "T4. Summarization & Synthesis": 0.4341408375280332,
35
+ "T5. Attribution & Citation Alignment": 0.0344779091920434,
36
+ "T6. Aggregation & Clustering": 0.15408667402382106,
37
+ "T7. Consistency & Compliance Checking": 0.07795789080091817,
38
+ "T8. Structured & Numeric Reasoning": 0.04398148148148148,
39
+ "T9. Version & Code Diff Analysis": 0.12905444479341674,
40
+ "T10. Rule Induction & In-Context Learning": 0.1479166666666667,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.07777777777777778
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.10513615313151659,
45
+ "English": 0.20903076360127287
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.1522499746285183,
49
+ "token_length": {
50
+ "8k": 0.18381886404580258,
51
+ "16k": 0.1763640779318065,
52
+ "32k": 0.1530403977981516,
53
+ "64k": 0.1336449793468121,
54
+ "128k": 0.13665833409911293,
55
+ "256k": 0.1299731945494239
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.14616662224309246,
59
+ "Partial": 0.15999242311906023
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.17269103520136797,
63
+ "Moderate": 0.10010586621394478,
64
+ "Hard": 0.1522959770717054,
65
+ "Extreme": 0.16407670515037534
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.31616711715222934,
69
+ "T2. Sequencing & Structure Reconstruction": 0.30997657033436127,
70
+ "T3. Evidence-Grounded QA": 0.075,
71
+ "T4. Summarization & Synthesis": 0.433290104470351,
72
+ "T5. Attribution & Citation Alignment": 0.040719448989792455,
73
+ "T6. Aggregation & Clustering": 0.1420105345821374,
74
+ "T7. Consistency & Compliance Checking": 0.0818116476937141,
75
+ "T8. Structured & Numeric Reasoning": 0.04583333333333333,
76
+ "T9. Version & Code Diff Analysis": 0.12251594627374464,
77
+ "T10. Rule Induction & In-Context Learning": 0.12597222222222224,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.075
79
+ },
80
+ "language": {
81
+ "Chinese": 0.10313580635298143,
82
+ "English": 0.2013641429040551
83
+ }
84
+ },
85
+ "pass@1": 0.03333333333333333,
86
+ "BoN-2": {
87
+ "overall_metric": 0.1921634438744948,
88
+ "token_length": {
89
+ "8k": 0.21983364549684287,
90
+ "16k": 0.21955667723795985,
91
+ "32k": 0.18995358273392637,
92
+ "64k": 0.16240795470998307,
93
+ "128k": 0.17591239432067907,
94
+ "256k": 0.18531640874757638
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.183991826715688,
98
+ "Partial": 0.20256368389479382
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.23050124787481288,
102
+ "Moderate": 0.13024657792029828,
103
+ "Hard": 0.18670800635922485,
104
+ "Extreme": 0.19441747608931817
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.3586662331587295,
108
+ "T2. Sequencing & Structure Reconstruction": 0.3847515389630534,
109
+ "T3. Evidence-Grounded QA": 0.16666666666666666,
110
+ "T4. Summarization & Synthesis": 0.4623366120534231,
111
+ "T5. Attribution & Citation Alignment": 0.04912393153232278,
112
+ "T6. Aggregation & Clustering": 0.19169925535926752,
113
+ "T7. Consistency & Compliance Checking": 0.09784140674649551,
114
+ "T8. Structured & Numeric Reasoning": 0.06805555555555555,
115
+ "T9. Version & Code Diff Analysis": 0.17896485067612064,
116
+ "T10. Rule Induction & In-Context Learning": 0.17347222222222225,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.09166666666666666
118
+ },
119
+ "language": {
120
+ "Chinese": 0.1340947957448097,
121
+ "English": 0.25023209200417945
122
+ }
123
+ },
124
+ "pass@2": 0.052,
125
+ "BoN-3": {
126
+ "overall_metric": 0.2126068902708674,
127
+ "token_length": {
128
+ "8k": 0.24600389478810633,
129
+ "16k": 0.23558970928733117,
130
+ "32k": 0.21547322231985894,
131
+ "64k": 0.18904442314365716,
132
+ "128k": 0.19236619778234643,
133
+ "256k": 0.19716389430390446
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.20471895219678654,
137
+ "Partial": 0.22264608418333381
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.2591916862859481,
141
+ "Moderate": 0.1495495342063559,
142
+ "Hard": 0.20304294133612527,
143
+ "Extreme": 0.20927435723794804
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.3790383678432267,
147
+ "T2. Sequencing & Structure Reconstruction": 0.41972832869160254,
148
+ "T3. Evidence-Grounded QA": 0.19166666666666668,
149
+ "T4. Summarization & Synthesis": 0.47495636922941775,
150
+ "T5. Attribution & Citation Alignment": 0.058968622858899346,
151
+ "T6. Aggregation & Clustering": 0.22320851430069694,
152
+ "T7. Consistency & Compliance Checking": 0.10913808800759871,
153
+ "T8. Structured & Numeric Reasoning": 0.08194444444444444,
154
+ "T9. Version & Code Diff Analysis": 0.19665231407803055,
155
+ "T10. Rule Induction & In-Context Learning": 0.21513888888888888,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.1
157
+ },
158
+ "language": {
159
+ "Chinese": 0.1489641004694211,
160
+ "English": 0.27624968007231393
161
+ }
162
+ },
163
+ "pass@3": 0.059333333333333335
164
+ }
results/Llama-3.2-3B-Instruct/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.12579532534277046,
9
+ "inference_iteration_1_overall_metric": 0.12189144574406009,
10
+ "inference_iteration_2_overall_metric": 0.1293014670146224,
11
+ "inference_iteration_3_overall_metric": 0.12619306326962915,
12
+ "average_token_length_metric": {
13
+ "8k": 0.15520842809468816,
14
+ "16k": 0.1484137730398096,
15
+ "32k": 0.13566213672791996,
16
+ "64k": 0.10339908100452032,
17
+ "128k": 0.11523413736015264,
18
+ "256k": 0.09685439582953186
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.11497129858976732,
22
+ "Partial": 0.13957135939204723
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.14352977294197386,
26
+ "Moderate": 0.07165328990150331,
27
+ "Hard": 0.10476288088788434,
28
+ "Extreme": 0.15574400931361643
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.18027953901797078,
32
+ "T2. Sequencing & Structure Reconstruction": 0.2032463362205395,
33
+ "T3. Evidence-Grounded QA": 0.11944444444444445,
34
+ "T4. Summarization & Synthesis": 0.42455853122133613,
35
+ "T5. Attribution & Citation Alignment": 0.03812352880578544,
36
+ "T6. Aggregation & Clustering": 0.12499067187507265,
37
+ "T7. Consistency & Compliance Checking": 0.06067661860338902,
38
+ "T8. Structured & Numeric Reasoning": 0.029629629629629627,
39
+ "T9. Version & Code Diff Analysis": 0.09370491802352847,
40
+ "T10. Rule Induction & In-Context Learning": 0.09847222222222225,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.09166666666666666
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.08710157372538106,
45
+ "English": 0.16448907696016007
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.12189144574406009,
49
+ "token_length": {
50
+ "8k": 0.14926024363329887,
51
+ "16k": 0.14140607600695068,
52
+ "32k": 0.12814754734467546,
53
+ "64k": 0.09111044280283606,
54
+ "128k": 0.1221228193452808,
55
+ "256k": 0.09930154533131832
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.11202506506940242,
59
+ "Partial": 0.13444865751180604
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.14039643942766825,
63
+ "Moderate": 0.056507474186827354,
64
+ "Hard": 0.10349493747731114,
65
+ "Extreme": 0.15664860872958616
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.16389165825733903,
69
+ "T2. Sequencing & Structure Reconstruction": 0.1787449973585681,
70
+ "T3. Evidence-Grounded QA": 0.09166666666666666,
71
+ "T4. Summarization & Synthesis": 0.42251209889763974,
72
+ "T5. Attribution & Citation Alignment": 0.029073247426826414,
73
+ "T6. Aggregation & Clustering": 0.13197902480468204,
74
+ "T7. Consistency & Compliance Checking": 0.06122238755841022,
75
+ "T8. Structured & Numeric Reasoning": 0.044444444444444446,
76
+ "T9. Version & Code Diff Analysis": 0.08781339576018317,
77
+ "T10. Rule Induction & In-Context Learning": 0.10180555555555555,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.09166666666666666
79
+ },
80
+ "language": {
81
+ "Chinese": 0.07947872192042414,
82
+ "English": 0.16430416956769595
83
+ }
84
+ },
85
+ "pass@1": 0.03333333333333333,
86
+ "BoN-2": {
87
+ "overall_metric": 0.16673174742075972,
88
+ "token_length": {
89
+ "8k": 0.21235596929835865,
90
+ "16k": 0.19139324741892721,
91
+ "32k": 0.17847356237746767,
92
+ "64k": 0.1323473405869264,
93
+ "128k": 0.15654182190882684,
94
+ "256k": 0.12927854293405058
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.15641485621192497,
98
+ "Partial": 0.1798623362320038
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.20556729410789223,
102
+ "Moderate": 0.09843476269293053,
103
+ "Hard": 0.13991984386404396,
104
+ "Extreme": 0.1866584183549311
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.23716632921131517,
108
+ "T2. Sequencing & Structure Reconstruction": 0.283700678823595,
109
+ "T3. Evidence-Grounded QA": 0.18333333333333332,
110
+ "T4. Summarization & Synthesis": 0.44817949561782316,
111
+ "T5. Attribution & Citation Alignment": 0.07044117047262258,
112
+ "T6. Aggregation & Clustering": 0.1774625349060259,
113
+ "T7. Consistency & Compliance Checking": 0.0801161795021524,
114
+ "T8. Structured & Numeric Reasoning": 0.05,
115
+ "T9. Version & Code Diff Analysis": 0.14648554146631595,
116
+ "T10. Rule Induction & In-Context Learning": 0.14513888888888887,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.10833333333333334
118
+ },
119
+ "language": {
120
+ "Chinese": 0.11086445603583714,
121
+ "English": 0.22259903880568205
122
+ }
123
+ },
124
+ "pass@2": 0.048,
125
+ "BoN-3": {
126
+ "overall_metric": 0.19563257554771482,
127
+ "token_length": {
128
+ "8k": 0.2530151029327433,
129
+ "16k": 0.2145335526961687,
130
+ "32k": 0.21507949798738746,
131
+ "64k": 0.15928462128906565,
132
+ "128k": 0.18536894288047534,
133
+ "256k": 0.14651373550044794
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.18678441171314086,
137
+ "Partial": 0.20689387497353642
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.24756743178790308,
141
+ "Moderate": 0.1259095390094924,
142
+ "Hard": 0.1628721973860393,
143
+ "Extreme": 0.20605327132157794
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.3006781841143776,
147
+ "T2. Sequencing & Structure Reconstruction": 0.34291202176503843,
148
+ "T3. Evidence-Grounded QA": 0.20833333333333334,
149
+ "T4. Summarization & Synthesis": 0.46018159312540347,
150
+ "T5. Attribution & Citation Alignment": 0.0815602367002352,
151
+ "T6. Aggregation & Clustering": 0.20248617019109905,
152
+ "T7. Consistency & Compliance Checking": 0.08636604532829192,
153
+ "T8. Structured & Numeric Reasoning": 0.07777777777777778,
154
+ "T9. Version & Code Diff Analysis": 0.17054683536229345,
155
+ "T10. Rule Induction & In-Context Learning": 0.18958333333333333,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666
157
+ },
158
+ "language": {
159
+ "Chinese": 0.13358536850999,
160
+ "English": 0.25767978258543983
161
+ }
162
+ },
163
+ "pass@3": 0.06133333333333333
164
+ }
results/Llama-3.3-70B-Instruct/nonthinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.3189017909858673,
9
+ "inference_iteration_1_overall_metric": 0.3156125985423413,
10
+ "inference_iteration_2_overall_metric": 0.32123074093436993,
11
+ "inference_iteration_3_overall_metric": 0.31986203348089143,
12
+ "average_token_length_metric": {
13
+ "8k": 0.4593043661575621,
14
+ "16k": 0.4357005279195819,
15
+ "32k": 0.4042163423898818,
16
+ "64k": 0.33700892278371447,
17
+ "128k": 0.1425979596199625,
18
+ "256k": 0.13458262704450225
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.291318303255093,
22
+ "Partial": 0.35400804809776254
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.440396417724264,
26
+ "Moderate": 0.22606271262921054,
27
+ "Hard": 0.29071041097108385,
28
+ "Extreme": 0.2653049554891388
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.5698993177933688,
32
+ "T2. Sequencing & Structure Reconstruction": 0.5207791047484032,
33
+ "T3. Evidence-Grounded QA": 0.33055555555555555,
34
+ "T4. Summarization & Synthesis": 0.4889794661436827,
35
+ "T5. Attribution & Citation Alignment": 0.24622344831110268,
36
+ "T6. Aggregation & Clustering": 0.31517757482240366,
37
+ "T7. Consistency & Compliance Checking": 0.19554447151545906,
38
+ "T8. Structured & Numeric Reasoning": 0.11126543209876544,
39
+ "T9. Version & Code Diff Analysis": 0.3449561289681397,
40
+ "T10. Rule Induction & In-Context Learning": 0.3185648148148148,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.23333333333333334
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.28656733073968105,
45
+ "English": 0.35123625123205404
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.3156125985423413,
49
+ "token_length": {
50
+ "8k": 0.4598857003901339,
51
+ "16k": 0.43280589089049554,
52
+ "32k": 0.4031552748630893,
53
+ "64k": 0.32858688094254546,
54
+ "128k": 0.13499648051535806,
55
+ "256k": 0.1342453636524253
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.28665010278500924,
59
+ "Partial": 0.3524739567789461
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.4365832341037067,
63
+ "Moderate": 0.2165303315022285,
64
+ "Hard": 0.2928446211124342,
65
+ "Extreme": 0.26312822197701774
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.5433299618590155,
69
+ "T2. Sequencing & Structure Reconstruction": 0.526070226070226,
70
+ "T3. Evidence-Grounded QA": 0.325,
71
+ "T4. Summarization & Synthesis": 0.49127697260080533,
72
+ "T5. Attribution & Citation Alignment": 0.24102740431192388,
73
+ "T6. Aggregation & Clustering": 0.31258403804363555,
74
+ "T7. Consistency & Compliance Checking": 0.20132235403207502,
75
+ "T8. Structured & Numeric Reasoning": 0.11481481481481483,
76
+ "T9. Version & Code Diff Analysis": 0.34245443993484,
77
+ "T10. Rule Induction & In-Context Learning": 0.30791666666666667,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.225
79
+ },
80
+ "language": {
81
+ "Chinese": 0.28075822912737913,
82
+ "English": 0.3504669679573037
83
+ }
84
+ },
85
+ "pass@1": 0.11266666666666666,
86
+ "BoN-2": {
87
+ "overall_metric": 0.34649569529069124,
88
+ "token_length": {
89
+ "8k": 0.47647116314268717,
90
+ "16k": 0.4687523820747641,
91
+ "32k": 0.4305852114495849,
92
+ "64k": 0.3618707734054544,
93
+ "128k": 0.17753523030724935,
94
+ "256k": 0.16375941136440833
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.3168861362630559,
98
+ "Partial": 0.3841805885985913
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.4708835305116446,
102
+ "Moderate": 0.2536880874829281,
103
+ "Hard": 0.31598267575873745,
104
+ "Extreme": 0.29123370602859766
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.613795052914502,
108
+ "T2. Sequencing & Structure Reconstruction": 0.572449494949495,
109
+ "T3. Evidence-Grounded QA": 0.35833333333333334,
110
+ "T4. Summarization & Synthesis": 0.5065243852070945,
111
+ "T5. Attribution & Citation Alignment": 0.26010872017700837,
112
+ "T6. Aggregation & Clustering": 0.3543572676296878,
113
+ "T7. Consistency & Compliance Checking": 0.2180957885923682,
114
+ "T8. Structured & Numeric Reasoning": 0.1287037037037037,
115
+ "T9. Version & Code Diff Analysis": 0.3871389535524576,
116
+ "T10. Rule Induction & In-Context Learning": 0.3311111111111111,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25
118
+ },
119
+ "language": {
120
+ "Chinese": 0.3170227577667401,
121
+ "English": 0.3759686328146428
122
+ }
123
+ },
124
+ "pass@2": 0.126,
125
+ "BoN-3": {
126
+ "overall_metric": 0.358435386899099,
127
+ "token_length": {
128
+ "8k": 0.48710556132647576,
129
+ "16k": 0.4795308481526823,
130
+ "32k": 0.4449722178774269,
131
+ "64k": 0.37149023172449164,
132
+ "128k": 0.19457817433016786,
133
+ "256k": 0.17293528798335014
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.33005566769629513,
137
+ "Partial": 0.39455502952084975
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.48736321739782795,
141
+ "Moderate": 0.2624427701465103,
142
+ "Hard": 0.3272030727162918,
143
+ "Extreme": 0.30076445676260405
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.6366814621432787,
147
+ "T2. Sequencing & Structure Reconstruction": 0.5821049783549782,
148
+ "T3. Evidence-Grounded QA": 0.36666666666666664,
149
+ "T4. Summarization & Synthesis": 0.5152102198399457,
150
+ "T5. Attribution & Citation Alignment": 0.27428748176856066,
151
+ "T6. Aggregation & Clustering": 0.36977067074988085,
152
+ "T7. Consistency & Compliance Checking": 0.22440889882489728,
153
+ "T8. Structured & Numeric Reasoning": 0.1300925925925926,
154
+ "T9. Version & Code Diff Analysis": 0.39630550643647333,
155
+ "T10. Rule Induction & In-Context Learning": 0.37277777777777776,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.25
157
+ },
158
+ "language": {
159
+ "Chinese": 0.33234090111179226,
160
+ "English": 0.3845298726864061
161
+ }
162
+ },
163
+ "pass@3": 0.13466666666666666
164
+ }
results/Llama-3.3-70B-Instruct/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.3368788983987977,
9
+ "inference_iteration_1_overall_metric": 0.3346445205602255,
10
+ "inference_iteration_2_overall_metric": 0.34105124981338825,
11
+ "inference_iteration_3_overall_metric": 0.3349409248227798,
12
+ "average_token_length_metric": {
13
+ "8k": 0.48257436937624887,
14
+ "16k": 0.4570891611420083,
15
+ "32k": 0.43164967032208246,
16
+ "64k": 0.37974005621997625,
17
+ "128k": 0.14494662029982153,
18
+ "256k": 0.1252735130326494
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.3148605467957358,
22
+ "Partial": 0.3649022549845135
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.5193942390521589,
26
+ "Moderate": 0.22606077851307846,
27
+ "Hard": 0.2859477654450278,
28
+ "Extreme": 0.2431814890253722
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.5716856431399633,
32
+ "T2. Sequencing & Structure Reconstruction": 0.5180708023900146,
33
+ "T3. Evidence-Grounded QA": 0.3222222222222222,
34
+ "T4. Summarization & Synthesis": 0.4675755720039648,
35
+ "T5. Attribution & Citation Alignment": 0.2472281333613823,
36
+ "T6. Aggregation & Clustering": 0.36222233277682875,
37
+ "T7. Consistency & Compliance Checking": 0.16955304749357702,
38
+ "T8. Structured & Numeric Reasoning": 0.2820987654320987,
39
+ "T9. Version & Code Diff Analysis": 0.34705004572107623,
40
+ "T10. Rule Induction & In-Context Learning": 0.31356481481481485,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2027777777777778
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.28230462501190684,
45
+ "English": 0.39145317178568884
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.3346445205602255,
49
+ "token_length": {
50
+ "8k": 0.4542953722813034,
51
+ "16k": 0.44790102117035924,
52
+ "32k": 0.4589206338089406,
53
+ "64k": 0.3903508423882444,
54
+ "128k": 0.1390327640597696,
55
+ "256k": 0.11736648965273745
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.3229561312874356,
59
+ "Partial": 0.34952065236195895
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.5259881421298087,
63
+ "Moderate": 0.21937379810223762,
64
+ "Hard": 0.272924266331999,
65
+ "Extreme": 0.24128717207335554
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.5894507286480519,
69
+ "T2. Sequencing & Structure Reconstruction": 0.49666058774955946,
70
+ "T3. Evidence-Grounded QA": 0.31666666666666665,
71
+ "T4. Summarization & Synthesis": 0.4602747124060712,
72
+ "T5. Attribution & Citation Alignment": 0.2717574105274334,
73
+ "T6. Aggregation & Clustering": 0.34752011368589375,
74
+ "T7. Consistency & Compliance Checking": 0.15669004787219404,
75
+ "T8. Structured & Numeric Reasoning": 0.3101851851851852,
76
+ "T9. Version & Code Diff Analysis": 0.3494311586679074,
77
+ "T10. Rule Induction & In-Context Learning": 0.2688888888888889,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334
79
+ },
80
+ "language": {
81
+ "Chinese": 0.28273488892882487,
82
+ "English": 0.3865541521916267
83
+ }
84
+ },
85
+ "pass@1": 0.14066666666666666,
86
+ "BoN-2": {
87
+ "overall_metric": 0.3958348563743987,
88
+ "token_length": {
89
+ "8k": 0.5375237165063974,
90
+ "16k": 0.5279796876480686,
91
+ "32k": 0.5128361604334021,
92
+ "64k": 0.44897938061236586,
93
+ "128k": 0.18481857856663617,
94
+ "256k": 0.16287161447952125
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.37597657573399246,
98
+ "Partial": 0.4211090317349159
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.6024719774255488,
102
+ "Moderate": 0.2783382985783699,
103
+ "Hard": 0.3365596754926361,
104
+ "Extreme": 0.28558017485446086
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.6560921957788185,
108
+ "T2. Sequencing & Structure Reconstruction": 0.590698368130755,
109
+ "T3. Evidence-Grounded QA": 0.38333333333333336,
110
+ "T4. Summarization & Synthesis": 0.4863977172263582,
111
+ "T5. Attribution & Citation Alignment": 0.29159793863898964,
112
+ "T6. Aggregation & Clustering": 0.424325611415665,
113
+ "T7. Consistency & Compliance Checking": 0.20595551459420022,
114
+ "T8. Structured & Numeric Reasoning": 0.3527777777777778,
115
+ "T9. Version & Code Diff Analysis": 0.3998111292235962,
116
+ "T10. Rule Induction & In-Context Learning": 0.42375,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667
118
+ },
119
+ "language": {
120
+ "Chinese": 0.3369511703916512,
121
+ "English": 0.4547185423571466
122
+ }
123
+ },
124
+ "pass@2": 0.18533333333333332,
125
+ "BoN-3": {
126
+ "overall_metric": 0.4231932102533526,
127
+ "token_length": {
128
+ "8k": 0.5825111026223746,
129
+ "16k": 0.5470777230462098,
130
+ "32k": 0.5441468137547846,
131
+ "64k": 0.476981081390983,
132
+ "128k": 0.19851254776601635,
133
+ "256k": 0.189929992939747
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.4008399164475989,
137
+ "Partial": 0.4516428569152216
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.6383633989735824,
141
+ "Moderate": 0.29622937081220263,
142
+ "Hard": 0.3631324161592812,
143
+ "Extreme": 0.31032522872728074
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.6823167917941978,
147
+ "T2. Sequencing & Structure Reconstruction": 0.6268177888335086,
148
+ "T3. Evidence-Grounded QA": 0.38333333333333336,
149
+ "T4. Summarization & Synthesis": 0.49909323423974894,
150
+ "T5. Attribution & Citation Alignment": 0.30308485751721276,
151
+ "T6. Aggregation & Clustering": 0.4702913178424183,
152
+ "T7. Consistency & Compliance Checking": 0.22495072081956943,
153
+ "T8. Structured & Numeric Reasoning": 0.38055555555555554,
154
+ "T9. Version & Code Diff Analysis": 0.4307393977892574,
155
+ "T10. Rule Induction & In-Context Learning": 0.47583333333333333,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.275
157
+ },
158
+ "language": {
159
+ "Chinese": 0.36349427675262524,
160
+ "English": 0.48289214375408057
161
+ }
162
+ },
163
+ "pass@3": 0.20866666666666667
164
+ }
results/Magistral-Small-2509/thinking_context-120000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.38398116897357343,
9
+ "inference_iteration_1_overall_metric": 0.3775125226933459,
10
+ "inference_iteration_2_overall_metric": 0.3891788972494985,
11
+ "inference_iteration_3_overall_metric": 0.38525208697787594,
12
+ "average_token_length_metric": {
13
+ "8k": 0.5449912961150511,
14
+ "16k": 0.47444243155128724,
15
+ "32k": 0.4264163670766384,
16
+ "64k": 0.3029575907045873,
17
+ "128k": 0.302750216644183,
18
+ "256k": 0.2523291117496938
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.3475865533347475,
22
+ "Partial": 0.4303015888775337
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.5425168562606535,
26
+ "Moderate": 0.2943739375717643,
27
+ "Hard": 0.3291735952793175,
28
+ "Extreme": 0.30516679942900543
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.6519835369865774,
32
+ "T2. Sequencing & Structure Reconstruction": 0.6259667241346109,
33
+ "T3. Evidence-Grounded QA": 0.438888888888889,
34
+ "T4. Summarization & Synthesis": 0.5181402784690045,
35
+ "T5. Attribution & Citation Alignment": 0.2964646210831104,
36
+ "T6. Aggregation & Clustering": 0.3430731095683128,
37
+ "T7. Consistency & Compliance Checking": 0.1940507194662843,
38
+ "T8. Structured & Numeric Reasoning": 0.23070987654320987,
39
+ "T9. Version & Code Diff Analysis": 0.40948667090743257,
40
+ "T10. Rule Induction & In-Context Learning": 0.4654166666666667,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.2416666666666666
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.3639911930284364,
45
+ "English": 0.40397114491871056
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.3775125226933459,
49
+ "token_length": {
50
+ "8k": 0.5435818039060978,
51
+ "16k": 0.4750140116790471,
52
+ "32k": 0.40129268121677775,
53
+ "64k": 0.28670320465634697,
54
+ "128k": 0.3074867892397562,
55
+ "256k": 0.25099664546205136
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.33866755109330426,
59
+ "Partial": 0.42695157745703605
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.5415570936201182,
63
+ "Moderate": 0.2821803430686346,
64
+ "Hard": 0.32058466511584804,
65
+ "Extreme": 0.29781631261319746
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.6532513121925126,
69
+ "T2. Sequencing & Structure Reconstruction": 0.6111094803277807,
70
+ "T3. Evidence-Grounded QA": 0.425,
71
+ "T4. Summarization & Synthesis": 0.5176378460686947,
72
+ "T5. Attribution & Citation Alignment": 0.2913492653600891,
73
+ "T6. Aggregation & Clustering": 0.33400917389507784,
74
+ "T7. Consistency & Compliance Checking": 0.1887200677483396,
75
+ "T8. Structured & Numeric Reasoning": 0.22129629629629627,
76
+ "T9. Version & Code Diff Analysis": 0.41785365614151276,
77
+ "T10. Rule Induction & In-Context Learning": 0.445,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667
79
+ },
80
+ "language": {
81
+ "Chinese": 0.34935070830064097,
82
+ "English": 0.4056743370860514
83
+ }
84
+ },
85
+ "pass@1": 0.15666666666666668,
86
+ "BoN-2": {
87
+ "overall_metric": 0.4410818050309274,
88
+ "token_length": {
89
+ "8k": 0.6043547908072043,
90
+ "16k": 0.5353872646463821,
91
+ "32k": 0.50503501278225,
92
+ "64k": 0.35415051008140713,
93
+ "128k": 0.3590897028976117,
94
+ "256k": 0.28847354897071054
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.4052447124894285,
98
+ "Partial": 0.48669265008374507
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.6224120794085322,
102
+ "Moderate": 0.35760055550726977,
103
+ "Hard": 0.3739866973836827,
104
+ "Extreme": 0.3413440208772583
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.7183527826830406,
108
+ "T2. Sequencing & Structure Reconstruction": 0.6865933482781307,
109
+ "T3. Evidence-Grounded QA": 0.5333333333333333,
110
+ "T4. Summarization & Synthesis": 0.5317396047677334,
111
+ "T5. Attribution & Citation Alignment": 0.34712734561624925,
112
+ "T6. Aggregation & Clustering": 0.4084056110628732,
113
+ "T7. Consistency & Compliance Checking": 0.2396263809618489,
114
+ "T8. Structured & Numeric Reasoning": 0.28148148148148144,
115
+ "T9. Version & Code Diff Analysis": 0.4526614935043571,
116
+ "T10. Rule Induction & In-Context Learning": 0.5494444444444445,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3
118
+ },
119
+ "language": {
120
+ "Chinese": 0.4165337844683723,
121
+ "English": 0.4656298255934835
122
+ }
123
+ },
124
+ "pass@2": 0.202,
125
+ "BoN-3": {
126
+ "overall_metric": 0.47447373045805247,
127
+ "token_length": {
128
+ "8k": 0.6472581172783606,
129
+ "16k": 0.572013026736313,
130
+ "32k": 0.5323134358536711,
131
+ "64k": 0.38848449233451177,
132
+ "128k": 0.38802278615487296,
133
+ "256k": 0.31875052439058804
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.43884732486807027,
137
+ "Partial": 0.5198164284816681
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.6584366917924486,
141
+ "Moderate": 0.38635877266139323,
142
+ "Hard": 0.40375289537189885,
143
+ "Extreme": 0.3772769049579495
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.7364126543991428,
147
+ "T2. Sequencing & Structure Reconstruction": 0.718547042731825,
148
+ "T3. Evidence-Grounded QA": 0.5666666666666667,
149
+ "T4. Summarization & Synthesis": 0.5424760798885467,
150
+ "T5. Attribution & Citation Alignment": 0.3887278739210803,
151
+ "T6. Aggregation & Clustering": 0.45972240993615016,
152
+ "T7. Consistency & Compliance Checking": 0.258469847158553,
153
+ "T8. Structured & Numeric Reasoning": 0.32592592592592595,
154
+ "T9. Version & Code Diff Analysis": 0.48746959414301055,
155
+ "T10. Rule Induction & In-Context Learning": 0.5911111111111111,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3333333333333333
157
+ },
158
+ "language": {
159
+ "Chinese": 0.45726250895870557,
160
+ "English": 0.4916849519574011
161
+ }
162
+ },
163
+ "pass@3": 0.22866666666666666
164
+ }
results/MiniMax-M2/thinking_context-1000000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.5320685707653132,
9
+ "inference_iteration_1_overall_metric": 0.535180398833494,
10
+ "inference_iteration_2_overall_metric": 0.5311849506804371,
11
+ "inference_iteration_3_overall_metric": 0.5298403627820072,
12
+ "average_token_length_metric": {
13
+ "8k": 0.654795970947119,
14
+ "16k": 0.5832041701523042,
15
+ "32k": 0.5830505446766833,
16
+ "64k": 0.5201561955794758,
17
+ "128k": 0.5060838591020447,
18
+ "256k": 0.3451206841342513
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.4938467607068266,
22
+ "Partial": 0.5807145108397509
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.7219874781362817,
26
+ "Moderate": 0.599199335465557,
27
+ "Hard": 0.4257653962693645,
28
+ "Extreme": 0.34975019139747615
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.767571983047427,
32
+ "T2. Sequencing & Structure Reconstruction": 0.7186696477094124,
33
+ "T3. Evidence-Grounded QA": 0.4972222222222222,
34
+ "T4. Summarization & Synthesis": 0.4696599254603241,
35
+ "T5. Attribution & Citation Alignment": 0.54344042963745,
36
+ "T6. Aggregation & Clustering": 0.5123089198769455,
37
+ "T7. Consistency & Compliance Checking": 0.31381086481875964,
38
+ "T8. Structured & Numeric Reasoning": 0.6038580246913581,
39
+ "T9. Version & Code Diff Analysis": 0.5619188050015754,
40
+ "T10. Rule Induction & In-Context Learning": 0.5529629629629632,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.39444444444444454
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.5354797348772966,
45
+ "English": 0.5286574066533296
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.535180398833494,
49
+ "token_length": {
50
+ "8k": 0.6571274187960493,
51
+ "16k": 0.5855452098864022,
52
+ "32k": 0.6094638772285274,
53
+ "64k": 0.5094373867375244,
54
+ "128k": 0.5028727484199556,
55
+ "256k": 0.34663575193250185
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.4983496461876354,
59
+ "Partial": 0.5820559022009494
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.7348029222520711,
63
+ "Moderate": 0.6076522249303262,
64
+ "Hard": 0.4165082385065274,
65
+ "Extreme": 0.3468482177079349
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.7483964432006224,
69
+ "T2. Sequencing & Structure Reconstruction": 0.6994320017261199,
70
+ "T3. Evidence-Grounded QA": 0.5,
71
+ "T4. Summarization & Synthesis": 0.46659438196842223,
72
+ "T5. Attribution & Citation Alignment": 0.5466093432829364,
73
+ "T6. Aggregation & Clustering": 0.5244645023077399,
74
+ "T7. Consistency & Compliance Checking": 0.32026132009110975,
75
+ "T8. Structured & Numeric Reasoning": 0.6097222222222223,
76
+ "T9. Version & Code Diff Analysis": 0.5581618594200692,
77
+ "T10. Rule Induction & In-Context Learning": 0.5638888888888888,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.425
79
+ },
80
+ "language": {
81
+ "Chinese": 0.5363273796225,
82
+ "English": 0.5340334180444871
83
+ }
84
+ },
85
+ "pass@1": 0.30133333333333334,
86
+ "BoN-2": {
87
+ "overall_metric": 0.631079137318095,
88
+ "token_length": {
89
+ "8k": 0.7607802022833232,
90
+ "16k": 0.6922523863155777,
91
+ "32k": 0.696226877019834,
92
+ "64k": 0.6061494664574102,
93
+ "128k": 0.5803070617163435,
94
+ "256k": 0.45075883011608214
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.5863061857618213,
98
+ "Partial": 0.6880628938442622
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.8429412379045521,
102
+ "Moderate": 0.7260325873797594,
103
+ "Hard": 0.5037591992235261,
104
+ "Extreme": 0.42025273404272534
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.854967268320159,
108
+ "T2. Sequencing & Structure Reconstruction": 0.7950413527388575,
109
+ "T3. Evidence-Grounded QA": 0.6333333333333333,
110
+ "T4. Summarization & Synthesis": 0.48790929716965253,
111
+ "T5. Attribution & Citation Alignment": 0.6628232709674524,
112
+ "T6. Aggregation & Clustering": 0.6070531962911038,
113
+ "T7. Consistency & Compliance Checking": 0.4273910542891768,
114
+ "T8. Structured & Numeric Reasoning": 0.6976851851851852,
115
+ "T9. Version & Code Diff Analysis": 0.6589983180763119,
116
+ "T10. Rule Induction & In-Context Learning": 0.663888888888889,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333
118
+ },
119
+ "language": {
120
+ "Chinese": 0.6243713115466586,
121
+ "English": 0.6377869630895318
122
+ }
123
+ },
124
+ "pass@2": 0.38133333333333336,
125
+ "BoN-3": {
126
+ "overall_metric": 0.6838483190204042,
127
+ "token_length": {
128
+ "8k": 0.8056053063071357,
129
+ "16k": 0.754121676530954,
130
+ "32k": 0.7309373525434467,
131
+ "64k": 0.6802921620132278,
132
+ "128k": 0.6428076963616377,
133
+ "256k": 0.48932572036602695
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.6418563038987366,
137
+ "Partial": 0.7372927019025284
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.8932565923719491,
141
+ "Moderate": 0.7958991372015439,
142
+ "Hard": 0.5571346872299623,
143
+ "Extreme": 0.46408187669686923
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.8856810957992655,
147
+ "T2. Sequencing & Structure Reconstruction": 0.8257039309014353,
148
+ "T3. Evidence-Grounded QA": 0.7083333333333334,
149
+ "T4. Summarization & Synthesis": 0.5020042367803014,
150
+ "T5. Attribution & Citation Alignment": 0.729994986816228,
151
+ "T6. Aggregation & Clustering": 0.6651081533166491,
152
+ "T7. Consistency & Compliance Checking": 0.49051401515979876,
153
+ "T8. Structured & Numeric Reasoning": 0.7680555555555556,
154
+ "T9. Version & Code Diff Analysis": 0.7036475958542688,
155
+ "T10. Rule Induction & In-Context Learning": 0.7072222222222223,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6
157
+ },
158
+ "language": {
159
+ "Chinese": 0.6727473750313149,
160
+ "English": 0.694949263009495
161
+ }
162
+ },
163
+ "pass@3": 0.444
164
+ }
results/MiniMax-Text-01/nonthinking_context-1000000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.4113523778378889,
9
+ "inference_iteration_1_overall_metric": 0.4026546189679395,
10
+ "inference_iteration_2_overall_metric": 0.41422198000018023,
11
+ "inference_iteration_3_overall_metric": 0.41718053454554826,
12
+ "average_token_length_metric": {
13
+ "8k": 0.45750122785552744,
14
+ "16k": 0.40648581074103435,
15
+ "32k": 0.41953181726499883,
16
+ "64k": 0.3963813527019971,
17
+ "128k": 0.41323756281622565,
18
+ "256k": 0.3749764956475515
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.37732447212646125,
22
+ "Partial": 0.45466062147061553
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.5125950929989945,
26
+ "Moderate": 0.38228847113922254,
27
+ "Hard": 0.3867421547849868,
28
+ "Extreme": 0.33569972963459577
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.7019344870456294,
32
+ "T2. Sequencing & Structure Reconstruction": 0.6935557265385518,
33
+ "T3. Evidence-Grounded QA": 0.5000000000000001,
34
+ "T4. Summarization & Synthesis": 0.525289467915154,
35
+ "T5. Attribution & Citation Alignment": 0.40960389859884994,
36
+ "T6. Aggregation & Clustering": 0.3855189408594916,
37
+ "T7. Consistency & Compliance Checking": 0.2570183735053335,
38
+ "T8. Structured & Numeric Reasoning": 0.16126543209876543,
39
+ "T9. Version & Code Diff Analysis": 0.3763262824393013,
40
+ "T10. Rule Induction & In-Context Learning": 0.3850462962962962,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.34444444444444444
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.4206201869029405,
45
+ "English": 0.40208456877283766
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.4026546189679395,
49
+ "token_length": {
50
+ "8k": 0.4484815946744958,
51
+ "16k": 0.40023341947584756,
52
+ "32k": 0.39365195091822286,
53
+ "64k": 0.4050265329266902,
54
+ "128k": 0.40626760527764794,
55
+ "256k": 0.3622666105347326
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.3618050193216267,
59
+ "Partial": 0.45464501851779227
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.49189201078601713,
63
+ "Moderate": 0.3847984308236515,
64
+ "Hard": 0.39113612973801154,
65
+ "Extreme": 0.32419293466074633
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.7071317552591362,
69
+ "T2. Sequencing & Structure Reconstruction": 0.692217342415818,
70
+ "T3. Evidence-Grounded QA": 0.475,
71
+ "T4. Summarization & Synthesis": 0.5252872492452957,
72
+ "T5. Attribution & Citation Alignment": 0.3965042482839467,
73
+ "T6. Aggregation & Clustering": 0.38900319686695384,
74
+ "T7. Consistency & Compliance Checking": 0.24881818692821855,
75
+ "T8. Structured & Numeric Reasoning": 0.1462962962962963,
76
+ "T9. Version & Code Diff Analysis": 0.35572673286895423,
77
+ "T10. Rule Induction & In-Context Learning": 0.36347222222222214,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667
79
+ },
80
+ "language": {
81
+ "Chinese": 0.4101250015901318,
82
+ "English": 0.3951842363457473
83
+ }
84
+ },
85
+ "pass@1": 0.15533333333333332,
86
+ "BoN-2": {
87
+ "overall_metric": 0.4807038949944852,
88
+ "token_length": {
89
+ "8k": 0.5303281501019884,
90
+ "16k": 0.4819497908714715,
91
+ "32k": 0.47954691765928337,
92
+ "64k": 0.48083012165065453,
93
+ "128k": 0.465293133114307,
94
+ "256k": 0.44627525656921
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.44656912126099607,
98
+ "Partial": 0.5241481524734732
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.596920990471646,
102
+ "Moderate": 0.4603818463137054,
103
+ "Hard": 0.4590470067460482,
104
+ "Extreme": 0.3809658785230146
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.7449388025193358,
108
+ "T2. Sequencing & Structure Reconstruction": 0.7399444536944532,
109
+ "T3. Evidence-Grounded QA": 0.6166666666666667,
110
+ "T4. Summarization & Synthesis": 0.54417036696111,
111
+ "T5. Attribution & Citation Alignment": 0.5088222013004289,
112
+ "T6. Aggregation & Clustering": 0.4705462063266301,
113
+ "T7. Consistency & Compliance Checking": 0.3211976903039678,
114
+ "T8. Structured & Numeric Reasoning": 0.2083333333333333,
115
+ "T9. Version & Code Diff Analysis": 0.4528903513431796,
116
+ "T10. Rule Induction & In-Context Learning": 0.45958333333333334,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665
118
+ },
119
+ "language": {
120
+ "Chinese": 0.48759226495133245,
121
+ "English": 0.47381552503763963
122
+ }
123
+ },
124
+ "pass@2": 0.206,
125
+ "BoN-3": {
126
+ "overall_metric": 0.5286875532565248,
127
+ "token_length": {
128
+ "8k": 0.5760868208214227,
129
+ "16k": 0.5315447995369911,
130
+ "32k": 0.5297979513353553,
131
+ "64k": 0.5139951126923608,
132
+ "128k": 0.5285145377275431,
133
+ "256k": 0.49218609742548064
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.4922563052647642,
137
+ "Partial": 0.5750545961551319
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.6571166886876132,
141
+ "Moderate": 0.5008690293131257,
142
+ "Hard": 0.5070400734318661,
143
+ "Extreme": 0.42048944373649116
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.7851733453547192,
147
+ "T2. Sequencing & Structure Reconstruction": 0.7758457283457282,
148
+ "T3. Evidence-Grounded QA": 0.7083333333333334,
149
+ "T4. Summarization & Synthesis": 0.5534698517064113,
150
+ "T5. Attribution & Citation Alignment": 0.5639720868179612,
151
+ "T6. Aggregation & Clustering": 0.503918026189678,
152
+ "T7. Consistency & Compliance Checking": 0.34945026972752397,
153
+ "T8. Structured & Numeric Reasoning": 0.25277777777777777,
154
+ "T9. Version & Code Diff Analysis": 0.5104976262726122,
155
+ "T10. Rule Induction & In-Context Learning": 0.5270833333333333,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.525
157
+ },
158
+ "language": {
159
+ "Chinese": 0.5330279709661675,
160
+ "English": 0.5243471355468845
161
+ }
162
+ },
163
+ "pass@3": 0.24266666666666667
164
+ }
results/MiniMax-Text-01/thinking_context-1000000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.4499528005964066,
9
+ "inference_iteration_1_overall_metric": 0.4519835462001885,
10
+ "inference_iteration_2_overall_metric": 0.4481755772504262,
11
+ "inference_iteration_3_overall_metric": 0.4496992783386054,
12
+ "average_token_length_metric": {
13
+ "8k": 0.485225729559654,
14
+ "16k": 0.4524723240855649,
15
+ "32k": 0.46920448352940436,
16
+ "64k": 0.44046374240515457,
17
+ "128k": 0.4133092627171987,
18
+ "256k": 0.43904126128146514
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.4116545212336913,
22
+ "Partial": 0.49869606523986354
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.6191934548978654,
26
+ "Moderate": 0.4082147550465631,
27
+ "Hard": 0.3801988071084879,
28
+ "Extreme": 0.33778735493415807
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.6830330861399296,
32
+ "T2. Sequencing & Structure Reconstruction": 0.6403219944448011,
33
+ "T3. Evidence-Grounded QA": 0.4833333333333333,
34
+ "T4. Summarization & Synthesis": 0.5086176566073063,
35
+ "T5. Attribution & Citation Alignment": 0.416914270509611,
36
+ "T6. Aggregation & Clustering": 0.4334853794839026,
37
+ "T7. Consistency & Compliance Checking": 0.27119391146489646,
38
+ "T8. Structured & Numeric Reasoning": 0.38966049382716056,
39
+ "T9. Version & Code Diff Analysis": 0.4348929522191275,
40
+ "T10. Rule Induction & In-Context Learning": 0.41300925925925924,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4027777777777778
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.45819903421860664,
45
+ "English": 0.4417065669742075
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.4519835462001885,
49
+ "token_length": {
50
+ "8k": 0.4879779124929164,
51
+ "16k": 0.4554840853531918,
52
+ "32k": 0.4648286187996774,
53
+ "64k": 0.42985632449506034,
54
+ "128k": 0.4307020670264534,
55
+ "256k": 0.443052269033835
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.41228070711895354,
59
+ "Partial": 0.5025144323035801
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.6285595261886431,
63
+ "Moderate": 0.4057015689049336,
64
+ "Hard": 0.37791019658117175,
65
+ "Extreme": 0.33760415329971205
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.6904671153390595,
69
+ "T2. Sequencing & Structure Reconstruction": 0.6319390331890332,
70
+ "T3. Evidence-Grounded QA": 0.44166666666666665,
71
+ "T4. Summarization & Synthesis": 0.5079368349605524,
72
+ "T5. Attribution & Citation Alignment": 0.3963567606333699,
73
+ "T6. Aggregation & Clustering": 0.4315669444489273,
74
+ "T7. Consistency & Compliance Checking": 0.26717481095169254,
75
+ "T8. Structured & Numeric Reasoning": 0.40648148148148144,
76
+ "T9. Version & Code Diff Analysis": 0.4533152836127507,
77
+ "T10. Rule Induction & In-Context Learning": 0.4119444444444444,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4583333333333333
79
+ },
80
+ "language": {
81
+ "Chinese": 0.47043321599568005,
82
+ "English": 0.43353387640469826
83
+ }
84
+ },
85
+ "pass@1": 0.21,
86
+ "BoN-2": {
87
+ "overall_metric": 0.5523435379453717,
88
+ "token_length": {
89
+ "8k": 0.6041368153338821,
90
+ "16k": 0.553143416205592,
91
+ "32k": 0.5547357356840433,
92
+ "64k": 0.5474714891955119,
93
+ "128k": 0.5080001305944092,
94
+ "256k": 0.5465736406587951
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.5094377873914124,
98
+ "Partial": 0.6069508568322307
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.7582302423860908,
102
+ "Moderate": 0.5069058318579235,
103
+ "Hard": 0.47636905813527697,
104
+ "Extreme": 0.40654974290892704
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.7753422134076285,
108
+ "T2. Sequencing & Structure Reconstruction": 0.7188864376364378,
109
+ "T3. Evidence-Grounded QA": 0.6083333333333333,
110
+ "T4. Summarization & Synthesis": 0.5276281423571613,
111
+ "T5. Attribution & Citation Alignment": 0.573374443874177,
112
+ "T6. Aggregation & Clustering": 0.5278895685136558,
113
+ "T7. Consistency & Compliance Checking": 0.35338346649949204,
114
+ "T8. Structured & Numeric Reasoning": 0.5027777777777779,
115
+ "T9. Version & Code Diff Analysis": 0.552570101188694,
116
+ "T10. Rule Induction & In-Context Learning": 0.5220833333333333,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.55
118
+ },
119
+ "language": {
120
+ "Chinese": 0.5649919855932303,
121
+ "English": 0.5396950902975145
122
+ }
123
+ },
124
+ "pass@2": 0.2753333333333333,
125
+ "BoN-3": {
126
+ "overall_metric": 0.5997056103547938,
127
+ "token_length": {
128
+ "8k": 0.6457585156659336,
129
+ "16k": 0.6123141997231359,
130
+ "32k": 0.6242961953070552,
131
+ "64k": 0.5876928890236057,
132
+ "128k": 0.5497742714361217,
133
+ "256k": 0.5783975909729129
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.5540426758661396,
137
+ "Partial": 0.6578220724312633
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.8056166900447767,
141
+ "Moderate": 0.5614066990728871,
142
+ "Hard": 0.5178805893116146,
143
+ "Extreme": 0.45303896497156343
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.8083334868935526,
147
+ "T2. Sequencing & Structure Reconstruction": 0.7593085155585156,
148
+ "T3. Evidence-Grounded QA": 0.6833333333333333,
149
+ "T4. Summarization & Synthesis": 0.5344316475303361,
150
+ "T5. Attribution & Citation Alignment": 0.6383957562170883,
151
+ "T6. Aggregation & Clustering": 0.5743997782942697,
152
+ "T7. Consistency & Compliance Checking": 0.39861351698347697,
153
+ "T8. Structured & Numeric Reasoning": 0.5527777777777778,
154
+ "T9. Version & Code Diff Analysis": 0.5853585580965909,
155
+ "T10. Rule Induction & In-Context Learning": 0.5984722222222222,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.6
157
+ },
158
+ "language": {
159
+ "Chinese": 0.6144058958264887,
160
+ "English": 0.5850053248830991
161
+ }
162
+ },
163
+ "pass@3": 0.31933333333333336
164
+ }
results/Ministral-3-14B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.40137741846169367,
9
+ "inference_iteration_1_overall_metric": 0.40238223426918757,
10
+ "inference_iteration_2_overall_metric": 0.4002952301959775,
11
+ "inference_iteration_3_overall_metric": 0.4014547909199151,
12
+ "average_token_length_metric": {
13
+ "8k": 0.43375517745982745,
14
+ "16k": 0.4624624276537502,
15
+ "32k": 0.4225675952668474,
16
+ "64k": 0.39702430034744873,
17
+ "128k": 0.3821065286583368,
18
+ "256k": 0.3103484813839513
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.3639638275660222,
22
+ "Partial": 0.4489947159652748
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.5260224045368944,
26
+ "Moderate": 0.3401707697729334,
27
+ "Hard": 0.35036961025265867,
28
+ "Extreme": 0.33853899745082133
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.7189334347722878,
32
+ "T2. Sequencing & Structure Reconstruction": 0.6668259209925876,
33
+ "T3. Evidence-Grounded QA": 0.4583333333333333,
34
+ "T4. Summarization & Synthesis": 0.5221962563770883,
35
+ "T5. Attribution & Citation Alignment": 0.3136369167899536,
36
+ "T6. Aggregation & Clustering": 0.39798286937745314,
37
+ "T7. Consistency & Compliance Checking": 0.21946000799150195,
38
+ "T8. Structured & Numeric Reasoning": 0.14151234567901233,
39
+ "T9. Version & Code Diff Analysis": 0.47709977467470854,
40
+ "T10. Rule Induction & In-Context Learning": 0.4217592592592592,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.30000000000000004
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.40560935669062276,
45
+ "English": 0.397145480232764
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.40238223426918757,
49
+ "token_length": {
50
+ "8k": 0.433884589163934,
51
+ "16k": 0.46516909415802493,
52
+ "32k": 0.42676245820656594,
53
+ "64k": 0.3945954502711027,
54
+ "128k": 0.38087905781855047,
55
+ "256k": 0.31300275599695
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.36618501990567515,
59
+ "Partial": 0.44845141618638706
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.5313007708587812,
63
+ "Moderate": 0.33905574344991224,
64
+ "Hard": 0.35279460774167126,
65
+ "Extreme": 0.33532188262609514
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.7231148921645373,
69
+ "T2. Sequencing & Structure Reconstruction": 0.6661998186998181,
70
+ "T3. Evidence-Grounded QA": 0.4666666666666667,
71
+ "T4. Summarization & Synthesis": 0.5230622050479962,
72
+ "T5. Attribution & Citation Alignment": 0.30176586689960616,
73
+ "T6. Aggregation & Clustering": 0.3921100544588917,
74
+ "T7. Consistency & Compliance Checking": 0.219946553696774,
75
+ "T8. Structured & Numeric Reasoning": 0.14675925925925926,
76
+ "T9. Version & Code Diff Analysis": 0.4747724555416158,
77
+ "T10. Rule Induction & In-Context Learning": 0.4193055555555556,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665
79
+ },
80
+ "language": {
81
+ "Chinese": 0.40159850236109407,
82
+ "English": 0.4031659661772829
83
+ }
84
+ },
85
+ "pass@1": 0.158,
86
+ "BoN-2": {
87
+ "overall_metric": 0.4167918691419028,
88
+ "token_length": {
89
+ "8k": 0.4448864222174602,
90
+ "16k": 0.481483520242527,
91
+ "32k": 0.43929332248789354,
92
+ "64k": 0.4125546400739453,
93
+ "128k": 0.39499054247940135,
94
+ "256k": 0.3275427673501904
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.37697160991132156,
98
+ "Partial": 0.4674721990717337
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.543547780022054,
102
+ "Moderate": 0.3573487107807427,
103
+ "Hard": 0.3645996932831712,
104
+ "Extreme": 0.3512606476539851
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.7338149078373205,
108
+ "T2. Sequencing & Structure Reconstruction": 0.6898572261072259,
109
+ "T3. Evidence-Grounded QA": 0.475,
110
+ "T4. Summarization & Synthesis": 0.5324649457390374,
111
+ "T5. Attribution & Citation Alignment": 0.3305062690008576,
112
+ "T6. Aggregation & Clustering": 0.41269190069190054,
113
+ "T7. Consistency & Compliance Checking": 0.24063695524418569,
114
+ "T8. Structured & Numeric Reasoning": 0.14953703703703705,
115
+ "T9. Version & Code Diff Analysis": 0.4968728427963255,
116
+ "T10. Rule Induction & In-Context Learning": 0.43041666666666667,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665
118
+ },
119
+ "language": {
120
+ "Chinese": 0.42183857405387454,
121
+ "English": 0.4117451642299312
122
+ }
123
+ },
124
+ "pass@2": 0.164,
125
+ "BoN-3": {
126
+ "overall_metric": 0.42795282912937055,
127
+ "token_length": {
128
+ "8k": 0.45584907332800734,
129
+ "16k": 0.4930808335792047,
130
+ "32k": 0.45565921785915897,
131
+ "64k": 0.41883977659018873,
132
+ "128k": 0.4041598036532658,
133
+ "256k": 0.34012826976639843
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.3885912574254956,
137
+ "Partial": 0.47804937493430194
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.554784793655902,
141
+ "Moderate": 0.37124888788597155,
142
+ "Hard": 0.37318051863597795,
143
+ "Extreme": 0.36223380606151817
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.7412124373820532,
147
+ "T2. Sequencing & Structure Reconstruction": 0.6988453213453213,
148
+ "T3. Evidence-Grounded QA": 0.48333333333333334,
149
+ "T4. Summarization & Synthesis": 0.538030953852775,
150
+ "T5. Attribution & Citation Alignment": 0.34888079906957237,
151
+ "T6. Aggregation & Clustering": 0.4339707150850415,
152
+ "T7. Consistency & Compliance Checking": 0.24529018187024293,
153
+ "T8. Structured & Numeric Reasoning": 0.15046296296296297,
154
+ "T9. Version & Code Diff Analysis": 0.5141050625900392,
155
+ "T10. Rule Induction & In-Context Learning": 0.44708333333333333,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.3333333333333333
157
+ },
158
+ "language": {
159
+ "Chinese": 0.43352032726356865,
160
+ "English": 0.42238533099517206
161
+ }
162
+ },
163
+ "pass@3": 0.16933333333333334
164
+ }
results/Ministral-3-14B-Instruct-2512/thinking_context-224000_bon-3_summary.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "date": "2025-12-08",
3
+ "total_questions_num": 1500,
4
+ "inference_iterations": 3,
5
+ "total_samples_num": 4500,
6
+ "fail_samples_num": 0,
7
+ "inference_inconsistent_samples_num": 0,
8
+ "average_overall_metric": 0.45799174186842606,
9
+ "inference_iteration_1_overall_metric": 0.45848259675544495,
10
+ "inference_iteration_2_overall_metric": 0.46152550901732137,
11
+ "inference_iteration_3_overall_metric": 0.453967119832514,
12
+ "average_token_length_metric": {
13
+ "8k": 0.5187827600069922,
14
+ "16k": 0.48518599796025474,
15
+ "32k": 0.48745678732020276,
16
+ "64k": 0.4570416898883375,
17
+ "128k": 0.42361656559174016,
18
+ "256k": 0.3758666504430352
19
+ },
20
+ "average_contextual_requirement_metric": {
21
+ "Full": 0.4248803053301027,
22
+ "Partial": 0.500133570189931
23
+ },
24
+ "average_difficulty_metric": {
25
+ "Easy": 0.6756101555622144,
26
+ "Moderate": 0.39349242818800056,
27
+ "Hard": 0.37479147914388156,
28
+ "Extreme": 0.31661242864258926
29
+ },
30
+ "average_primary_task_metric": {
31
+ "T1. Retrieval & Ranking": 0.7608265794743194,
32
+ "T2. Sequencing & Structure Reconstruction": 0.6976467766457445,
33
+ "T3. Evidence-Grounded QA": 0.4138888888888888,
34
+ "T4. Summarization & Synthesis": 0.49943611989270814,
35
+ "T5. Attribution & Citation Alignment": 0.3460922902510361,
36
+ "T6. Aggregation & Clustering": 0.45809776319010986,
37
+ "T7. Consistency & Compliance Checking": 0.2280021697407482,
38
+ "T8. Structured & Numeric Reasoning": 0.41836419753086435,
39
+ "T9. Version & Code Diff Analysis": 0.5354951076952419,
40
+ "T10. Rule Induction & In-Context Learning": 0.462037037037037,
41
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.35277777777777775
42
+ },
43
+ "average_language_metric": {
44
+ "Chinese": 0.43851283791556633,
45
+ "English": 0.4774706458212879
46
+ },
47
+ "BoN-1": {
48
+ "overall_metric": 0.45848259675544495,
49
+ "token_length": {
50
+ "8k": 0.538436412123529,
51
+ "16k": 0.47816429220213824,
52
+ "32k": 0.4805712657434,
53
+ "64k": 0.44911476875649875,
54
+ "128k": 0.4390480988508506,
55
+ "256k": 0.3655607428562557
56
+ },
57
+ "contextual_requirement": {
58
+ "Full": 0.42758275308934174,
59
+ "Partial": 0.49780967051230435
60
+ },
61
+ "difficulty": {
62
+ "Easy": 0.6788797898422243,
63
+ "Moderate": 0.39582478863697335,
64
+ "Hard": 0.37581391782555706,
65
+ "Extreme": 0.3125005687762375
66
+ },
67
+ "primary_task": {
68
+ "T1. Retrieval & Ranking": 0.7602730263789274,
69
+ "T2. Sequencing & Structure Reconstruction": 0.7038347568610727,
70
+ "T3. Evidence-Grounded QA": 0.4583333333333333,
71
+ "T4. Summarization & Synthesis": 0.4990357866977353,
72
+ "T5. Attribution & Citation Alignment": 0.35405443033443207,
73
+ "T6. Aggregation & Clustering": 0.43771712805012347,
74
+ "T7. Consistency & Compliance Checking": 0.21275716732831734,
75
+ "T8. Structured & Numeric Reasoning": 0.42546296296296293,
76
+ "T9. Version & Code Diff Analysis": 0.5013174605476809,
77
+ "T10. Rule Induction & In-Context Learning": 0.48194444444444445,
78
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.35833333333333334
79
+ },
80
+ "language": {
81
+ "Chinese": 0.43269825076497226,
82
+ "English": 0.48426694274591864
83
+ }
84
+ },
85
+ "pass@1": 0.22733333333333333,
86
+ "BoN-2": {
87
+ "overall_metric": 0.5227530650530186,
88
+ "token_length": {
89
+ "8k": 0.5815903903274882,
90
+ "16k": 0.5495634888346752,
91
+ "32k": 0.5446963722215565,
92
+ "64k": 0.5299811967182008,
93
+ "128k": 0.4888075896458958,
94
+ "256k": 0.4418793525702981
95
+ },
96
+ "contextual_requirement": {
97
+ "Full": 0.489360650067773,
98
+ "Partial": 0.565252502306969
99
+ },
100
+ "difficulty": {
101
+ "Easy": 0.7604456302850787,
102
+ "Moderate": 0.463797787486306,
103
+ "Hard": 0.43728803588921916,
104
+ "Extreme": 0.35722954733316814
105
+ },
106
+ "primary_task": {
107
+ "T1. Retrieval & Ranking": 0.8036702370810763,
108
+ "T2. Sequencing & Structure Reconstruction": 0.7488194240400123,
109
+ "T3. Evidence-Grounded QA": 0.5083333333333333,
110
+ "T4. Summarization & Synthesis": 0.5120697680827194,
111
+ "T5. Attribution & Citation Alignment": 0.4029499793922618,
112
+ "T6. Aggregation & Clustering": 0.5304624812643679,
113
+ "T7. Consistency & Compliance Checking": 0.27354479208047294,
114
+ "T8. Structured & Numeric Reasoning": 0.4962962962962963,
115
+ "T9. Version & Code Diff Analysis": 0.6272818834382945,
116
+ "T10. Rule Induction & In-Context Learning": 0.5725,
117
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333
118
+ },
119
+ "language": {
120
+ "Chinese": 0.5068736065410564,
121
+ "English": 0.5386325235649823
122
+ }
123
+ },
124
+ "pass@2": 0.2833333333333333,
125
+ "BoN-3": {
126
+ "overall_metric": 0.5542260073321058,
127
+ "token_length": {
128
+ "8k": 0.6154421507239413,
129
+ "16k": 0.5842317006106588,
130
+ "32k": 0.5699342902198496,
131
+ "64k": 0.5576866952391514,
132
+ "128k": 0.5184072720538686,
133
+ "256k": 0.4796539351451706
134
+ },
135
+ "contextual_requirement": {
136
+ "Full": 0.5204393104129562,
137
+ "Partial": 0.5972272579564804
138
+ },
139
+ "difficulty": {
140
+ "Easy": 0.7905996585472251,
141
+ "Moderate": 0.5012307381116559,
142
+ "Hard": 0.46914010908714937,
143
+ "Extreme": 0.3859836380407792
144
+ },
145
+ "primary_task": {
146
+ "T1. Retrieval & Ranking": 0.8214474184096282,
147
+ "T2. Sequencing & Structure Reconstruction": 0.7666511293717175,
148
+ "T3. Evidence-Grounded QA": 0.525,
149
+ "T4. Summarization & Synthesis": 0.5197606048087388,
150
+ "T5. Attribution & Citation Alignment": 0.4423027060091871,
151
+ "T6. Aggregation & Clustering": 0.5772430200792206,
152
+ "T7. Consistency & Compliance Checking": 0.3040682257365206,
153
+ "T8. Structured & Numeric Reasoning": 0.5402777777777777,
154
+ "T9. Version & Code Diff Analysis": 0.661946364328448,
155
+ "T10. Rule Induction & In-Context Learning": 0.5916666666666668,
156
+ "T11. Dialogue Memory & Long-Horizon Tracking": 0.4666666666666667
157
+ },
158
+ "language": {
159
+ "Chinese": 0.5366930860176997,
160
+ "English": 0.5717589286465138
161
+ }
162
+ },
163
+ "pass@3": 0.31133333333333335
164
+ }