Spaces:
Sleeping
Sleeping
add application file
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- README.md +1 -3
- app.py +877 -4
- results/Claude-3.7-Sonnet/nonthinking_context-120000_bon-3_summary.json +164 -0
- results/Claude-3.7-Sonnet/thinking_context-120000_bon-3_summary.json +164 -0
- results/Claude-4-Sonnet/nonthinking_context-1000000_bon-3_summary.json +164 -0
- results/Claude-4-Sonnet/thinking_context-1000000_bon-3_summary.json +164 -0
- results/DeepSeek-R1-0528/thinking_context-120000_bon-3_summary.json +164 -0
- results/DeepSeek-R1/thinking_context-120000_bon-3_summary.json +164 -0
- results/DeepSeek-V3-0324/nonthinking_context-120000_bon-3_summary.json +164 -0
- results/DeepSeek-V3-0324/thinking_context-120000_bon-3_summary.json +164 -0
- results/DeepSeek-V3.1/nonthinking_context-120000_bon-3_summary.json +164 -0
- results/DeepSeek-V3.1/thinking_context-120000_bon-3_summary.json +164 -0
- results/DeepSeek-V3.2/nonthinking_context-120000_bon-3_summary.json +164 -0
- results/DeepSeek-V3.2/thinking_context-120000_bon-3_summary.json +164 -0
- results/GLM-4.5/nonthinking_context-120000_bon-3_summary.json +164 -0
- results/GLM-4.5/thinking_context-120000_bon-3_summary.json +164 -0
- results/GLM-4.6/nonthinking_context-120000_bon-3_summary.json +164 -0
- results/GLM-4.6/thinking_context-120000_bon-3_summary.json +164 -0
- results/GPT-4o/nonthinking_context-120000_bon-3_summary.json +164 -0
- results/GPT-4o/thinking_context-120000_bon-3_summary.json +164 -0
- results/GPT-5/thinking_context-272000_bon-3_summary.json +164 -0
- results/GPT-OSS-120B/thinking_context-120000_bon-3_summary.json +164 -0
- results/GPT-OSS-20B/thinking_context-120000_bon-3_summary.json +164 -0
- results/Gemini-2.5-Flash/nonthinking_context-1000000_bon-3_summary.json +164 -0
- results/Gemini-2.5-Flash/thinking_context-1000000_bon-3_summary.json +164 -0
- results/Gemini-2.5-Pro/thinking_context-1000000_bon-3_summary.json +164 -0
- results/Gemma-3-12B-It/nonthinking_context-120000_bon-3_summary.json +164 -0
- results/Gemma-3-12B-It/thinking_context-120000_bon-3_summary.json +164 -0
- results/Gemma-3-27B-It/nonthinking_context-120000_bon-3_summary.json +164 -0
- results/Gemma-3-27B-It/thinking_context-120000_bon-3_summary.json +164 -0
- results/Gemma-3-4B-It/nonthinking_context-120000_bon-3_summary.json +164 -0
- results/Gemma-3-4B-It/thinking_context-120000_bon-3_summary.json +164 -0
- results/Kimi-K2-Instruct-0905/nonthinking_context-224000_bon-3_summary.json +164 -0
- results/Kimi-K2-Instruct-0905/thinking_context-224000_bon-3_summary.json +164 -0
- results/Llama-3.1-405B-Instruct/nonthinking_context-120000_bon-3_summary.json +164 -0
- results/Llama-3.1-405B-Instruct/thinking_context-120000_bon-3_summary.json +164 -0
- results/Llama-3.1-70B-Instruct/nonthinking_context-120000_bon-3_summary.json +164 -0
- results/Llama-3.1-70B-Instruct/thinking_context-120000_bon-3_summary.json +164 -0
- results/Llama-3.1-8B-Instruct/nonthinking_context-120000_bon-3_summary.json +164 -0
- results/Llama-3.1-8B-Instruct/thinking_context-120000_bon-3_summary.json +164 -0
- results/Llama-3.2-3B-Instruct/nonthinking_context-120000_bon-3_summary.json +164 -0
- results/Llama-3.2-3B-Instruct/thinking_context-120000_bon-3_summary.json +164 -0
- results/Llama-3.3-70B-Instruct/nonthinking_context-120000_bon-3_summary.json +164 -0
- results/Llama-3.3-70B-Instruct/thinking_context-120000_bon-3_summary.json +164 -0
- results/Magistral-Small-2509/thinking_context-120000_bon-3_summary.json +164 -0
- results/MiniMax-M2/thinking_context-1000000_bon-3_summary.json +164 -0
- results/MiniMax-Text-01/nonthinking_context-1000000_bon-3_summary.json +164 -0
- results/MiniMax-Text-01/thinking_context-1000000_bon-3_summary.json +164 -0
- results/Ministral-3-14B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json +164 -0
- results/Ministral-3-14B-Instruct-2512/thinking_context-224000_bon-3_summary.json +164 -0
README.md
CHANGED
|
@@ -9,6 +9,4 @@ app_file: app.py
|
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
| 11 |
short_description: Realistic and Comprehensive Bilingual Long-Context Benchmark
|
| 12 |
-
---
|
| 13 |
-
|
| 14 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
| 11 |
short_description: Realistic and Comprehensive Bilingual Long-Context Benchmark
|
| 12 |
+
---
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,7 +1,880 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
def
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
LongBenchmark 结果可视化
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import re
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from pathlib import Path
|
| 11 |
import gradio as gr
|
| 12 |
+
import plotly.graph_objects as go
|
| 13 |
+
|
| 14 |
+
with open('./results/model_info.json', 'r', encoding='utf-8') as f:
|
| 15 |
+
MODLE_INFO_DICT = json.load(f)
|
| 16 |
+
|
| 17 |
+
def get_color(index):
|
| 18 |
+
"""基于索引生成颜色,使用黄金角度确保颜色分布均匀且无限"""
|
| 19 |
+
# 黄金角度约 137.508 度,确保颜色在色环上分布均匀
|
| 20 |
+
hue = (index * 137.508) % 360
|
| 21 |
+
# 固定饱和度为70%,亮度为60%,确保颜色既鲜艳又不刺眼
|
| 22 |
+
return f"hsl({hue}, 70%, 60%)"
|
| 23 |
+
|
| 24 |
+
class ResultParser:
|
| 25 |
+
def __init__(self, output_dir: str):
|
| 26 |
+
self.output_dir = Path(output_dir)
|
| 27 |
+
self.results = []
|
| 28 |
+
|
| 29 |
+
def parse_filename(self, filename: str):
|
| 30 |
+
"""解析文件名,提取context长度和是否包含thinking或nonthinking"""
|
| 31 |
+
# 提取context长度
|
| 32 |
+
context_match = re.search(r'context-(\d+)', filename)
|
| 33 |
+
context_length = int(context_match.group(1)) if context_match else 0
|
| 34 |
+
|
| 35 |
+
filename_lower = filename.lower()
|
| 36 |
+
# 检查是否包含nonthinking(优先检查,因为nonthinking也包含thinking)
|
| 37 |
+
has_nonthinking = 'nonthinking' in filename_lower
|
| 38 |
+
# 检查是否包含thinking(但不包含nonthinking)
|
| 39 |
+
has_thinking = 'thinking' in filename_lower and not has_nonthinking
|
| 40 |
+
|
| 41 |
+
return context_length, has_thinking, has_nonthinking
|
| 42 |
+
|
| 43 |
+
def parse_result_file(self, model_name: str, file_path: Path):
|
| 44 |
+
"""解析单个结果文件"""
|
| 45 |
+
try:
|
| 46 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 47 |
+
data = json.load(f)
|
| 48 |
+
|
| 49 |
+
context_length, has_thinking, has_nonthinking = self.parse_filename(file_path.name)
|
| 50 |
+
# 使用JSON文件中的date字段作为评估日期
|
| 51 |
+
eval_date = data.get('date', "未知")
|
| 52 |
+
|
| 53 |
+
# 提取BoN数据
|
| 54 |
+
bon_data = {}
|
| 55 |
+
for bon_key in ['BoN-1', 'BoN-2', 'BoN-3']:
|
| 56 |
+
if bon_key in data and 'overall_metric' in data[bon_key]:
|
| 57 |
+
bon_data[bon_key] = data[bon_key]['overall_metric']
|
| 58 |
+
|
| 59 |
+
result = {
|
| 60 |
+
'model_name': model_name,
|
| 61 |
+
'eval_date': eval_date,
|
| 62 |
+
'context_length': context_length,
|
| 63 |
+
'has_thinking': has_thinking,
|
| 64 |
+
'has_nonthinking': has_nonthinking,
|
| 65 |
+
'overall_metric': data.get('average_overall_metric', 0.0),
|
| 66 |
+
'token_length_metrics': data.get('average_token_length_metric', {}),
|
| 67 |
+
'contextual_requirement': data.get('average_contextual_requirement_metric', {}),
|
| 68 |
+
'difficulty': data.get('average_difficulty_metric', {}),
|
| 69 |
+
'primary_task': data.get('average_primary_task_metric', {}),
|
| 70 |
+
'language': data.get('average_language_metric', {}),
|
| 71 |
+
'bon_data': bon_data, # 存储BoN-1, BoN-2, BoN-3的overall_metric
|
| 72 |
+
'pass_at_k': {
|
| 73 |
+
'pass@1': data.get('pass@1'),
|
| 74 |
+
'pass@2': data.get('pass@2'),
|
| 75 |
+
'pass@3': data.get('pass@3')
|
| 76 |
+
}
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
return result
|
| 80 |
+
|
| 81 |
+
except Exception as e:
|
| 82 |
+
print(f"解析文件 {file_path} 时出错: {e}")
|
| 83 |
+
return None
|
| 84 |
+
|
| 85 |
+
def scan_all_results(self):
|
| 86 |
+
"""扫描所有模型的结果文件"""
|
| 87 |
+
self.results = []
|
| 88 |
+
|
| 89 |
+
if not self.output_dir.exists():
|
| 90 |
+
print(f"输出目录不存在: {self.output_dir}")
|
| 91 |
+
return
|
| 92 |
+
|
| 93 |
+
# 遍历所有模型目录
|
| 94 |
+
for model_dir in self.output_dir.iterdir():
|
| 95 |
+
if not model_dir.is_dir():
|
| 96 |
+
continue
|
| 97 |
+
|
| 98 |
+
model_name = model_dir.name
|
| 99 |
+
print(f"扫描模型: {model_name}")
|
| 100 |
+
|
| 101 |
+
# 查找该模型下的所有_summary.json文件
|
| 102 |
+
for file_path in model_dir.glob("*_summary.json"):
|
| 103 |
+
print(f" 解析文件: {file_path.name}")
|
| 104 |
+
result = self.parse_result_file(model_name, file_path)
|
| 105 |
+
if result:
|
| 106 |
+
self.results.append(result)
|
| 107 |
+
|
| 108 |
+
print(f"总共解析了 {len(self.results)} 个结果文件")
|
| 109 |
+
|
| 110 |
+
def get_leaderboard_data(self):
|
| 111 |
+
"""获取排行榜数据"""
|
| 112 |
+
if not self.results:
|
| 113 |
+
return pd.DataFrame()
|
| 114 |
+
|
| 115 |
+
# 按模型名称聚合数据
|
| 116 |
+
model_groups = {}
|
| 117 |
+
for result in self.results:
|
| 118 |
+
model_name = result['model_name']
|
| 119 |
+
if model_name not in model_groups:
|
| 120 |
+
model_groups[model_name] = {
|
| 121 |
+
'dates': [],
|
| 122 |
+
'contexts': [],
|
| 123 |
+
'thinking_scores': [],
|
| 124 |
+
'non_thinking_scores': []
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
group = model_groups[model_name]
|
| 128 |
+
group['dates'].append(result['eval_date'])
|
| 129 |
+
group['contexts'].append(result['context_length'])
|
| 130 |
+
|
| 131 |
+
score = result['overall_metric']
|
| 132 |
+
if result['has_thinking']:
|
| 133 |
+
group['thinking_scores'].append(score)
|
| 134 |
+
else:
|
| 135 |
+
group['non_thinking_scores'].append(score)
|
| 136 |
+
|
| 137 |
+
leaderboard_data = []
|
| 138 |
+
for model_name, group in model_groups.items():
|
| 139 |
+
# 获取最新日期
|
| 140 |
+
valid_dates = [d for d in group['dates'] if d != "未知"]
|
| 141 |
+
latest_date = max(valid_dates) if valid_dates else "未知"
|
| 142 |
+
|
| 143 |
+
# 获取最大Context Window
|
| 144 |
+
max_context = max(group['contexts']) if group['contexts'] else 0
|
| 145 |
+
|
| 146 |
+
# 格式化截断长度
|
| 147 |
+
if max_context >= 1000000:
|
| 148 |
+
context_str = f"{max_context/1000000:.0f}M" if max_context % 1000000 == 0 else f"{max_context/1000000:.1f}M"
|
| 149 |
+
elif max_context >= 1000:
|
| 150 |
+
context_str = f"{max_context/1000:.0f}k" if max_context % 1000 == 0 else f"{max_context/1000:.1f}k"
|
| 151 |
+
else:
|
| 152 |
+
context_str = str(max_context)
|
| 153 |
+
|
| 154 |
+
# 获取模型类型和上下文长度
|
| 155 |
+
model_context = "-"
|
| 156 |
+
model_url = ""
|
| 157 |
+
if model_name in MODLE_INFO_DICT:
|
| 158 |
+
model_info = MODLE_INFO_DICT[model_name]
|
| 159 |
+
if isinstance(model_info, dict):
|
| 160 |
+
model_type = model_info.get("type", "Unknown")
|
| 161 |
+
model_context = model_info.get("context_length", "-")
|
| 162 |
+
model_url = model_info.get("url", "")
|
| 163 |
+
else:
|
| 164 |
+
model_type = str(model_info)
|
| 165 |
+
else:
|
| 166 |
+
model_type = "Unknown"
|
| 167 |
+
|
| 168 |
+
# 处理模型名称链接和图标
|
| 169 |
+
display_model_name = model_name
|
| 170 |
+
|
| 171 |
+
if model_url:
|
| 172 |
+
display_model_name = f"[{display_model_name}]({model_url})"
|
| 173 |
+
|
| 174 |
+
# 计算平均分
|
| 175 |
+
nt_score_val = 0
|
| 176 |
+
nt_score_str = "-"
|
| 177 |
+
if group['non_thinking_scores']:
|
| 178 |
+
nt_score_val = sum(group['non_thinking_scores']) / len(group['non_thinking_scores'])
|
| 179 |
+
nt_score_str = f"{nt_score_val * 100:.2f}"
|
| 180 |
+
|
| 181 |
+
t_score_val = 0
|
| 182 |
+
t_score_str = "-"
|
| 183 |
+
if group['thinking_scores']:
|
| 184 |
+
t_score_val = sum(group['thinking_scores']) / len(group['thinking_scores'])
|
| 185 |
+
t_score_str = f"{t_score_val * 100:.2f}"
|
| 186 |
+
|
| 187 |
+
leaderboard_data.append({
|
| 188 |
+
'模型名称': display_model_name,
|
| 189 |
+
'模型类型': model_type,
|
| 190 |
+
'上下文长度': model_context,
|
| 191 |
+
'截断长度': context_str,
|
| 192 |
+
'非思考得分': nt_score_str,
|
| 193 |
+
'思考得分': t_score_str,
|
| 194 |
+
'_sort_score': max(nt_score_val, t_score_val)
|
| 195 |
+
})
|
| 196 |
+
|
| 197 |
+
df = pd.DataFrame(leaderboard_data)
|
| 198 |
+
# 按最高分降序排列
|
| 199 |
+
if not df.empty:
|
| 200 |
+
df = df.sort_values('_sort_score', ascending=False).drop(columns=['_sort_score']).reset_index(drop=True)
|
| 201 |
+
|
| 202 |
+
return df
|
| 203 |
+
|
| 204 |
+
def get_display_name_for_result(result):
|
| 205 |
+
"""获取模型的显示名称(根据是否包含thinking或nonthinking添加后缀)"""
|
| 206 |
+
if result.get('has_nonthinking'):
|
| 207 |
+
return f"{result['model_name']}_nonthinking"
|
| 208 |
+
elif result.get('has_thinking'):
|
| 209 |
+
return f"{result['model_name']}_thinking"
|
| 210 |
+
else:
|
| 211 |
+
return result['model_name']
|
| 212 |
+
|
| 213 |
+
def get_model_color_index(model_name, all_models):
|
| 214 |
+
"""获取模型在颜色列表中的索引"""
|
| 215 |
+
try:
|
| 216 |
+
return all_models.index(model_name)
|
| 217 |
+
except ValueError:
|
| 218 |
+
return 0
|
| 219 |
+
|
| 220 |
+
def create_contextual_requirement_chart(results, selected_models):
|
| 221 |
+
"""创建上下文需求对比柱状图"""
|
| 222 |
+
if not selected_models:
|
| 223 |
+
return go.Figure()
|
| 224 |
+
|
| 225 |
+
# 收集数据 - 直接使用summary中的值,不需要计算平均值
|
| 226 |
+
chart_data = {}
|
| 227 |
+
|
| 228 |
+
for result in results:
|
| 229 |
+
display_name = get_display_name_for_result(result)
|
| 230 |
+
if display_name in selected_models:
|
| 231 |
+
model_name = display_name
|
| 232 |
+
contextual_requirement = result['contextual_requirement']
|
| 233 |
+
|
| 234 |
+
# 直接存储每个模型的结果,不需要计算平均值
|
| 235 |
+
if model_name not in chart_data:
|
| 236 |
+
chart_data[model_name] = {}
|
| 237 |
+
|
| 238 |
+
for req_type, score in contextual_requirement.items():
|
| 239 |
+
chart_data[model_name][req_type] = score * 100 # 乘以100
|
| 240 |
+
|
| 241 |
+
# 创建图表
|
| 242 |
+
fig = go.Figure()
|
| 243 |
+
|
| 244 |
+
# 获取所有需求类型 - 保持原始顺序,不排序
|
| 245 |
+
all_req_types = []
|
| 246 |
+
for result in results:
|
| 247 |
+
display_name = get_display_name_for_result(result)
|
| 248 |
+
if display_name in selected_models:
|
| 249 |
+
contextual_requirement = result['contextual_requirement']
|
| 250 |
+
for req_type in contextual_requirement.keys():
|
| 251 |
+
if req_type not in all_req_types:
|
| 252 |
+
all_req_types.append(req_type)
|
| 253 |
+
|
| 254 |
+
for model_name in selected_models:
|
| 255 |
+
if model_name in chart_data:
|
| 256 |
+
scores = [chart_data[model_name].get(req_type, 0) for req_type in all_req_types]
|
| 257 |
+
color_index = get_model_color_index(model_name, selected_models)
|
| 258 |
+
|
| 259 |
+
fig.add_trace(go.Bar(
|
| 260 |
+
name=model_name,
|
| 261 |
+
x=all_req_types,
|
| 262 |
+
y=scores,
|
| 263 |
+
marker_color=get_color(color_index),
|
| 264 |
+
text=[f"{score:.2f}" for score in scores], # 保留2位小数
|
| 265 |
+
textposition='auto'
|
| 266 |
+
))
|
| 267 |
+
|
| 268 |
+
fig.update_layout(
|
| 269 |
+
title='模型在不同上下文需求上的性能对比',
|
| 270 |
+
xaxis_title='上下文需求类型',
|
| 271 |
+
yaxis_title='平均得分',
|
| 272 |
+
barmode='group',
|
| 273 |
+
autosize=True, # 自动调整大小
|
| 274 |
+
legend=dict(
|
| 275 |
+
orientation="h",
|
| 276 |
+
yanchor="top",
|
| 277 |
+
y=-0.25, # 调整到更下方
|
| 278 |
+
xanchor="center",
|
| 279 |
+
x=0.5
|
| 280 |
+
),
|
| 281 |
+
margin=dict(b=100) # 增加底部边距
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
return fig
|
| 285 |
+
|
| 286 |
+
def create_primary_task_radar_chart(results, selected_models):
|
| 287 |
+
"""创建主要任务雷达图(按任务前缀聚合,使用'.'前缀,绘制最多11个任务)"""
|
| 288 |
+
if not selected_models:
|
| 289 |
+
return go.Figure()
|
| 290 |
+
|
| 291 |
+
# 收集所有模型下的任务前缀,保持出现顺序
|
| 292 |
+
prefix_order = []
|
| 293 |
+
# 为每个模型构建 前缀 -> [scores] 的映射
|
| 294 |
+
model_prefix_scores = {}
|
| 295 |
+
|
| 296 |
+
for result in results:
|
| 297 |
+
display_name = get_display_name_for_result(result)
|
| 298 |
+
if display_name not in selected_models:
|
| 299 |
+
continue
|
| 300 |
+
primary_task = result.get('primary_task', {})
|
| 301 |
+
if display_name not in model_prefix_scores:
|
| 302 |
+
model_prefix_scores[display_name] = {}
|
| 303 |
+
for task_key, score in primary_task.items():
|
| 304 |
+
prefix = task_key.split('.')[0].strip() if isinstance(task_key, str) else str(task_key)
|
| 305 |
+
if prefix not in prefix_order:
|
| 306 |
+
prefix_order.append(prefix)
|
| 307 |
+
if prefix not in model_prefix_scores[display_name]:
|
| 308 |
+
model_prefix_scores[display_name][prefix] = []
|
| 309 |
+
model_prefix_scores[display_name][prefix].append(score * 100)
|
| 310 |
+
|
| 311 |
+
# 只取前11个前缀用于绘制
|
| 312 |
+
categories = prefix_order[:11]
|
| 313 |
+
|
| 314 |
+
# 创建雷达图
|
| 315 |
+
fig = go.Figure()
|
| 316 |
+
|
| 317 |
+
for model_name in selected_models:
|
| 318 |
+
if model_name not in model_prefix_scores:
|
| 319 |
+
continue
|
| 320 |
+
# 对每个前缀做均值聚合;缺失则为0
|
| 321 |
+
values = []
|
| 322 |
+
for prefix in categories:
|
| 323 |
+
scores = model_prefix_scores[model_name].get(prefix, [])
|
| 324 |
+
if scores:
|
| 325 |
+
values.append(sum(scores) / len(scores))
|
| 326 |
+
else:
|
| 327 |
+
values.append(0)
|
| 328 |
+
# 闭合多边形
|
| 329 |
+
r_values = values + ([values[0]] if values else [])
|
| 330 |
+
theta_values = categories + ([categories[0]] if categories else [])
|
| 331 |
+
color_index = get_model_color_index(model_name, selected_models)
|
| 332 |
+
fig.add_trace(go.Scatterpolar(
|
| 333 |
+
r=r_values,
|
| 334 |
+
theta=theta_values,
|
| 335 |
+
mode='lines+markers',
|
| 336 |
+
name=model_name,
|
| 337 |
+
line=dict(color=get_color(color_index), width=3),
|
| 338 |
+
marker=dict(size=6),
|
| 339 |
+
fill='toself'
|
| 340 |
+
))
|
| 341 |
+
|
| 342 |
+
fig.update_layout(
|
| 343 |
+
title='模型在不同主要任务上的性能对比',
|
| 344 |
+
polar=dict(
|
| 345 |
+
radialaxis=dict(visible=True, range=[0, 100])
|
| 346 |
+
),
|
| 347 |
+
legend=dict(
|
| 348 |
+
orientation="h",
|
| 349 |
+
yanchor="top",
|
| 350 |
+
y=-0.2,
|
| 351 |
+
xanchor="center",
|
| 352 |
+
x=0.5
|
| 353 |
+
),
|
| 354 |
+
margin=dict(b=100)
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
return fig
|
| 358 |
+
|
| 359 |
+
def create_language_chart(results, selected_models):
|
| 360 |
+
"""创建语言对比柱状图"""
|
| 361 |
+
if not selected_models:
|
| 362 |
+
return go.Figure()
|
| 363 |
+
|
| 364 |
+
# 收集数据 - 直接使用summary中的值,不需要计算平均值
|
| 365 |
+
chart_data = {}
|
| 366 |
+
|
| 367 |
+
for result in results:
|
| 368 |
+
display_name = get_display_name_for_result(result)
|
| 369 |
+
if display_name in selected_models:
|
| 370 |
+
model_name = display_name
|
| 371 |
+
language = result['language']
|
| 372 |
+
|
| 373 |
+
# 直接存储每个模型的结果,不需要计算平均值
|
| 374 |
+
if model_name not in chart_data:
|
| 375 |
+
chart_data[model_name] = {}
|
| 376 |
+
|
| 377 |
+
for lang_type, score in language.items():
|
| 378 |
+
chart_data[model_name][lang_type] = score * 100 # 乘以100
|
| 379 |
+
|
| 380 |
+
# 创建图表
|
| 381 |
+
fig = go.Figure()
|
| 382 |
+
|
| 383 |
+
# 获取所有语言类型 - 保持原始顺序,不排序
|
| 384 |
+
all_lang_types = []
|
| 385 |
+
for result in results:
|
| 386 |
+
display_name = get_display_name_for_result(result)
|
| 387 |
+
if display_name in selected_models:
|
| 388 |
+
language = result['language']
|
| 389 |
+
for lang_type in language.keys():
|
| 390 |
+
if lang_type not in all_lang_types:
|
| 391 |
+
all_lang_types.append(lang_type)
|
| 392 |
+
|
| 393 |
+
for model_name in selected_models:
|
| 394 |
+
if model_name in chart_data:
|
| 395 |
+
scores = [chart_data[model_name].get(lang_type, 0) for lang_type in all_lang_types]
|
| 396 |
+
color_index = get_model_color_index(model_name, selected_models)
|
| 397 |
+
|
| 398 |
+
fig.add_trace(go.Bar(
|
| 399 |
+
name=model_name,
|
| 400 |
+
x=all_lang_types,
|
| 401 |
+
y=scores,
|
| 402 |
+
marker_color=get_color(color_index),
|
| 403 |
+
text=[f"{score:.2f}" for score in scores], # 保留2位小数
|
| 404 |
+
textposition='auto'
|
| 405 |
+
))
|
| 406 |
+
|
| 407 |
+
fig.update_layout(
|
| 408 |
+
title='模型在不同语言上的性能对比',
|
| 409 |
+
xaxis_title='语言类型',
|
| 410 |
+
yaxis_title='平均得分',
|
| 411 |
+
barmode='group',
|
| 412 |
+
autosize=True, # 自动调整大小
|
| 413 |
+
legend=dict(
|
| 414 |
+
orientation="h",
|
| 415 |
+
yanchor="top",
|
| 416 |
+
y=-0.25, # 调整到更下方
|
| 417 |
+
xanchor="center",
|
| 418 |
+
x=0.5
|
| 419 |
+
),
|
| 420 |
+
margin=dict(b=100) # 增加底部边距
|
| 421 |
+
)
|
| 422 |
+
|
| 423 |
+
return fig
|
| 424 |
+
|
| 425 |
+
def create_difficulty_chart(results, selected_models):
|
| 426 |
+
"""创建难度对比柱状图"""
|
| 427 |
+
if not selected_models:
|
| 428 |
+
return go.Figure()
|
| 429 |
+
|
| 430 |
+
# 收集数据 - 直接使用summary中的值,不需要计算平均值
|
| 431 |
+
chart_data = {}
|
| 432 |
+
|
| 433 |
+
for result in results:
|
| 434 |
+
display_name = get_display_name_for_result(result)
|
| 435 |
+
if display_name in selected_models:
|
| 436 |
+
model_name = display_name
|
| 437 |
+
difficulty = result['difficulty']
|
| 438 |
+
|
| 439 |
+
# 直接存储每个模型的结果,不需要计算平均值
|
| 440 |
+
if model_name not in chart_data:
|
| 441 |
+
chart_data[model_name] = {}
|
| 442 |
+
|
| 443 |
+
for diff_type, score in difficulty.items():
|
| 444 |
+
chart_data[model_name][diff_type] = score * 100 # 乘以100
|
| 445 |
+
|
| 446 |
+
# 创建图表
|
| 447 |
+
fig = go.Figure()
|
| 448 |
+
|
| 449 |
+
# 获取所有难度类型 - 保持原始顺序,不排序
|
| 450 |
+
all_diff_types = []
|
| 451 |
+
for result in results:
|
| 452 |
+
display_name = get_display_name_for_result(result)
|
| 453 |
+
if display_name in selected_models:
|
| 454 |
+
difficulty = result['difficulty']
|
| 455 |
+
for diff_type in difficulty.keys():
|
| 456 |
+
if diff_type not in all_diff_types:
|
| 457 |
+
all_diff_types.append(diff_type)
|
| 458 |
+
|
| 459 |
+
for model_name in selected_models:
|
| 460 |
+
if model_name in chart_data:
|
| 461 |
+
scores = [chart_data[model_name].get(diff_type, 0) for diff_type in all_diff_types]
|
| 462 |
+
color_index = get_model_color_index(model_name, selected_models)
|
| 463 |
+
|
| 464 |
+
fig.add_trace(go.Bar(
|
| 465 |
+
name=model_name,
|
| 466 |
+
x=all_diff_types,
|
| 467 |
+
y=scores,
|
| 468 |
+
marker_color=get_color(color_index),
|
| 469 |
+
text=[f"{score:.2f}" for score in scores], # 保留2位小数
|
| 470 |
+
textposition='auto'
|
| 471 |
+
))
|
| 472 |
+
|
| 473 |
+
fig.update_layout(
|
| 474 |
+
title='模型在不同难度上的性能对比',
|
| 475 |
+
xaxis_title='难度类型',
|
| 476 |
+
yaxis_title='平均得分',
|
| 477 |
+
barmode='group',
|
| 478 |
+
autosize=True, # 自动调整大小
|
| 479 |
+
legend=dict(
|
| 480 |
+
orientation="h",
|
| 481 |
+
yanchor="top",
|
| 482 |
+
y=-0.25, # 调整到更下方
|
| 483 |
+
xanchor="center",
|
| 484 |
+
x=0.5
|
| 485 |
+
),
|
| 486 |
+
margin=dict(b=100) # 增加底部边距
|
| 487 |
+
)
|
| 488 |
+
|
| 489 |
+
return fig
|
| 490 |
+
|
| 491 |
+
def create_length_heatmap(results, selected_models):
|
| 492 |
+
"""创建长度热力图:横坐标为长度,纵坐标为模型"""
|
| 493 |
+
if not selected_models:
|
| 494 |
+
return go.Figure()
|
| 495 |
+
|
| 496 |
+
# 定义标准的context长度范围:8k, 16k, 32k, 64k, 128k, 256k
|
| 497 |
+
standard_lengths = [8000, 16000, 32000, 64000, 128000, 256000]
|
| 498 |
+
standard_length_keys = ['8k', '16k', '32k', '64k', '128k', '256k']
|
| 499 |
+
|
| 500 |
+
# 准备热力图数据
|
| 501 |
+
heatmap_data = []
|
| 502 |
+
model_names = []
|
| 503 |
+
|
| 504 |
+
for result in results:
|
| 505 |
+
display_name = get_display_name_for_result(result)
|
| 506 |
+
if display_name in selected_models:
|
| 507 |
+
model_names.append(display_name)
|
| 508 |
+
|
| 509 |
+
# 从token_length_metrics中获取数据
|
| 510 |
+
token_length_metrics = result.get('token_length_metrics', {})
|
| 511 |
+
row_data = []
|
| 512 |
+
|
| 513 |
+
for key in standard_length_keys:
|
| 514 |
+
if key in token_length_metrics:
|
| 515 |
+
row_data.append(token_length_metrics[key] * 100) # 乘以100转换为百分比
|
| 516 |
+
else:
|
| 517 |
+
row_data.append(None) # 没有数据点
|
| 518 |
+
|
| 519 |
+
heatmap_data.append(row_data)
|
| 520 |
+
|
| 521 |
+
# 创建热力图
|
| 522 |
+
fig = go.Figure(data=go.Heatmap(
|
| 523 |
+
z=heatmap_data,
|
| 524 |
+
x=[f"{length//1000}k" for length in standard_lengths], # x轴标签
|
| 525 |
+
y=model_names, # y轴标签
|
| 526 |
+
colorscale='RdYlBu_r', # 颜色映射:红色表示低分,蓝色表示高分
|
| 527 |
+
showscale=True,
|
| 528 |
+
text=[[f"{val:.2f}" if val is not None else "N/A" for val in row] for row in heatmap_data], # 显示数值
|
| 529 |
+
texttemplate="%{text}",
|
| 530 |
+
textfont={"size": 10},
|
| 531 |
+
hoverongaps=False
|
| 532 |
+
))
|
| 533 |
+
|
| 534 |
+
fig.update_layout(
|
| 535 |
+
title='模型在不同Context长度上的性能热力图',
|
| 536 |
+
xaxis_title='Context长度 (tokens)',
|
| 537 |
+
yaxis_title='模型名称',
|
| 538 |
+
autosize=True,
|
| 539 |
+
height=max(400, len(model_names) * 50), # 根据模型数量调整高度
|
| 540 |
+
margin=dict(l=150, r=50, t=80, b=80) # 调整边距,左侧留更多空间给模型名称
|
| 541 |
+
)
|
| 542 |
+
|
| 543 |
+
return fig
|
| 544 |
+
|
| 545 |
+
def create_bon_chart(results, selected_models):
|
| 546 |
+
"""创建BoN 1-3折线图,显示overall_metric"""
|
| 547 |
+
if not selected_models:
|
| 548 |
+
return go.Figure()
|
| 549 |
+
|
| 550 |
+
# BoN 标签
|
| 551 |
+
bon_labels = ['BoN-1', 'BoN-2', 'BoN-3']
|
| 552 |
+
bon_indices = [1, 2, 3]
|
| 553 |
+
|
| 554 |
+
# 为每个模型准备数据
|
| 555 |
+
model_data = {}
|
| 556 |
+
for result in results:
|
| 557 |
+
display_name = get_display_name_for_result(result)
|
| 558 |
+
if display_name in selected_models:
|
| 559 |
+
if display_name not in model_data:
|
| 560 |
+
model_data[display_name] = {}
|
| 561 |
+
|
| 562 |
+
# 从bon_data中获取数据
|
| 563 |
+
bon_data = result.get('bon_data', {})
|
| 564 |
+
for bon_key in bon_labels:
|
| 565 |
+
if bon_key in bon_data:
|
| 566 |
+
bon_index = bon_labels.index(bon_key) + 1
|
| 567 |
+
model_data[display_name][bon_index] = bon_data[bon_key] * 100 # 乘以100转换为百分比
|
| 568 |
+
|
| 569 |
+
# 创建图表
|
| 570 |
+
fig = go.Figure()
|
| 571 |
+
|
| 572 |
+
for model_name, data in model_data.items():
|
| 573 |
+
if not data:
|
| 574 |
+
continue
|
| 575 |
+
|
| 576 |
+
# 为每个BoN准备数据
|
| 577 |
+
x_values = []
|
| 578 |
+
y_values = []
|
| 579 |
+
text_values = []
|
| 580 |
+
|
| 581 |
+
for bon_index in bon_indices:
|
| 582 |
+
x_values.append(bon_index)
|
| 583 |
+
if bon_index in data:
|
| 584 |
+
y_values.append(data[bon_index])
|
| 585 |
+
text_values.append(f"{data[bon_index]:.2f}")
|
| 586 |
+
else:
|
| 587 |
+
y_values.append(None)
|
| 588 |
+
text_values.append("")
|
| 589 |
+
|
| 590 |
+
# 获取模型颜色索引
|
| 591 |
+
color_index = get_model_color_index(model_name, selected_models)
|
| 592 |
+
|
| 593 |
+
fig.add_trace(go.Scatter(
|
| 594 |
+
x=x_values,
|
| 595 |
+
y=y_values,
|
| 596 |
+
mode='lines+markers',
|
| 597 |
+
name=model_name,
|
| 598 |
+
line=dict(color=get_color(color_index), width=3),
|
| 599 |
+
marker=dict(size=10),
|
| 600 |
+
text=text_values,
|
| 601 |
+
textposition='top center',
|
| 602 |
+
connectgaps=False
|
| 603 |
+
))
|
| 604 |
+
|
| 605 |
+
# 设置x轴
|
| 606 |
+
fig.update_layout(
|
| 607 |
+
title='模型在不同Best-of-N下的对比',
|
| 608 |
+
xaxis_title='N',
|
| 609 |
+
yaxis_title='平均得分',
|
| 610 |
+
autosize=True,
|
| 611 |
+
xaxis=dict(
|
| 612 |
+
tickmode='array',
|
| 613 |
+
tickvals=bon_indices,
|
| 614 |
+
ticktext=bon_labels,
|
| 615 |
+
tickangle=0
|
| 616 |
+
),
|
| 617 |
+
legend=dict(
|
| 618 |
+
orientation="h",
|
| 619 |
+
yanchor="top",
|
| 620 |
+
y=-0.25,
|
| 621 |
+
xanchor="center",
|
| 622 |
+
x=0.5
|
| 623 |
+
),
|
| 624 |
+
margin=dict(b=100)
|
| 625 |
+
)
|
| 626 |
+
|
| 627 |
+
return fig
|
| 628 |
+
|
| 629 |
+
def create_pass_k_chart(results, selected_models):
|
| 630 |
+
"""创建Pass@N 折线图"""
|
| 631 |
+
if not selected_models:
|
| 632 |
+
return go.Figure()
|
| 633 |
+
|
| 634 |
+
# Pass@K 标签
|
| 635 |
+
k_labels = ['pass@1', 'pass@2', 'pass@3']
|
| 636 |
+
k_indices = [1, 2, 3]
|
| 637 |
+
|
| 638 |
+
# 为每个模型准备数据
|
| 639 |
+
model_data = {}
|
| 640 |
+
for result in results:
|
| 641 |
+
display_name = get_display_name_for_result(result)
|
| 642 |
+
if display_name in selected_models:
|
| 643 |
+
if display_name not in model_data:
|
| 644 |
+
model_data[display_name] = {}
|
| 645 |
+
|
| 646 |
+
# 从pass_at_k中获取数据
|
| 647 |
+
pass_data = result.get('pass_at_k', {})
|
| 648 |
+
for i, k_key in enumerate(k_labels):
|
| 649 |
+
val = pass_data.get(k_key)
|
| 650 |
+
if val is not None:
|
| 651 |
+
k_index = k_indices[i]
|
| 652 |
+
model_data[display_name][k_index] = val * 100 # 乘以100转换为百分比
|
| 653 |
+
|
| 654 |
+
# 创建图表
|
| 655 |
+
fig = go.Figure()
|
| 656 |
+
|
| 657 |
+
for model_name, data in model_data.items():
|
| 658 |
+
if not data:
|
| 659 |
+
continue
|
| 660 |
+
|
| 661 |
+
# 为每个Pass@K准备数据
|
| 662 |
+
x_values = []
|
| 663 |
+
y_values = []
|
| 664 |
+
text_values = []
|
| 665 |
+
|
| 666 |
+
for k_index in k_indices:
|
| 667 |
+
x_values.append(k_index)
|
| 668 |
+
if k_index in data:
|
| 669 |
+
y_values.append(data[k_index])
|
| 670 |
+
text_values.append(f"{data[k_index]:.2f}")
|
| 671 |
+
else:
|
| 672 |
+
y_values.append(None)
|
| 673 |
+
text_values.append("")
|
| 674 |
+
|
| 675 |
+
# 获取模型颜色索引
|
| 676 |
+
color_index = get_model_color_index(model_name, selected_models)
|
| 677 |
+
|
| 678 |
+
fig.add_trace(go.Scatter(
|
| 679 |
+
x=x_values,
|
| 680 |
+
y=y_values,
|
| 681 |
+
mode='lines+markers',
|
| 682 |
+
name=model_name,
|
| 683 |
+
line=dict(color=get_color(color_index), width=3),
|
| 684 |
+
marker=dict(size=10),
|
| 685 |
+
text=text_values,
|
| 686 |
+
textposition='top center',
|
| 687 |
+
connectgaps=False
|
| 688 |
+
))
|
| 689 |
+
|
| 690 |
+
# 设置x轴
|
| 691 |
+
fig.update_layout(
|
| 692 |
+
title='模型在不同Pass@N下的对比',
|
| 693 |
+
xaxis_title='N',
|
| 694 |
+
yaxis_title='Pass@N (%)',
|
| 695 |
+
autosize=True,
|
| 696 |
+
xaxis=dict(
|
| 697 |
+
tickmode='array',
|
| 698 |
+
tickvals=k_indices,
|
| 699 |
+
ticktext=k_labels,
|
| 700 |
+
tickangle=0
|
| 701 |
+
),
|
| 702 |
+
legend=dict(
|
| 703 |
+
orientation="h",
|
| 704 |
+
yanchor="top",
|
| 705 |
+
y=-0.25,
|
| 706 |
+
xanchor="center",
|
| 707 |
+
x=0.5
|
| 708 |
+
),
|
| 709 |
+
margin=dict(b=100)
|
| 710 |
+
)
|
| 711 |
+
|
| 712 |
+
return fig
|
| 713 |
+
|
| 714 |
+
def create_gradio_interface(parser: ResultParser):
|
| 715 |
+
"""创建Gradio界面"""
|
| 716 |
+
|
| 717 |
+
def refresh_data():
|
| 718 |
+
"""刷新数据"""
|
| 719 |
+
parser.scan_all_results()
|
| 720 |
+
return parser.get_leaderboard_data()
|
| 721 |
+
|
| 722 |
+
def get_model_choices():
|
| 723 |
+
"""获取模型选择列表(按是否包含Thinking或NonThinking区分,以相应后缀标识)"""
|
| 724 |
+
if not parser.results:
|
| 725 |
+
return []
|
| 726 |
+
display_names = set()
|
| 727 |
+
for r in parser.results:
|
| 728 |
+
name = get_display_name_for_result(r)
|
| 729 |
+
display_names.add(name)
|
| 730 |
+
models = sorted(list(display_names))
|
| 731 |
+
return models
|
| 732 |
+
|
| 733 |
+
def update_charts(selected_models):
|
| 734 |
+
"""更新所有图表"""
|
| 735 |
+
if not selected_models:
|
| 736 |
+
return None, None, None, None, None, None, None
|
| 737 |
+
|
| 738 |
+
length_heatmap = create_length_heatmap(parser.results, selected_models)
|
| 739 |
+
contextual_chart = create_contextual_requirement_chart(parser.results, selected_models)
|
| 740 |
+
primary_task_radar_chart = create_primary_task_radar_chart(parser.results, selected_models)
|
| 741 |
+
language_chart = create_language_chart(parser.results, selected_models)
|
| 742 |
+
difficulty_chart = create_difficulty_chart(parser.results, selected_models)
|
| 743 |
+
bon_chart = create_bon_chart(parser.results, selected_models)
|
| 744 |
+
pass_k_chart = create_pass_k_chart(parser.results, selected_models)
|
| 745 |
+
|
| 746 |
+
return length_heatmap, contextual_chart, primary_task_radar_chart, language_chart, difficulty_chart, bon_chart, pass_k_chart
|
| 747 |
+
|
| 748 |
+
# 自定义CSS:
|
| 749 |
+
# 1. 强制所有表头居中(包括内部的按钮或文本容器)
|
| 750 |
+
# 2. 除了第一列(模型名称),其他列内容居中
|
| 751 |
+
custom_css = """
|
| 752 |
+
/* 强制标题居中 */
|
| 753 |
+
h1 {
|
| 754 |
+
text-align: center;
|
| 755 |
+
display: block;
|
| 756 |
+
}
|
| 757 |
+
|
| 758 |
+
/* 表头居中 */
|
| 759 |
+
#leaderboard_table th,
|
| 760 |
+
#leaderboard_table th button,
|
| 761 |
+
#leaderboard_table th span {
|
| 762 |
+
text-align: center !important;
|
| 763 |
+
justify-content: center !important;
|
| 764 |
+
}
|
| 765 |
+
|
| 766 |
+
/* 内容列居中:从第3列开始(跳过行号和模型名称) */
|
| 767 |
+
#leaderboard_table td:nth-child(n+3) {
|
| 768 |
+
text-align: center !important;
|
| 769 |
+
}
|
| 770 |
+
"""
|
| 771 |
+
|
| 772 |
+
# 创建界面
|
| 773 |
+
with gr.Blocks(title="LongBench Pro 结果可视化", theme=gr.themes.Soft(), css=custom_css) as demo:
|
| 774 |
+
gr.Markdown("# LongBench Pro 结果可视化")
|
| 775 |
+
|
| 776 |
+
gr.HTML("""
|
| 777 |
+
<div style="text-align: center; display: flex; justify-content: center; gap: 10px; margin-bottom: 20px;">
|
| 778 |
+
<a href="https://huggingface.co/datasets/caskcsg/LongBench-Pro" target="_blank"><img src="https://img.shields.io/badge/HF-Dataset-yellow?logo=huggingface&logoColor=white" alt="HF Dataset"></a>
|
| 779 |
+
<a href="https://github.com/caskcsg/longcontext/tree/main/LongBench_Pro" target="_blank"><img src="https://img.shields.io/badge/Github-Code-blue?logo=github&logoColor=white" alt="Github Code"></a>
|
| 780 |
+
<a href="https://huggingface.co/spaces/caskcsg/LongBench-Pro-Leaderboard" target="_blank"><img src="https://img.shields.io/badge/🏆-Leaderboard-red" alt="Leaderboard"></a>
|
| 781 |
+
<a href="#" target="_blank"><img src="https://img.shields.io/badge/📄-Arxiv_Paper-green" alt="Paper"></a>
|
| 782 |
+
</div>
|
| 783 |
+
""")
|
| 784 |
+
|
| 785 |
+
# 排行榜区域
|
| 786 |
+
gr.Markdown("## 🏆 总体性能排行榜")
|
| 787 |
+
gr.Markdown("""
|
| 788 |
+
- *思考模型和混合思考模型的思考得分,使用本身的思考能力(Non-Thinking Prompt)*
|
| 789 |
+
- *指令模型的思考得分,使用思考提示获得(Thinking Prompt)*
|
| 790 |
+
""")
|
| 791 |
+
leaderboard_df = gr.Dataframe(
|
| 792 |
+
headers=["模型名称", "模型类型", "上下文长度", "截断长度", "非思考得分", "思考得分"],
|
| 793 |
+
datatype=["markdown", "str", "str", "str", "str", "str"],
|
| 794 |
+
interactive=False,
|
| 795 |
+
wrap=True,
|
| 796 |
+
show_row_numbers=True,
|
| 797 |
+
show_search="filter",
|
| 798 |
+
show_fullscreen_button=True,
|
| 799 |
+
max_height=800,
|
| 800 |
+
column_widths=["250px", "100px", "100px", "100px", "120px", "120px"],
|
| 801 |
+
elem_id="leaderboard_table"
|
| 802 |
+
)
|
| 803 |
+
|
| 804 |
+
# 模型筛选和图表区域
|
| 805 |
+
gr.HTML("<br>")
|
| 806 |
+
gr.Markdown("## 📊 特定维度对比")
|
| 807 |
+
with gr.Row():
|
| 808 |
+
with gr.Column(scale=4):
|
| 809 |
+
model_selector = gr.Dropdown(
|
| 810 |
+
choices=[],
|
| 811 |
+
label="选择模型",
|
| 812 |
+
value=[],
|
| 813 |
+
multiselect=True,
|
| 814 |
+
interactive=True
|
| 815 |
+
)
|
| 816 |
+
with gr.Column(scale=1):
|
| 817 |
+
update_charts_btn = gr.Button("更新图表", variant="primary", size="lg")
|
| 818 |
+
|
| 819 |
+
with gr.Tabs():
|
| 820 |
+
with gr.TabItem("语言维度"):
|
| 821 |
+
language_plot = gr.Plot()
|
| 822 |
+
|
| 823 |
+
with gr.TabItem("难度维度"):
|
| 824 |
+
difficulty_plot = gr.Plot()
|
| 825 |
+
|
| 826 |
+
with gr.TabItem("长度维度"):
|
| 827 |
+
length_heatmap = gr.Plot()
|
| 828 |
+
|
| 829 |
+
with gr.TabItem("主要任务维度"):
|
| 830 |
+
primary_task_radar_plot = gr.Plot()
|
| 831 |
+
|
| 832 |
+
with gr.TabItem("上下文需求维度"):
|
| 833 |
+
contextual_plot = gr.Plot()
|
| 834 |
+
|
| 835 |
+
with gr.TabItem("BoN维度"):
|
| 836 |
+
bon_plot = gr.Plot()
|
| 837 |
+
|
| 838 |
+
with gr.TabItem("Pass@N维度"):
|
| 839 |
+
pass_k_plot = gr.Plot()
|
| 840 |
+
|
| 841 |
+
# 事件处理
|
| 842 |
+
def update_model_choices():
|
| 843 |
+
models = get_model_choices()
|
| 844 |
+
return gr.Dropdown(choices=models, value=[])
|
| 845 |
+
|
| 846 |
+
update_charts_btn.click(
|
| 847 |
+
fn=update_charts,
|
| 848 |
+
inputs=[model_selector],
|
| 849 |
+
outputs=[length_heatmap, contextual_plot, primary_task_radar_plot, language_plot, difficulty_plot, bon_plot, pass_k_plot]
|
| 850 |
+
)
|
| 851 |
+
|
| 852 |
+
# 初始化 - 页面加载时自动刷新数据
|
| 853 |
+
demo.load(
|
| 854 |
+
fn=refresh_data,
|
| 855 |
+
outputs=[leaderboard_df]
|
| 856 |
+
).then(
|
| 857 |
+
fn=update_model_choices,
|
| 858 |
+
outputs=[model_selector]
|
| 859 |
+
)
|
| 860 |
+
|
| 861 |
+
return demo
|
| 862 |
|
| 863 |
+
def main():
|
| 864 |
+
"""主函数"""
|
| 865 |
+
output_dir = "./results"
|
| 866 |
+
|
| 867 |
+
print("初始化结果解析器...")
|
| 868 |
+
parser = ResultParser(output_dir)
|
| 869 |
+
|
| 870 |
+
print("扫描结果文件...")
|
| 871 |
+
parser.scan_all_results()
|
| 872 |
+
|
| 873 |
+
print("创建Gradio界面...")
|
| 874 |
+
demo = create_gradio_interface(parser)
|
| 875 |
+
|
| 876 |
+
print("启动服务器...")
|
| 877 |
+
demo.launch()
|
| 878 |
|
| 879 |
+
if __name__ == "__main__":
|
| 880 |
+
main()
|
results/Claude-3.7-Sonnet/nonthinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.5144730485997339,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.5192628714494713,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.5090899475543829,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.515066326795347,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.5927607589235461,
|
| 14 |
+
"16k": 0.5922491004183165,
|
| 15 |
+
"32k": 0.5555486925170308,
|
| 16 |
+
"64k": 0.4991997081584744,
|
| 17 |
+
"128k": 0.45285894052515324,
|
| 18 |
+
"256k": 0.39422109105588254
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.47584256909012274,
|
| 22 |
+
"Partial": 0.5636391134301498
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.6868572950582806,
|
| 26 |
+
"Moderate": 0.48375113564429373,
|
| 27 |
+
"Hard": 0.4728683670759167,
|
| 28 |
+
"Extreme": 0.3731393645349295
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.7586982224527067,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.7545327049493711,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.5277777777777779,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5250996637138268,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.5254132304220211,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.47394883159992857,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.3040021982475052,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.41188271604938276,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.6042705189653765,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.5114814814814815,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.43888888888888894
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.5100199196277736,
|
| 45 |
+
"English": 0.5189261775716956
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.5192628714494713,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.5937761874854375,
|
| 51 |
+
"16k": 0.606154781504802,
|
| 52 |
+
"32k": 0.5701163293545726,
|
| 53 |
+
"64k": 0.49747085680734393,
|
| 54 |
+
"128k": 0.4476635155122931,
|
| 55 |
+
"256k": 0.40039555803238175
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.47604900489990065,
|
| 59 |
+
"Partial": 0.5742623379671082
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.6993477849390543,
|
| 63 |
+
"Moderate": 0.4824572359285609,
|
| 64 |
+
"Hard": 0.47426941765067004,
|
| 65 |
+
"Extreme": 0.37571516352087697
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.7482072090157142,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.7523087560587559,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.5333333333333333,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5249609995597003,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.5307787048666445,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.47337460590728553,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.30808530916861365,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.40740740740740744,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.6209514621148434,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.5469444444444443,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.45
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.5151862466463957,
|
| 82 |
+
"English": 0.5233394962525484
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.2673333333333333,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.5638452937577435,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.6334745299899752,
|
| 90 |
+
"16k": 0.6535009894669588,
|
| 91 |
+
"32k": 0.6109603205298609,
|
| 92 |
+
"64k": 0.5566414838337063,
|
| 93 |
+
"128k": 0.5030500434216845,
|
| 94 |
+
"256k": 0.4254443953042751
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.523183868231744,
|
| 98 |
+
"Partial": 0.6155961989726518
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.7554808778953406,
|
| 102 |
+
"Moderate": 0.5292531239954449,
|
| 103 |
+
"Hard": 0.5124984336029716,
|
| 104 |
+
"Extreme": 0.41036353942072434
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.8018540164669292,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.7873358123358121,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.5666666666666667,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5413926274234249,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.5675265408978069,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.5409163851157314,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.34558583778191654,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.4685185185185185,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.6508982849457906,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.6081944444444445,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.49166666666666664
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.5545031574164562,
|
| 121 |
+
"English": 0.5731874300990306
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.30533333333333335,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.5987860690936202,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.6767030215651172,
|
| 129 |
+
"16k": 0.6801366595488965,
|
| 130 |
+
"32k": 0.6345247903374839,
|
| 131 |
+
"64k": 0.6039272497657204,
|
| 132 |
+
"128k": 0.5233376257525678,
|
| 133 |
+
"256k": 0.47408706759193875
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.5614328850984434,
|
| 137 |
+
"Partial": 0.6463264850874838
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.7882244354415668,
|
| 141 |
+
"Moderate": 0.5766160107480841,
|
| 142 |
+
"Hard": 0.5655081578600745,
|
| 143 |
+
"Extreme": 0.42768418415872295
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.8178727620501064,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8166698116698113,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.6166666666666667,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5469270547891242,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.581425502230592,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.5699768544212985,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.3875679491876082,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.5462962962962963,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.6792246386283735,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.6452777777777778,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.525
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.5867683815613326,
|
| 160 |
+
"English": 0.6108037566259096
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.3433333333333333
|
| 164 |
+
}
|
results/Claude-3.7-Sonnet/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.5966078087059191,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.5938171820634314,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.5955816438384393,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.6004246002158852,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.6997135645823386,
|
| 14 |
+
"16k": 0.6577212798228894,
|
| 15 |
+
"32k": 0.6419035800281319,
|
| 16 |
+
"64k": 0.6238264957040918,
|
| 17 |
+
"128k": 0.523846643485212,
|
| 18 |
+
"256k": 0.43263528861285133
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.5527640561663963,
|
| 22 |
+
"Partial": 0.652408948301675
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.7825568834883242,
|
| 26 |
+
"Moderate": 0.6155843766907921,
|
| 27 |
+
"Hard": 0.5658238514809286,
|
| 28 |
+
"Extreme": 0.4006643574451805
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.8486700113628681,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.8000456983629063,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.5027777777777777,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5309981037882555,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.5878801280848494,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.5732629573374424,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.3939611740106759,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.6212962962962962,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.69342672946219,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.5610185185185187,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5499999999999997
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.588350975552306,
|
| 45 |
+
"English": 0.6048646418595319
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.5938171820634314,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.7191856133745022,
|
| 51 |
+
"16k": 0.6402554520543442,
|
| 52 |
+
"32k": 0.6341044882273853,
|
| 53 |
+
"64k": 0.6259136300211012,
|
| 54 |
+
"128k": 0.5177437687626877,
|
| 55 |
+
"256k": 0.4257001399405685
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.5550251903269423,
|
| 59 |
+
"Partial": 0.6431888079098727
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.7745826629942724,
|
| 63 |
+
"Moderate": 0.6051939662024207,
|
| 64 |
+
"Hard": 0.5720458613452599,
|
| 65 |
+
"Extreme": 0.40262413493324634
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.8535130690589529,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.7808294450361011,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.5,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5297721568932299,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.5849862779597039,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.5655938313957183,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.3878929800328972,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.6314814814814815,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.6772724985908704,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.5855555555555555,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.5905648959587577,
|
| 82 |
+
"English": 0.5970694681681056
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.37266666666666665,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.6528466974974912,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.7638003125719123,
|
| 90 |
+
"16k": 0.7101654799888735,
|
| 91 |
+
"32k": 0.7028814358570394,
|
| 92 |
+
"64k": 0.6870948174265773,
|
| 93 |
+
"128k": 0.565071084237125,
|
| 94 |
+
"256k": 0.4880670549034259
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.6177398094156152,
|
| 98 |
+
"Partial": 0.6975281914198818
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.8281250142111442,
|
| 102 |
+
"Moderate": 0.6919969423858446,
|
| 103 |
+
"Hard": 0.6370194614390577,
|
| 104 |
+
"Extreme": 0.4455182924797047
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.8893518121473104,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.8424985248669454,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.6,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.544816527949887,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.6380851790586051,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.6286177167059521,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.45873422235423905,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.6578703703703704,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.7510537661056169,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.618611111111111,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.6583333333333333
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.6376749344767387,
|
| 121 |
+
"English": 0.6680184605182464
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.44333333333333336,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.6830970097115105,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.7837282981657816,
|
| 129 |
+
"16k": 0.7434736805597991,
|
| 130 |
+
"32k": 0.7293201301488285,
|
| 131 |
+
"64k": 0.7233152422607191,
|
| 132 |
+
"128k": 0.6019561639441493,
|
| 133 |
+
"256k": 0.5167885431897875
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.6474666039819896,
|
| 137 |
+
"Partial": 0.7284447988218108
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.8607457235279341,
|
| 141 |
+
"Moderate": 0.7303808913214545,
|
| 142 |
+
"Hard": 0.6594975210053281,
|
| 143 |
+
"Extreme": 0.47293457878264067
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.9066883296818435,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8643315529499735,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.6166666666666667,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5522881837833883,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.6507666087400348,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.6674614963830652,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.48674980053348876,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.7092592592592593,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.7716543341971472,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.6727777777777777,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.7083333333333334
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.6635013624534305,
|
| 160 |
+
"English": 0.7026926569695916
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.4826666666666667
|
| 164 |
+
}
|
results/Claude-4-Sonnet/nonthinking_context-1000000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.5606565628619046,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.5620036050651629,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.5631248059457928,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.5568412775747574,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.6071528197414233,
|
| 14 |
+
"16k": 0.5816154959256097,
|
| 15 |
+
"32k": 0.5612446325027117,
|
| 16 |
+
"64k": 0.5254403501645888,
|
| 17 |
+
"128k": 0.5465188702735214,
|
| 18 |
+
"256k": 0.541967208563576
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.5264359183584719,
|
| 22 |
+
"Partial": 0.6042101104117302
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.6842286640015465,
|
| 26 |
+
"Moderate": 0.5396282888053806,
|
| 27 |
+
"Hard": 0.5757154269645611,
|
| 28 |
+
"Extreme": 0.4292097599695439
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.8091570544332052,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.8207229190562522,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.5027777777777778,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5379205858992453,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.6634794615844591,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.5193980953038955,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.41333574812040713,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.3430555555555555,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.7423632867012375,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.5125462962962964,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5055555555555556
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.5499191967861178,
|
| 45 |
+
"English": 0.5713939289376934
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.5620036050651629,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.6116650341526972,
|
| 51 |
+
"16k": 0.5789009200875381,
|
| 52 |
+
"32k": 0.564884526756138,
|
| 53 |
+
"64k": 0.5150575277083638,
|
| 54 |
+
"128k": 0.5512053153279016,
|
| 55 |
+
"256k": 0.5503083063583438
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.5327035707633319,
|
| 59 |
+
"Partial": 0.5992945578129499
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.6903796852415736,
|
| 63 |
+
"Moderate": 0.5418819297458669,
|
| 64 |
+
"Hard": 0.5712079185155163,
|
| 65 |
+
"Extreme": 0.4285441662812896
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.8022069774152061,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.8176220076220074,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.5333333333333333,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5377531684167628,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.6702134910265403,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.509165168922684,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.4132211721134369,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.336574074074074,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.755058796168736,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.5204166666666666,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.5525156622096248,
|
| 82 |
+
"English": 0.5714915479207029
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.2986666666666667,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.6070413152649775,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.648428309386666,
|
| 90 |
+
"16k": 0.6138312363156012,
|
| 91 |
+
"32k": 0.6050527198490672,
|
| 92 |
+
"64k": 0.5709459527531863,
|
| 93 |
+
"128k": 0.6043903989020145,
|
| 94 |
+
"256k": 0.5995992743833368
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.5725088578790201,
|
| 98 |
+
"Partial": 0.6509917155743812
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.7433988294310949,
|
| 102 |
+
"Moderate": 0.5959678783418261,
|
| 103 |
+
"Hard": 0.6127141026903381,
|
| 104 |
+
"Extreme": 0.461215101348602
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.8352274736404431,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.8427542827542828,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.6,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5511890444295781,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.709250434569062,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.5590519848415796,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.4538880045271121,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.400462962962963,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.7896296658102724,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.5898611111111111,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.55
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.6018498726474579,
|
| 121 |
+
"English": 0.6122327578825001
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.3486666666666667,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.6359674546066088,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.6791089276748106,
|
| 129 |
+
"16k": 0.6344789904129863,
|
| 130 |
+
"32k": 0.6382754639404674,
|
| 131 |
+
"64k": 0.6155932927177382,
|
| 132 |
+
"128k": 0.6246025171745627,
|
| 133 |
+
"256k": 0.6237455357190941
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.5997037562420825,
|
| 137 |
+
"Partial": 0.6821212525251004
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.7810114340069705,
|
| 141 |
+
"Moderate": 0.6237269051463531,
|
| 142 |
+
"Hard": 0.6375808018524918,
|
| 143 |
+
"Extreme": 0.4840585077179298
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.8531556276526557,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8630080105080103,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.6416666666666667,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5557327849332047,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.7262293044645985,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.5966415249311195,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.47966164058768634,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.4393518518518518,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.7950682623015005,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.6329166666666667,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.6083333333333333
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.6245809400158543,
|
| 160 |
+
"English": 0.6473539691973664
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.38
|
| 164 |
+
}
|
results/Claude-4-Sonnet/thinking_context-1000000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 3,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.6987364832054667,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.7019992434982991,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.6978899327024527,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.6963202734156487,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.7273068305229948,
|
| 14 |
+
"16k": 0.7148161402734813,
|
| 15 |
+
"32k": 0.7282156837997693,
|
| 16 |
+
"64k": 0.7051754330841736,
|
| 17 |
+
"128k": 0.6642984844940268,
|
| 18 |
+
"256k": 0.6526063270583587
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.6617044440872328,
|
| 22 |
+
"Partial": 0.7458681693559481
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.8377531760390221,
|
| 26 |
+
"Moderate": 0.7658446956684767,
|
| 27 |
+
"Hard": 0.7472224806628969,
|
| 28 |
+
"Extreme": 0.4705256363582413
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.907681462321177,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.8890326186159514,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.661111111111111,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5383660231848545,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.7860152219301051,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.6671470819716809,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.5518199768375653,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.6859567901234568,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.8575767924690506,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.6481481481481483,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5888888888888888
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.6865315143381795,
|
| 45 |
+
"English": 0.7109414520727555
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.7019992434982991,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.7253942984634015,
|
| 51 |
+
"16k": 0.7347686831241128,
|
| 52 |
+
"32k": 0.7405843026072749,
|
| 53 |
+
"64k": 0.6852109109698611,
|
| 54 |
+
"128k": 0.6670707753146399,
|
| 55 |
+
"256k": 0.6589664905105064
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.6664641511659741,
|
| 59 |
+
"Partial": 0.747225724648532
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.8574789158116471,
|
| 63 |
+
"Moderate": 0.7537775763318981,
|
| 64 |
+
"Hard": 0.7422849474268556,
|
| 65 |
+
"Extreme": 0.47120899649989867
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.9235851079801015,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.8866411828911827,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.7166666666666667,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5397488846450174,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.7968571187727533,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.6625413639156731,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.5348173324914223,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.7013888888888888,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.8605924270512658,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.6277777777777778,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.575
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.7011338968631923,
|
| 82 |
+
"English": 0.7028645901334066
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.4786666666666667,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.7662921811826703,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.7864626566034867,
|
| 90 |
+
"16k": 0.7903224578099031,
|
| 91 |
+
"32k": 0.7829418518359753,
|
| 92 |
+
"64k": 0.776573418015718,
|
| 93 |
+
"128k": 0.7369784925906059,
|
| 94 |
+
"256k": 0.724474210240336
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.7387739445109663,
|
| 98 |
+
"Partial": 0.8013153914921128
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.9044185942106279,
|
| 102 |
+
"Moderate": 0.8489744455895216,
|
| 103 |
+
"Hard": 0.8300963355442819,
|
| 104 |
+
"Extreme": 0.5187660309473054
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.9337455622019767,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.920133061383061,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.8,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5512587507775879,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.851460921546061,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.7263341037175177,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.6403373459035246,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.7763888888888889,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.8952412388875778,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.7372222222222222,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.675
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.7559027655426326,
|
| 121 |
+
"English": 0.7766815968227087
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.5586666666666666,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.7907893209368455,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.8018918816218435,
|
| 129 |
+
"16k": 0.8122398647423218,
|
| 130 |
+
"32k": 0.8052123235968958,
|
| 131 |
+
"64k": 0.8059913515132954,
|
| 132 |
+
"128k": 0.7658095463175019,
|
| 133 |
+
"256k": 0.7535909578292158
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.7602540929686584,
|
| 137 |
+
"Partial": 0.8296523383509014
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.9273787695935313,
|
| 141 |
+
"Moderate": 0.8740334338695857,
|
| 142 |
+
"Hard": 0.8656496547603084,
|
| 143 |
+
"Extreme": 0.5373159132889705
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.9422092659363656,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.9295053557553554,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.8416666666666667,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5573876341160701,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.8621875560226953,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.7544804518953127,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.6657566567798248,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.8055555555555556,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.9043321479784867,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.7838888888888889,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.725
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.7791329047815275,
|
| 160 |
+
"English": 0.8024457370921636
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.598
|
| 164 |
+
}
|
results/DeepSeek-R1-0528/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 1,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.61893761586453,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.6270481753191374,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.6115350419668117,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.6182296303076407,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.7165288754873151,
|
| 14 |
+
"16k": 0.6828199674990499,
|
| 15 |
+
"32k": 0.6181133860648209,
|
| 16 |
+
"64k": 0.6286866574208946,
|
| 17 |
+
"128k": 0.5812085902020846,
|
| 18 |
+
"256k": 0.48626821851301744
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.5840521215265608,
|
| 22 |
+
"Partial": 0.6633373359310379
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.8266775116602355,
|
| 26 |
+
"Moderate": 0.6653109531944901,
|
| 27 |
+
"Hard": 0.5367579918507135,
|
| 28 |
+
"Extreme": 0.41488622286022364
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.8218247532961501,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.8170893828393826,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.5638888888888887,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5505430321723027,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.5766419140172199,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.5593864809441226,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.44029922393263027,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.6998456790123458,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.7073888549627423,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.613935185185185,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5361111111111111
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.6388577912805221,
|
| 45 |
+
"English": 0.5990174404485396
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.6270481753191374,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.7085495640791158,
|
| 51 |
+
"16k": 0.6852891854029057,
|
| 52 |
+
"32k": 0.6523714114738722,
|
| 53 |
+
"64k": 0.6349276261566131,
|
| 54 |
+
"128k": 0.5990851877476868,
|
| 55 |
+
"256k": 0.48206607705463184
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.5954038691856903,
|
| 59 |
+
"Partial": 0.6673227467617081
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.842688379979359,
|
| 63 |
+
"Moderate": 0.6856872320332753,
|
| 64 |
+
"Hard": 0.5371806890772243,
|
| 65 |
+
"Extreme": 0.4113355332448204
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.8085658637642787,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.8118172105672106,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.6083333333333333,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5488292098363808,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.5873852854613878,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.5544843141923278,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.4479896544786345,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.7194444444444444,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.704738113297963,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.6438888888888888,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.6382004484945908,
|
| 82 |
+
"English": 0.6158959021436853
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.39066666666666666,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.6908377344426165,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.7595240418657973,
|
| 90 |
+
"16k": 0.7570136230136708,
|
| 91 |
+
"32k": 0.7096605299566262,
|
| 92 |
+
"64k": 0.7135170329369038,
|
| 93 |
+
"128k": 0.6539368768633997,
|
| 94 |
+
"256k": 0.5513743020193068
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.6581544826620869,
|
| 98 |
+
"Partial": 0.7324346003451124
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.900422588090864,
|
| 102 |
+
"Moderate": 0.7645982045756979,
|
| 103 |
+
"Hard": 0.6173528116491919,
|
| 104 |
+
"Extreme": 0.46106606935258343
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.8656746491642889,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.8850374162874161,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.6833333333333333,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5661734768963829,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.6630436055268355,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.6284207499424888,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.520982919531347,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.7888888888888889,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.7536592506692632,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.7194444444444444,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5916666666666667
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.6996429460445758,
|
| 121 |
+
"English": 0.6820325228406605
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.4533333333333333,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.7254771584689355,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.7994316297691582,
|
| 129 |
+
"16k": 0.7937454228456009,
|
| 130 |
+
"32k": 0.7297465435731235,
|
| 131 |
+
"64k": 0.7326783552830499,
|
| 132 |
+
"128k": 0.6951047141031044,
|
| 133 |
+
"256k": 0.6021562852395752
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.6913631461546793,
|
| 137 |
+
"Partial": 0.7688949923234434
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.9337047416686951,
|
| 141 |
+
"Moderate": 0.8141591847806626,
|
| 142 |
+
"Hard": 0.6485103013588345,
|
| 143 |
+
"Extreme": 0.4896785698290395
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.8968239339375051,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8988072575572574,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.7583333333333333,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5734497230075248,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.6928069332150858,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.6670419649341216,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.5624760870987693,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.8055555555555556,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.7817439995394256,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.7638888888888888,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.65
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.7321739274049238,
|
| 160 |
+
"English": 0.7187803895329472
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.49666666666666665
|
| 164 |
+
}
|
results/DeepSeek-R1/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.6006714049681133,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.6007584621917721,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.5960043654782469,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.6052513872343173,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.6896237775198697,
|
| 14 |
+
"16k": 0.66847824761939,
|
| 15 |
+
"32k": 0.6242811862728697,
|
| 16 |
+
"64k": 0.5907117819226532,
|
| 17 |
+
"128k": 0.526720556197483,
|
| 18 |
+
"256k": 0.5042128802764103
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.5734808170096616,
|
| 22 |
+
"Partial": 0.6352776078243236
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.8244195631460464,
|
| 26 |
+
"Moderate": 0.5882837964508552,
|
| 27 |
+
"Hard": 0.5338546774181954,
|
| 28 |
+
"Extreme": 0.4075883160627708
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.8460279171139484,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.7927840387644306,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.5666666666666665,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5315482688091906,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.46763122932017526,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.5661396588973091,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.4411785360364781,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.6290123456790124,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.7118775193966861,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.6290277777777776,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5083333333333333
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.5813030699136731,
|
| 45 |
+
"English": 0.6200397400225522
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.6007584621917721,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.6794004597378533,
|
| 51 |
+
"16k": 0.6605152745514365,
|
| 52 |
+
"32k": 0.637696287010787,
|
| 53 |
+
"64k": 0.6015965809771497,
|
| 54 |
+
"128k": 0.5259184504039809,
|
| 55 |
+
"256k": 0.49942372046943184
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.5724096385120696,
|
| 59 |
+
"Partial": 0.6368387832386698
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.8239541273708798,
|
| 63 |
+
"Moderate": 0.5859117167110014,
|
| 64 |
+
"Hard": 0.541012801830159,
|
| 65 |
+
"Extreme": 0.40525140462840953
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.8346607474979665,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.7960157843246078,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.5916666666666667,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5314348105743746,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.46439938615714244,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.5590113115895492,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.4443221207730568,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.612962962962963,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.7031087891880523,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.6470833333333333,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5166666666666667
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.5732559357352887,
|
| 82 |
+
"English": 0.6282609886482589
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.3433333333333333,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.6591761776037405,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.7392791599059808,
|
| 90 |
+
"16k": 0.7298238663653037,
|
| 91 |
+
"32k": 0.6777532203191601,
|
| 92 |
+
"64k": 0.6787515982515117,
|
| 93 |
+
"128k": 0.5821796500623371,
|
| 94 |
+
"256k": 0.5472695707181553
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.6319274531975012,
|
| 98 |
+
"Partial": 0.6938563723025926
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.8783150831551243,
|
| 102 |
+
"Moderate": 0.678502573566839,
|
| 103 |
+
"Hard": 0.5963536523109759,
|
| 104 |
+
"Extreme": 0.4476885160139848
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.8861689034562782,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.8374043195366726,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.625,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5455058096240588,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.5369499475317325,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.6381104251141014,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.5019132623573087,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.699537037037037,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.7605821531353517,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.7220833333333334,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.6390711207410246,
|
| 121 |
+
"English": 0.6792812344664584
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.4093333333333333,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.6982292810132508,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.783866781971667,
|
| 129 |
+
"16k": 0.7689573555512769,
|
| 130 |
+
"32k": 0.7150318293064207,
|
| 131 |
+
"64k": 0.707631405403529,
|
| 132 |
+
"128k": 0.6291537694328186,
|
| 133 |
+
"256k": 0.5847345444137952
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.6747138288729008,
|
| 137 |
+
"Partial": 0.7281580382827888
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.9164006912153285,
|
| 141 |
+
"Moderate": 0.7293351521236777,
|
| 142 |
+
"Hard": 0.6332713918179128,
|
| 143 |
+
"Extreme": 0.48146703898856563
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.9025424539049985,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8813529526029528,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.6666666666666666,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5516760401143904,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.5770877507616411,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.6710035196738627,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.5581333121650413,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.7560185185185184,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.8001126786344129,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.7456944444444444,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.625
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.6861725652789407,
|
| 160 |
+
"English": 0.7102859967475628
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.45866666666666667
|
| 164 |
+
}
|
results/DeepSeek-V3-0324/nonthinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.5169762636111047,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.5181528065498966,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.5148683077997773,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.5179076764836414,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.5649502252093578,
|
| 14 |
+
"16k": 0.5347800008319371,
|
| 15 |
+
"32k": 0.5556420045489457,
|
| 16 |
+
"64k": 0.5214603320658495,
|
| 17 |
+
"128k": 0.4864319755387441,
|
| 18 |
+
"256k": 0.4385930434717982
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.477916384525216,
|
| 22 |
+
"Partial": 0.566688836993147
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.6526369307346341,
|
| 26 |
+
"Moderate": 0.4967551723461267,
|
| 27 |
+
"Hard": 0.48299903154456436,
|
| 28 |
+
"Extreme": 0.4039646133594433
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.7877679677233363,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.7655772360963239,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.5416666666666666,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5446412810076181,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.5358172435200781,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.4889882988114357,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.3557205172395749,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.24367283950617283,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.6358733797519817,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.6043981481481482,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4138888888888889
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.5177833454262655,
|
| 45 |
+
"English": 0.5161691817959461
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.5181528065498966,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.5625899513794126,
|
| 51 |
+
"16k": 0.5301053854931655,
|
| 52 |
+
"32k": 0.5507157770563107,
|
| 53 |
+
"64k": 0.5301772699202785,
|
| 54 |
+
"128k": 0.4898481902149404,
|
| 55 |
+
"256k": 0.44548026523527584
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.4761374399130544,
|
| 59 |
+
"Partial": 0.571626909542243
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.6469710156877221,
|
| 63 |
+
"Moderate": 0.5054097919629513,
|
| 64 |
+
"Hard": 0.48366558283696526,
|
| 65 |
+
"Extreme": 0.4080599930595185
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.7857412502114394,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.7680012781036988,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.5416666666666666,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5417591122015827,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.5399240477985636,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.4939990937730982,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.3541605841917901,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.24212962962962964,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.6325782099444311,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.6151388888888889,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.5160793653421631,
|
| 82 |
+
"English": 0.5202262477576317
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.23666666666666666,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.5442270665213056,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.594568981140541,
|
| 90 |
+
"16k": 0.5576674233487121,
|
| 91 |
+
"32k": 0.5818313481616337,
|
| 92 |
+
"64k": 0.5446955915354488,
|
| 93 |
+
"128k": 0.5245780781334816,
|
| 94 |
+
"256k": 0.4620209768080174
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.5071422110822568,
|
| 98 |
+
"Partial": 0.591425973443732
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.6783617155027842,
|
| 102 |
+
"Moderate": 0.5372286722751785,
|
| 103 |
+
"Hard": 0.5038179540985808,
|
| 104 |
+
"Extreme": 0.4284267679263639
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.8015300122926023,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.787322466174887,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.5666666666666667,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5591478963061529,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.5635377708713467,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.5151496972716835,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.37875998396209754,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.29953703703703705,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.6509912195762164,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.6334722222222222,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.45
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.5466945351249929,
|
| 121 |
+
"English": 0.5417595979176187
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.26066666666666666,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.5555999285609496,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.6111433308773455,
|
| 129 |
+
"16k": 0.5701468581090299,
|
| 130 |
+
"32k": 0.5948383611006594,
|
| 131 |
+
"64k": 0.5601715262733609,
|
| 132 |
+
"128k": 0.5307382241597844,
|
| 133 |
+
"256k": 0.4665612708455233
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.5192835049000674,
|
| 137 |
+
"Partial": 0.6018208314020757
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.6899946609782567,
|
| 141 |
+
"Moderate": 0.55112703775723,
|
| 142 |
+
"Hard": 0.5166174713215149,
|
| 143 |
+
"Extreme": 0.4369188707412474
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.8147581778505403,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8027788153812361,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.5916666666666667,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5641241115145229,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.5716268051520625,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.5308177346459271,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.3957991644610091,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.29953703703703705,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.6606747373420039,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.6418055555555555,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4583333333333333
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.5602606340750268,
|
| 160 |
+
"English": 0.550939223046875
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.272
|
| 164 |
+
}
|
results/DeepSeek-V3-0324/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.5670800708470047,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.5592880863605422,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.5704394040472569,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.5715127221332137,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.6286936796561847,
|
| 14 |
+
"16k": 0.6309027535519853,
|
| 15 |
+
"32k": 0.5969011989319307,
|
| 16 |
+
"64k": 0.5427727165403452,
|
| 17 |
+
"128k": 0.5275644070173147,
|
| 18 |
+
"256k": 0.4756456693842662
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.5368141482212665,
|
| 22 |
+
"Partial": 0.605600336007035
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.7920271132046242,
|
| 26 |
+
"Moderate": 0.5713738824882528,
|
| 27 |
+
"Hard": 0.4620098327210685,
|
| 28 |
+
"Extreme": 0.3868526000236004
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.8062267128065881,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.7434196920363564,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.5111111111111112,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5146898974811284,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.537979547816877,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.5248106585111237,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.3681804918962003,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.5841049382716048,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.6466057172430281,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.6183796296296297,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4944444444444444
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.5527161660546936,
|
| 45 |
+
"English": 0.5814439756393159
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.5592880863605422,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.6182266355968743,
|
| 51 |
+
"16k": 0.6224911215102044,
|
| 52 |
+
"32k": 0.6006190818475612,
|
| 53 |
+
"64k": 0.5386968289399401,
|
| 54 |
+
"128k": 0.5299641908835836,
|
| 55 |
+
"256k": 0.4457306593850912
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.530042096148415,
|
| 59 |
+
"Partial": 0.5965102557214322
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.7861925110638937,
|
| 63 |
+
"Moderate": 0.563744851626512,
|
| 64 |
+
"Hard": 0.4535437025457247,
|
| 65 |
+
"Extreme": 0.377252152391456
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.7951398867629517,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.751023606023606,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.4666666666666667,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5136736975702932,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.510566852786565,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.5116905358775884,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.37914696318940605,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.5847222222222223,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.6690241210962063,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.5966666666666666,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.475
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.5492580351840802,
|
| 82 |
+
"English": 0.569318137537005
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.30666666666666664,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.6262109782348795,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.6883768372773764,
|
| 90 |
+
"16k": 0.7011104482454619,
|
| 91 |
+
"32k": 0.647383679165818,
|
| 92 |
+
"64k": 0.6092322863406843,
|
| 93 |
+
"128k": 0.5965297187489229,
|
| 94 |
+
"256k": 0.5146328996310173
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.5949295492853987,
|
| 98 |
+
"Partial": 0.6660237059887659
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.8509257637159784,
|
| 102 |
+
"Moderate": 0.66205670168914,
|
| 103 |
+
"Hard": 0.5144444684176784,
|
| 104 |
+
"Extreme": 0.42991229790988206
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.8583478908910738,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.7814144189144188,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.5916666666666667,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5277169061289491,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.5960013921722541,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.5992683212004715,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.4404830107556436,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.6166666666666667,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.7117518441173546,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.7094444444444444,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.6172660802474611,
|
| 121 |
+
"English": 0.6351558762222995
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.37133333333333335,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.656766091226133,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.7136143676151803,
|
| 129 |
+
"16k": 0.723535463035206,
|
| 130 |
+
"32k": 0.676630607848943,
|
| 131 |
+
"64k": 0.6371662004749992,
|
| 132 |
+
"128k": 0.6230203333837121,
|
| 133 |
+
"256k": 0.5666295749987625
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.6274188340913682,
|
| 137 |
+
"Partial": 0.6941171457612914
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.8771215787680936,
|
| 141 |
+
"Moderate": 0.7073341252772278,
|
| 142 |
+
"Hard": 0.5445129149531545,
|
| 143 |
+
"Extreme": 0.4558925937418165
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.8681438231282913,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8017822455322453,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.625,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5352432221044823,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.6492191718976088,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.6200836170157672,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.4675370424175185,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.6592592592592593,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.7376455774030053,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.7722222222222221,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.6
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.6466196520294557,
|
| 160 |
+
"English": 0.6669125304228131
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.39866666666666667
|
| 164 |
+
}
|
results/DeepSeek-V3.1/nonthinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.513858634133048,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.5123343209652136,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.5169477472023125,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.5122938342316177,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.5798800160519532,
|
| 14 |
+
"16k": 0.557162234839459,
|
| 15 |
+
"32k": 0.5231647768475723,
|
| 16 |
+
"64k": 0.5020895430155518,
|
| 17 |
+
"128k": 0.47482295470763564,
|
| 18 |
+
"256k": 0.44603227933611866
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.4799762392602454,
|
| 22 |
+
"Partial": 0.556981682152979
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.6361477184952151,
|
| 26 |
+
"Moderate": 0.4879546435756716,
|
| 27 |
+
"Hard": 0.4929175841249406,
|
| 28 |
+
"Extreme": 0.4106651751804595
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.7899439606505894,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.7633837790575241,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.5472222222222223,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5504457320532966,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.5405950417654154,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.4826886948879801,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.3782532668616311,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.20864197530864195,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.6426366556970474,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.5235185185185185,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4611111111111112
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.5242036714829956,
|
| 45 |
+
"English": 0.5035135967831006
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.5123343209652136,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.5775546994411811,
|
| 51 |
+
"16k": 0.5639037168302903,
|
| 52 |
+
"32k": 0.5253784942631851,
|
| 53 |
+
"64k": 0.4921258363031359,
|
| 54 |
+
"128k": 0.46961700213634144,
|
| 55 |
+
"256k": 0.4454261768171497
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.4805828046453364,
|
| 59 |
+
"Partial": 0.5527453417359676
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.6289501731523275,
|
| 63 |
+
"Moderate": 0.48863452317355005,
|
| 64 |
+
"Hard": 0.49859679584956984,
|
| 65 |
+
"Extreme": 0.4091764699789028
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.7832093443146274,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.758390884975293,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.525,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5526444826651147,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.550721576382896,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.49314760273206115,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.36201890631349554,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.19351851851851853,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.6396574046033499,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.5548611111111109,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4666666666666667
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.5269577477981102,
|
| 82 |
+
"English": 0.49771089413231795
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.24,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.5575639036526701,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.6295556911757817,
|
| 90 |
+
"16k": 0.600201088330424,
|
| 91 |
+
"32k": 0.568079050385207,
|
| 92 |
+
"64k": 0.540874671214598,
|
| 93 |
+
"128k": 0.5142278846286528,
|
| 94 |
+
"256k": 0.4924450361813656
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.5270669681600962,
|
| 98 |
+
"Partial": 0.596378185188677
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.6861712816104073,
|
| 102 |
+
"Moderate": 0.5385181729697447,
|
| 103 |
+
"Hard": 0.5320539370618487,
|
| 104 |
+
"Extreme": 0.4459453589628649
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.7998040423032986,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.7974421612945821,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.6083333333333333,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5644516725623473,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.603125221836478,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.5375390883957035,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.42616915873421196,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.2569444444444444,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.6918022158557025,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.5736111111111112,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.5701747476373757,
|
| 121 |
+
"English": 0.5449530596679679
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.278,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.5783189478755758,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.6444603085006093,
|
| 129 |
+
"16k": 0.6168368410260194,
|
| 130 |
+
"32k": 0.591356956659339,
|
| 131 |
+
"64k": 0.5741881409846333,
|
| 132 |
+
"128k": 0.5329008044819139,
|
| 133 |
+
"256k": 0.5101706356009489
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.5453502508711524,
|
| 137 |
+
"Partial": 0.6202791076993917
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.700691507162887,
|
| 141 |
+
"Moderate": 0.5586892040625308,
|
| 142 |
+
"Hard": 0.5577221017016739,
|
| 143 |
+
"Extreme": 0.47068692726136246
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.8323002597882068,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8148032724056932,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.625,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5726119582175254,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.625566004790705,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.5576354028999125,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.4599095603777464,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.2736111111111111,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.7066934638816516,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.5819444444444444,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.5882921354915299,
|
| 160 |
+
"English": 0.5683457602596252
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.3
|
| 164 |
+
}
|
results/DeepSeek-V3.1/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 8,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.6621817899708398,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.6612230154154042,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.6610111426397741,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.6643112118573413,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.7494820895775017,
|
| 14 |
+
"16k": 0.7158886748078707,
|
| 15 |
+
"32k": 0.668616684861116,
|
| 16 |
+
"64k": 0.7028333128738413,
|
| 17 |
+
"128k": 0.6150251691532579,
|
| 18 |
+
"256k": 0.5212448085514543
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.6306995662884327,
|
| 22 |
+
"Partial": 0.7022500746575411
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.8572162949660228,
|
| 26 |
+
"Moderate": 0.7353266184513482,
|
| 27 |
+
"Hard": 0.622190936275892,
|
| 28 |
+
"Extreme": 0.4267542215146938
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.8547667520039813,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.8502990373823708,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.5944444444444446,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5592502973941748,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.664951753589773,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.6143401320362227,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.5020434872004602,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.7200617283950619,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.7327346609657337,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.6883796296296296,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5777777777777778
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.6626168849921547,
|
| 45 |
+
"English": 0.6617466949495263
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.6612230154154042,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.7464911564030304,
|
| 51 |
+
"16k": 0.7250299866543051,
|
| 52 |
+
"32k": 0.658322634935698,
|
| 53 |
+
"64k": 0.7169507057254954,
|
| 54 |
+
"128k": 0.6020278750216188,
|
| 55 |
+
"256k": 0.5185157337522811
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.6306907002796491,
|
| 59 |
+
"Partial": 0.7000823255881854
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.8474131666511834,
|
| 63 |
+
"Moderate": 0.7235563816620875,
|
| 64 |
+
"Hard": 0.6392066996365914,
|
| 65 |
+
"Extreme": 0.4307791961407244
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.858023793743841,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.8565782365782365,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.6083333333333333,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5579680390729822,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.6494356501600668,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.6299150042042198,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.5158224119304962,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.6930555555555555,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.7300925156020261,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.6966666666666667,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.55
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.6664246885361451,
|
| 82 |
+
"English": 0.6560213422946652
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.44533333333333336,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.7247597749822192,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.7863113902541472,
|
| 90 |
+
"16k": 0.7674558325218811,
|
| 91 |
+
"32k": 0.7281178415241295,
|
| 92 |
+
"64k": 0.7821445768686631,
|
| 93 |
+
"128k": 0.6970380441541918,
|
| 94 |
+
"256k": 0.5874909645703058
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.6960065402743163,
|
| 98 |
+
"Partial": 0.7613548009740968
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.9116793007094617,
|
| 102 |
+
"Moderate": 0.8131496116751571,
|
| 103 |
+
"Hard": 0.7011503473822069,
|
| 104 |
+
"Extreme": 0.47744898037225214
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.8830939332648011,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.8806919931919931,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.6916666666666667,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5727671170579177,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.7446276254629812,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.683646617964251,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.5837976903893948,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.7902777777777777,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.7892333891029187,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.7691666666666667,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.6416666666666667
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.7206824886096936,
|
| 121 |
+
"English": 0.7288370613547459
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.5166666666666667,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.7535692341250043,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.8201462848118084,
|
| 129 |
+
"16k": 0.7948417017172265,
|
| 130 |
+
"32k": 0.7537778254297324,
|
| 131 |
+
"64k": 0.8032112869269382,
|
| 132 |
+
"128k": 0.7298931033645517,
|
| 133 |
+
"256k": 0.6195452024997666
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.726098496655499,
|
| 137 |
+
"Partial": 0.7885319909043751
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.9344410438349966,
|
| 141 |
+
"Moderate": 0.844684360754373,
|
| 142 |
+
"Hard": 0.7404763342771397,
|
| 143 |
+
"Extreme": 0.5041859708975693
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.9022474827372764,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8967201779701777,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.7333333333333333,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5812947594742044,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.7893345155800883,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.7043090297107774,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.6227060379432763,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.8129629629629629,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.8129681115419459,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.79375,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.7
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.7447219230644647,
|
| 160 |
+
"English": 0.7624165451855438
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.552
|
| 164 |
+
}
|
results/DeepSeek-V3.2/nonthinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.5167049903246114,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.5175993160365915,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.5135807596157895,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.5189348953214519,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.5691616296699119,
|
| 14 |
+
"16k": 0.5676134549372556,
|
| 15 |
+
"32k": 0.5289760437098003,
|
| 16 |
+
"64k": 0.5015696259811485,
|
| 17 |
+
"128k": 0.4857001095282947,
|
| 18 |
+
"256k": 0.44720907812125815
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.48158748704864085,
|
| 22 |
+
"Partial": 0.5613999944940294
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.6212240545937193,
|
| 26 |
+
"Moderate": 0.5136346834318135,
|
| 27 |
+
"Hard": 0.5163049021668649,
|
| 28 |
+
"Extreme": 0.4044885248516518
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.7713866456415331,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.762687420604087,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.5333333333333333,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5515992544844097,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.5944535310423185,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.4789878465188747,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.39465891182344254,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.2149691358024691,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.6451135379199672,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.49787037037037035,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.46944444444444444
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.5273254322066815,
|
| 45 |
+
"English": 0.5060845484425422
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.5175993160365915,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.5773249164903291,
|
| 51 |
+
"16k": 0.5604627326667504,
|
| 52 |
+
"32k": 0.5374659261246943,
|
| 53 |
+
"64k": 0.5074991434398594,
|
| 54 |
+
"128k": 0.4702901190904467,
|
| 55 |
+
"256k": 0.45255305840747634
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.4801143243240927,
|
| 59 |
+
"Partial": 0.5653074873070485
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.6321632804118208,
|
| 63 |
+
"Moderate": 0.5036215268809261,
|
| 64 |
+
"Hard": 0.5164886663122836,
|
| 65 |
+
"Extreme": 0.40201006150807705
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.7671950408606149,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.7640499777999779,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.5833333333333334,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5488326711816013,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.58422686569879,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.47446486157270457,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.3846286380881271,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.2226851851851852,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.6587133120918426,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.5076388888888889,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.43333333333333335
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.5202198683535451,
|
| 82 |
+
"English": 0.5149787637196416
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.24533333333333332,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.5858993921620037,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.6332517244583726,
|
| 90 |
+
"16k": 0.6310887297252004,
|
| 91 |
+
"32k": 0.6114096243459353,
|
| 92 |
+
"64k": 0.5723573466459502,
|
| 93 |
+
"128k": 0.5523069746695444,
|
| 94 |
+
"256k": 0.5149819531270248
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.5499254609160144,
|
| 98 |
+
"Partial": 0.6316843955659928
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.6997370485006186,
|
| 102 |
+
"Moderate": 0.578931807315116,
|
| 103 |
+
"Hard": 0.59328992133015,
|
| 104 |
+
"Extreme": 0.46091761656187924
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.8076556299673503,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.8115853128353127,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.65,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5673030178914519,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.6729248677257385,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.5463889541830715,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.4736347042901523,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.2773148148148148,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.7103491970064771,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.5745833333333332,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5833333333333334
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.5980713707246905,
|
| 121 |
+
"English": 0.5737274135993191
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.312,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.6250975643039252,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.6624896013016575,
|
| 129 |
+
"16k": 0.667371637798531,
|
| 130 |
+
"32k": 0.6458484887458542,
|
| 131 |
+
"64k": 0.619893995338142,
|
| 132 |
+
"128k": 0.5995137452171235,
|
| 133 |
+
"256k": 0.5554679174222451
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.5850937206249248,
|
| 137 |
+
"Partial": 0.6760115471681097
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.7532527253719965,
|
| 141 |
+
"Moderate": 0.6215376875723243,
|
| 142 |
+
"Hard": 0.6270141217985696,
|
| 143 |
+
"Extreme": 0.48578877254181324
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.8312391426962816,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8401638639138637,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.725,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5737811360033489,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.7032234702446885,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.5739851542792719,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.5085181572307016,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.33425925925925926,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.7396125292314827,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.6588888888888889,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.6166666666666667
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.6402707582944462,
|
| 160 |
+
"English": 0.6099243703134061
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.3526666666666667
|
| 164 |
+
}
|
results/DeepSeek-V3.2/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.6782077426413915,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.671629754030229,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.6777556491690084,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.6852378247249357,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.755369154280727,
|
| 14 |
+
"16k": 0.7449467265987637,
|
| 15 |
+
"32k": 0.6953336880653428,
|
| 16 |
+
"64k": 0.6946800210314833,
|
| 17 |
+
"128k": 0.6477035080898761,
|
| 18 |
+
"256k": 0.5312133577821595
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.6459619783148297,
|
| 22 |
+
"Partial": 0.7192478063297452
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.8502179380964533,
|
| 26 |
+
"Moderate": 0.7507860067400632,
|
| 27 |
+
"Hard": 0.6772551692268365,
|
| 28 |
+
"Extreme": 0.4427333362390087
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.8628300416379104,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.8633894500561163,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.6277777777777778,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5645627813985595,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.7367830500533472,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.6168551563610202,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.5431477714039084,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.6640432098765434,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.7821104015574073,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.6818518518518517,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.622222222222222
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.6775197474946019,
|
| 45 |
+
"English": 0.6788957377881832
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.671629754030229,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.7397920433374541,
|
| 51 |
+
"16k": 0.7269924173975423,
|
| 52 |
+
"32k": 0.7007145536231846,
|
| 53 |
+
"64k": 0.6696695962094932,
|
| 54 |
+
"128k": 0.655131428243527,
|
| 55 |
+
"256k": 0.5374784853701818
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.6384885103680042,
|
| 59 |
+
"Partial": 0.7138095186912471
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.8524722619644284,
|
| 63 |
+
"Moderate": 0.7391126766592921,
|
| 64 |
+
"Hard": 0.6576844523580323,
|
| 65 |
+
"Extreme": 0.4383605238465565
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.8608914983834914,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.8649913049913043,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.6166666666666667,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5629979913624931,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.727192234350903,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.6142105299342737,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.5448247044942024,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.6222222222222223,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.7868571557580843,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.7038888888888889,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.6
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.6695591460858414,
|
| 82 |
+
"English": 0.6737003619746216
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.442,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.739062545668539,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.7985986496818439,
|
| 90 |
+
"16k": 0.8081758798304621,
|
| 91 |
+
"32k": 0.7769565912085228,
|
| 92 |
+
"64k": 0.7312553059908137,
|
| 93 |
+
"128k": 0.7215196949254835,
|
| 94 |
+
"256k": 0.5978691523741125
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.7083303283149,
|
| 98 |
+
"Partial": 0.7781762768459
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.9085052195225997,
|
| 102 |
+
"Moderate": 0.8289156864861236,
|
| 103 |
+
"Hard": 0.7590532764337315,
|
| 104 |
+
"Extreme": 0.4812983463842697
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.8977317620746387,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.9110066322566314,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.725,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5777725397440469,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.807338444836897,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.6747522573464864,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.6096717826867489,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.7398148148148148,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.8172408263391231,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.7575,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.7083333333333334
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.7379779669884229,
|
| 121 |
+
"English": 0.7401471243486567
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.5286666666666666,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.769484517884144,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.8183989834849181,
|
| 129 |
+
"16k": 0.8287125912826261,
|
| 130 |
+
"32k": 0.8000120047613876,
|
| 131 |
+
"64k": 0.7929440904260506,
|
| 132 |
+
"128k": 0.761968193898885,
|
| 133 |
+
"256k": 0.6148712434509996
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.7396412839189731,
|
| 137 |
+
"Partial": 0.8074668156579995
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.9315740410553816,
|
| 141 |
+
"Moderate": 0.8685767211690967,
|
| 142 |
+
"Hard": 0.8013938214601453,
|
| 143 |
+
"Extreme": 0.5058786414037999
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.904701887970005,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.9204745254745246,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.7583333333333333,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5854934073644178,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.8343370156947629,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.6936792642304824,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.6489698568119598,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.789814814814815,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.8323537332622081,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.8091666666666666,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.775
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.76992326145198,
|
| 160 |
+
"English": 0.7690457743163093
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.572
|
| 164 |
+
}
|
results/GLM-4.5/nonthinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.43035083788419254,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.4323298899239496,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.42711968234411496,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.43160294138451205,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.5314683958937002,
|
| 14 |
+
"16k": 0.49409349830535854,
|
| 15 |
+
"32k": 0.5016963643416883,
|
| 16 |
+
"64k": 0.41773091498134723,
|
| 17 |
+
"128k": 0.3415048455402667,
|
| 18 |
+
"256k": 0.2956110082427921
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.40615033864921796,
|
| 22 |
+
"Partial": 0.46115147327415856
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.5567819400237102,
|
| 26 |
+
"Moderate": 0.36920960557839383,
|
| 27 |
+
"Hard": 0.4020802519560651,
|
| 28 |
+
"Extreme": 0.3505786202440932
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.698931526590424,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.6796812769380679,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.4333333333333333,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5382688702946171,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.46682998936207576,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.3861931118799582,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.2699805385521367,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.19089506172839513,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.5555800013857407,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.4250462962962961,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.3111111111111111
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.4302365847397278,
|
| 45 |
+
"English": 0.4304650910286559
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.4323298899239496,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.5493363038285968,
|
| 51 |
+
"16k": 0.5003136640414932,
|
| 52 |
+
"32k": 0.48767860893851595,
|
| 53 |
+
"64k": 0.427526424572054,
|
| 54 |
+
"128k": 0.3355553710370271,
|
| 55 |
+
"256k": 0.29356896712600833
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.4069098702566493,
|
| 59 |
+
"Partial": 0.4646826422277853
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.549387464064302,
|
| 63 |
+
"Moderate": 0.38289221280334496,
|
| 64 |
+
"Hard": 0.4057019361680662,
|
| 65 |
+
"Extreme": 0.35405992762316457
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.6989719229147292,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.6906849631849629,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.4583333333333333,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5384461382258066,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.4818936352277487,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.37379481108384327,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.26307329406050667,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.19120370370370374,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.5459081401129251,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.43111111111111106,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.42606506993377413,
|
| 82 |
+
"English": 0.4385947099141243
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.188,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.5016598810015709,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.6177853878601806,
|
| 90 |
+
"16k": 0.5606584472775635,
|
| 91 |
+
"32k": 0.5705417377303723,
|
| 92 |
+
"64k": 0.49224051142500497,
|
| 93 |
+
"128k": 0.4148610132238459,
|
| 94 |
+
"256k": 0.3538721884924622
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.4781661168763403,
|
| 98 |
+
"Partial": 0.531561035342776
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.6469329658307705,
|
| 102 |
+
"Moderate": 0.4476620656378584,
|
| 103 |
+
"Hard": 0.47548119381384835,
|
| 104 |
+
"Extreme": 0.39518120452359784
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.7612773821912874,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.7455342342842339,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.5416666666666666,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5515008189706921,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.5415377453921782,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.46179351546743685,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.33666684704302524,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.24537037037037035,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.6269577879155583,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.5281944444444444,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.48816204809318214,
|
| 121 |
+
"English": 0.5151577139099619
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.23933333333333334,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.5384969000760513,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.6356799930117969,
|
| 129 |
+
"16k": 0.5929083561253189,
|
| 130 |
+
"32k": 0.6076554475255213,
|
| 131 |
+
"64k": 0.5450784408185039,
|
| 132 |
+
"128k": 0.4563507575799897,
|
| 133 |
+
"256k": 0.3933084053951815
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.5134959721986003,
|
| 137 |
+
"Partial": 0.5703162628291728
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.6991002895184023,
|
| 141 |
+
"Moderate": 0.4873593462709992,
|
| 142 |
+
"Hard": 0.49915491813734847,
|
| 143 |
+
"Extreme": 0.4219917912549902
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.7936494394174134,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.7682061919561918,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.6,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5604787610213925,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.5757231079007503,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.4903242104980166,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.3811432372555578,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.274537037037037,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.6567859123578721,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.5740277777777778,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.5274718845754229,
|
| 160 |
+
"English": 0.5495219155766814
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.27466666666666667
|
| 164 |
+
}
|
results/GLM-4.5/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 2,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.5547937875815533,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.5516398292433491,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.5535867950098665,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.559154738491441,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.6972820277601768,
|
| 14 |
+
"16k": 0.6560112539595868,
|
| 15 |
+
"32k": 0.6029656036576351,
|
| 16 |
+
"64k": 0.5486294944947675,
|
| 17 |
+
"128k": 0.4403706772307823,
|
| 18 |
+
"256k": 0.38350366838636923
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.5245120995377551,
|
| 22 |
+
"Partial": 0.593334117819114
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.7655351117837857,
|
| 26 |
+
"Moderate": 0.5513197070461822,
|
| 27 |
+
"Hard": 0.473832729215578,
|
| 28 |
+
"Extreme": 0.37939478048385467
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.81334590378291,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.7505160653892534,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.5027777777777779,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5448942860165678,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.5301683776855378,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.5045905245734419,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.3519808997100079,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.6371913580246912,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.6439646495440424,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.5808333333333334,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.3277777777777777
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.5739046099726642,
|
| 45 |
+
"English": 0.5356829651904418
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.5516398292433491,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.7055429088605052,
|
| 51 |
+
"16k": 0.6578481199386829,
|
| 52 |
+
"32k": 0.5956492591185892,
|
| 53 |
+
"64k": 0.5401522196952867,
|
| 54 |
+
"128k": 0.42395107911700697,
|
| 55 |
+
"256k": 0.386695388730029
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.5177580971479406,
|
| 59 |
+
"Partial": 0.5947620337284167
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.752722499925911,
|
| 63 |
+
"Moderate": 0.5341124686360985,
|
| 64 |
+
"Hard": 0.4868060322041808,
|
| 65 |
+
"Extreme": 0.3854592094497618
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.802009002313031,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.7465347152847152,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.5,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.545377281117479,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.5499549412575729,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.5019983393746833,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.36021434419736664,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.6138888888888889,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.6552473446554453,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.5405555555555556,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.5760533640670983,
|
| 82 |
+
"English": 0.5272262944196019
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.304,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.634985055308529,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.7829957270581746,
|
| 90 |
+
"16k": 0.720868655519701,
|
| 91 |
+
"32k": 0.6955532292535292,
|
| 92 |
+
"64k": 0.6191323026708053,
|
| 93 |
+
"128k": 0.5371685233314611,
|
| 94 |
+
"256k": 0.45419189401750604
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.6004623267171497,
|
| 98 |
+
"Partial": 0.67892307351574
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.8494656930118193,
|
| 102 |
+
"Moderate": 0.6521983830806934,
|
| 103 |
+
"Hard": 0.561519594238419,
|
| 104 |
+
"Extreme": 0.43697868974062276
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.8580534190067889,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.8009605209605205,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.6166666666666667,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5623513196545108,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.6128841050494817,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.5858431630098295,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.42985731617797807,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.7472222222222222,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.7035408856813825,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.6718055555555555,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4666666666666667
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.6487038966161633,
|
| 121 |
+
"English": 0.6212662140008957
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.4,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.6753829492782434,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.8185554078642306,
|
| 129 |
+
"16k": 0.7599530890518512,
|
| 130 |
+
"32k": 0.7311990883460826,
|
| 131 |
+
"64k": 0.6754068771726057,
|
| 132 |
+
"128k": 0.5730612976856176,
|
| 133 |
+
"256k": 0.49412193554907724
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.6412352384515387,
|
| 137 |
+
"Partial": 0.718843672148596
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.8844767087888782,
|
| 141 |
+
"Moderate": 0.717837978263145,
|
| 142 |
+
"Hard": 0.600883682680195,
|
| 143 |
+
"Extreme": 0.4673774778829585
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.896714960990196,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8421179283679281,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.675,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5712469046961466,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.6300235179241888,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.6348800775177585,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.4736376213352886,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.7824074074074073,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.7424625612755764,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.765,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.6895150466449506,
|
| 160 |
+
"English": 0.6612508519115379
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.444
|
| 164 |
+
}
|
results/GLM-4.6/nonthinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.45854238430368943,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.44890397156188516,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.4676756901179884,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.45904749123119587,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.539826456202734,
|
| 14 |
+
"16k": 0.49883990878468565,
|
| 15 |
+
"32k": 0.5226004279628154,
|
| 16 |
+
"64k": 0.4617605114078172,
|
| 17 |
+
"128k": 0.3868307627999842,
|
| 18 |
+
"256k": 0.34139623866410224
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.43758451425043066,
|
| 22 |
+
"Partial": 0.4852160370987465
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.5881815896217503,
|
| 26 |
+
"Moderate": 0.40045739164324096,
|
| 27 |
+
"Hard": 0.43071501653405486,
|
| 28 |
+
"Extreme": 0.3729573279423014
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.7391294273870368,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.7168876379566528,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.48333333333333334,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.54430615481068,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.5036524549754317,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.4230515938862694,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.28024190494681,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.204783950617284,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.5493453618981832,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.48856481481481484,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.3444444444444445
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.4607293656684521,
|
| 45 |
+
"English": 0.45635540293892746
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.44890397156188516,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.5369079824065298,
|
| 51 |
+
"16k": 0.4950184758232463,
|
| 52 |
+
"32k": 0.5119378052749619,
|
| 53 |
+
"64k": 0.4359616248204175,
|
| 54 |
+
"128k": 0.3900490912931948,
|
| 55 |
+
"256k": 0.323548849752962
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.42475510461227917,
|
| 59 |
+
"Partial": 0.47963889313411256
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.5839417584803529,
|
| 63 |
+
"Moderate": 0.38543423470966093,
|
| 64 |
+
"Hard": 0.4251833017579716,
|
| 65 |
+
"Extreme": 0.3582444584458006
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.7552782575667283,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.72015059015059,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.425,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5415035414147457,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.49621823831328055,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.41038094275648807,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.27110571290315905,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.1953703703703704,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.5256412558109732,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.4822222222222222,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.35
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.44671558878002343,
|
| 82 |
+
"English": 0.45109235434374795
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.19266666666666668,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.5266630332287439,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.604236555945032,
|
| 90 |
+
"16k": 0.5564543546519637,
|
| 91 |
+
"32k": 0.5870438443448399,
|
| 92 |
+
"64k": 0.5387565602861155,
|
| 93 |
+
"128k": 0.46692762105066604,
|
| 94 |
+
"256k": 0.4065592630938467
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.5015019335956029,
|
| 98 |
+
"Partial": 0.5586862509436522
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.6773656505276844,
|
| 102 |
+
"Moderate": 0.47328676466113107,
|
| 103 |
+
"Hard": 0.49837787163941455,
|
| 104 |
+
"Extreme": 0.4152118781770787
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.7867939033692525,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.7854105213890441,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.5666666666666667,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5572174880251567,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.573956288098141,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.4915724205045227,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.33460325146257974,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.2578703703703704,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.6423128731937172,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.6198611111111111,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.425
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.518790207475404,
|
| 121 |
+
"English": 0.5345358589820849
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.25466666666666665,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.5609610306922399,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.6386076789983793,
|
| 129 |
+
"16k": 0.5860184400135818,
|
| 130 |
+
"32k": 0.6210153091497477,
|
| 131 |
+
"64k": 0.5871225156541778,
|
| 132 |
+
"128k": 0.4913536643924111,
|
| 133 |
+
"256k": 0.4416485759451446
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.5370910562726694,
|
| 137 |
+
"Partial": 0.5913409981353319
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.7100406216614994,
|
| 141 |
+
"Moderate": 0.5174437099567866,
|
| 142 |
+
"Hard": 0.5265960486680222,
|
| 143 |
+
"Extreme": 0.44880562762488274
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.815117939706351,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8064498061783285,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.6416666666666667,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5680717589841464,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.604565984118908,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.5283992168363204,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.35872230397543203,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.28935185185185186,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.6707362245587523,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.6406944444444443,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.5609588769304155,
|
| 160 |
+
"English": 0.5609631844540666
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.286
|
| 164 |
+
}
|
results/GLM-4.6/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.5820993757625644,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.5900318347862288,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.5774825139114689,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.5787837785899949,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.7122784818137915,
|
| 14 |
+
"16k": 0.6603518496747058,
|
| 15 |
+
"32k": 0.6352743108645184,
|
| 16 |
+
"64k": 0.5897286272690893,
|
| 17 |
+
"128k": 0.475467875017661,
|
| 18 |
+
"256k": 0.4194951099356217
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.5470143190278319,
|
| 22 |
+
"Partial": 0.6267530843340428
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.7978473417092227,
|
| 26 |
+
"Moderate": 0.6094768922677877,
|
| 27 |
+
"Hard": 0.4892370620605133,
|
| 28 |
+
"Extreme": 0.3887688912252786
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.8197327977970514,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.8006519321293782,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.538888888888889,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5408566771607968,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.5337112988588841,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.5397680321862239,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.380513781495624,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.6123456790123456,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.6754501038965057,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.6013425925925924,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.46666666666666673
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.5991918774535788,
|
| 45 |
+
"English": 0.5650068740715505
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.5900318347862288,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.7190910013269595,
|
| 51 |
+
"16k": 0.6680291983169964,
|
| 52 |
+
"32k": 0.6447298296516131,
|
| 53 |
+
"64k": 0.5905857251798682,
|
| 54 |
+
"128k": 0.4766512837488421,
|
| 55 |
+
"256k": 0.4411039704930917
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.5455741223917344,
|
| 59 |
+
"Partial": 0.6466143778337665
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.8187580590040169,
|
| 63 |
+
"Moderate": 0.6184177733393077,
|
| 64 |
+
"Hard": 0.4890540077544932,
|
| 65 |
+
"Extreme": 0.38715232500749663
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.8264146064969754,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.8039418272654707,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.55,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5427878753470068,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.5419657251498339,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.5399584229450126,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.37011974365960826,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.6416666666666667,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.693087317328304,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.6145833333333334,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.475
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.6075617792305364,
|
| 82 |
+
"English": 0.5725018903419208
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.36466666666666664,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.6659160211594685,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.7935291454736249,
|
| 90 |
+
"16k": 0.7469698033059613,
|
| 91 |
+
"32k": 0.7147019641303554,
|
| 92 |
+
"64k": 0.6716314717341791,
|
| 93 |
+
"128k": 0.5707409059251077,
|
| 94 |
+
"256k": 0.497922836387586
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.6309312423015924,
|
| 98 |
+
"Partial": 0.7104421033422219
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.8923409007483016,
|
| 102 |
+
"Moderate": 0.7327459154582487,
|
| 103 |
+
"Hard": 0.5632013922542815,
|
| 104 |
+
"Extreme": 0.4414476037490934
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.880882952904511,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.8476685999185997,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.6666666666666666,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5583277247093672,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.6177785331058078,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.6268647075743846,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.4566279411326888,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.7217592592592593,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.7464145919055787,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.7400000000000001,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5583333333333333
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.6881369643762367,
|
| 121 |
+
"English": 0.6436950779427024
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.43866666666666665,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.706844154531168,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.8222844269368486,
|
| 129 |
+
"16k": 0.7772591556882481,
|
| 130 |
+
"32k": 0.7466099276083229,
|
| 131 |
+
"64k": 0.7234900151981559,
|
| 132 |
+
"128k": 0.62014724878678,
|
| 133 |
+
"256k": 0.551274152968658
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.6725119993190923,
|
| 137 |
+
"Partial": 0.7505396248010854
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.9285919042851156,
|
| 141 |
+
"Moderate": 0.7918869916930842,
|
| 142 |
+
"Hard": 0.6026672046571206,
|
| 143 |
+
"Extreme": 0.4764972072411785
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.9055704999975606,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8679013780100734,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.7416666666666667,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5683880603721403,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.6545164467516165,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.6671955091257745,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.5102082868832639,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.7717592592592593,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.7612642969391072,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.7541666666666667,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.6583333333333333
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.7225009620618484,
|
| 160 |
+
"English": 0.6911873470004898
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.48133333333333334
|
| 164 |
+
}
|
results/GPT-4o/nonthinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.46665010092977977,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.4658401067882854,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.46753496394327626,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.466575232057776,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.5113488376851383,
|
| 14 |
+
"16k": 0.4997009141224516,
|
| 15 |
+
"32k": 0.5251055066966325,
|
| 16 |
+
"64k": 0.45692433752384126,
|
| 17 |
+
"128k": 0.4357776587958875,
|
| 18 |
+
"256k": 0.37104335075472816
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.4340554247509889,
|
| 22 |
+
"Partial": 0.5081342342482407
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.5937760942513476,
|
| 26 |
+
"Moderate": 0.4302951347009006,
|
| 27 |
+
"Hard": 0.4487990540053617,
|
| 28 |
+
"Extreme": 0.3629928487032055
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.7349022081832958,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.7250800279966945,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.522222222222222,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5082664460738612,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.5342878338439898,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.4265212400920285,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.27964395302062434,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.21157407407407405,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.5920364002998717,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.4702777777777778,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.36944444444444435
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.4566116864733059,
|
| 45 |
+
"English": 0.47668851538625334
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.4658401067882854,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.49942481947489,
|
| 51 |
+
"16k": 0.5075293232399243,
|
| 52 |
+
"32k": 0.535471182543319,
|
| 53 |
+
"64k": 0.4466006941173288,
|
| 54 |
+
"128k": 0.43362259025283534,
|
| 55 |
+
"256k": 0.37239203110142105
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.434998076864218,
|
| 59 |
+
"Partial": 0.5050935994189195
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.5852393493724929,
|
| 63 |
+
"Moderate": 0.43887833780663893,
|
| 64 |
+
"Hard": 0.4558206214636842,
|
| 65 |
+
"Extreme": 0.3593336239903742
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.7408543245056541,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.7499819162319159,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.5166666666666667,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5048929077004661,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.5359141447270879,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.42758922871826105,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.27436727289246843,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.2152777777777778,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.592312178161249,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.44819444444444445,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.35833333333333334
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.4424240788239187,
|
| 82 |
+
"English": 0.48925613475265434
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.194,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.5398849816112041,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.5759687409807404,
|
| 90 |
+
"16k": 0.549906030476181,
|
| 91 |
+
"32k": 0.6066920241775022,
|
| 92 |
+
"64k": 0.5317817512096467,
|
| 93 |
+
"128k": 0.5214872601681293,
|
| 94 |
+
"256k": 0.4534740826550301
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.5083509910328655,
|
| 98 |
+
"Partial": 0.5800191514381833
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.6782240284299118,
|
| 102 |
+
"Moderate": 0.5120970500900949,
|
| 103 |
+
"Hard": 0.5256120585356611,
|
| 104 |
+
"Extreme": 0.41596717800169664
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.7993838319887435,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.7934912309912311,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.6666666666666666,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5276777173176291,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.6042284709538017,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.4920141039764696,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.34899250013259275,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.26296296296296295,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.6671322238361709,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.5840277777777777,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.45
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.525731927704756,
|
| 121 |
+
"English": 0.5540380355176548
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.25466666666666665,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.5744593750081309,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.606512147802086,
|
| 129 |
+
"16k": 0.5958370007923852,
|
| 130 |
+
"32k": 0.6384127504911996,
|
| 131 |
+
"64k": 0.5707759455131253,
|
| 132 |
+
"128k": 0.5568976109117445,
|
| 133 |
+
"256k": 0.4783207945382497
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.5391592397427731,
|
| 137 |
+
"Partial": 0.6193868198913163
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.7196856800163218,
|
| 141 |
+
"Moderate": 0.540664277732673,
|
| 142 |
+
"Hard": 0.5601282438717835,
|
| 143 |
+
"Extreme": 0.44698074091055134
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.8103075764869724,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8166357716357717,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.7,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5377648665242384,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.6423496422537093,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.532547329213996,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.40458496685251083,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.30046296296296293,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.6915969977123051,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.6340277777777777,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.49166666666666664
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.563099563818372,
|
| 160 |
+
"English": 0.5858191861978923
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.286
|
| 164 |
+
}
|
results/GPT-4o/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.4943586900400841,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.4968481802669354,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.4953906178052376,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.49083727204807814,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.5879811335998006,
|
| 14 |
+
"16k": 0.5326400416753286,
|
| 15 |
+
"32k": 0.512948102728002,
|
| 16 |
+
"64k": 0.4721690409999518,
|
| 17 |
+
"128k": 0.44658724759711643,
|
| 18 |
+
"256k": 0.41382657364030534
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.46488316610562114,
|
| 22 |
+
"Partial": 0.5318729932294004
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.7183848834775743,
|
| 26 |
+
"Moderate": 0.43069679620968054,
|
| 27 |
+
"Hard": 0.41352044386464876,
|
| 28 |
+
"Extreme": 0.34385849736921437
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.7592784040430812,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.6992785706820794,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.41944444444444456,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.4904128501144795,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.5555445495468067,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.46463843332938043,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.26892122251820344,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.4810185185185185,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.5265279154913766,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.48212962962962963,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4250000000000001
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.46257560421163496,
|
| 45 |
+
"English": 0.526141775868533
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.4968481802669354,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.5741867935281576,
|
| 51 |
+
"16k": 0.535374756046493,
|
| 52 |
+
"32k": 0.5202544808789716,
|
| 53 |
+
"64k": 0.47647427072064424,
|
| 54 |
+
"128k": 0.4412042204935842,
|
| 55 |
+
"256k": 0.4335945599337654
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.47247358832994213,
|
| 59 |
+
"Partial": 0.5278703881867464
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.7337781908034996,
|
| 63 |
+
"Moderate": 0.4336235016656763,
|
| 64 |
+
"Hard": 0.39945319018108144,
|
| 65 |
+
"Extreme": 0.3428000420213732
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.75748852621513,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.7128378857984122,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.4166666666666667,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.4915611304736425,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.5573645055278006,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.4608094132930736,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.2337605934000851,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.5046296296296297,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.5617174175041987,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.5141666666666667,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.46378379039245493,
|
| 82 |
+
"English": 0.5299125701414172
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.25266666666666665,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.5733973874130963,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.6607408168420358,
|
| 90 |
+
"16k": 0.6168176180801352,
|
| 91 |
+
"32k": 0.6026001586229682,
|
| 92 |
+
"64k": 0.5549119793217003,
|
| 93 |
+
"128k": 0.5219672618111347,
|
| 94 |
+
"256k": 0.4833464898006067
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.5468729804164447,
|
| 98 |
+
"Partial": 0.6071557235906547
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.832354149230665,
|
| 102 |
+
"Moderate": 0.5035087598472016,
|
| 103 |
+
"Hard": 0.482911736547395,
|
| 104 |
+
"Extreme": 0.39505876757369657
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.8194314757440243,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.7647144522144521,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.5166666666666667,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.507756894787762,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.6431090618973574,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.5325685690744169,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.31928948521783374,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.5949074074074073,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.5981405988039616,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.6058333333333333,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.5418805636874472,
|
| 121 |
+
"English": 0.6049142111387469
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.33,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.6133026799603023,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.7092280973251848,
|
| 129 |
+
"16k": 0.6573848999983455,
|
| 130 |
+
"32k": 0.6374049433271965,
|
| 131 |
+
"64k": 0.5884360272634136,
|
| 132 |
+
"128k": 0.5736764233126421,
|
| 133 |
+
"256k": 0.513685688535035
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.585559935676702,
|
| 137 |
+
"Partial": 0.6486116272303409
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.8579938866329095,
|
| 141 |
+
"Moderate": 0.5531379960724733,
|
| 142 |
+
"Hard": 0.5375404413799146,
|
| 143 |
+
"Extreme": 0.4345338594537549
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.8365910773415853,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.7977771302771303,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.5916666666666667,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5159806774505266,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.6935011362804332,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.5705653708431486,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.381137305153889,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.6296296296296297,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.6584905752696673,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.6336111111111112,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5666666666666667
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.5780672262010988,
|
| 160 |
+
"English": 0.6485381337195074
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.36666666666666664
|
| 164 |
+
}
|
results/GPT-5/thinking_context-272000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.726053089253122,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.7242860759291603,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.72436075729001,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.729512434540192,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.7537078410340138,
|
| 14 |
+
"16k": 0.7627066310839429,
|
| 15 |
+
"32k": 0.7434290864816196,
|
| 16 |
+
"64k": 0.7646193918174649,
|
| 17 |
+
"128k": 0.6936202889645278,
|
| 18 |
+
"256k": 0.638235296137159
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.6915568234658586,
|
| 22 |
+
"Partial": 0.7699574275278195
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.8523326045847652,
|
| 26 |
+
"Moderate": 0.8231088494697211,
|
| 27 |
+
"Hard": 0.787367547123676,
|
| 28 |
+
"Extreme": 0.4836991814871219
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.9032376385150938,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.9075063054229715,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.6666666666666666,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5256066584699448,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.8116994715897818,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.6716265654111317,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.631179283519898,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.7979938271604939,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.818404768269679,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.6802314814814814,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.6111111111111112
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.7196645097291159,
|
| 45 |
+
"English": 0.7324416687771269
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.7242860759291603,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.7638228227994025,
|
| 51 |
+
"16k": 0.7511485364018967,
|
| 52 |
+
"32k": 0.7397315002658593,
|
| 53 |
+
"64k": 0.7648062624572959,
|
| 54 |
+
"128k": 0.6947065191324134,
|
| 55 |
+
"256k": 0.6315008145180959
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.6845638507619599,
|
| 59 |
+
"Partial": 0.7748416352328712
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.8419121655420269,
|
| 63 |
+
"Moderate": 0.8140896757444649,
|
| 64 |
+
"Hard": 0.8018107002313927,
|
| 65 |
+
"Extreme": 0.4855278214669571
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.9022908711992736,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.9003492803492802,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.6666666666666666,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.525285592483348,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.8350389199886978,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.6728116198035761,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.6250527729039961,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.7824074074074074,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.8228424738103258,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.6890277777777778,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5916666666666667
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.7225808137285838,
|
| 82 |
+
"English": 0.725991338129738
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.5033333333333333,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.773365567880672,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.7988567725267066,
|
| 90 |
+
"16k": 0.7953552672252621,
|
| 91 |
+
"32k": 0.7853032014648265,
|
| 92 |
+
"64k": 0.8171591510524335,
|
| 93 |
+
"128k": 0.7387615265550217,
|
| 94 |
+
"256k": 0.7047574884597809
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.740254405395005,
|
| 98 |
+
"Partial": 0.8155070474078848
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.8943471956938479,
|
| 102 |
+
"Moderate": 0.8694949682853881,
|
| 103 |
+
"Hard": 0.8603608124174508,
|
| 104 |
+
"Extreme": 0.5205560974396651
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.9235081407527015,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.9270526695526693,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.7583333333333333,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5388141391367185,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.8662194687189113,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.724952326567939,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.6769275451403334,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.837962962962963,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.8518498172294341,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.749861111111111,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.6916666666666667
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.7653921632804664,
|
| 121 |
+
"English": 0.7813389724808776
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.5773333333333334,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.7997603117800453,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.8156058789899132,
|
| 129 |
+
"16k": 0.8312258319915683,
|
| 130 |
+
"32k": 0.8146647150412942,
|
| 131 |
+
"64k": 0.8402343004850696,
|
| 132 |
+
"128k": 0.7648319163907665,
|
| 133 |
+
"256k": 0.7319992277816549
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.7681553658992594,
|
| 137 |
+
"Partial": 0.8399847883555894
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.9168344057692764,
|
| 141 |
+
"Moderate": 0.9117105202934518,
|
| 142 |
+
"Hard": 0.8867394849893248,
|
| 143 |
+
"Extreme": 0.5408505285512573
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.9362853987173203,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.9359547859547858,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.7916666666666666,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5458038576746401,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.8823540286034711,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.7446436845926303,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.6987021524631377,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.8824074074074073,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.8622815151611319,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.8040277777777778,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.75
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.7871986587353806,
|
| 160 |
+
"English": 0.8123219648247083
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.6106666666666667
|
| 164 |
+
}
|
results/GPT-OSS-120B/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.5260760130553013,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.5251990705311491,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.5187040802401437,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.5343248883946079,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.6379995894817992,
|
| 14 |
+
"16k": 0.6200629617253591,
|
| 15 |
+
"32k": 0.5668769322787303,
|
| 16 |
+
"64k": 0.5173492904735919,
|
| 17 |
+
"128k": 0.4362186866504548,
|
| 18 |
+
"256k": 0.3779486177218682
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.49116755127749995,
|
| 22 |
+
"Partial": 0.5705049644088642
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.7406025674065024,
|
| 26 |
+
"Moderate": 0.506610347347898,
|
| 27 |
+
"Hard": 0.44966953179643426,
|
| 28 |
+
"Extreme": 0.3540424932279647
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.7441033370277687,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.7329905896572565,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.5333333333333334,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5106082800845382,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.46625824816375694,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.5279484217060981,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.31563292534840204,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.5515432098765432,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.5119880580465573,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.5589814814814815,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.425
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.505405756924764,
|
| 45 |
+
"English": 0.5467462691858365
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.5251990705311491,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.6529081123251393,
|
| 51 |
+
"16k": 0.6200667957335821,
|
| 52 |
+
"32k": 0.5763521514454887,
|
| 53 |
+
"64k": 0.49832867440843903,
|
| 54 |
+
"128k": 0.4350202675435077,
|
| 55 |
+
"256k": 0.36851842173074006
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.48686804900920944,
|
| 59 |
+
"Partial": 0.5739840070136185
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.7411278216421984,
|
| 63 |
+
"Moderate": 0.5064062158524808,
|
| 64 |
+
"Hard": 0.4567280838491506,
|
| 65 |
+
"Extreme": 0.34597541625321154
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.737237169976606,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.7325725663225663,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.5166666666666667,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5064758252528795,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.4582161191244139,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.5344712887432478,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.3124319752506027,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.5666666666666667,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.5286040271943487,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.5565277777777777,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.5156423632056797,
|
| 82 |
+
"English": 0.534755777856619
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.2833333333333333,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.6024165651661463,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.7164315027505042,
|
| 90 |
+
"16k": 0.700257643310589,
|
| 91 |
+
"32k": 0.6507634459310141,
|
| 92 |
+
"64k": 0.5868187511846459,
|
| 93 |
+
"128k": 0.5172068676740627,
|
| 94 |
+
"256k": 0.4430211801460688
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.5670686758499749,
|
| 98 |
+
"Partial": 0.6474047879321861
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.8314482789657763,
|
| 102 |
+
"Moderate": 0.5941446742173757,
|
| 103 |
+
"Hard": 0.5272666327730237,
|
| 104 |
+
"Extreme": 0.4063157035624808
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.8187672440796736,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.7801595626595622,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.6583333333333333,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5244604747568062,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.5425411826774184,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.5998244613513222,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.38919737648291697,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.6231481481481482,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.5981069547631331,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.6645833333333333,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.525
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.5840400269819745,
|
| 121 |
+
"English": 0.620793103350321
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.3506666666666667,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.6337631142743206,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.7401560588202145,
|
| 129 |
+
"16k": 0.7330726977884732,
|
| 130 |
+
"32k": 0.6780170211387931,
|
| 131 |
+
"64k": 0.6366671272752144,
|
| 132 |
+
"128k": 0.543143313800391,
|
| 133 |
+
"256k": 0.47152246682284277
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.5976534402407667,
|
| 137 |
+
"Partial": 0.6797208812261188
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.858812640816625,
|
| 141 |
+
"Moderate": 0.6436207584661483,
|
| 142 |
+
"Hard": 0.5564637457814393,
|
| 143 |
+
"Extreme": 0.43152853820526416
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.841381085118233,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8196503034003031,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.6916666666666667,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5298447017215417,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.5830791662518957,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.6214286606830465,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.41851724869569523,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.6564814814814816,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.626359252313376,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.7104166666666667,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.575
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.6166297449202884,
|
| 160 |
+
"English": 0.6508964836283548
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.382
|
| 164 |
+
}
|
results/GPT-OSS-20B/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.4466309565832364,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.4454625807266656,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.45246537487177085,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.44196491415127315,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.5748339290561163,
|
| 14 |
+
"16k": 0.520513959710621,
|
| 15 |
+
"32k": 0.4891012266007553,
|
| 16 |
+
"64k": 0.41584677147603494,
|
| 17 |
+
"128k": 0.358630149540046,
|
| 18 |
+
"256k": 0.3208597031158458
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.415365323316177,
|
| 22 |
+
"Partial": 0.4864235807413135
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.650502297368124,
|
| 26 |
+
"Moderate": 0.39329906469313236,
|
| 27 |
+
"Hard": 0.35893228463928295,
|
| 28 |
+
"Extreme": 0.3159306081507978
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.7015484192056548,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.685767192683859,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.45555555555555555,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.4908914699997827,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.36677196742848295,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.4730052458390773,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.20816491065985157,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.41743827160493835,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.447495265816877,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.4786111111111111,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.3083333333333334
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.4149338600461589,
|
| 45 |
+
"English": 0.4783280531203148
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.4454625807266656,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.568519720930888,
|
| 51 |
+
"16k": 0.5262031339471792,
|
| 52 |
+
"32k": 0.4861844372065094,
|
| 53 |
+
"64k": 0.41353173269416393,
|
| 54 |
+
"128k": 0.3591760759962795,
|
| 55 |
+
"256k": 0.3191603835849737
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.4087106311919467,
|
| 59 |
+
"Partial": 0.49223778922539846
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.6501986428504741,
|
| 63 |
+
"Moderate": 0.4020783848645907,
|
| 64 |
+
"Hard": 0.35262004275477665,
|
| 65 |
+
"Extreme": 0.3106597264865292
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.6988816931417082,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.6855601343101344,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.44166666666666665,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.48711155580421095,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.3737909056226912,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.47376675235942955,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.19543673928650457,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.4592592592592593,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.4319105105134516,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.4483333333333333,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.30833333333333335
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.4135726150175988,
|
| 82 |
+
"English": 0.4773525464357322
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.22066666666666668,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.5272106840251494,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.6646954568347576,
|
| 90 |
+
"16k": 0.5904928422287453,
|
| 91 |
+
"32k": 0.5839842384318158,
|
| 92 |
+
"64k": 0.5063967419452635,
|
| 93 |
+
"128k": 0.43057993780384596,
|
| 94 |
+
"256k": 0.38711488690647056
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.49180621097136906,
|
| 98 |
+
"Partial": 0.5722709224572338
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.7548882368495022,
|
| 102 |
+
"Moderate": 0.4956196438725791,
|
| 103 |
+
"Hard": 0.4286236931227538,
|
| 104 |
+
"Extreme": 0.3633035715559389
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.7734553050177705,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.7497803122803125,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.5833333333333334,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5045962797540592,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.4402260724918136,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.5442968593297539,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.271121809138241,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.5342592592592592,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.5242253558461997,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.5733333333333334,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.5001287057888837,
|
| 121 |
+
"English": 0.5542926622614154
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.288,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.5630326206114822,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.696307151429279,
|
| 129 |
+
"16k": 0.6363828018704142,
|
| 130 |
+
"32k": 0.6080757988876709,
|
| 131 |
+
"64k": 0.5447446048393001,
|
| 132 |
+
"128k": 0.4651876442546681,
|
| 133 |
+
"256k": 0.4274977223875639
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.5231691902067062,
|
| 137 |
+
"Partial": 0.6137678956721072
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.7981617653401918,
|
| 141 |
+
"Moderate": 0.5378309814958724,
|
| 142 |
+
"Hard": 0.4581227829612033,
|
| 143 |
+
"Extreme": 0.3909189138526289
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.7979486004255343,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.787096468346468,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.6416666666666667,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5114673228538231,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.4791973901175429,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.5794154259483205,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.29817240493883673,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.5800925925925926,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.5598440073472041,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.6325000000000001,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.543096347456943,
|
| 160 |
+
"English": 0.5829688937660222
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.316
|
| 164 |
+
}
|
results/Gemini-2.5-Flash/nonthinking_context-1000000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.5591861836855936,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.555222408533961,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.5555746542742924,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.5667614882485269,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.5794836437092291,
|
| 14 |
+
"16k": 0.585038678968723,
|
| 15 |
+
"32k": 0.5764993408909757,
|
| 16 |
+
"64k": 0.5298001757287436,
|
| 17 |
+
"128k": 0.5583690767328653,
|
| 18 |
+
"256k": 0.5259261860830253
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.5219144924948039,
|
| 22 |
+
"Partial": 0.6066228815647814
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.6655203056614667,
|
| 26 |
+
"Moderate": 0.5398880938056573,
|
| 27 |
+
"Hard": 0.5786822600966999,
|
| 28 |
+
"Extreme": 0.4425719452767774
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.7771168693642845,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.8032048969548965,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.6055555555555554,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5340660905787081,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.745788551044203,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.5023487328603856,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.4407587176859518,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.2706790123456789,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.7292026752712853,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.5269907407407407,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.44722222222222224
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.5654483346944664,
|
| 45 |
+
"English": 0.5529240326767215
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.555222408533961,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.5888731703826585,
|
| 51 |
+
"16k": 0.5734020363631814,
|
| 52 |
+
"32k": 0.5716727282141728,
|
| 53 |
+
"64k": 0.5201046130976303,
|
| 54 |
+
"128k": 0.5511174700611993,
|
| 55 |
+
"256k": 0.5261644330849247
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.5109299072772209,
|
| 59 |
+
"Partial": 0.6115946828607216
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.6581097400970598,
|
| 63 |
+
"Moderate": 0.5327009022057297,
|
| 64 |
+
"Hard": 0.5733517044448927,
|
| 65 |
+
"Extreme": 0.4453988971639295
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.7847565381129994,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.8209740259740258,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.5833333333333334,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5367973042699341,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.7270779373385174,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.49367147369310305,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.444842452883972,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.27037037037037037,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.7099867444467561,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.522361111111111,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.5642007220552895,
|
| 82 |
+
"English": 0.5462440950126328
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.2846666666666667,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.600215470701642,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.6308006025002828,
|
| 90 |
+
"16k": 0.6218713098115006,
|
| 91 |
+
"32k": 0.6209629478481974,
|
| 92 |
+
"64k": 0.5627834642545474,
|
| 93 |
+
"128k": 0.6034759308063585,
|
| 94 |
+
"256k": 0.5613985689889686
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.5620277169409751,
|
| 98 |
+
"Partial": 0.6488180663970381
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.7092035834801983,
|
| 102 |
+
"Moderate": 0.562063871268018,
|
| 103 |
+
"Hard": 0.6309834472774163,
|
| 104 |
+
"Extreme": 0.4856736448985854
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.8017289450670718,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.8388936988936988,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.6833333333333333,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5495701538780565,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.7927380382693102,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.5444118952178385,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.5074368060401414,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.3101851851851852,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.751572829108757,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.5584722222222223,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.48333333333333334
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.6052369717987128,
|
| 121 |
+
"English": 0.5951939696045724
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.32666666666666666,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.6297638749448518,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.6514500993442873,
|
| 129 |
+
"16k": 0.6579228544698497,
|
| 130 |
+
"32k": 0.6569374765913272,
|
| 131 |
+
"64k": 0.5879896058573842,
|
| 132 |
+
"128k": 0.6298609557348528,
|
| 133 |
+
"256k": 0.5944222576714168
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.5933694722646927,
|
| 137 |
+
"Partial": 0.6760840238105122
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.7491261498971669,
|
| 141 |
+
"Moderate": 0.6049484175447234,
|
| 142 |
+
"Hard": 0.6548738677798389,
|
| 143 |
+
"Extreme": 0.49881447206374147
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.8141979383533716,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8490187590187587,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.7083333333333334,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5562735797066128,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.8231149616655856,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.5745805749141115,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.5414909010178753,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.3379629629629629,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.7854748730572404,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.6129166666666667,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.6363071790468551,
|
| 160 |
+
"English": 0.6232205708428518
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.3606666666666667
|
| 164 |
+
}
|
results/Gemini-2.5-Flash/thinking_context-1000000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.674056323049449,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.6759983703704746,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.6798720821649146,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.6662985166129568,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.7135890855897297,
|
| 14 |
+
"16k": 0.6856610849454701,
|
| 15 |
+
"32k": 0.6818807100418771,
|
| 16 |
+
"64k": 0.7027829296522448,
|
| 17 |
+
"128k": 0.6399411623659867,
|
| 18 |
+
"256k": 0.6204829657013889
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.63656677094683,
|
| 22 |
+
"Partial": 0.7217702984527835
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.7982292314606038,
|
| 26 |
+
"Moderate": 0.7239074886730374,
|
| 27 |
+
"Hard": 0.7218614768527792,
|
| 28 |
+
"Extreme": 0.47388809993909664
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.8642581912430608,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.877125374625374,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.6638888888888889,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.545007270213042,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.7904062945014397,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.6528080258554949,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.5102049505643657,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.6658950617283949,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.8004985540165189,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.605046296296296,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5361111111111112
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.6759423962737819,
|
| 45 |
+
"English": 0.6721702498251175
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.6759983703704746,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.7102710318952188,
|
| 51 |
+
"16k": 0.6883832260010031,
|
| 52 |
+
"32k": 0.6744900827350674,
|
| 53 |
+
"64k": 0.7119047863552107,
|
| 54 |
+
"128k": 0.6560068019942544,
|
| 55 |
+
"256k": 0.6149342932420981
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.6308759268961498,
|
| 59 |
+
"Partial": 0.733426934792345
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.8006604241201102,
|
| 63 |
+
"Moderate": 0.7377405126394252,
|
| 64 |
+
"Hard": 0.7224536765391667,
|
| 65 |
+
"Extreme": 0.46837070558456173
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.8740026641040577,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.8621174196174196,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.675,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5455538913362303,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.8061189254081019,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.6554340790288137,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.5169546620879129,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.6708333333333333,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.7797147286011522,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.5926388888888889,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.55
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.6768990117658723,
|
| 82 |
+
"English": 0.6750977289750792
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.4493333333333333,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.7479747877310018,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.7719367863380663,
|
| 90 |
+
"16k": 0.7848557527755085,
|
| 91 |
+
"32k": 0.7435010923393187,
|
| 92 |
+
"64k": 0.7892992664474747,
|
| 93 |
+
"128k": 0.7226763130932595,
|
| 94 |
+
"256k": 0.6755795153923868
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.720714483291413,
|
| 98 |
+
"Partial": 0.782669720654116
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.8637385553396211,
|
| 102 |
+
"Moderate": 0.8121373016562715,
|
| 103 |
+
"Hard": 0.8141424576405533,
|
| 104 |
+
"Extreme": 0.535557607922782
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.9027833860231022,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.9054959854959853,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.7666666666666667,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5582670711283242,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.8550801223307993,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.7273381143811253,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.6357469182183698,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.7416666666666667,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.8585696216489627,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.712361111111111,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.6333333333333333
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.7535034625567544,
|
| 121 |
+
"English": 0.74244611290525
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.5313333333333333,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.7790736538060705,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.800474143876513,
|
| 129 |
+
"16k": 0.8203674165938436,
|
| 130 |
+
"32k": 0.767107753488941,
|
| 131 |
+
"64k": 0.8124208427670737,
|
| 132 |
+
"128k": 0.743086283346713,
|
| 133 |
+
"256k": 0.7309854827633406
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.7557801236733843,
|
| 137 |
+
"Partial": 0.808719964884035
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.8954381880917556,
|
| 141 |
+
"Moderate": 0.8399060564631339,
|
| 142 |
+
"Hard": 0.848259677254879,
|
| 143 |
+
"Extreme": 0.5662031295553962
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.9242311389541524,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.9315938690938689,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.7916666666666666,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5695550592103452,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.8809133228802023,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.7566450728547504,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.662115306165705,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.7824074074074074,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.8867644916844079,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.7519444444444443,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.7
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.7890668786410705,
|
| 160 |
+
"English": 0.7690804289710705
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.5786666666666667
|
| 164 |
+
}
|
results/Gemini-2.5-Pro/thinking_context-1000000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.7342184707317124,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.7402405885346022,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.7288378446496467,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.7335769790108894,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.7449778241967657,
|
| 14 |
+
"16k": 0.7478649041506191,
|
| 15 |
+
"32k": 0.7530566835243759,
|
| 16 |
+
"64k": 0.7417918268320294,
|
| 17 |
+
"128k": 0.6999601003776742,
|
| 18 |
+
"256k": 0.7176594853088111
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.7006912685258201,
|
| 22 |
+
"Partial": 0.7768894553573948
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.8440057387459964,
|
| 26 |
+
"Moderate": 0.819848501651939,
|
| 27 |
+
"Hard": 0.8102915033262061,
|
| 28 |
+
"Extreme": 0.5077419967802616
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.900910721502263,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.9242053162886497,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.6500000000000001,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5430214244860422,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.8428063760922413,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.7039837824498163,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.6274987753728497,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.7824074074074073,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.873498394228399,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.683564814814815,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5888888888888888
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.7449054000919034,
|
| 45 |
+
"English": 0.7235315413715225
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.7402405885346022,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.7531611316917413,
|
| 51 |
+
"16k": 0.7524897292361332,
|
| 52 |
+
"32k": 0.759794274989024,
|
| 53 |
+
"64k": 0.7435484076033682,
|
| 54 |
+
"128k": 0.6968353298720406,
|
| 55 |
+
"256k": 0.7356146578153082
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.6997313831727073,
|
| 59 |
+
"Partial": 0.7917977589951957
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.8496811955485003,
|
| 63 |
+
"Moderate": 0.8519813060901164,
|
| 64 |
+
"Hard": 0.7987384600115967,
|
| 65 |
+
"Extreme": 0.5085375776002995
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.8998469455079252,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.9226606264106262,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.7083333333333334,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5451740468187636,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.8298635734563725,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.6845419570109973,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.6246870318062281,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.7995370370370369,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.8931464590407817,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.6825,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.6083333333333333
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.7482902342281601,
|
| 82 |
+
"English": 0.7321909428410444
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.5366666666666666,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.7920257143166666,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.8033887159307824,
|
| 90 |
+
"16k": 0.7898793857734576,
|
| 91 |
+
"32k": 0.8117863707215878,
|
| 92 |
+
"64k": 0.7964277126734358,
|
| 93 |
+
"128k": 0.7607965077861976,
|
| 94 |
+
"256k": 0.7898755930145346
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.7559933032110803,
|
| 98 |
+
"Partial": 0.8378851466328657
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.8992430876416225,
|
| 102 |
+
"Moderate": 0.8976673772472249,
|
| 103 |
+
"Hard": 0.8677753697095277,
|
| 104 |
+
"Extreme": 0.5554328394573533
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.9332256652370721,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.9412599437599435,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.775,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5583020906699263,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.8863658130468474,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.7465324819181888,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.7073416442539386,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.8560185185185186,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.9071622825418993,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.7341666666666666,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.7
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.795156398417639,
|
| 121 |
+
"English": 0.7888950302156931
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.6133333333333333,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.8133867657039734,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.8126699858808082,
|
| 129 |
+
"16k": 0.8091582494700531,
|
| 130 |
+
"32k": 0.8359194281957039,
|
| 131 |
+
"64k": 0.816367429901851,
|
| 132 |
+
"128k": 0.7927970026749335,
|
| 133 |
+
"256k": 0.8134084981004824
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.7838454845628277,
|
| 137 |
+
"Partial": 0.85098475988361
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.9143269591461097,
|
| 141 |
+
"Moderate": 0.9126631081599108,
|
| 142 |
+
"Hard": 0.9014414032827288,
|
| 143 |
+
"Extreme": 0.5797689782741141
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.9453427087326437,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.9562455137455134,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.7833333333333333,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5633383564725131,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.9113749401192192,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.7887683935896461,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.7334094152440016,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.8671296296296297,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.922905227868178,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.7925,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.7083333333333334
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.8203398785387466,
|
| 160 |
+
"English": 0.8064336528691968
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.6466666666666666
|
| 164 |
+
}
|
results/Gemma-3-12B-It/nonthinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.3215537579100566,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.32433034575226716,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.31844270123543494,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.3218882267424678,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.3884654162699951,
|
| 14 |
+
"16k": 0.3447777819230472,
|
| 15 |
+
"32k": 0.34075916239810233,
|
| 16 |
+
"64k": 0.2830133627729955,
|
| 17 |
+
"128k": 0.2903427416083741,
|
| 18 |
+
"256k": 0.2819640824878266
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.2967079738599621,
|
| 22 |
+
"Partial": 0.3531756648829045
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.43662085782547294,
|
| 26 |
+
"Moderate": 0.23390254455699586,
|
| 27 |
+
"Hard": 0.30432253509954527,
|
| 28 |
+
"Extreme": 0.26439167130106106
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.5521425232273246,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.6018784456284454,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.39722222222222225,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.4889337252719512,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.17289417311274363,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.31608511775757386,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.18810666022578687,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.06064814814814815,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.34441506928983295,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.39800925925925923,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.21666666666666667
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.31275985343364654,
|
| 45 |
+
"English": 0.3303476623864672
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.32433034575226716,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.3906271090949922,
|
| 51 |
+
"16k": 0.3561137033219127,
|
| 52 |
+
"32k": 0.34838703683305916,
|
| 53 |
+
"64k": 0.2733930672728249,
|
| 54 |
+
"128k": 0.295394704679806,
|
| 55 |
+
"256k": 0.2820664533110089
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.299184653750444,
|
| 59 |
+
"Partial": 0.35633395375458804
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.4375571061543292,
|
| 63 |
+
"Moderate": 0.2456864337252399,
|
| 64 |
+
"Hard": 0.3046160927997635,
|
| 65 |
+
"Extreme": 0.26489900749156403
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.5440288685497278,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.595296185296185,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.4166666666666667,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.4882012721729454,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.16495676080926253,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.3232791449049618,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.1944817769562575,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.0699074074074074,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.345977074505613,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.4008333333333333,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.21666666666666667
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.3144798192828198,
|
| 82 |
+
"English": 0.334180872221715
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.11133333333333334,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.3467523917671754,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.4142271322547509,
|
| 90 |
+
"16k": 0.377736900046965,
|
| 91 |
+
"32k": 0.37302728004656277,
|
| 92 |
+
"64k": 0.3062912069169447,
|
| 93 |
+
"128k": 0.3073157553107305,
|
| 94 |
+
"256k": 0.3019160760270996
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.32135117390700496,
|
| 98 |
+
"Partial": 0.3790812144983023
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.46615306988638716,
|
| 102 |
+
"Moderate": 0.26112276416188485,
|
| 103 |
+
"Hard": 0.32737280433310617,
|
| 104 |
+
"Extreme": 0.28492633788743743
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.5650465241334548,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.6249731287231288,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.43333333333333335,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5032712591907067,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.20597827766959523,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.346419862594183,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.21497503949007407,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.07962962962962963,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.3894322431353122,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.4174999999999999,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.23333333333333334
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.3387201257962417,
|
| 121 |
+
"English": 0.3547846577381098
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.122,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.36208312298536466,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.4291254746141897,
|
| 129 |
+
"16k": 0.3819529993839447,
|
| 130 |
+
"32k": 0.3886685107360013,
|
| 131 |
+
"64k": 0.3331907556566436,
|
| 132 |
+
"128k": 0.3232918491379485,
|
| 133 |
+
"256k": 0.3162691483834608
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.33738791539283736,
|
| 137 |
+
"Partial": 0.39351338719403606
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.4781807777959498,
|
| 141 |
+
"Moderate": 0.2766679371680117,
|
| 142 |
+
"Hard": 0.3532458860700918,
|
| 143 |
+
"Extreme": 0.29681012423769865
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.5905198790888515,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.6396154771154773,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.43333333333333335,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5090709574388708,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.21374662530636385,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.3778424215167421,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.2211746842442865,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.09351851851851851,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.4124771063926183,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.43847222222222215,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.25
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.35324378584530425,
|
| 160 |
+
"English": 0.3709224601254252
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.12733333333333333
|
| 164 |
+
}
|
results/Gemma-3-12B-It/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.3191893284499226,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.3198032509438655,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.3185140430839788,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.3192506913219244,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.39039471003518295,
|
| 14 |
+
"16k": 0.36666548253930176,
|
| 15 |
+
"32k": 0.3454075697967419,
|
| 16 |
+
"64k": 0.2965559300528977,
|
| 17 |
+
"128k": 0.2634390883568726,
|
| 18 |
+
"256k": 0.25267318991854026
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.29023833980013936,
|
| 22 |
+
"Partial": 0.3560360412769198
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.45478741131761163,
|
| 26 |
+
"Moderate": 0.2260826308323087,
|
| 27 |
+
"Hard": 0.28024203424188526,
|
| 28 |
+
"Extreme": 0.2573832550303484
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.5500567032378852,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.5750085972908512,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.275,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.4560070506311603,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.1618052577444313,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.3308854760173438,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.16988798875914354,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.254783950617284,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.3307362069623843,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.3245833333333333,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.18333333333333338
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.29411514963064544,
|
| 45 |
+
"English": 0.3442635072692001
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.3198032509438655,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.3780018581730685,
|
| 51 |
+
"16k": 0.3712141189933461,
|
| 52 |
+
"32k": 0.32728165209503113,
|
| 53 |
+
"64k": 0.30571589667403015,
|
| 54 |
+
"128k": 0.2747567763160667,
|
| 55 |
+
"256k": 0.2618492034116515
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.29216027329955657,
|
| 59 |
+
"Partial": 0.3549852224911681
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.45511408988195684,
|
| 63 |
+
"Moderate": 0.23953366795757702,
|
| 64 |
+
"Hard": 0.27643841537644914,
|
| 65 |
+
"Extreme": 0.25278171138445865
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.5667485461728832,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.562965737965738,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.25,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.4542471570812729,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.17077813789682644,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.331058456491463,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.17425550039116783,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.25555555555555554,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.34802456680209815,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.32847222222222217,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.175
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.29038552736401346,
|
| 82 |
+
"English": 0.34922097452371764
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.11466666666666667,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.38355638163031974,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.4753389925367892,
|
| 90 |
+
"16k": 0.42098100793684207,
|
| 91 |
+
"32k": 0.4193716441650546,
|
| 92 |
+
"64k": 0.35186094157335357,
|
| 93 |
+
"128k": 0.3173777879671563,
|
| 94 |
+
"256k": 0.3164079156027232
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.3475360182865405,
|
| 98 |
+
"Partial": 0.4294004804314932
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.5523314770092703,
|
| 102 |
+
"Moderate": 0.2771664695259602,
|
| 103 |
+
"Hard": 0.3369385202392175,
|
| 104 |
+
"Extreme": 0.29916922382926153
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.6227581461675343,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.6340152902652901,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.35,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.47044783692403785,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.21419149989714506,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.40641717668678456,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.22278173964383363,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.31851851851851853,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.41479917818461753,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.4166666666666667,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.25
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.35221914805982085,
|
| 121 |
+
"English": 0.41489361520081863
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.154,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.41783270358538355,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.5097501627706587,
|
| 129 |
+
"16k": 0.4551451198013833,
|
| 130 |
+
"32k": 0.4565584105132001,
|
| 131 |
+
"64k": 0.39691938688026035,
|
| 132 |
+
"128k": 0.34453029473355373,
|
| 133 |
+
"256k": 0.3440928468132463
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.37691399280812576,
|
| 137 |
+
"Partial": 0.46991106275643846
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.6081488352407031,
|
| 141 |
+
"Moderate": 0.3030799752726802,
|
| 142 |
+
"Hard": 0.35694868313688616,
|
| 143 |
+
"Extreme": 0.32471144207202707
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.6492859989190713,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.673747440830774,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.4083333333333333,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.47953981981487215,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.23614408266148504,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.44385472207531035,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.24389274305473457,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.3634259259259259,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.4490980326738026,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.4583333333333333,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.2916666666666667
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.3946655531503397,
|
| 160 |
+
"English": 0.440999854020427
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.174
|
| 164 |
+
}
|
results/Gemma-3-27B-It/nonthinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.3613898319544999,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.35966281601422384,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.3610482900444428,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.3634583898048339,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.43644157643949566,
|
| 14 |
+
"16k": 0.3804621509069283,
|
| 15 |
+
"32k": 0.39249485549033103,
|
| 16 |
+
"64k": 0.3508346036478247,
|
| 17 |
+
"128k": 0.30224089028156714,
|
| 18 |
+
"256k": 0.30586491496085444
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.33698292855454914,
|
| 22 |
+
"Partial": 0.3924531635544373
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.49956833239118315,
|
| 26 |
+
"Moderate": 0.2504305317277634,
|
| 27 |
+
"Hard": 0.3319623187177785,
|
| 28 |
+
"Extreme": 0.30223017713736006
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.6186817382099091,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.6828640898869156,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.475,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.4863291170840516,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.20718550263597674,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.35878702296933646,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.19534470405785112,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.11558641975308641,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.3981518981106548,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.37236111111111114,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.27222222222222225
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.3481327108034932,
|
| 45 |
+
"English": 0.3746469531055068
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.35966281601422384,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.43768168643426314,
|
| 51 |
+
"16k": 0.37280230082410293,
|
| 52 |
+
"32k": 0.39131018375907956,
|
| 53 |
+
"64k": 0.3439379322622389,
|
| 54 |
+
"128k": 0.30914303161909973,
|
| 55 |
+
"256k": 0.30310176118655724
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.33431999039428895,
|
| 59 |
+
"Partial": 0.39191732134868573
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.49522975795181967,
|
| 63 |
+
"Moderate": 0.24738817424396956,
|
| 64 |
+
"Hard": 0.33414580218141465,
|
| 65 |
+
"Extreme": 0.30165945795823235
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.6149553491444276,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.6777184081350744,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.4666666666666667,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.48496676951441003,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.2117639703302691,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.3687808030960292,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.19168084686723558,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.1189814814814815,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.3861048947753823,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.35944444444444446,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.275
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.34703709850451614,
|
| 82 |
+
"English": 0.3722885335239309
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.13066666666666665,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.38518154623771866,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.45699608191277485,
|
| 90 |
+
"16k": 0.3970070268223878,
|
| 91 |
+
"32k": 0.4155514974235712,
|
| 92 |
+
"64k": 0.3795777424428599,
|
| 93 |
+
"128k": 0.3342775351649673,
|
| 94 |
+
"256k": 0.32767939365975096
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.36193530889736725,
|
| 98 |
+
"Partial": 0.4147676664890755
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.5260736620896884,
|
| 102 |
+
"Moderate": 0.2746052566038148,
|
| 103 |
+
"Hard": 0.3586645680646715,
|
| 104 |
+
"Extreme": 0.32088598840944504
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.6391494300203041,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.6911707890874557,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.49166666666666664,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.4979131525726192,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.24413521215425465,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.3897620242575365,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.2135542070257249,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.14675925925925926,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.4478430638786239,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.39444444444444443,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.2833333333333333
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.37305464561658386,
|
| 121 |
+
"English": 0.3973084468588535
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.144,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.4011866600471999,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.4633867051191647,
|
| 129 |
+
"16k": 0.42281662150014376,
|
| 130 |
+
"32k": 0.44096928608538427,
|
| 131 |
+
"64k": 0.39220811203028316,
|
| 132 |
+
"128k": 0.34412857203690767,
|
| 133 |
+
"256k": 0.34361066351131797
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.37845160215956763,
|
| 137 |
+
"Partial": 0.4301221882678235
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.5455848419176232,
|
| 141 |
+
"Moderate": 0.2912498230785689,
|
| 142 |
+
"Hard": 0.36825828865136306,
|
| 143 |
+
"Extreme": 0.33684148176489936
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.654875485368947,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.7061728240894906,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.5166666666666667,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5049097313487289,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.2565335955530918,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.4143461915069394,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.22258770429716687,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.175462962962963,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.4681907705235851,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.40555555555555556,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.2833333333333333
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.3916620761944417,
|
| 160 |
+
"English": 0.410711243899959
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.15533333333333332
|
| 164 |
+
}
|
results/Gemma-3-27B-It/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.37338415281793874,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.3756110685938797,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.3716321861397887,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.3729092037201496,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.44812577930836095,
|
| 14 |
+
"16k": 0.4266217475899872,
|
| 15 |
+
"32k": 0.4074453646105579,
|
| 16 |
+
"64k": 0.35662526806956907,
|
| 17 |
+
"128k": 0.2952141304786102,
|
| 18 |
+
"256k": 0.3062726268505501
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.3443713298159222,
|
| 22 |
+
"Partial": 0.4103095639114165
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.5780767692142667,
|
| 26 |
+
"Moderate": 0.24533089723723267,
|
| 27 |
+
"Hard": 0.3056384367420397,
|
| 28 |
+
"Extreme": 0.27775702033096106
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.6037064651554387,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.6278458897510606,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.3361111111111111,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.45719209902963875,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.23234121031762375,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.38387242742350736,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.18133975282134737,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.3114197530864198,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.44895353115875314,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.35731481481481475,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.28888888888888886
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.3378195347928154,
|
| 45 |
+
"English": 0.40894877084306375
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.3756110685938797,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.4545820286350837,
|
| 51 |
+
"16k": 0.4352872653228386,
|
| 52 |
+
"32k": 0.3950079365533934,
|
| 53 |
+
"64k": 0.35311614333477187,
|
| 54 |
+
"128k": 0.3114655730243775,
|
| 55 |
+
"256k": 0.3042074646928149
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.35375594987355935,
|
| 59 |
+
"Partial": 0.4034266742379251
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.5844518111059833,
|
| 63 |
+
"Moderate": 0.23662855463799365,
|
| 64 |
+
"Hard": 0.3096811226362121,
|
| 65 |
+
"Extreme": 0.281425757285206
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.6068670183757786,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.6163485749654042,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.36666666666666664,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.4560619845050425,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.22005868934189987,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.39305351710005704,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.1715251657008684,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.33888888888888885,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.44004517714509755,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.35888888888888887,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.275
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.34160224855312066,
|
| 82 |
+
"English": 0.40961988863463966
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.15533333333333332,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.4363206399942735,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.5234602086730173,
|
| 90 |
+
"16k": 0.5018004637617909,
|
| 91 |
+
"32k": 0.46794475339715547,
|
| 92 |
+
"64k": 0.4185517955545234,
|
| 93 |
+
"128k": 0.3516627837551589,
|
| 94 |
+
"256k": 0.354503834823997
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.4063870314951086,
|
| 98 |
+
"Partial": 0.4744179599023025
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.6923421245761895,
|
| 102 |
+
"Moderate": 0.2843297701377906,
|
| 103 |
+
"Hard": 0.35026505438423683,
|
| 104 |
+
"Extreme": 0.31221398104277637
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.6567331296259676,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.6726985976985973,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.43333333333333335,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.4711965663860271,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.27489752350046465,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.44484952313131765,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.22198381203531814,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.40925925925925927,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.5240655133007459,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.4486111111111111,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.35833333333333334
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.3992628958468014,
|
| 121 |
+
"English": 0.47337838414174643
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.19933333333333333,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.4678951184844386,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.5595081038285018,
|
| 129 |
+
"16k": 0.5288508385707865,
|
| 130 |
+
"32k": 0.5128049001652387,
|
| 131 |
+
"64k": 0.45281870495664023,
|
| 132 |
+
"128k": 0.37498216355710723,
|
| 133 |
+
"256k": 0.3784059998283628
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.4378657173135933,
|
| 137 |
+
"Partial": 0.5061143563382442
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.7390417273273124,
|
| 141 |
+
"Moderate": 0.31763207699520324,
|
| 142 |
+
"Hard": 0.37625789112044933,
|
| 143 |
+
"Extreme": 0.3297508281124345
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.6917909626535441,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.7037670200170199,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.475,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.4763487195551507,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.30687631368745927,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.4772314892310187,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.24817243122652502,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.4217592592592592,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.5559667511226697,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.5098611111111111,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.4238794242578244,
|
| 160 |
+
"English": 0.511910812711055
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.22333333333333333
|
| 164 |
+
}
|
results/Gemma-3-4B-It/nonthinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.2175748220994214,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.21852109706784154,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.2163322668515703,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.21787110237885274,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.24656523663590132,
|
| 14 |
+
"16k": 0.2205604877341683,
|
| 15 |
+
"32k": 0.23963284248634728,
|
| 16 |
+
"64k": 0.21111513028758372,
|
| 17 |
+
"128k": 0.19343593120899555,
|
| 18 |
+
"256k": 0.19413930424353257
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.21444597727329182,
|
| 22 |
+
"Partial": 0.22155698824176823
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.28179521332096,
|
| 26 |
+
"Moderate": 0.15821172453341914,
|
| 27 |
+
"Hard": 0.20704875252188354,
|
| 28 |
+
"Extreme": 0.19312877831692504
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.4236351950773965,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.45359587924627603,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.3194444444444444,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.443681734303759,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.042786026910229404,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.19637703429305803,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.09142599749178396,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.02438271604938272,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.1616430041389551,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.22050925925925927,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.18611111111111114
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.20889372279815283,
|
| 45 |
+
"English": 0.22625592140069015
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.21852109706784154,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.2476866042969573,
|
| 51 |
+
"16k": 0.21620176526480195,
|
| 52 |
+
"32k": 0.24753627633784483,
|
| 53 |
+
"64k": 0.21130221395252485,
|
| 54 |
+
"128k": 0.19685176433002144,
|
| 55 |
+
"256k": 0.19154795822489784
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.21588524494794425,
|
| 59 |
+
"Partial": 0.22187581794771047
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.28399652990881596,
|
| 63 |
+
"Moderate": 0.1580374343562769,
|
| 64 |
+
"Hard": 0.20857790675358978,
|
| 65 |
+
"Extreme": 0.19305337410218493
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.4309006915611223,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.4447710584938352,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.3333333333333333,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.441308668504869,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.03940722221903115,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.20337572553598166,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.08406090392051238,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.02361111111111111,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.16341557282886326,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.22013888888888886,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.2120533839166827,
|
| 82 |
+
"English": 0.22498881021900005
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.06733333333333333,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.23271347684478086,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.2631313909667352,
|
| 90 |
+
"16k": 0.2312324535272521,
|
| 91 |
+
"32k": 0.25978977734748787,
|
| 92 |
+
"64k": 0.23161875208541116,
|
| 93 |
+
"128k": 0.20537856042145283,
|
| 94 |
+
"256k": 0.20512992672034427
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.2292600969188946,
|
| 98 |
+
"Partial": 0.23710868765954476
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.2946012095946107,
|
| 102 |
+
"Moderate": 0.1735404680186527,
|
| 103 |
+
"Hard": 0.22252519869057716,
|
| 104 |
+
"Extreme": 0.21047603309909335
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.4536230189628364,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.4791046371928724,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.3333333333333333,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.4555393638488407,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.04458180082240233,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.22301925498410874,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.10512521699205389,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.029166666666666667,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.18246404287967255,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.23263888888888887,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.22667394776839053,
|
| 121 |
+
"English": 0.2387530059211707
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.06866666666666667,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.2401600102211412,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.26882845096050667,
|
| 129 |
+
"16k": 0.24012842113745536,
|
| 130 |
+
"32k": 0.2657505147595882,
|
| 131 |
+
"64k": 0.237575165952952,
|
| 132 |
+
"128k": 0.21797743491880378,
|
| 133 |
+
"256k": 0.21070007359754095
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.23644784183701267,
|
| 137 |
+
"Partial": 0.24488458816457748
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.3019876106242685,
|
| 141 |
+
"Moderate": 0.17780290261890433,
|
| 142 |
+
"Hard": 0.22967603723970634,
|
| 143 |
+
"Extreme": 0.22027403043562677
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.4596998605405389,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.5016641610023963,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.3333333333333333,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.4669275398584054,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.051513017753619265,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.23601198455578568,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.11322244187918541,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.03148148148148148,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.19070502006795878,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.23541666666666666,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.19166666666666668
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.23450699509250633,
|
| 160 |
+
"English": 0.24581302534977606
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.07
|
| 164 |
+
}
|
results/Gemma-3-4B-It/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.2119885030064203,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.21437249924782262,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.21469684223951344,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.20689616753192464,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.24366425705090342,
|
| 14 |
+
"16k": 0.2312288563166909,
|
| 15 |
+
"32k": 0.24934489050979397,
|
| 16 |
+
"64k": 0.17455165550407764,
|
| 17 |
+
"128k": 0.18287496802077124,
|
| 18 |
+
"256k": 0.19026639063628464
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.20450724123760522,
|
| 22 |
+
"Partial": 0.221510108894003
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.28661876230483024,
|
| 26 |
+
"Moderate": 0.13848484214026374,
|
| 27 |
+
"Hard": 0.1987191002737751,
|
| 28 |
+
"Extreme": 0.18722857209328425
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.4122355140839005,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.4238909558076955,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.2361111111111111,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.4241502347645632,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.04263401505989564,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.23701628551906095,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.07531749025982754,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.10648148148148145,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.15631527456623664,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.20962962962962953,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.11666666666666664
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.19120313153416005,
|
| 45 |
+
"English": 0.23277387447868045
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.21437249924782262,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.260899815427902,
|
| 51 |
+
"16k": 0.2556087799887465,
|
| 52 |
+
"32k": 0.23432924859855547,
|
| 53 |
+
"64k": 0.1786455712537597,
|
| 54 |
+
"128k": 0.1740181124720787,
|
| 55 |
+
"256k": 0.18273346774589402
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.2082323257071287,
|
| 59 |
+
"Partial": 0.22218726557234228
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.28518994612968396,
|
| 63 |
+
"Moderate": 0.1458906399979874,
|
| 64 |
+
"Hard": 0.19668309266803527,
|
| 65 |
+
"Extreme": 0.19339405931078624
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.44068453349253894,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.42484177261436434,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.25,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.42183521824061226,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.03826565166477447,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.25040130008880007,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.08179456674127736,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.08703703703703704,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.16976304211815604,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.19708333333333336,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.10833333333333334
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.19621205437007752,
|
| 82 |
+
"English": 0.23253294412556783
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.06333333333333334,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.2636576810412479,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.3079135549113116,
|
| 90 |
+
"16k": 0.28734590159692097,
|
| 91 |
+
"32k": 0.30413774794877135,
|
| 92 |
+
"64k": 0.21523254007725492,
|
| 93 |
+
"128k": 0.22804447155407118,
|
| 94 |
+
"256k": 0.23927187015915638
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.250251500957686,
|
| 98 |
+
"Partial": 0.28072009205669
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.364020302669296,
|
| 102 |
+
"Moderate": 0.1823512513190994,
|
| 103 |
+
"Hard": 0.23524288009416136,
|
| 104 |
+
"Extreme": 0.22578189022008766
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.5017988282740397,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.5025411389881287,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.275,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.44235758157965793,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.07123185213097494,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.3076327414869081,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.11226263632567846,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.14583333333333334,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.2253096564350266,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.26222222222222225,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.16666666666666666
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.24369127122121384,
|
| 121 |
+
"English": 0.28362409086128176
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.086,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.29217739130674947,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.3390735712300384,
|
| 129 |
+
"16k": 0.3084943145981662,
|
| 130 |
+
"32k": 0.331141630537573,
|
| 131 |
+
"64k": 0.24932737796255805,
|
| 132 |
+
"128k": 0.25196083809503234,
|
| 133 |
+
"256k": 0.27306661541712973
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.2733927975152606,
|
| 137 |
+
"Partial": 0.3160850561322812
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.4041733526262008,
|
| 141 |
+
"Moderate": 0.2194670736422517,
|
| 142 |
+
"Hard": 0.2580927963130063,
|
| 143 |
+
"Extreme": 0.23963574676642313
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.5332538419210622,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.5533578363062058,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.3416666666666667,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.4493150256833664,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.09079534419446701,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.3434824981209785,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.12257916371054065,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.16805555555555557,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.24931951714865616,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.30833333333333335,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.175
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.27088663804629004,
|
| 160 |
+
"English": 0.3134681445672092
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.102
|
| 164 |
+
}
|
results/Kimi-K2-Instruct-0905/nonthinking_context-224000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 67,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.5009443422920304,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.5011015308802983,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.49751406897312744,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.5042174270226657,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.5193469215810047,
|
| 14 |
+
"16k": 0.5532046525085649,
|
| 15 |
+
"32k": 0.5393076869166767,
|
| 16 |
+
"64k": 0.45954315717941974,
|
| 17 |
+
"128k": 0.4753071835553842,
|
| 18 |
+
"256k": 0.4589564520111373
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.45716785858755954,
|
| 22 |
+
"Partial": 0.5566598670068132
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.6491770551060967,
|
| 26 |
+
"Moderate": 0.4905460752618018,
|
| 27 |
+
"Hard": 0.43431147544571097,
|
| 28 |
+
"Extreme": 0.3960536599333797
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.7424451818013483,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.7428685203685202,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.5472222222222223,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5115863734748296,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.5310286936858898,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.481867796853936,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.36661627375742456,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.24089506172839517,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.607908662662019,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.5085648148148147,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.43611111111111117
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.502898779221276,
|
| 45 |
+
"English": 0.4989899053627864
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.5011015308802983,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.5266204875905753,
|
| 51 |
+
"16k": 0.5431095505457598,
|
| 52 |
+
"32k": 0.5392474254502099,
|
| 53 |
+
"64k": 0.46556965866611255,
|
| 54 |
+
"128k": 0.48161604654304363,
|
| 55 |
+
"256k": 0.45044601648609367
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.45533929299371073,
|
| 59 |
+
"Partial": 0.5593443790995934
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.6458118886638174,
|
| 63 |
+
"Moderate": 0.4926159658473186,
|
| 64 |
+
"Hard": 0.42892012991547307,
|
| 65 |
+
"Extreme": 0.4021536227747844
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.734408014297419,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.7401580826580821,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.5666666666666667,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5077956681433179,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.5196800787679438,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.4941189417411527,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.3706991980056276,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.22268518518518515,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.6228334158501364,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.5243055555555556,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.49773496712013804,
|
| 82 |
+
"English": 0.5044680946404603
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.23,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.5483962332490746,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.5725846797858735,
|
| 90 |
+
"16k": 0.6070506360109902,
|
| 91 |
+
"32k": 0.5785761030801342,
|
| 92 |
+
"64k": 0.5092135264066221,
|
| 93 |
+
"128k": 0.5131800223023555,
|
| 94 |
+
"256k": 0.5097724319084751
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.49887561173193595,
|
| 98 |
+
"Partial": 0.6114224788163434
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.7046240276873015,
|
| 102 |
+
"Moderate": 0.5462638174512109,
|
| 103 |
+
"Hard": 0.4707831873116643,
|
| 104 |
+
"Extreme": 0.4364568187575384
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.7623386988134085,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.7700949513449513,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.6,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5249959487118497,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.5953546052259285,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.5412045721601162,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.41849900851913713,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.28935185185185186,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.6584466738317519,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.5701388888888889,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.5438314702987085,
|
| 121 |
+
"English": 0.552960996199442
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.27666666666666667,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.5729921291255787,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.5918227174634976,
|
| 129 |
+
"16k": 0.6198567950677695,
|
| 130 |
+
"32k": 0.6115768945303457,
|
| 131 |
+
"64k": 0.5284625404433138,
|
| 132 |
+
"128k": 0.558149430962295,
|
| 133 |
+
"256k": 0.528084396286257
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.524120078602167,
|
| 137 |
+
"Partial": 0.6351929207008328
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.728216785648972,
|
| 141 |
+
"Moderate": 0.5755547819370206,
|
| 142 |
+
"Hard": 0.5066701999617829,
|
| 143 |
+
"Extreme": 0.45151519711603216
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.7860047126136753,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.7853528878528878,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.625,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5322270993764735,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.6208079019292253,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.5652450687006129,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.44884599567602773,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.31157407407407406,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.6765946379547446,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.5895833333333333,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5583333333333333
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.5755180331103958,
|
| 160 |
+
"English": 0.5704662251407642
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.3
|
| 164 |
+
}
|
results/Kimi-K2-Instruct-0905/thinking_context-224000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 69,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.5553060678788313,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.558917739810572,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.5552262066724464,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.5517742571534756,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.5978532013581613,
|
| 14 |
+
"16k": 0.5816609532803436,
|
| 15 |
+
"32k": 0.5872894997726004,
|
| 16 |
+
"64k": 0.5360933501085343,
|
| 17 |
+
"128k": 0.522886026665569,
|
| 18 |
+
"256k": 0.5060533760877814
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.5076499747745465,
|
| 22 |
+
"Partial": 0.6159592772842868
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.7729188134795828,
|
| 26 |
+
"Moderate": 0.5733088402612271,
|
| 27 |
+
"Hard": 0.4375074213815535,
|
| 28 |
+
"Extreme": 0.38246894115073926
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.758377055109974,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.7423694091857211,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.4861111111111112,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5011656658098056,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.6197584764672828,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.5164556923382113,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.3547519606397262,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.5962962962962963,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.6299270957790389,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.5606944444444444,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666654
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.5409680916435047,
|
| 45 |
+
"English": 0.5696440441141594
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.558917739810572,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.6032704706738693,
|
| 51 |
+
"16k": 0.5808397170448323,
|
| 52 |
+
"32k": 0.5927696772272222,
|
| 53 |
+
"64k": 0.5485389223926213,
|
| 54 |
+
"128k": 0.5293584568340762,
|
| 55 |
+
"256k": 0.49872919469081073
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.5058707162683681,
|
| 59 |
+
"Partial": 0.6264321334097424
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.7833558783440772,
|
| 63 |
+
"Moderate": 0.5768808845916201,
|
| 64 |
+
"Hard": 0.4381739958885578,
|
| 65 |
+
"Extreme": 0.3805641270346401
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.7528903747933631,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.7690795456301782,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.525,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5001505956406523,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.6016113058386018,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.5066860396628982,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.37783095340307493,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.5875000000000001,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.6056866583526187,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.5740277777777777,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.45
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.5344377355706056,
|
| 82 |
+
"English": 0.5833977440505398
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.30733333333333335,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.6315444476656525,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.6765293955798586,
|
| 90 |
+
"16k": 0.6653958948787116,
|
| 91 |
+
"32k": 0.6614274068144226,
|
| 92 |
+
"64k": 0.6099720675525434,
|
| 93 |
+
"128k": 0.5887552831784975,
|
| 94 |
+
"256k": 0.5871866379898839
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.5906087497944075,
|
| 98 |
+
"Partial": 0.6836444267745122
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.8546630775122501,
|
| 102 |
+
"Moderate": 0.6784971908255039,
|
| 103 |
+
"Hard": 0.5110824387985737,
|
| 104 |
+
"Extreme": 0.4354103526732191
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.7926072573330768,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.814456423206423,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.5666666666666667,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5156065007848037,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.705413217864198,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.6176306197741449,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.42951044567324476,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.6861111111111111,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.7066217095721875,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.6680555555555556,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.525
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.617638270540212,
|
| 121 |
+
"English": 0.6454506247910954
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.37066666666666664,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.6596591269299542,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.6994761842119754,
|
| 129 |
+
"16k": 0.6880103332979634,
|
| 130 |
+
"32k": 0.6831628614701334,
|
| 131 |
+
"64k": 0.6414281237947528,
|
| 132 |
+
"128k": 0.6223896794908902,
|
| 133 |
+
"256k": 0.6234875793140215
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.6186943779973499,
|
| 137 |
+
"Partial": 0.7117960801169108
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.865163305432102,
|
| 141 |
+
"Moderate": 0.7223592192252923,
|
| 142 |
+
"Hard": 0.5480741090923853,
|
| 143 |
+
"Extreme": 0.4666471483928459
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.8175679720885881,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8413347300847297,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.6,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5241839616339089,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.7286969407202896,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.6482898910584165,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.47389920059132556,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.7092592592592593,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.7448385112889893,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.7002777777777779,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5416666666666666
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.646426154128056,
|
| 160 |
+
"English": 0.6728920997318576
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.4013333333333333
|
| 164 |
+
}
|
results/Llama-3.1-405B-Instruct/nonthinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.4006972406362581,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.4033767470362484,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.4037400979033875,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.3949748769691391,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.495611810737427,
|
| 14 |
+
"16k": 0.47999448108480186,
|
| 15 |
+
"32k": 0.4902670612376324,
|
| 16 |
+
"64k": 0.40596651011726403,
|
| 17 |
+
"128k": 0.2929513776114342,
|
| 18 |
+
"256k": 0.23939220302898997
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.3708303649326616,
|
| 22 |
+
"Partial": 0.4387096278953802
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.5544944910188809,
|
| 26 |
+
"Moderate": 0.2906684222492303,
|
| 27 |
+
"Hard": 0.35510805093665054,
|
| 28 |
+
"Extreme": 0.33443208075583397
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.7010722319989281,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.6852361434861434,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.39722222222222225,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.49860889459519425,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.33280185377001925,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.4288546372402087,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.22662521432331084,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.1651234567901234,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.4037951252761809,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.43962962962962965,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.3194444444444445
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.38105191405127925,
|
| 45 |
+
"English": 0.42034256722123686
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.4033767470362484,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.5193261381744707,
|
| 51 |
+
"16k": 0.47458776670121805,
|
| 52 |
+
"32k": 0.4893823156329057,
|
| 53 |
+
"64k": 0.41443272045065593,
|
| 54 |
+
"128k": 0.28371722198812693,
|
| 55 |
+
"256k": 0.2388143192701119
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.3743223052410306,
|
| 59 |
+
"Partial": 0.44035512750288913
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.5734527544487493,
|
| 63 |
+
"Moderate": 0.2799762087674356,
|
| 64 |
+
"Hard": 0.35650549807929216,
|
| 65 |
+
"Extreme": 0.3289038173440243
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.6946832479137959,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.6867736892736891,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.4083333333333333,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5005294746426308,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.3263529995384955,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.4192752041935995,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.23609519970144652,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.18101851851851852,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.391786542964143,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.47083333333333327,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.30833333333333335
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.3889859691853749,
|
| 82 |
+
"English": 0.41776752488712193
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.16266666666666665,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.44522764896517386,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.558048061407528,
|
| 90 |
+
"16k": 0.5158119021911939,
|
| 91 |
+
"32k": 0.530639639804942,
|
| 92 |
+
"64k": 0.45350021661205264,
|
| 93 |
+
"128k": 0.3440588360232635,
|
| 94 |
+
"256k": 0.26930723775206405
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.40918538567791846,
|
| 98 |
+
"Partial": 0.49109962042168165
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.6186902100237465,
|
| 102 |
+
"Moderate": 0.32736405593203205,
|
| 103 |
+
"Hard": 0.39034570869637836,
|
| 104 |
+
"Extreme": 0.36866970508796515
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.7336012615268711,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.737358012358012,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.45,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5147616333746151,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.3674427224531845,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.4743354751511784,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.26985986799607303,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.20462962962962963,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.44727785652000346,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.5249999999999999,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.36666666666666664
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.433626244820898,
|
| 121 |
+
"English": 0.4568290531094507
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.19,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.4623473432019363,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.5703117586774046,
|
| 129 |
+
"16k": 0.531567048206503,
|
| 130 |
+
"32k": 0.5504750677715584,
|
| 131 |
+
"64k": 0.47066633396240054,
|
| 132 |
+
"128k": 0.3715170610120738,
|
| 133 |
+
"256k": 0.27954678958167684
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.42433070289568703,
|
| 137 |
+
"Partial": 0.510732158137163
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.6297066688011407,
|
| 141 |
+
"Moderate": 0.34570653296360615,
|
| 142 |
+
"Hard": 0.4161126844074583,
|
| 143 |
+
"Extreme": 0.3859923786829926
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.7453735974615648,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.7443156843156841,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.45,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.52546578139621,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.4112102181587475,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.5005878653860251,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.2880921279803352,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.22685185185185183,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.4880120741980102,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.5249999999999999,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.36666666666666664
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.4464982468640505,
|
| 160 |
+
"English": 0.47819643953982244
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.20266666666666666
|
| 164 |
+
}
|
results/Llama-3.1-405B-Instruct/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.40659333298471173,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.3990307184980547,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.40746318832781453,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.4132860921282645,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.5237859741060642,
|
| 14 |
+
"16k": 0.517961066700275,
|
| 15 |
+
"32k": 0.4641044292483723,
|
| 16 |
+
"64k": 0.4182090511525332,
|
| 17 |
+
"128k": 0.260073356838944,
|
| 18 |
+
"256k": 0.25542611986208036
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.3729971231698154,
|
| 22 |
+
"Partial": 0.44935214547639707
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.613595833316379,
|
| 26 |
+
"Moderate": 0.2921863382975652,
|
| 27 |
+
"Hard": 0.3409469495726698,
|
| 28 |
+
"Extreme": 0.29809383550926016
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.6642543814799384,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.660868776285443,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.3,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.4699204609681677,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.35415278369391223,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.44523374235335905,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.2145859605426568,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.36867283950617286,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.43511107590777826,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.37759259259259265,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.2777777777777778
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.36859109108368215,
|
| 45 |
+
"English": 0.4445955748857405
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.3990307184980547,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.5319602144193065,
|
| 51 |
+
"16k": 0.495294502488435,
|
| 52 |
+
"32k": 0.44900241142401065,
|
| 53 |
+
"64k": 0.4104061454120568,
|
| 54 |
+
"128k": 0.24974039710926457,
|
| 55 |
+
"256k": 0.2577806401352564
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.3663953815207406,
|
| 59 |
+
"Partial": 0.4405666019237288
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.6016370051267634,
|
| 63 |
+
"Moderate": 0.26872089549030526,
|
| 64 |
+
"Hard": 0.34181177303603094,
|
| 65 |
+
"Extreme": 0.3002570456178896
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.6857190555998647,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.6476817164317161,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.2916666666666667,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.4722890782260215,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.3720362720390496,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.4527252876757789,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.19573002717715302,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.3111111111111111,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.4434471088718592,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.39402777777777775,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.3646393289213864,
|
| 82 |
+
"English": 0.43342210807472437
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.17866666666666667,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.47482444558969183,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.6017952145070679,
|
| 90 |
+
"16k": 0.591088648529682,
|
| 91 |
+
"32k": 0.5375509398549342,
|
| 92 |
+
"64k": 0.4892562129103632,
|
| 93 |
+
"128k": 0.3164459575452793,
|
| 94 |
+
"256k": 0.3128097001908253
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.43173242380596516,
|
| 98 |
+
"Partial": 0.5296688369507991
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.7232714101187123,
|
| 102 |
+
"Moderate": 0.3321291637725583,
|
| 103 |
+
"Hard": 0.3952399665178113,
|
| 104 |
+
"Extreme": 0.3486594773940952
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.7289955546187882,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.7412526825026818,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.4166666666666667,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.49118408476251574,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.40752656877934634,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.5058127366024506,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.2513867378923854,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.4481481481481482,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.5031863563544513,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.48847222222222225,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.35
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.43768030437389926,
|
| 121 |
+
"English": 0.511968586805485
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.24,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.5112966568518927,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.638281295176006,
|
| 129 |
+
"16k": 0.6423408107009556,
|
| 130 |
+
"32k": 0.5654314097480697,
|
| 131 |
+
"64k": 0.5253595874826821,
|
| 132 |
+
"128k": 0.3530891462917609,
|
| 133 |
+
"256k": 0.3432776917118821
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.4709592628636537,
|
| 137 |
+
"Partial": 0.5626351582914695
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.7672241772316003,
|
| 141 |
+
"Moderate": 0.375515223666526,
|
| 142 |
+
"Hard": 0.43497012537117896,
|
| 143 |
+
"Extreme": 0.3702560590461619
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.7536336494527094,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.7626347726347721,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.43333333333333335,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5004237684828063,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.44213397827806017,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.5445203343972627,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.2924260189234064,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.5148148148148148,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.5454347340415279,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.5093055555555556,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4166666666666667
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.47723166126529054,
|
| 160 |
+
"English": 0.545361652438495
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.27466666666666667
|
| 164 |
+
}
|
results/Llama-3.1-70B-Instruct/nonthinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.31531891526483563,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.3187840899451787,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.3181153572604232,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.3090572985889053,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.44072420920233435,
|
| 14 |
+
"16k": 0.4154170524608382,
|
| 15 |
+
"32k": 0.39938052404397517,
|
| 16 |
+
"64k": 0.3038357678876172,
|
| 17 |
+
"128k": 0.16668959617065898,
|
| 18 |
+
"256k": 0.16586634182358947
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.28629550819966637,
|
| 22 |
+
"Partial": 0.3522577969841428
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.4401547192469389,
|
| 26 |
+
"Moderate": 0.21463302825178993,
|
| 27 |
+
"Hard": 0.2886301431775311,
|
| 28 |
+
"Extreme": 0.26222895835717064
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.5528929885667343,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.5351228410297036,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.34722222222222227,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.48251270202602564,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.2383585369805011,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.3094103218555832,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.18317006381802675,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.11111111111111113,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.35298805295632424,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.3324074074074074,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.1944444444444444
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.27963370499725654,
|
| 45 |
+
"English": 0.3510041255324155
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.3187840899451787,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.45086107163290673,
|
| 51 |
+
"16k": 0.42082616936389317,
|
| 52 |
+
"32k": 0.4024958091173256,
|
| 53 |
+
"64k": 0.31412697939763573,
|
| 54 |
+
"128k": 0.16271506723441695,
|
| 55 |
+
"256k": 0.16167944292489206
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.28863047320759827,
|
| 59 |
+
"Partial": 0.35716142033846177
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.4450445268436371,
|
| 63 |
+
"Moderate": 0.23675467668036354,
|
| 64 |
+
"Hard": 0.2806267700317323,
|
| 65 |
+
"Extreme": 0.25941235199849716
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.5578290894275468,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.5340221352721353,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.36666666666666664,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.4824250532281757,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.22435495508076153,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.3143695347862014,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.1819753578820025,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.11342592592592592,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.33790255230380384,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.3586111111111111,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.29265028040862706,
|
| 82 |
+
"English": 0.34491789948172935
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.116,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.3704962078912661,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.5007140188951568,
|
| 90 |
+
"16k": 0.471793141596409,
|
| 91 |
+
"32k": 0.4656246883405684,
|
| 92 |
+
"64k": 0.3555228262932919,
|
| 93 |
+
"128k": 0.22278285337549444,
|
| 94 |
+
"256k": 0.20653971884667408
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.33766430263043345,
|
| 98 |
+
"Partial": 0.41228226913232546
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.505817704967975,
|
| 102 |
+
"Moderate": 0.26539475589230027,
|
| 103 |
+
"Hard": 0.33902165222515473,
|
| 104 |
+
"Extreme": 0.3119632638554326
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.6325785537327107,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.6024494976700858,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.4083333333333333,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5068628671369674,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.27736426767676775,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.39468559218559207,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.23698781644898675,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.1412037037037037,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.42207618836131144,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.3888888888888889,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.23333333333333334
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.3386454170505693,
|
| 121 |
+
"English": 0.4023469987319626
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.144,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.39272719549993146,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.5352185196666011,
|
| 129 |
+
"16k": 0.48774731079355566,
|
| 130 |
+
"32k": 0.4914359262892438,
|
| 131 |
+
"64k": 0.38089490889752375,
|
| 132 |
+
"128k": 0.23106162164056493,
|
| 133 |
+
"256k": 0.23000488571210054
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.3601448943540025,
|
| 137 |
+
"Partial": 0.434195578776569
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.5296491139243411,
|
| 141 |
+
"Moderate": 0.28398341296374197,
|
| 142 |
+
"Hard": 0.3678982241467773,
|
| 143 |
+
"Extreme": 0.3304684709396022
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.6534337905352087,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.6269316794316794,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.43333333333333335,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5138761303847846,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.30993542257550877,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.4248105135605135,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.25284892696838046,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.1523148148148148,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.4435626489175109,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.4330555555555556,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.25
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.3526743750393581,
|
| 160 |
+
"English": 0.4327800159605054
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.16266666666666665
|
| 164 |
+
}
|
results/Llama-3.1-70B-Instruct/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.3212355454655496,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.3168848382877898,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.3235694833261471,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.32325231478271244,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.44607201777886324,
|
| 14 |
+
"16k": 0.43551597572008266,
|
| 15 |
+
"32k": 0.40532179664339874,
|
| 16 |
+
"64k": 0.3372735574136524,
|
| 17 |
+
"128k": 0.14963906519016765,
|
| 18 |
+
"256k": 0.15359086004713354
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.2920710323592253,
|
| 22 |
+
"Partial": 0.3583540166917813
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.4845503461095368,
|
| 26 |
+
"Moderate": 0.21437000317705404,
|
| 27 |
+
"Hard": 0.28040007010006374,
|
| 28 |
+
"Extreme": 0.2393443186282747
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.5203183043430563,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.5048829192634423,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.2972222222222221,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.46869982746667466,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.24051072181402314,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.3352447566072413,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.16831334519236535,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.24182098765432103,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.3504639112512836,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.3041666666666666,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.21111111111111108
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.27396983223922683,
|
| 45 |
+
"English": 0.368501258691873
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.3168848382877898,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.4288512287020653,
|
| 51 |
+
"16k": 0.431350020549353,
|
| 52 |
+
"32k": 0.41805229286031653,
|
| 53 |
+
"64k": 0.3101016124828498,
|
| 54 |
+
"128k": 0.16200619520020057,
|
| 55 |
+
"256k": 0.1509476799319523
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.28780546371048366,
|
| 59 |
+
"Partial": 0.3538949513861792
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.48546407013873055,
|
| 63 |
+
"Moderate": 0.1945019101312392,
|
| 64 |
+
"Hard": 0.2709484383728626,
|
| 65 |
+
"Extreme": 0.24276914751620637
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.48095324443543325,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.4873936033102699,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.275,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.46817349027530025,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.21679364691461467,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.3425653712663516,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.17647214053474497,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.25462962962962965,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.347662448182329,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.31625,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.2661805594268294,
|
| 82 |
+
"English": 0.36758911714875
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.13733333333333334,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.39714621701263747,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.5101215226314286,
|
| 90 |
+
"16k": 0.5452446091546882,
|
| 91 |
+
"32k": 0.5033835517180962,
|
| 92 |
+
"64k": 0.40629425415229514,
|
| 93 |
+
"128k": 0.20367087402793838,
|
| 94 |
+
"256k": 0.21416249039138233
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.3628015637327161,
|
| 98 |
+
"Partial": 0.4408575939143573
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.6103509331261675,
|
| 102 |
+
"Moderate": 0.2739929015343184,
|
| 103 |
+
"Hard": 0.33264890818640663,
|
| 104 |
+
"Extreme": 0.2868424835064885
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.6190525866570429,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.5863848211097236,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.4,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.48619725203488445,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.290019754922306,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.4354357416367219,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.21250441110110105,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.3388888888888889,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.4174297354939513,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.42666666666666675,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.25833333333333336
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.3417787747514596,
|
| 121 |
+
"English": 0.45251365927381704
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.19133333333333333,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.43806378317359107,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.5718190971548373,
|
| 129 |
+
"16k": 0.5932030702540269,
|
| 130 |
+
"32k": 0.5401634596906353,
|
| 131 |
+
"64k": 0.45712267932316264,
|
| 132 |
+
"128k": 0.2296143794444798,
|
| 133 |
+
"256k": 0.23646001317440798
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.39971488113838277,
|
| 137 |
+
"Partial": 0.48687147667294856
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.6591788734633496,
|
| 141 |
+
"Moderate": 0.3103483415629278,
|
| 142 |
+
"Hard": 0.38512512860737047,
|
| 143 |
+
"Extreme": 0.3144991474359921
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.6739172902122323,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.6553062801836215,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.475,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5017174053624821,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.31632987998243095,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.4726401785278596,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.24365040952524628,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.36944444444444446,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.44797944073835727,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.46027777777777784,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.386238080047615,
|
| 160 |
+
"English": 0.48988948629956824
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.21933333333333332
|
| 164 |
+
}
|
results/Llama-3.1-8B-Instruct/nonthinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.21094590782574696,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.20814425242445228,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.213015185500322,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.21167828555246626,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.24549737122739362,
|
| 14 |
+
"16k": 0.2608710428868677,
|
| 15 |
+
"32k": 0.2249354240045269,
|
| 16 |
+
"64k": 0.18691854981764278,
|
| 17 |
+
"128k": 0.18010527298228765,
|
| 18 |
+
"256k": 0.16734778603576234
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.1813007104244737,
|
| 22 |
+
"Partial": 0.24867615906373072
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.254652823450579,
|
| 26 |
+
"Moderate": 0.13823151671179162,
|
| 27 |
+
"Hard": 0.21215047305696197,
|
| 28 |
+
"Extreme": 0.21003592225516218
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.4667231034949644,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.4213756913607651,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.15000000000000002,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.47465443881044483,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.08095709533952888,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.1895252817222955,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.12098051997071714,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.04969135802469136,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.21578074220253873,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.16759259259259265,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.11944444444444445
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.17905397547001678,
|
| 45 |
+
"English": 0.24283784018147686
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.20814425242445228,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.22656456269661684,
|
| 51 |
+
"16k": 0.25006742264480875,
|
| 52 |
+
"32k": 0.22983820975916858,
|
| 53 |
+
"64k": 0.17867648708372652,
|
| 54 |
+
"128k": 0.1893868580863639,
|
| 55 |
+
"256k": 0.17433197427602828
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.1770456924597648,
|
| 59 |
+
"Partial": 0.24772423783405423
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.2566570988180275,
|
| 63 |
+
"Moderate": 0.1324196374719661,
|
| 64 |
+
"Hard": 0.20899042479807298,
|
| 65 |
+
"Extreme": 0.2041821568416995
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.4630702505342932,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.4120432796938637,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.13333333333333333,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.47400084179308405,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.0690637373143065,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.19812149190741143,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.11267079352137302,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.04490740740740741,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.19979772893803915,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.18361111111111109,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.13333333333333333
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.17869850347586846,
|
| 82 |
+
"English": 0.2375900013730359
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.052,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.24847856382430364,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.28288621820265775,
|
| 90 |
+
"16k": 0.30147254873111595,
|
| 91 |
+
"32k": 0.2728297990630907,
|
| 92 |
+
"64k": 0.21066076283394117,
|
| 93 |
+
"128k": 0.21820825855623124,
|
| 94 |
+
"256k": 0.2048137955587849
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.2142227788917067,
|
| 98 |
+
"Partial": 0.29207683555669967
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.3015279708931293,
|
| 102 |
+
"Moderate": 0.1575465527579005,
|
| 103 |
+
"Hard": 0.2601383702201079,
|
| 104 |
+
"Extreme": 0.24243277505755167
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.5278328421453239,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.4858897020853544,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.18333333333333332,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.4923937781072418,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.10321693015669395,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.24202529361366723,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.1496372306528247,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.06435185185185185,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.2640161200205543,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.22361111111111112,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.21521719416329613,
|
| 121 |
+
"English": 0.2817399334853111
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.068,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.27478587280873845,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.32764512992755657,
|
| 129 |
+
"16k": 0.3226696445637056,
|
| 130 |
+
"32k": 0.29882237152612906,
|
| 131 |
+
"64k": 0.24198009489674097,
|
| 132 |
+
"128k": 0.23991327950411545,
|
| 133 |
+
"256k": 0.21768471643418316
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.2383436635644805,
|
| 137 |
+
"Partial": 0.32116686639233955
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.33550759893050514,
|
| 141 |
+
"Moderate": 0.18287651202405897,
|
| 142 |
+
"Hard": 0.28514599318022166,
|
| 143 |
+
"Extreme": 0.26183100573765283
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.5538072480868509,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.5290554353054353,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.2,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5032056223860172,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.1333833560986498,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.2737365591929987,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.17502387399050007,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.08750000000000001,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.3068144317903629,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.2625,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.23247692481372026,
|
| 160 |
+
"English": 0.3170948208037568
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.078
|
| 164 |
+
}
|
results/Llama-3.1-8B-Instruct/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.20055372622856252,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.20930536348826667,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.19814963328771615,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.19420618190970523,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.25813666494087695,
|
| 14 |
+
"16k": 0.2584728735658432,
|
| 15 |
+
"32k": 0.22849707778354275,
|
| 16 |
+
"64k": 0.18730191383793596,
|
| 17 |
+
"128k": 0.12903377597359986,
|
| 18 |
+
"256k": 0.14188005126957745
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.1871659292157323,
|
| 22 |
+
"Partial": 0.21759274060852876
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.2628031197661334,
|
| 26 |
+
"Moderate": 0.12316795604778738,
|
| 27 |
+
"Hard": 0.17987117557385604,
|
| 28 |
+
"Extreme": 0.19677540131116525
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.4057709287198289,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.34219177348181007,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.12222222222222225,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.4554008546339488,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.09150573083223558,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.2205869310535933,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.10384155329285091,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.10123456790123456,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.17017808218806194,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.1756018518518519,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.10555555555555557
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.14712355199329824,
|
| 45 |
+
"English": 0.2539839004638273
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.20930536348826667,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.2793229214860049,
|
| 51 |
+
"16k": 0.271683549702249,
|
| 52 |
+
"32k": 0.23348902534053415,
|
| 53 |
+
"64k": 0.19344554869301367,
|
| 54 |
+
"128k": 0.12028061678065591,
|
| 55 |
+
"256k": 0.15761051892714206
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.18602878507502096,
|
| 59 |
+
"Partial": 0.2389300996505795
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.28424670669624624,
|
| 63 |
+
"Moderate": 0.1253791912220988,
|
| 64 |
+
"Hard": 0.17541923812572474,
|
| 65 |
+
"Extreme": 0.20459178755292767
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.4385523704155171,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.33931459481923876,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.11666666666666667,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.45953789724020144,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.0722496761788766,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.21044140806468803,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.1283895768466609,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.1125,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.2038326942491424,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.19249999999999998,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.11666666666666667
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.15245924739914177,
|
| 82 |
+
"English": 0.26615147957739155
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.06,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.2668017232451993,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.3461725655993255,
|
| 90 |
+
"16k": 0.34802165706779187,
|
| 91 |
+
"32k": 0.31037609052110204,
|
| 92 |
+
"64k": 0.2389649239073623,
|
| 93 |
+
"128k": 0.1748998439051164,
|
| 94 |
+
"256k": 0.18237525847049635
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.2499435538255567,
|
| 98 |
+
"Partial": 0.28825757523383516
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.3588173402056394,
|
| 102 |
+
"Moderate": 0.16793760263800153,
|
| 103 |
+
"Hard": 0.24236535579298768,
|
| 104 |
+
"Extreme": 0.246988799777556
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.5197176048343898,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.48746135503643234,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.16666666666666666,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.4794299302901568,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.15065800813584843,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.3052547184784026,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.15302354691124467,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.14444444444444443,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.2507539108503574,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.23458333333333334,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.19465748810477185,
|
| 121 |
+
"English": 0.33894595838562674
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.082,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.29957877493017654,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.3888483252966101,
|
| 129 |
+
"16k": 0.39060272417227027,
|
| 130 |
+
"32k": 0.34430797611932984,
|
| 131 |
+
"64k": 0.2620211212858359,
|
| 132 |
+
"128k": 0.19544431047747687,
|
| 133 |
+
"256k": 0.21624819222953548
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.2794795135612338,
|
| 137 |
+
"Partial": 0.3251596530361036
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.4075586546242195,
|
| 141 |
+
"Moderate": 0.19423764623204026,
|
| 142 |
+
"Hard": 0.27460228265341846,
|
| 143 |
+
"Extreme": 0.26688670776930307
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.5783680297471054,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.5321006964107361,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.20833333333333334,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.4907425557642246,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.1724297780962502,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.3362168666306595,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.16743722857143514,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.1921296296296296,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.2933902615835246,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.2673611111111111,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.15833333333333333
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.22893100795326488,
|
| 160 |
+
"English": 0.37022654190708804
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.09866666666666667
|
| 164 |
+
}
|
results/Llama-3.2-3B-Instruct/nonthinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.15708345836639478,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.1522499746285183,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.16292071303675215,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.15607968743391326,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.19074751544937305,
|
| 14 |
+
"16k": 0.18311448739692587,
|
| 15 |
+
"32k": 0.15849934191444578,
|
| 16 |
+
"64k": 0.13711682904337855,
|
| 17 |
+
"128k": 0.13633492271322553,
|
| 18 |
+
"256k": 0.13668765368101904
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.15215903834853456,
|
| 22 |
+
"Partial": 0.16335090202548921
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.18487754128195516,
|
| 26 |
+
"Moderate": 0.10373568516116806,
|
| 27 |
+
"Hard": 0.15009434290092064,
|
| 28 |
+
"Extreme": 0.16626666941305895
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.3024747670482544,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.30977286822552064,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.11388888888888889,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.4341408375280332,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.0344779091920434,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.15408667402382106,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.07795789080091817,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.04398148148148148,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.12905444479341674,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.1479166666666667,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.07777777777777778
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.10513615313151659,
|
| 45 |
+
"English": 0.20903076360127287
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.1522499746285183,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.18381886404580258,
|
| 51 |
+
"16k": 0.1763640779318065,
|
| 52 |
+
"32k": 0.1530403977981516,
|
| 53 |
+
"64k": 0.1336449793468121,
|
| 54 |
+
"128k": 0.13665833409911293,
|
| 55 |
+
"256k": 0.1299731945494239
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.14616662224309246,
|
| 59 |
+
"Partial": 0.15999242311906023
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.17269103520136797,
|
| 63 |
+
"Moderate": 0.10010586621394478,
|
| 64 |
+
"Hard": 0.1522959770717054,
|
| 65 |
+
"Extreme": 0.16407670515037534
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.31616711715222934,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.30997657033436127,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.075,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.433290104470351,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.040719448989792455,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.1420105345821374,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.0818116476937141,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.04583333333333333,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.12251594627374464,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.12597222222222224,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.075
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.10313580635298143,
|
| 82 |
+
"English": 0.2013641429040551
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.03333333333333333,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.1921634438744948,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.21983364549684287,
|
| 90 |
+
"16k": 0.21955667723795985,
|
| 91 |
+
"32k": 0.18995358273392637,
|
| 92 |
+
"64k": 0.16240795470998307,
|
| 93 |
+
"128k": 0.17591239432067907,
|
| 94 |
+
"256k": 0.18531640874757638
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.183991826715688,
|
| 98 |
+
"Partial": 0.20256368389479382
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.23050124787481288,
|
| 102 |
+
"Moderate": 0.13024657792029828,
|
| 103 |
+
"Hard": 0.18670800635922485,
|
| 104 |
+
"Extreme": 0.19441747608931817
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.3586662331587295,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.3847515389630534,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.16666666666666666,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.4623366120534231,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.04912393153232278,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.19169925535926752,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.09784140674649551,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.06805555555555555,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.17896485067612064,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.17347222222222225,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.09166666666666666
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.1340947957448097,
|
| 121 |
+
"English": 0.25023209200417945
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.052,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.2126068902708674,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.24600389478810633,
|
| 129 |
+
"16k": 0.23558970928733117,
|
| 130 |
+
"32k": 0.21547322231985894,
|
| 131 |
+
"64k": 0.18904442314365716,
|
| 132 |
+
"128k": 0.19236619778234643,
|
| 133 |
+
"256k": 0.19716389430390446
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.20471895219678654,
|
| 137 |
+
"Partial": 0.22264608418333381
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.2591916862859481,
|
| 141 |
+
"Moderate": 0.1495495342063559,
|
| 142 |
+
"Hard": 0.20304294133612527,
|
| 143 |
+
"Extreme": 0.20927435723794804
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.3790383678432267,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.41972832869160254,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.19166666666666668,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.47495636922941775,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.058968622858899346,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.22320851430069694,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.10913808800759871,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.08194444444444444,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.19665231407803055,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.21513888888888888,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.1
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.1489641004694211,
|
| 160 |
+
"English": 0.27624968007231393
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.059333333333333335
|
| 164 |
+
}
|
results/Llama-3.2-3B-Instruct/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.12579532534277046,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.12189144574406009,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.1293014670146224,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.12619306326962915,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.15520842809468816,
|
| 14 |
+
"16k": 0.1484137730398096,
|
| 15 |
+
"32k": 0.13566213672791996,
|
| 16 |
+
"64k": 0.10339908100452032,
|
| 17 |
+
"128k": 0.11523413736015264,
|
| 18 |
+
"256k": 0.09685439582953186
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.11497129858976732,
|
| 22 |
+
"Partial": 0.13957135939204723
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.14352977294197386,
|
| 26 |
+
"Moderate": 0.07165328990150331,
|
| 27 |
+
"Hard": 0.10476288088788434,
|
| 28 |
+
"Extreme": 0.15574400931361643
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.18027953901797078,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.2032463362205395,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.11944444444444445,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.42455853122133613,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.03812352880578544,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.12499067187507265,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.06067661860338902,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.029629629629629627,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.09370491802352847,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.09847222222222225,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.09166666666666666
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.08710157372538106,
|
| 45 |
+
"English": 0.16448907696016007
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.12189144574406009,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.14926024363329887,
|
| 51 |
+
"16k": 0.14140607600695068,
|
| 52 |
+
"32k": 0.12814754734467546,
|
| 53 |
+
"64k": 0.09111044280283606,
|
| 54 |
+
"128k": 0.1221228193452808,
|
| 55 |
+
"256k": 0.09930154533131832
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.11202506506940242,
|
| 59 |
+
"Partial": 0.13444865751180604
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.14039643942766825,
|
| 63 |
+
"Moderate": 0.056507474186827354,
|
| 64 |
+
"Hard": 0.10349493747731114,
|
| 65 |
+
"Extreme": 0.15664860872958616
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.16389165825733903,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.1787449973585681,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.09166666666666666,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.42251209889763974,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.029073247426826414,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.13197902480468204,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.06122238755841022,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.044444444444444446,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.08781339576018317,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.10180555555555555,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.09166666666666666
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.07947872192042414,
|
| 82 |
+
"English": 0.16430416956769595
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.03333333333333333,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.16673174742075972,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.21235596929835865,
|
| 90 |
+
"16k": 0.19139324741892721,
|
| 91 |
+
"32k": 0.17847356237746767,
|
| 92 |
+
"64k": 0.1323473405869264,
|
| 93 |
+
"128k": 0.15654182190882684,
|
| 94 |
+
"256k": 0.12927854293405058
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.15641485621192497,
|
| 98 |
+
"Partial": 0.1798623362320038
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.20556729410789223,
|
| 102 |
+
"Moderate": 0.09843476269293053,
|
| 103 |
+
"Hard": 0.13991984386404396,
|
| 104 |
+
"Extreme": 0.1866584183549311
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.23716632921131517,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.283700678823595,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.18333333333333332,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.44817949561782316,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.07044117047262258,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.1774625349060259,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.0801161795021524,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.05,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.14648554146631595,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.14513888888888887,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.10833333333333334
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.11086445603583714,
|
| 121 |
+
"English": 0.22259903880568205
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.048,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.19563257554771482,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.2530151029327433,
|
| 129 |
+
"16k": 0.2145335526961687,
|
| 130 |
+
"32k": 0.21507949798738746,
|
| 131 |
+
"64k": 0.15928462128906565,
|
| 132 |
+
"128k": 0.18536894288047534,
|
| 133 |
+
"256k": 0.14651373550044794
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.18678441171314086,
|
| 137 |
+
"Partial": 0.20689387497353642
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.24756743178790308,
|
| 141 |
+
"Moderate": 0.1259095390094924,
|
| 142 |
+
"Hard": 0.1628721973860393,
|
| 143 |
+
"Extreme": 0.20605327132157794
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.3006781841143776,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.34291202176503843,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.20833333333333334,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.46018159312540347,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.0815602367002352,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.20248617019109905,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.08636604532829192,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.07777777777777778,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.17054683536229345,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.18958333333333333,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.14166666666666666
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.13358536850999,
|
| 160 |
+
"English": 0.25767978258543983
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.06133333333333333
|
| 164 |
+
}
|
results/Llama-3.3-70B-Instruct/nonthinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.3189017909858673,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.3156125985423413,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.32123074093436993,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.31986203348089143,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.4593043661575621,
|
| 14 |
+
"16k": 0.4357005279195819,
|
| 15 |
+
"32k": 0.4042163423898818,
|
| 16 |
+
"64k": 0.33700892278371447,
|
| 17 |
+
"128k": 0.1425979596199625,
|
| 18 |
+
"256k": 0.13458262704450225
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.291318303255093,
|
| 22 |
+
"Partial": 0.35400804809776254
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.440396417724264,
|
| 26 |
+
"Moderate": 0.22606271262921054,
|
| 27 |
+
"Hard": 0.29071041097108385,
|
| 28 |
+
"Extreme": 0.2653049554891388
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.5698993177933688,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.5207791047484032,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.33055555555555555,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.4889794661436827,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.24622344831110268,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.31517757482240366,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.19554447151545906,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.11126543209876544,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.3449561289681397,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.3185648148148148,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.23333333333333334
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.28656733073968105,
|
| 45 |
+
"English": 0.35123625123205404
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.3156125985423413,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.4598857003901339,
|
| 51 |
+
"16k": 0.43280589089049554,
|
| 52 |
+
"32k": 0.4031552748630893,
|
| 53 |
+
"64k": 0.32858688094254546,
|
| 54 |
+
"128k": 0.13499648051535806,
|
| 55 |
+
"256k": 0.1342453636524253
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.28665010278500924,
|
| 59 |
+
"Partial": 0.3524739567789461
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.4365832341037067,
|
| 63 |
+
"Moderate": 0.2165303315022285,
|
| 64 |
+
"Hard": 0.2928446211124342,
|
| 65 |
+
"Extreme": 0.26312822197701774
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.5433299618590155,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.526070226070226,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.325,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.49127697260080533,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.24102740431192388,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.31258403804363555,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.20132235403207502,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.11481481481481483,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.34245443993484,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.30791666666666667,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.225
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.28075822912737913,
|
| 82 |
+
"English": 0.3504669679573037
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.11266666666666666,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.34649569529069124,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.47647116314268717,
|
| 90 |
+
"16k": 0.4687523820747641,
|
| 91 |
+
"32k": 0.4305852114495849,
|
| 92 |
+
"64k": 0.3618707734054544,
|
| 93 |
+
"128k": 0.17753523030724935,
|
| 94 |
+
"256k": 0.16375941136440833
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.3168861362630559,
|
| 98 |
+
"Partial": 0.3841805885985913
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.4708835305116446,
|
| 102 |
+
"Moderate": 0.2536880874829281,
|
| 103 |
+
"Hard": 0.31598267575873745,
|
| 104 |
+
"Extreme": 0.29123370602859766
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.613795052914502,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.572449494949495,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.35833333333333334,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5065243852070945,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.26010872017700837,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.3543572676296878,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.2180957885923682,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.1287037037037037,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.3871389535524576,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.3311111111111111,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.25
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.3170227577667401,
|
| 121 |
+
"English": 0.3759686328146428
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.126,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.358435386899099,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.48710556132647576,
|
| 129 |
+
"16k": 0.4795308481526823,
|
| 130 |
+
"32k": 0.4449722178774269,
|
| 131 |
+
"64k": 0.37149023172449164,
|
| 132 |
+
"128k": 0.19457817433016786,
|
| 133 |
+
"256k": 0.17293528798335014
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.33005566769629513,
|
| 137 |
+
"Partial": 0.39455502952084975
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.48736321739782795,
|
| 141 |
+
"Moderate": 0.2624427701465103,
|
| 142 |
+
"Hard": 0.3272030727162918,
|
| 143 |
+
"Extreme": 0.30076445676260405
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.6366814621432787,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.5821049783549782,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.36666666666666664,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5152102198399457,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.27428748176856066,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.36977067074988085,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.22440889882489728,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.1300925925925926,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.39630550643647333,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.37277777777777776,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.25
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.33234090111179226,
|
| 160 |
+
"English": 0.3845298726864061
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.13466666666666666
|
| 164 |
+
}
|
results/Llama-3.3-70B-Instruct/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.3368788983987977,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.3346445205602255,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.34105124981338825,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.3349409248227798,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.48257436937624887,
|
| 14 |
+
"16k": 0.4570891611420083,
|
| 15 |
+
"32k": 0.43164967032208246,
|
| 16 |
+
"64k": 0.37974005621997625,
|
| 17 |
+
"128k": 0.14494662029982153,
|
| 18 |
+
"256k": 0.1252735130326494
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.3148605467957358,
|
| 22 |
+
"Partial": 0.3649022549845135
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.5193942390521589,
|
| 26 |
+
"Moderate": 0.22606077851307846,
|
| 27 |
+
"Hard": 0.2859477654450278,
|
| 28 |
+
"Extreme": 0.2431814890253722
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.5716856431399633,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.5180708023900146,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.3222222222222222,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.4675755720039648,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.2472281333613823,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.36222233277682875,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.16955304749357702,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.2820987654320987,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.34705004572107623,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.31356481481481485,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.2027777777777778
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.28230462501190684,
|
| 45 |
+
"English": 0.39145317178568884
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.3346445205602255,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.4542953722813034,
|
| 51 |
+
"16k": 0.44790102117035924,
|
| 52 |
+
"32k": 0.4589206338089406,
|
| 53 |
+
"64k": 0.3903508423882444,
|
| 54 |
+
"128k": 0.1390327640597696,
|
| 55 |
+
"256k": 0.11736648965273745
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.3229561312874356,
|
| 59 |
+
"Partial": 0.34952065236195895
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.5259881421298087,
|
| 63 |
+
"Moderate": 0.21937379810223762,
|
| 64 |
+
"Hard": 0.272924266331999,
|
| 65 |
+
"Extreme": 0.24128717207335554
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.5894507286480519,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.49666058774955946,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.31666666666666665,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.4602747124060712,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.2717574105274334,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.34752011368589375,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.15669004787219404,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.3101851851851852,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.3494311586679074,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.2688888888888889,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.20833333333333334
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.28273488892882487,
|
| 82 |
+
"English": 0.3865541521916267
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.14066666666666666,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.3958348563743987,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.5375237165063974,
|
| 90 |
+
"16k": 0.5279796876480686,
|
| 91 |
+
"32k": 0.5128361604334021,
|
| 92 |
+
"64k": 0.44897938061236586,
|
| 93 |
+
"128k": 0.18481857856663617,
|
| 94 |
+
"256k": 0.16287161447952125
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.37597657573399246,
|
| 98 |
+
"Partial": 0.4211090317349159
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.6024719774255488,
|
| 102 |
+
"Moderate": 0.2783382985783699,
|
| 103 |
+
"Hard": 0.3365596754926361,
|
| 104 |
+
"Extreme": 0.28558017485446086
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.6560921957788185,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.590698368130755,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.38333333333333336,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.4863977172263582,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.29159793863898964,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.424325611415665,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.20595551459420022,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.3527777777777778,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.3998111292235962,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.42375,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.3369511703916512,
|
| 121 |
+
"English": 0.4547185423571466
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.18533333333333332,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.4231932102533526,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.5825111026223746,
|
| 129 |
+
"16k": 0.5470777230462098,
|
| 130 |
+
"32k": 0.5441468137547846,
|
| 131 |
+
"64k": 0.476981081390983,
|
| 132 |
+
"128k": 0.19851254776601635,
|
| 133 |
+
"256k": 0.189929992939747
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.4008399164475989,
|
| 137 |
+
"Partial": 0.4516428569152216
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.6383633989735824,
|
| 141 |
+
"Moderate": 0.29622937081220263,
|
| 142 |
+
"Hard": 0.3631324161592812,
|
| 143 |
+
"Extreme": 0.31032522872728074
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.6823167917941978,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.6268177888335086,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.38333333333333336,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.49909323423974894,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.30308485751721276,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.4702913178424183,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.22495072081956943,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.38055555555555554,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.4307393977892574,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.47583333333333333,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.275
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.36349427675262524,
|
| 160 |
+
"English": 0.48289214375408057
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.20866666666666667
|
| 164 |
+
}
|
results/Magistral-Small-2509/thinking_context-120000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.38398116897357343,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.3775125226933459,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.3891788972494985,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.38525208697787594,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.5449912961150511,
|
| 14 |
+
"16k": 0.47444243155128724,
|
| 15 |
+
"32k": 0.4264163670766384,
|
| 16 |
+
"64k": 0.3029575907045873,
|
| 17 |
+
"128k": 0.302750216644183,
|
| 18 |
+
"256k": 0.2523291117496938
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.3475865533347475,
|
| 22 |
+
"Partial": 0.4303015888775337
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.5425168562606535,
|
| 26 |
+
"Moderate": 0.2943739375717643,
|
| 27 |
+
"Hard": 0.3291735952793175,
|
| 28 |
+
"Extreme": 0.30516679942900543
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.6519835369865774,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.6259667241346109,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.438888888888889,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5181402784690045,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.2964646210831104,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.3430731095683128,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.1940507194662843,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.23070987654320987,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.40948667090743257,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.4654166666666667,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.2416666666666666
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.3639911930284364,
|
| 45 |
+
"English": 0.40397114491871056
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.3775125226933459,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.5435818039060978,
|
| 51 |
+
"16k": 0.4750140116790471,
|
| 52 |
+
"32k": 0.40129268121677775,
|
| 53 |
+
"64k": 0.28670320465634697,
|
| 54 |
+
"128k": 0.3074867892397562,
|
| 55 |
+
"256k": 0.25099664546205136
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.33866755109330426,
|
| 59 |
+
"Partial": 0.42695157745703605
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.5415570936201182,
|
| 63 |
+
"Moderate": 0.2821803430686346,
|
| 64 |
+
"Hard": 0.32058466511584804,
|
| 65 |
+
"Extreme": 0.29781631261319746
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.6532513121925126,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.6111094803277807,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.425,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5176378460686947,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.2913492653600891,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.33400917389507784,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.1887200677483396,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.22129629629629627,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.41785365614151276,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.445,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.24166666666666667
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.34935070830064097,
|
| 82 |
+
"English": 0.4056743370860514
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.15666666666666668,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.4410818050309274,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.6043547908072043,
|
| 90 |
+
"16k": 0.5353872646463821,
|
| 91 |
+
"32k": 0.50503501278225,
|
| 92 |
+
"64k": 0.35415051008140713,
|
| 93 |
+
"128k": 0.3590897028976117,
|
| 94 |
+
"256k": 0.28847354897071054
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.4052447124894285,
|
| 98 |
+
"Partial": 0.48669265008374507
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.6224120794085322,
|
| 102 |
+
"Moderate": 0.35760055550726977,
|
| 103 |
+
"Hard": 0.3739866973836827,
|
| 104 |
+
"Extreme": 0.3413440208772583
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.7183527826830406,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.6865933482781307,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.5333333333333333,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5317396047677334,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.34712734561624925,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.4084056110628732,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.2396263809618489,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.28148148148148144,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.4526614935043571,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.5494444444444445,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.3
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.4165337844683723,
|
| 121 |
+
"English": 0.4656298255934835
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.202,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.47447373045805247,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.6472581172783606,
|
| 129 |
+
"16k": 0.572013026736313,
|
| 130 |
+
"32k": 0.5323134358536711,
|
| 131 |
+
"64k": 0.38848449233451177,
|
| 132 |
+
"128k": 0.38802278615487296,
|
| 133 |
+
"256k": 0.31875052439058804
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.43884732486807027,
|
| 137 |
+
"Partial": 0.5198164284816681
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.6584366917924486,
|
| 141 |
+
"Moderate": 0.38635877266139323,
|
| 142 |
+
"Hard": 0.40375289537189885,
|
| 143 |
+
"Extreme": 0.3772769049579495
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.7364126543991428,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.718547042731825,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.5666666666666667,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5424760798885467,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.3887278739210803,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.45972240993615016,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.258469847158553,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.32592592592592595,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.48746959414301055,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.5911111111111111,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.3333333333333333
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.45726250895870557,
|
| 160 |
+
"English": 0.4916849519574011
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.22866666666666666
|
| 164 |
+
}
|
results/MiniMax-M2/thinking_context-1000000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.5320685707653132,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.535180398833494,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.5311849506804371,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.5298403627820072,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.654795970947119,
|
| 14 |
+
"16k": 0.5832041701523042,
|
| 15 |
+
"32k": 0.5830505446766833,
|
| 16 |
+
"64k": 0.5201561955794758,
|
| 17 |
+
"128k": 0.5060838591020447,
|
| 18 |
+
"256k": 0.3451206841342513
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.4938467607068266,
|
| 22 |
+
"Partial": 0.5807145108397509
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.7219874781362817,
|
| 26 |
+
"Moderate": 0.599199335465557,
|
| 27 |
+
"Hard": 0.4257653962693645,
|
| 28 |
+
"Extreme": 0.34975019139747615
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.767571983047427,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.7186696477094124,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.4972222222222222,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.4696599254603241,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.54344042963745,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.5123089198769455,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.31381086481875964,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.6038580246913581,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.5619188050015754,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.5529629629629632,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.39444444444444454
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.5354797348772966,
|
| 45 |
+
"English": 0.5286574066533296
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.535180398833494,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.6571274187960493,
|
| 51 |
+
"16k": 0.5855452098864022,
|
| 52 |
+
"32k": 0.6094638772285274,
|
| 53 |
+
"64k": 0.5094373867375244,
|
| 54 |
+
"128k": 0.5028727484199556,
|
| 55 |
+
"256k": 0.34663575193250185
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.4983496461876354,
|
| 59 |
+
"Partial": 0.5820559022009494
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.7348029222520711,
|
| 63 |
+
"Moderate": 0.6076522249303262,
|
| 64 |
+
"Hard": 0.4165082385065274,
|
| 65 |
+
"Extreme": 0.3468482177079349
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.7483964432006224,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.6994320017261199,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.5,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.46659438196842223,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.5466093432829364,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.5244645023077399,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.32026132009110975,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.6097222222222223,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.5581618594200692,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.5638888888888888,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.425
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.5363273796225,
|
| 82 |
+
"English": 0.5340334180444871
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.30133333333333334,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.631079137318095,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.7607802022833232,
|
| 90 |
+
"16k": 0.6922523863155777,
|
| 91 |
+
"32k": 0.696226877019834,
|
| 92 |
+
"64k": 0.6061494664574102,
|
| 93 |
+
"128k": 0.5803070617163435,
|
| 94 |
+
"256k": 0.45075883011608214
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.5863061857618213,
|
| 98 |
+
"Partial": 0.6880628938442622
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.8429412379045521,
|
| 102 |
+
"Moderate": 0.7260325873797594,
|
| 103 |
+
"Hard": 0.5037591992235261,
|
| 104 |
+
"Extreme": 0.42025273404272534
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.854967268320159,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.7950413527388575,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.6333333333333333,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.48790929716965253,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.6628232709674524,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.6070531962911038,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.4273910542891768,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.6976851851851852,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.6589983180763119,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.663888888888889,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.6243713115466586,
|
| 121 |
+
"English": 0.6377869630895318
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.38133333333333336,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.6838483190204042,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.8056053063071357,
|
| 129 |
+
"16k": 0.754121676530954,
|
| 130 |
+
"32k": 0.7309373525434467,
|
| 131 |
+
"64k": 0.6802921620132278,
|
| 132 |
+
"128k": 0.6428076963616377,
|
| 133 |
+
"256k": 0.48932572036602695
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.6418563038987366,
|
| 137 |
+
"Partial": 0.7372927019025284
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.8932565923719491,
|
| 141 |
+
"Moderate": 0.7958991372015439,
|
| 142 |
+
"Hard": 0.5571346872299623,
|
| 143 |
+
"Extreme": 0.46408187669686923
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.8856810957992655,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.8257039309014353,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.7083333333333334,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5020042367803014,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.729994986816228,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.6651081533166491,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.49051401515979876,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.7680555555555556,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.7036475958542688,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.7072222222222223,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.6
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.6727473750313149,
|
| 160 |
+
"English": 0.694949263009495
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.444
|
| 164 |
+
}
|
results/MiniMax-Text-01/nonthinking_context-1000000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.4113523778378889,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.4026546189679395,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.41422198000018023,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.41718053454554826,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.45750122785552744,
|
| 14 |
+
"16k": 0.40648581074103435,
|
| 15 |
+
"32k": 0.41953181726499883,
|
| 16 |
+
"64k": 0.3963813527019971,
|
| 17 |
+
"128k": 0.41323756281622565,
|
| 18 |
+
"256k": 0.3749764956475515
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.37732447212646125,
|
| 22 |
+
"Partial": 0.45466062147061553
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.5125950929989945,
|
| 26 |
+
"Moderate": 0.38228847113922254,
|
| 27 |
+
"Hard": 0.3867421547849868,
|
| 28 |
+
"Extreme": 0.33569972963459577
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.7019344870456294,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.6935557265385518,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.5000000000000001,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.525289467915154,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.40960389859884994,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.3855189408594916,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.2570183735053335,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.16126543209876543,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.3763262824393013,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.3850462962962962,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.34444444444444444
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.4206201869029405,
|
| 45 |
+
"English": 0.40208456877283766
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.4026546189679395,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.4484815946744958,
|
| 51 |
+
"16k": 0.40023341947584756,
|
| 52 |
+
"32k": 0.39365195091822286,
|
| 53 |
+
"64k": 0.4050265329266902,
|
| 54 |
+
"128k": 0.40626760527764794,
|
| 55 |
+
"256k": 0.3622666105347326
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.3618050193216267,
|
| 59 |
+
"Partial": 0.45464501851779227
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.49189201078601713,
|
| 63 |
+
"Moderate": 0.3847984308236515,
|
| 64 |
+
"Hard": 0.39113612973801154,
|
| 65 |
+
"Extreme": 0.32419293466074633
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.7071317552591362,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.692217342415818,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.475,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5252872492452957,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.3965042482839467,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.38900319686695384,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.24881818692821855,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.1462962962962963,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.35572673286895423,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.36347222222222214,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.4101250015901318,
|
| 82 |
+
"English": 0.3951842363457473
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.15533333333333332,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.4807038949944852,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.5303281501019884,
|
| 90 |
+
"16k": 0.4819497908714715,
|
| 91 |
+
"32k": 0.47954691765928337,
|
| 92 |
+
"64k": 0.48083012165065453,
|
| 93 |
+
"128k": 0.465293133114307,
|
| 94 |
+
"256k": 0.44627525656921
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.44656912126099607,
|
| 98 |
+
"Partial": 0.5241481524734732
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.596920990471646,
|
| 102 |
+
"Moderate": 0.4603818463137054,
|
| 103 |
+
"Hard": 0.4590470067460482,
|
| 104 |
+
"Extreme": 0.3809658785230146
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.7449388025193358,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.7399444536944532,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.6166666666666667,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.54417036696111,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.5088222013004289,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.4705462063266301,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.3211976903039678,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.2083333333333333,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.4528903513431796,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.45958333333333334,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.48759226495133245,
|
| 121 |
+
"English": 0.47381552503763963
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.206,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.5286875532565248,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.5760868208214227,
|
| 129 |
+
"16k": 0.5315447995369911,
|
| 130 |
+
"32k": 0.5297979513353553,
|
| 131 |
+
"64k": 0.5139951126923608,
|
| 132 |
+
"128k": 0.5285145377275431,
|
| 133 |
+
"256k": 0.49218609742548064
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.4922563052647642,
|
| 137 |
+
"Partial": 0.5750545961551319
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.6571166886876132,
|
| 141 |
+
"Moderate": 0.5008690293131257,
|
| 142 |
+
"Hard": 0.5070400734318661,
|
| 143 |
+
"Extreme": 0.42048944373649116
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.7851733453547192,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.7758457283457282,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.7083333333333334,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5534698517064113,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.5639720868179612,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.503918026189678,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.34945026972752397,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.25277777777777777,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.5104976262726122,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.5270833333333333,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.525
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.5330279709661675,
|
| 160 |
+
"English": 0.5243471355468845
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.24266666666666667
|
| 164 |
+
}
|
results/MiniMax-Text-01/thinking_context-1000000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.4499528005964066,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.4519835462001885,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.4481755772504262,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.4496992783386054,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.485225729559654,
|
| 14 |
+
"16k": 0.4524723240855649,
|
| 15 |
+
"32k": 0.46920448352940436,
|
| 16 |
+
"64k": 0.44046374240515457,
|
| 17 |
+
"128k": 0.4133092627171987,
|
| 18 |
+
"256k": 0.43904126128146514
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.4116545212336913,
|
| 22 |
+
"Partial": 0.49869606523986354
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.6191934548978654,
|
| 26 |
+
"Moderate": 0.4082147550465631,
|
| 27 |
+
"Hard": 0.3801988071084879,
|
| 28 |
+
"Extreme": 0.33778735493415807
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.6830330861399296,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.6403219944448011,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.4833333333333333,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5086176566073063,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.416914270509611,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.4334853794839026,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.27119391146489646,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.38966049382716056,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.4348929522191275,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.41300925925925924,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4027777777777778
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.45819903421860664,
|
| 45 |
+
"English": 0.4417065669742075
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.4519835462001885,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.4879779124929164,
|
| 51 |
+
"16k": 0.4554840853531918,
|
| 52 |
+
"32k": 0.4648286187996774,
|
| 53 |
+
"64k": 0.42985632449506034,
|
| 54 |
+
"128k": 0.4307020670264534,
|
| 55 |
+
"256k": 0.443052269033835
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.41228070711895354,
|
| 59 |
+
"Partial": 0.5025144323035801
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.6285595261886431,
|
| 63 |
+
"Moderate": 0.4057015689049336,
|
| 64 |
+
"Hard": 0.37791019658117175,
|
| 65 |
+
"Extreme": 0.33760415329971205
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.6904671153390595,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.6319390331890332,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.44166666666666665,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5079368349605524,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.3963567606333699,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.4315669444489273,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.26717481095169254,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.40648148148148144,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.4533152836127507,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.4119444444444444,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4583333333333333
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.47043321599568005,
|
| 82 |
+
"English": 0.43353387640469826
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.21,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.5523435379453717,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.6041368153338821,
|
| 90 |
+
"16k": 0.553143416205592,
|
| 91 |
+
"32k": 0.5547357356840433,
|
| 92 |
+
"64k": 0.5474714891955119,
|
| 93 |
+
"128k": 0.5080001305944092,
|
| 94 |
+
"256k": 0.5465736406587951
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.5094377873914124,
|
| 98 |
+
"Partial": 0.6069508568322307
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.7582302423860908,
|
| 102 |
+
"Moderate": 0.5069058318579235,
|
| 103 |
+
"Hard": 0.47636905813527697,
|
| 104 |
+
"Extreme": 0.40654974290892704
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.7753422134076285,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.7188864376364378,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.6083333333333333,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5276281423571613,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.573374443874177,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.5278895685136558,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.35338346649949204,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.5027777777777779,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.552570101188694,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.5220833333333333,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.55
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.5649919855932303,
|
| 121 |
+
"English": 0.5396950902975145
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.2753333333333333,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.5997056103547938,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.6457585156659336,
|
| 129 |
+
"16k": 0.6123141997231359,
|
| 130 |
+
"32k": 0.6242961953070552,
|
| 131 |
+
"64k": 0.5876928890236057,
|
| 132 |
+
"128k": 0.5497742714361217,
|
| 133 |
+
"256k": 0.5783975909729129
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.5540426758661396,
|
| 137 |
+
"Partial": 0.6578220724312633
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.8056166900447767,
|
| 141 |
+
"Moderate": 0.5614066990728871,
|
| 142 |
+
"Hard": 0.5178805893116146,
|
| 143 |
+
"Extreme": 0.45303896497156343
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.8083334868935526,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.7593085155585156,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.6833333333333333,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5344316475303361,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.6383957562170883,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.5743997782942697,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.39861351698347697,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.5527777777777778,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.5853585580965909,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.5984722222222222,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.6
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.6144058958264887,
|
| 160 |
+
"English": 0.5850053248830991
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.31933333333333336
|
| 164 |
+
}
|
results/Ministral-3-14B-Instruct-2512/nonthinking_context-224000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.40137741846169367,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.40238223426918757,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.4002952301959775,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.4014547909199151,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.43375517745982745,
|
| 14 |
+
"16k": 0.4624624276537502,
|
| 15 |
+
"32k": 0.4225675952668474,
|
| 16 |
+
"64k": 0.39702430034744873,
|
| 17 |
+
"128k": 0.3821065286583368,
|
| 18 |
+
"256k": 0.3103484813839513
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.3639638275660222,
|
| 22 |
+
"Partial": 0.4489947159652748
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.5260224045368944,
|
| 26 |
+
"Moderate": 0.3401707697729334,
|
| 27 |
+
"Hard": 0.35036961025265867,
|
| 28 |
+
"Extreme": 0.33853899745082133
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.7189334347722878,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.6668259209925876,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.4583333333333333,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.5221962563770883,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.3136369167899536,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.39798286937745314,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.21946000799150195,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.14151234567901233,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.47709977467470854,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.4217592592592592,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.30000000000000004
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.40560935669062276,
|
| 45 |
+
"English": 0.397145480232764
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.40238223426918757,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.433884589163934,
|
| 51 |
+
"16k": 0.46516909415802493,
|
| 52 |
+
"32k": 0.42676245820656594,
|
| 53 |
+
"64k": 0.3945954502711027,
|
| 54 |
+
"128k": 0.38087905781855047,
|
| 55 |
+
"256k": 0.31300275599695
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.36618501990567515,
|
| 59 |
+
"Partial": 0.44845141618638706
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.5313007708587812,
|
| 63 |
+
"Moderate": 0.33905574344991224,
|
| 64 |
+
"Hard": 0.35279460774167126,
|
| 65 |
+
"Extreme": 0.33532188262609514
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.7231148921645373,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.6661998186998181,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.4666666666666667,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.5230622050479962,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.30176586689960616,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.3921100544588917,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.219946553696774,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.14675925925925926,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.4747724555416158,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.4193055555555556,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.40159850236109407,
|
| 82 |
+
"English": 0.4031659661772829
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.158,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.4167918691419028,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.4448864222174602,
|
| 90 |
+
"16k": 0.481483520242527,
|
| 91 |
+
"32k": 0.43929332248789354,
|
| 92 |
+
"64k": 0.4125546400739453,
|
| 93 |
+
"128k": 0.39499054247940135,
|
| 94 |
+
"256k": 0.3275427673501904
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.37697160991132156,
|
| 98 |
+
"Partial": 0.4674721990717337
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.543547780022054,
|
| 102 |
+
"Moderate": 0.3573487107807427,
|
| 103 |
+
"Hard": 0.3645996932831712,
|
| 104 |
+
"Extreme": 0.3512606476539851
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.7338149078373205,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.6898572261072259,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.475,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5324649457390374,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.3305062690008576,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.41269190069190054,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.24063695524418569,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.14953703703703705,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.4968728427963255,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.43041666666666667,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.31666666666666665
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.42183857405387454,
|
| 121 |
+
"English": 0.4117451642299312
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.164,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.42795282912937055,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.45584907332800734,
|
| 129 |
+
"16k": 0.4930808335792047,
|
| 130 |
+
"32k": 0.45565921785915897,
|
| 131 |
+
"64k": 0.41883977659018873,
|
| 132 |
+
"128k": 0.4041598036532658,
|
| 133 |
+
"256k": 0.34012826976639843
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.3885912574254956,
|
| 137 |
+
"Partial": 0.47804937493430194
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.554784793655902,
|
| 141 |
+
"Moderate": 0.37124888788597155,
|
| 142 |
+
"Hard": 0.37318051863597795,
|
| 143 |
+
"Extreme": 0.36223380606151817
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.7412124373820532,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.6988453213453213,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.48333333333333334,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.538030953852775,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.34888079906957237,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.4339707150850415,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.24529018187024293,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.15046296296296297,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.5141050625900392,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.44708333333333333,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.3333333333333333
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.43352032726356865,
|
| 160 |
+
"English": 0.42238533099517206
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.16933333333333334
|
| 164 |
+
}
|
results/Ministral-3-14B-Instruct-2512/thinking_context-224000_bon-3_summary.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"date": "2025-12-08",
|
| 3 |
+
"total_questions_num": 1500,
|
| 4 |
+
"inference_iterations": 3,
|
| 5 |
+
"total_samples_num": 4500,
|
| 6 |
+
"fail_samples_num": 0,
|
| 7 |
+
"inference_inconsistent_samples_num": 0,
|
| 8 |
+
"average_overall_metric": 0.45799174186842606,
|
| 9 |
+
"inference_iteration_1_overall_metric": 0.45848259675544495,
|
| 10 |
+
"inference_iteration_2_overall_metric": 0.46152550901732137,
|
| 11 |
+
"inference_iteration_3_overall_metric": 0.453967119832514,
|
| 12 |
+
"average_token_length_metric": {
|
| 13 |
+
"8k": 0.5187827600069922,
|
| 14 |
+
"16k": 0.48518599796025474,
|
| 15 |
+
"32k": 0.48745678732020276,
|
| 16 |
+
"64k": 0.4570416898883375,
|
| 17 |
+
"128k": 0.42361656559174016,
|
| 18 |
+
"256k": 0.3758666504430352
|
| 19 |
+
},
|
| 20 |
+
"average_contextual_requirement_metric": {
|
| 21 |
+
"Full": 0.4248803053301027,
|
| 22 |
+
"Partial": 0.500133570189931
|
| 23 |
+
},
|
| 24 |
+
"average_difficulty_metric": {
|
| 25 |
+
"Easy": 0.6756101555622144,
|
| 26 |
+
"Moderate": 0.39349242818800056,
|
| 27 |
+
"Hard": 0.37479147914388156,
|
| 28 |
+
"Extreme": 0.31661242864258926
|
| 29 |
+
},
|
| 30 |
+
"average_primary_task_metric": {
|
| 31 |
+
"T1. Retrieval & Ranking": 0.7608265794743194,
|
| 32 |
+
"T2. Sequencing & Structure Reconstruction": 0.6976467766457445,
|
| 33 |
+
"T3. Evidence-Grounded QA": 0.4138888888888888,
|
| 34 |
+
"T4. Summarization & Synthesis": 0.49943611989270814,
|
| 35 |
+
"T5. Attribution & Citation Alignment": 0.3460922902510361,
|
| 36 |
+
"T6. Aggregation & Clustering": 0.45809776319010986,
|
| 37 |
+
"T7. Consistency & Compliance Checking": 0.2280021697407482,
|
| 38 |
+
"T8. Structured & Numeric Reasoning": 0.41836419753086435,
|
| 39 |
+
"T9. Version & Code Diff Analysis": 0.5354951076952419,
|
| 40 |
+
"T10. Rule Induction & In-Context Learning": 0.462037037037037,
|
| 41 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.35277777777777775
|
| 42 |
+
},
|
| 43 |
+
"average_language_metric": {
|
| 44 |
+
"Chinese": 0.43851283791556633,
|
| 45 |
+
"English": 0.4774706458212879
|
| 46 |
+
},
|
| 47 |
+
"BoN-1": {
|
| 48 |
+
"overall_metric": 0.45848259675544495,
|
| 49 |
+
"token_length": {
|
| 50 |
+
"8k": 0.538436412123529,
|
| 51 |
+
"16k": 0.47816429220213824,
|
| 52 |
+
"32k": 0.4805712657434,
|
| 53 |
+
"64k": 0.44911476875649875,
|
| 54 |
+
"128k": 0.4390480988508506,
|
| 55 |
+
"256k": 0.3655607428562557
|
| 56 |
+
},
|
| 57 |
+
"contextual_requirement": {
|
| 58 |
+
"Full": 0.42758275308934174,
|
| 59 |
+
"Partial": 0.49780967051230435
|
| 60 |
+
},
|
| 61 |
+
"difficulty": {
|
| 62 |
+
"Easy": 0.6788797898422243,
|
| 63 |
+
"Moderate": 0.39582478863697335,
|
| 64 |
+
"Hard": 0.37581391782555706,
|
| 65 |
+
"Extreme": 0.3125005687762375
|
| 66 |
+
},
|
| 67 |
+
"primary_task": {
|
| 68 |
+
"T1. Retrieval & Ranking": 0.7602730263789274,
|
| 69 |
+
"T2. Sequencing & Structure Reconstruction": 0.7038347568610727,
|
| 70 |
+
"T3. Evidence-Grounded QA": 0.4583333333333333,
|
| 71 |
+
"T4. Summarization & Synthesis": 0.4990357866977353,
|
| 72 |
+
"T5. Attribution & Citation Alignment": 0.35405443033443207,
|
| 73 |
+
"T6. Aggregation & Clustering": 0.43771712805012347,
|
| 74 |
+
"T7. Consistency & Compliance Checking": 0.21275716732831734,
|
| 75 |
+
"T8. Structured & Numeric Reasoning": 0.42546296296296293,
|
| 76 |
+
"T9. Version & Code Diff Analysis": 0.5013174605476809,
|
| 77 |
+
"T10. Rule Induction & In-Context Learning": 0.48194444444444445,
|
| 78 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.35833333333333334
|
| 79 |
+
},
|
| 80 |
+
"language": {
|
| 81 |
+
"Chinese": 0.43269825076497226,
|
| 82 |
+
"English": 0.48426694274591864
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"pass@1": 0.22733333333333333,
|
| 86 |
+
"BoN-2": {
|
| 87 |
+
"overall_metric": 0.5227530650530186,
|
| 88 |
+
"token_length": {
|
| 89 |
+
"8k": 0.5815903903274882,
|
| 90 |
+
"16k": 0.5495634888346752,
|
| 91 |
+
"32k": 0.5446963722215565,
|
| 92 |
+
"64k": 0.5299811967182008,
|
| 93 |
+
"128k": 0.4888075896458958,
|
| 94 |
+
"256k": 0.4418793525702981
|
| 95 |
+
},
|
| 96 |
+
"contextual_requirement": {
|
| 97 |
+
"Full": 0.489360650067773,
|
| 98 |
+
"Partial": 0.565252502306969
|
| 99 |
+
},
|
| 100 |
+
"difficulty": {
|
| 101 |
+
"Easy": 0.7604456302850787,
|
| 102 |
+
"Moderate": 0.463797787486306,
|
| 103 |
+
"Hard": 0.43728803588921916,
|
| 104 |
+
"Extreme": 0.35722954733316814
|
| 105 |
+
},
|
| 106 |
+
"primary_task": {
|
| 107 |
+
"T1. Retrieval & Ranking": 0.8036702370810763,
|
| 108 |
+
"T2. Sequencing & Structure Reconstruction": 0.7488194240400123,
|
| 109 |
+
"T3. Evidence-Grounded QA": 0.5083333333333333,
|
| 110 |
+
"T4. Summarization & Synthesis": 0.5120697680827194,
|
| 111 |
+
"T5. Attribution & Citation Alignment": 0.4029499793922618,
|
| 112 |
+
"T6. Aggregation & Clustering": 0.5304624812643679,
|
| 113 |
+
"T7. Consistency & Compliance Checking": 0.27354479208047294,
|
| 114 |
+
"T8. Structured & Numeric Reasoning": 0.4962962962962963,
|
| 115 |
+
"T9. Version & Code Diff Analysis": 0.6272818834382945,
|
| 116 |
+
"T10. Rule Induction & In-Context Learning": 0.5725,
|
| 117 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4083333333333333
|
| 118 |
+
},
|
| 119 |
+
"language": {
|
| 120 |
+
"Chinese": 0.5068736065410564,
|
| 121 |
+
"English": 0.5386325235649823
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"pass@2": 0.2833333333333333,
|
| 125 |
+
"BoN-3": {
|
| 126 |
+
"overall_metric": 0.5542260073321058,
|
| 127 |
+
"token_length": {
|
| 128 |
+
"8k": 0.6154421507239413,
|
| 129 |
+
"16k": 0.5842317006106588,
|
| 130 |
+
"32k": 0.5699342902198496,
|
| 131 |
+
"64k": 0.5576866952391514,
|
| 132 |
+
"128k": 0.5184072720538686,
|
| 133 |
+
"256k": 0.4796539351451706
|
| 134 |
+
},
|
| 135 |
+
"contextual_requirement": {
|
| 136 |
+
"Full": 0.5204393104129562,
|
| 137 |
+
"Partial": 0.5972272579564804
|
| 138 |
+
},
|
| 139 |
+
"difficulty": {
|
| 140 |
+
"Easy": 0.7905996585472251,
|
| 141 |
+
"Moderate": 0.5012307381116559,
|
| 142 |
+
"Hard": 0.46914010908714937,
|
| 143 |
+
"Extreme": 0.3859836380407792
|
| 144 |
+
},
|
| 145 |
+
"primary_task": {
|
| 146 |
+
"T1. Retrieval & Ranking": 0.8214474184096282,
|
| 147 |
+
"T2. Sequencing & Structure Reconstruction": 0.7666511293717175,
|
| 148 |
+
"T3. Evidence-Grounded QA": 0.525,
|
| 149 |
+
"T4. Summarization & Synthesis": 0.5197606048087388,
|
| 150 |
+
"T5. Attribution & Citation Alignment": 0.4423027060091871,
|
| 151 |
+
"T6. Aggregation & Clustering": 0.5772430200792206,
|
| 152 |
+
"T7. Consistency & Compliance Checking": 0.3040682257365206,
|
| 153 |
+
"T8. Structured & Numeric Reasoning": 0.5402777777777777,
|
| 154 |
+
"T9. Version & Code Diff Analysis": 0.661946364328448,
|
| 155 |
+
"T10. Rule Induction & In-Context Learning": 0.5916666666666668,
|
| 156 |
+
"T11. Dialogue Memory & Long-Horizon Tracking": 0.4666666666666667
|
| 157 |
+
},
|
| 158 |
+
"language": {
|
| 159 |
+
"Chinese": 0.5366930860176997,
|
| 160 |
+
"English": 0.5717589286465138
|
| 161 |
+
}
|
| 162 |
+
},
|
| 163 |
+
"pass@3": 0.31133333333333335
|
| 164 |
+
}
|