RedHatAI
/

Llama-3.3-70B-Instruct-speculator.eagle3

@@ -40,7 +40,7 @@ This model should be used with the [meta-llama/Llama-3.3-70B-Instruct](https://h
 ```bash
 vllm serve meta-llama/Llama-3.3-70B-Instruct \
-  -tp 2 \
   --speculative-config '{
     "model": "RedHatAI/Llama-3.3-70B-Instruct-speculator.eagle3",
     "num_speculative_tokens": 3,
@@ -50,10 +50,119 @@ vllm serve meta-llama/Llama-3.3-70B-Instruct \
 ## Evaluations
-Subset of GSM8k (math reasoning):
-* acceptance_rate = [80.1, 63.7, 46.4]
-* conditional_acceptance_rate = [80.1, 79.5, 72.9]
-Subset of MTBench:
-* acceptance_rate = [73.3, 53.7, 38.4]
-* conditional_acceptance_rate = [73.3, 73.3, 71.5]

 ```bash
 vllm serve meta-llama/Llama-3.3-70B-Instruct \
+  -tp 4 \
   --speculative-config '{
     "model": "RedHatAI/Llama-3.3-70B-Instruct-speculator.eagle3",
     "num_speculative_tokens": 3,
 ## Evaluations
+<h3>Use cases</h3>
+<table>
+  <thead>
+    <tr>
+      <th>Use Case</th>
+      <th>Dataset</th>
+      <th>Number of Samples</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Coding</td>
+      <td>HumanEval</td>
+      <td>168</td>
+    </tr>
+    <tr>
+      <td>Math Reasoning</td>
+      <td>gsm8k</td>
+      <td>80</td>
+    </tr>
+    <tr>
+      <td>Text Summarization</td>
+      <td>CNN/Daily Mail</td>
+      <td>80</td>
+    </tr>
+  </tbody>
+</table>
+<h3>Acceptance lengths</h3>
+<table>
+  <thead>
+    <tr>
+      <th>Use Case</th>
+      <th>k=1</th>
+      <th>k=2</th>
+      <th>k=3</th>
+      <th>k=4</th>
+      <th>k=5</th>
+      <th>k=6</th>
+      <th>k=7</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Coding</td>
+      <td></td>
+      <td></td>
+      <td></td>
+      <td></td>
+      <td></td>
+      <td></td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>Math Reasoning</td>
+      <td>1.80</td>
+      <td>2.44</td>
+      <td>2.89</td>
+      <td>3.15</td>
+      <td>3.33</td>
+      <td>3.44</td>
+      <td>3.52</td>
+    </tr>
+    <tr>
+      <td>Text Summarization</td>
+      <td>1.72</td>
+      <td>2.21</td>
+      <td>2.53</td>
+      <td>2.74</td>
+      <td>2.86</td>
+      <td>2.93</td>
+      <td>2.98</td>
+    </tr>
+  </tbody>
+</table>
+<h3>Performance benchmarking (4xA100)</h3>
+<div style="display: flex; justify-content: center; gap: 20px;">
+  <figure style="text-align: center;">
+    <img src="assets/Llama-3.3-70B-Instruct-HumanEval.png" alt="Coding" width="100%">
+    <figcaption><b>(a)</b> Acceptance lengths — Coding</figcaption>
+  </figure>
+  <figure style="text-align: center;">
+    <img src="assets/Llama-3.3-70B-Instruct-math_reasoning.png" alt="Coding" width="100%">
+    <figcaption><b>(b)</b> Acceptance lengths — Math Reasoning</figcaption>
+  </figure>
+  <figure style="text-align: center;">
+    <img src="assets/Llama-3.3-70B-Instruct-summarization.png" alt="Coding" width="100%">
+    <figcaption><b>(b)</b> Acceptance lengths — Math Reasoning</figcaption>
+  </figure>
+</div>
+<details> <summary>Details</summary>
+<strong>Configuration</strong>
+- temperature: 0
+- repetitions: 5
+- time per experiment: 4min
+- hardware: 4xA100
+- vLLM version: 0.11.0
+- GuideLLM version: 0.3.0
+<strong>Command</strong>
+```bash
+GUIDELLM__PREFERRED_ROUTE="chat_completions" \
+guidellm benchmark \
+  --target "http://localhost:8000/v1" \
+  --data "RedHatAI/SpeculativeDecoding" \
+  --rate-type sweep \
+  --max-seconds 240 \
+  --output-path "Llama-3.3-70B-Instruct-HumanEval.json" \
+  --backend-args '{"extra_body": {"chat_completions": {"temperature": 0.0}}}'
+</details>