|
|
|
|
|
"""HYPERNET N1 - OFFICIAL HUMANEVAL WITH CODE EXECUTION""" |
|
|
import os, sys, json, time, requests, subprocess |
|
|
from datetime import datetime |
|
|
|
|
|
HYPERNET_URL = "http://localhost:5000" |
|
|
AUTH_TOKEN = "cpn-steve-kawa-hypernet-alpha" |
|
|
LANES = ["lola", "claude", "grok", "deep"] |
|
|
|
|
|
def call_lane(query, lane): |
|
|
try: |
|
|
r = requests.post(f"{HYPERNET_URL}/api/v1/run", |
|
|
headers={"Authorization": f"Bearer {AUTH_TOKEN}"}, |
|
|
json={"query": query, "lane": lane}, timeout=120) |
|
|
if r.status_code == 200: |
|
|
return r.json() |
|
|
except: pass |
|
|
return {"error": "failed"} |
|
|
|
|
|
def extract_code(response): |
|
|
code = response |
|
|
if "```python" in code: |
|
|
code = code.split("```python")[1].split("```")[0] |
|
|
elif "```" in code: |
|
|
code = code.split("```")[1].split("```")[0] |
|
|
return code.strip() |
|
|
|
|
|
def test_solution(problem, solution): |
|
|
code = extract_code(solution) |
|
|
test_code = f'''{problem["prompt"]} |
|
|
{code} |
|
|
|
|
|
{problem["test"]} |
|
|
check({problem["entry_point"]}) |
|
|
print("PASS") |
|
|
''' |
|
|
try: |
|
|
result = subprocess.run([sys.executable, "-c", test_code], |
|
|
capture_output=True, text=True, timeout=10) |
|
|
return result.returncode == 0 and "PASS" in result.stdout |
|
|
except: return False |
|
|
|
|
|
def run_benchmark(problems, limit=10): |
|
|
results = {"lanes": {l: {"pass": 0, "fail": 0} for l in LANES}, "problems": []} |
|
|
|
|
|
print(f"\n{'='*60}") |
|
|
print(f"OFFICIAL HUMANEVAL - {limit} PROBLEMS - CODE EXECUTION") |
|
|
print(f"{'='*60}\n") |
|
|
|
|
|
for i, p in enumerate(problems[:limit]): |
|
|
print(f"[{i+1}/{limit}] {p['task_id']}") |
|
|
prob_result = {"task_id": p["task_id"], "lanes": {}} |
|
|
|
|
|
for lane in LANES: |
|
|
prompt = f"Solve this Python function. Return ONLY the implementation, no explanation.\n\n{p['prompt']}" |
|
|
resp = call_lane(prompt, lane) |
|
|
|
|
|
if resp.get("response_text"): |
|
|
passed = test_solution(p, resp["response_text"]) |
|
|
prob_result["lanes"][lane] = passed |
|
|
results["lanes"][lane]["pass" if passed else "fail"] += 1 |
|
|
print(f" {lane}: {'PASS' if passed else 'FAIL'}") |
|
|
else: |
|
|
prob_result["lanes"][lane] = False |
|
|
results["lanes"][lane]["fail"] += 1 |
|
|
print(f" {lane}: ERROR") |
|
|
|
|
|
results["problems"].append(prob_result) |
|
|
print() |
|
|
|
|
|
|
|
|
print(f"{'='*60}") |
|
|
print("RESULTS (pass@1)") |
|
|
print(f"{'='*60}") |
|
|
for lane, stats in results["lanes"].items(): |
|
|
total = stats["pass"] + stats["fail"] |
|
|
pct = (stats["pass"]/total*100) if total > 0 else 0 |
|
|
print(f" {lane:10s}: {stats['pass']:3d}/{total:3d} ({pct:.1f}%)") |
|
|
|
|
|
return results |
|
|
|
|
|
if __name__ == "__main__": |
|
|
from datasets import load_dataset |
|
|
print("Loading official HumanEval...") |
|
|
ds = load_dataset("openai/openai_humaneval") |
|
|
problems = [dict(item) for item in ds["test"]] |
|
|
print(f"Loaded {len(problems)} problems\n") |
|
|
|
|
|
print("Options:") |
|
|
print(" 1. Run 10 problems (test)") |
|
|
print(" 2. Run 50 problems") |
|
|
print(" 3. Run ALL 164 problems") |
|
|
choice = input("Choice (1/2/3): ").strip() |
|
|
|
|
|
limit = {1: 10, 2: 50, 3: 164}.get(int(choice), 10) |
|
|
results = run_benchmark(problems, limit) |
|
|
|
|
|
|
|
|
with open(f"humaneval_results_{datetime.now().strftime('%H%M%S')}.json", "w") as f: |
|
|
json.dump(results, f, indent=2) |
|
|
print("\nResults saved!")
|
|
|
|