hypernet-n1-sdc / run_full_benchmark.py
NameONEStudios's picture
Upload folder using huggingface_hub
1db22f0 verified
#!/usr/bin/env python3
"""HYPERNET N1 - OFFICIAL HUMANEVAL WITH CODE EXECUTION"""
import os, sys, json, time, requests, subprocess
from datetime import datetime
HYPERNET_URL = "http://localhost:5000"
AUTH_TOKEN = "cpn-steve-kawa-hypernet-alpha"
LANES = ["lola", "claude", "grok", "deep"]
def call_lane(query, lane):
try:
r = requests.post(f"{HYPERNET_URL}/api/v1/run",
headers={"Authorization": f"Bearer {AUTH_TOKEN}"},
json={"query": query, "lane": lane}, timeout=120)
if r.status_code == 200:
return r.json()
except: pass
return {"error": "failed"}
def extract_code(response):
code = response
if "```python" in code:
code = code.split("```python")[1].split("```")[0]
elif "```" in code:
code = code.split("```")[1].split("```")[0]
return code.strip()
def test_solution(problem, solution):
code = extract_code(solution)
test_code = f'''{problem["prompt"]}
{code}
{problem["test"]}
check({problem["entry_point"]})
print("PASS")
'''
try:
result = subprocess.run([sys.executable, "-c", test_code],
capture_output=True, text=True, timeout=10)
return result.returncode == 0 and "PASS" in result.stdout
except: return False
def run_benchmark(problems, limit=10):
results = {"lanes": {l: {"pass": 0, "fail": 0} for l in LANES}, "problems": []}
print(f"\n{'='*60}")
print(f"OFFICIAL HUMANEVAL - {limit} PROBLEMS - CODE EXECUTION")
print(f"{'='*60}\n")
for i, p in enumerate(problems[:limit]):
print(f"[{i+1}/{limit}] {p['task_id']}")
prob_result = {"task_id": p["task_id"], "lanes": {}}
for lane in LANES:
prompt = f"Solve this Python function. Return ONLY the implementation, no explanation.\n\n{p['prompt']}"
resp = call_lane(prompt, lane)
if resp.get("response_text"):
passed = test_solution(p, resp["response_text"])
prob_result["lanes"][lane] = passed
results["lanes"][lane]["pass" if passed else "fail"] += 1
print(f" {lane}: {'PASS' if passed else 'FAIL'}")
else:
prob_result["lanes"][lane] = False
results["lanes"][lane]["fail"] += 1
print(f" {lane}: ERROR")
results["problems"].append(prob_result)
print()
# Summary
print(f"{'='*60}")
print("RESULTS (pass@1)")
print(f"{'='*60}")
for lane, stats in results["lanes"].items():
total = stats["pass"] + stats["fail"]
pct = (stats["pass"]/total*100) if total > 0 else 0
print(f" {lane:10s}: {stats['pass']:3d}/{total:3d} ({pct:.1f}%)")
return results
if __name__ == "__main__":
from datasets import load_dataset
print("Loading official HumanEval...")
ds = load_dataset("openai/openai_humaneval")
problems = [dict(item) for item in ds["test"]]
print(f"Loaded {len(problems)} problems\n")
print("Options:")
print(" 1. Run 10 problems (test)")
print(" 2. Run 50 problems")
print(" 3. Run ALL 164 problems")
choice = input("Choice (1/2/3): ").strip()
limit = {1: 10, 2: 50, 3: 164}.get(int(choice), 10)
results = run_benchmark(problems, limit)
# Save
with open(f"humaneval_results_{datetime.now().strftime('%H%M%S')}.json", "w") as f:
json.dump(results, f, indent=2)
print("\nResults saved!")