#!/usr/bin/env python3 """HYPERNET N1 - OFFICIAL HUMANEVAL WITH CODE EXECUTION""" import os, sys, json, time, requests, subprocess from datetime import datetime HYPERNET_URL = "http://localhost:5000" AUTH_TOKEN = "cpn-steve-kawa-hypernet-alpha" LANES = ["lola", "claude", "grok", "deep"] def call_lane(query, lane): try: r = requests.post(f"{HYPERNET_URL}/api/v1/run", headers={"Authorization": f"Bearer {AUTH_TOKEN}"}, json={"query": query, "lane": lane}, timeout=120) if r.status_code == 200: return r.json() except: pass return {"error": "failed"} def extract_code(response): code = response if "```python" in code: code = code.split("```python")[1].split("```")[0] elif "```" in code: code = code.split("```")[1].split("```")[0] return code.strip() def test_solution(problem, solution): code = extract_code(solution) test_code = f'''{problem["prompt"]} {code} {problem["test"]} check({problem["entry_point"]}) print("PASS") ''' try: result = subprocess.run([sys.executable, "-c", test_code], capture_output=True, text=True, timeout=10) return result.returncode == 0 and "PASS" in result.stdout except: return False def run_benchmark(problems, limit=10): results = {"lanes": {l: {"pass": 0, "fail": 0} for l in LANES}, "problems": []} print(f"\n{'='*60}") print(f"OFFICIAL HUMANEVAL - {limit} PROBLEMS - CODE EXECUTION") print(f"{'='*60}\n") for i, p in enumerate(problems[:limit]): print(f"[{i+1}/{limit}] {p['task_id']}") prob_result = {"task_id": p["task_id"], "lanes": {}} for lane in LANES: prompt = f"Solve this Python function. Return ONLY the implementation, no explanation.\n\n{p['prompt']}" resp = call_lane(prompt, lane) if resp.get("response_text"): passed = test_solution(p, resp["response_text"]) prob_result["lanes"][lane] = passed results["lanes"][lane]["pass" if passed else "fail"] += 1 print(f" {lane}: {'PASS' if passed else 'FAIL'}") else: prob_result["lanes"][lane] = False results["lanes"][lane]["fail"] += 1 print(f" {lane}: ERROR") results["problems"].append(prob_result) print() # Summary print(f"{'='*60}") print("RESULTS (pass@1)") print(f"{'='*60}") for lane, stats in results["lanes"].items(): total = stats["pass"] + stats["fail"] pct = (stats["pass"]/total*100) if total > 0 else 0 print(f" {lane:10s}: {stats['pass']:3d}/{total:3d} ({pct:.1f}%)") return results if __name__ == "__main__": from datasets import load_dataset print("Loading official HumanEval...") ds = load_dataset("openai/openai_humaneval") problems = [dict(item) for item in ds["test"]] print(f"Loaded {len(problems)} problems\n") print("Options:") print(" 1. Run 10 problems (test)") print(" 2. Run 50 problems") print(" 3. Run ALL 164 problems") choice = input("Choice (1/2/3): ").strip() limit = {1: 10, 2: 50, 3: 164}.get(int(choice), 10) results = run_benchmark(problems, limit) # Save with open(f"humaneval_results_{datetime.now().strftime('%H%M%S')}.json", "w") as f: json.dump(results, f, indent=2) print("\nResults saved!")