#!/usr/bin/env python3
"""HYPERNET N1 - OFFICIAL HUMANEVAL WITH CODE EXECUTION"""
import os, sys, json, time, requests, subprocess
from datetime import datetime

HYPERNET_URL = "http://localhost:5000"
AUTH_TOKEN = "cpn-steve-kawa-hypernet-alpha"
LANES = ["lola", "claude", "grok", "deep"]

def call_lane(query, lane):
    try:
        r = requests.post(f"{HYPERNET_URL}/api/v1/run",
            headers={"Authorization": f"Bearer {AUTH_TOKEN}"},
            json={"query": query, "lane": lane}, timeout=120)
        if r.status_code == 200:
            return r.json()
    except: pass
    return {"error": "failed"}

def extract_code(response):
    code = response
    if "```python" in code:
        code = code.split("```python")[1].split("```")[0]
    elif "```" in code:
        code = code.split("```")[1].split("```")[0]
    return code.strip()

def test_solution(problem, solution):
    code = extract_code(solution)
    test_code = f'''{problem["prompt"]}
{code}

{problem["test"]}
check({problem["entry_point"]})
print("PASS")
'''
    try:
        result = subprocess.run([sys.executable, "-c", test_code],
            capture_output=True, text=True, timeout=10)
        return result.returncode == 0 and "PASS" in result.stdout
    except: return False

def run_benchmark(problems, limit=10):
    results = {"lanes": {l: {"pass": 0, "fail": 0} for l in LANES}, "problems": []}
    
    print(f"\n{'='*60}")
    print(f"OFFICIAL HUMANEVAL - {limit} PROBLEMS - CODE EXECUTION")
    print(f"{'='*60}\n")
    
    for i, p in enumerate(problems[:limit]):
        print(f"[{i+1}/{limit}] {p['task_id']}")
        prob_result = {"task_id": p["task_id"], "lanes": {}}
        
        for lane in LANES:
            prompt = f"Solve this Python function. Return ONLY the implementation, no explanation.\n\n{p['prompt']}"
            resp = call_lane(prompt, lane)
            
            if resp.get("response_text"):
                passed = test_solution(p, resp["response_text"])
                prob_result["lanes"][lane] = passed
                results["lanes"][lane]["pass" if passed else "fail"] += 1
                print(f"  {lane}: {'PASS' if passed else 'FAIL'}")
            else:
                prob_result["lanes"][lane] = False
                results["lanes"][lane]["fail"] += 1
                print(f"  {lane}: ERROR")
        
        results["problems"].append(prob_result)
        print()
    
    # Summary
    print(f"{'='*60}")
    print("RESULTS (pass@1)")
    print(f"{'='*60}")
    for lane, stats in results["lanes"].items():
        total = stats["pass"] + stats["fail"]
        pct = (stats["pass"]/total*100) if total > 0 else 0
        print(f"  {lane:10s}: {stats['pass']:3d}/{total:3d} ({pct:.1f}%)")
    
    return results

if __name__ == "__main__":
    from datasets import load_dataset
    print("Loading official HumanEval...")
    ds = load_dataset("openai/openai_humaneval")
    problems = [dict(item) for item in ds["test"]]
    print(f"Loaded {len(problems)} problems\n")
    
    print("Options:")
    print("  1. Run 10 problems (test)")
    print("  2. Run 50 problems")  
    print("  3. Run ALL 164 problems")
    choice = input("Choice (1/2/3): ").strip()
    
    limit = {1: 10, 2: 50, 3: 164}.get(int(choice), 10)
    results = run_benchmark(problems, limit)
    
    # Save
    with open(f"humaneval_results_{datetime.now().strftime('%H%M%S')}.json", "w") as f:
        json.dump(results, f, indent=2)
    print("\nResults saved!")