| import json, sys, time, requests, subprocess | |
| from datetime import datetime | |
| HYPERNET_URL = "http://localhost:5000" | |
| AUTH_TOKEN = "cpn-steve-kawa-hypernet-alpha" | |
| LANES = ["lola", "claude", "grok", "deep", "gemini", "kimi"] | |
| def call_lane(query, lane): | |
| try: | |
| r = requests.post(f"{HYPERNET_URL}/api/v1/run", | |
| headers={"Authorization": f"Bearer {AUTH_TOKEN}"}, | |
| json={"query": query, "lane": lane}, timeout=120) | |
| if r.status_code == 200: | |
| return r.json() | |
| except: pass | |
| return {"error": "failed"} | |
| def extract_code(response): | |
| code = response | |
| if "```python" in code: | |
| code = code.split("```python")[1].split("```")[0] | |
| elif "```" in code: | |
| code = code.split("```")[1].split("```")[0] | |
| return code.strip() | |
| def test_solution(problem, solution): | |
| code = extract_code(solution) | |
| test_code = f'''{problem["prompt"]} | |
| {code} | |
| {problem["test"]} | |
| check({problem["entry_point"]}) | |
| print("PASS") | |
| ''' | |
| try: | |
| result = subprocess.run([sys.executable, "-c", test_code], | |
| capture_output=True, text=True, timeout=10) | |
| return result.returncode == 0 and "PASS" in result.stdout | |
| except: return False | |
| def run_benchmark(problems): | |
| results = {"lanes": {l: {"pass": 0, "fail": 0} for l in LANES}, "problems": []} | |
| print(f"\n{'='*60}") | |
| print(f"FULL 6-LANE BENCHMARK - 164 PROBLEMS") | |
| print(f"{'='*60}\n") | |
| for i, p in enumerate(problems): | |
| print(f"[{i+1}/164] {p['task_id']}") | |
| prob_result = {"task_id": p["task_id"], "lanes": {}} | |
| for lane in LANES: | |
| prompt = f"Solve this Python function. Return ONLY the implementation, no explanation.\n\n{p['prompt']}" | |
| resp = call_lane(prompt, lane) | |
| if resp.get("response_text"): | |
| passed = test_solution(p, resp["response_text"]) | |
| prob_result["lanes"][lane] = passed | |
| results["lanes"][lane]["pass" if passed else "fail"] += 1 | |
| print(f" {lane}: {'PASS' if passed else 'FAIL'}") | |
| else: | |
| prob_result["lanes"][lane] = False | |
| results["lanes"][lane]["fail"] += 1 | |
| print(f" {lane}: ERROR") | |
| results["problems"].append(prob_result) | |
| print() | |
| print(f"{'='*60}") | |
| print("RESULTS (pass@1) - ALL 6 LANES") | |
| print(f"{'='*60}") | |
| for lane, stats in results["lanes"].items(): | |
| total = stats["pass"] + stats["fail"] | |
| pct = (stats["pass"]/total*100) if total > 0 else 0 | |
| print(f" {lane:10s}: {stats['pass']:3d}/{total:3d} ({pct:.1f}%)") | |
| return results | |
| if __name__ == "__main__": | |
| from datasets import load_dataset | |
| print("Loading official HumanEval...") | |
| ds = load_dataset("openai/openai_humaneval") | |
| problems = [dict(item) for item in ds["test"]] | |
| print(f"Loaded {len(problems)} problems\n") | |
| results = run_benchmark(problems) | |
| with open(f"humaneval_6lane_{datetime.now().strftime('%H%M%S')}.json", "w") as f: | |
| json.dump(results, f, indent=2) | |
| print("\nResults saved!") | |