Spaces:
Running
Running
| import sys | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.express as px | |
| from gradio.themes.utils import colors | |
| from results.parse import parse_agg, read_data | |
| from static.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT | |
| from style.css_html_js import custom_css | |
| from utils import filter_bench, filter_bench_all, filter_RTLRepo, handle_special_cases | |
| def filter_leaderboard(task, benchmark, model_type, search_query, max_params): | |
| subset = df.copy() | |
| # Filter by task specific benchmarks when 'All' benchmarks is selected | |
| if task == "Spec-to-RTL": | |
| valid_benchmarks = s2r_benchs | |
| if benchmark == "All": | |
| subset = subset[subset["Benchmark"].isin(valid_benchmarks)] | |
| elif task == "Code Completion": | |
| valid_benchmarks = cc_benchs | |
| if benchmark == "All": | |
| subset = subset[subset["Benchmark"].isin(valid_benchmarks)] | |
| elif task == "Line Completion": | |
| valid_benchmarks = lc_benchs | |
| if benchmark == "All": | |
| subset = subset[subset["Benchmark"].isin(valid_benchmarks)] | |
| if benchmark != "All": | |
| subset = df[df["Benchmark"] == benchmark] | |
| if model_type != "All": | |
| # without emojis | |
| subset = subset[subset["Model Type"] == model_type.split(" ")[0]] | |
| if search_query: | |
| subset = subset[ | |
| subset["Model"].str.contains(search_query, case=False, na=False) | |
| ] | |
| max_params = float(max_params) | |
| subset = subset[subset["Params"] <= max_params] | |
| if benchmark == "All": | |
| if task == "Spec-to-RTL": | |
| return filter_bench_all(subset, df_agg, agg_column="Agg S2R") | |
| elif task == "Code Completion": | |
| return filter_bench_all(subset, df_agg, agg_column="Agg MC") | |
| elif task == "Line Completion": | |
| return filter_RTLRepo(subset) | |
| elif benchmark == "RTL-Repo": | |
| return filter_RTLRepo(subset) | |
| else: | |
| agg_column = None | |
| if benchmark == "VerilogEval S2R": | |
| agg_column = "Agg VerilogEval S2R" | |
| elif benchmark == "VerilogEval MC": | |
| agg_column = "Agg VerilogEval MC" | |
| elif benchmark == "RTLLM": | |
| agg_column = "Agg RTLLM" | |
| elif benchmark == "VeriGen": | |
| agg_column = "Agg VeriGen" | |
| return filter_bench(subset, df_agg, agg_column) | |
| def update_benchmarks_by_task(task): | |
| if task == "Spec-to-RTL": | |
| new_benchmarks = ["All"] + s2r_benchs | |
| elif task == "Code Completion": | |
| new_benchmarks = ["All"] + cc_benchs | |
| elif task == "Line Completion": | |
| new_benchmarks = lc_benchs | |
| else: | |
| new_benchmarks = ["All"] + benchmarks | |
| benchmark_value = "All" if "All" in new_benchmarks else new_benchmarks[0] | |
| filtered = filter_leaderboard( | |
| task, | |
| benchmark_value, | |
| model_type_dropdown.value, | |
| search_box.value, | |
| params_slider.value, | |
| ) | |
| return gr.update(value=benchmark_value, choices=new_benchmarks), filtered | |
| def generate_scatter_plot(benchmark, metric): | |
| benchmark, metric = handle_special_cases(benchmark, metric) | |
| subset = df[df["Benchmark"] == benchmark] | |
| if benchmark == "RTL-Repo": | |
| subset = subset[subset["Metric"].str.contains("EM", case=False, na=False)] | |
| detailed_scores = subset.groupby("Model", as_index=False)["Score"].mean() | |
| detailed_scores.rename(columns={"Score": "Exact Matching (EM)"}, inplace=True) | |
| else: | |
| detailed_scores = subset.pivot_table( | |
| index="Model", columns="Metric", values="Score" | |
| ).reset_index() | |
| details = df[["Model", "Params", "Model Type"]].drop_duplicates("Model") | |
| scatter_data = pd.merge(detailed_scores, details, on="Model", how="left").dropna( | |
| subset=["Params", metric] | |
| ) | |
| scatter_data["x"] = scatter_data["Params"] | |
| scatter_data["y"] = scatter_data[metric] | |
| scatter_data["size"] = (scatter_data["x"] ** 0.3) * 40 | |
| type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"} | |
| scatter_data["color"] = scatter_data["Model Type"].map(type_colors).fillna("gray") | |
| y_axis_limits = { | |
| "Functionality (FNC)": [5, 90], | |
| "Syntax (STX)": [20, 100], | |
| "Synthesis (SYN)": [5, 90], | |
| "Power": [0, 50], | |
| "Performance": [0, 50], | |
| "Area": [0, 50], | |
| "Exact Matching (EM)": [0, 50], | |
| } | |
| y_range = y_axis_limits.get(metric, [0, 80]) | |
| fig = px.scatter( | |
| scatter_data, | |
| x="x", | |
| y="y", | |
| log_x=True, | |
| size="size", | |
| color="Model Type", | |
| text="Model", | |
| hover_data={metric: ":.2f"}, | |
| title=f"Params vs. {metric} for {benchmark}", | |
| labels={"x": "# Params (Log Scale)", "y": metric}, | |
| template="plotly_white", | |
| height=600, | |
| width=1200, | |
| ) | |
| fig.update_traces( | |
| textposition="top center", | |
| textfont_size=10, | |
| marker=dict(opacity=0.8, line=dict(width=0.5, color="black")), | |
| ) | |
| fig.update_layout( | |
| xaxis=dict( | |
| showgrid=True, | |
| type="log", | |
| tickmode="array", | |
| tickvals=[8, 14, 32, 72, 200, 700], | |
| ticktext=["8", "14", "32", "72", "200", "700"], | |
| ), | |
| showlegend=False, | |
| yaxis=dict(range=y_range), | |
| margin=dict(l=50, r=50, t=50, b=50), | |
| plot_bgcolor="white", | |
| ) | |
| return fig | |
| js_func = """ | |
| function refresh() { | |
| const url = new URL(window.location); | |
| if (url.searchParams.get('__theme') !== 'light') { | |
| url.searchParams.set('__theme', 'light'); | |
| window.location.href = url.href; | |
| } | |
| } | |
| """ | |
| with gr.Blocks( | |
| css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=colors.emerald) | |
| ) as app: | |
| df_icarus, benchmarks, metrics, default_metric = read_data( | |
| "results/results_icarus.json" | |
| ) | |
| df_agg_icarus = parse_agg("results/aggregated_scores_icarus.csv") | |
| df_verilator, _, _, _ = read_data("results/results_verilator.json") | |
| df_agg_verilator = parse_agg("results/aggregated_scores_verilator.csv") | |
| df = df_icarus | |
| df_agg = df_agg_icarus | |
| tasks = ["Spec-to-RTL", "Code Completion", "Line Completion"] | |
| s2r_benchs = ["VerilogEval S2R", "RTLLM"] | |
| cc_benchs = ["VerilogEval MC", "VeriGen"] | |
| lc_benchs = ["RTL-Repo"] | |
| non_rtl_metrics = [ | |
| "Syntax (STX)", | |
| "Functionality (FNC)", | |
| "Synthesis (SYN)", | |
| "Power", | |
| "Performance", | |
| "Area", | |
| ] | |
| rtl_metrics = ["Exact Matching (EM)"] | |
| model_types = ["All", "General 🟢", "Coding 🔵", "RTL-Specific 🔴"] | |
| gr.HTML( | |
| """ | |
| <div align="center"> | |
| <img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/> | |
| </div> | |
| """ | |
| ) | |
| gr.HTML( | |
| """ | |
| <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css"> | |
| <script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script> | |
| <div style="text-align: center; margin-bottom: 0px; margin-top: 0px;"> | |
| <a href="https://github.com/HPAI-BSC/TuRTLe" target="_blank" style="text-decoration: none; margin-right: 10px;"> | |
| <button style="background: #333; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;"> | |
| GitHub Repo | |
| </button> | |
| </a> | |
| <a href="http://arxiv.org/abs/2504.01986" target="_blank" style="text-decoration: none; margin-right: 10px;"> | |
| <button style="background: #b31b1b; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;"> | |
| arXiv MLCAD 2025 | |
| </button> | |
| </a> | |
| <a href="mailto:hpai@bsc.es?subject=TuRTLe%20leaderboard%20new%20entry&body=Link%20to%20HuggingFace%20Model:" style="text-decoration: none;"> | |
| <button style="background: #00674F; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;"> | |
| How to submit | |
| </button> | |
| </a> | |
| <p style="margin-top: 15px;">If you have any inquiries or wish to collaborate: | |
| <a href="mailto:hpai@bsc.es">hpai@bsc.es</a> | |
| </p> | |
| </div> | |
| """ | |
| ) | |
| gr.HTML( | |
| """ | |
| <div style=" margin-top:-10px !important;"> | |
| <p style="margin-bottom: 15px; text-align: start !important;">Welcome to the TuRTLe Model Leaderboard! TuRTLe is a <b>unified evaluation framework designed to systematically assess Large Language Models (LLMs) in RTL (Register-Transfer Level) generation</b> for hardware design. | |
| Evaluation criteria include <b>syntax correctness, functional accuracy, synthesizability, and post-synthesis quality</b> (PPA: Power, Performance, Area). TuRTLe integrates multiple benchmarks to highlight strengths and weaknesses of available LLMs. | |
| Use the filters below to explore different RTL benchmarks, simulators and models.</p> | |
| <p style="margin-top:10px; text-align:start !important;"> <span style="font-variant:small-caps; font-weight:bold;">UPDATE (JULY 2025)</span>: Our TuRTLe paper has been accepted to <a href="https://mlcad.org/symposium/2025/" target="_blank"><b>MLCAD 2025</b></a> which will be held in September in Santa Cruz, California!</p> | |
| <p style="margin-top: -6px; text-align:start !important;"> <span style="font-variant:small-caps; font-weight:bold;">UPDATE (JULY 2025)</span>: Verilator has been added as an additional simulator alongside Icarus Verilog. You can now filter and compare results by simulator</p> | |
| <p style="margin-top: -6px; text-align: start !important; "><span style="font-variant: small-caps; font-weight: bold;">UPDATE (JUNE 2025)</span>: We make our framework open-source on GitHub and we add 7 new recent models! For a total of 40 base and instruct models and 5 RTL benchmarks</p> | |
| </div> | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("Leaderboard"): | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=4): | |
| task_radio = gr.Radio( | |
| choices=tasks, label="Select Task", value="Spec-to-RTL" | |
| ) | |
| with gr.Column(scale=3): | |
| benchmark_radio = gr.Radio( | |
| choices=["All"] + s2r_benchs, | |
| label="Select Benchmark", | |
| value="All", | |
| ) | |
| with gr.Column(scale=2, min_width=180): | |
| simulator_radio = gr.Radio( | |
| choices=["Icarus", "Verilator"], | |
| value="Icarus", | |
| label="Select Simulator", | |
| scale=1, | |
| ) | |
| with gr.Row(equal_height=True): | |
| search_box = gr.Textbox( | |
| label="Search Model", | |
| placeholder="Type model name...", | |
| scale=2, | |
| ) | |
| model_type_dropdown = gr.Radio( | |
| choices=model_types, | |
| label="Select Model Type", | |
| value="All", | |
| scale=3, | |
| ) | |
| params_slider = gr.Slider( | |
| minimum=df["Params"].min(), | |
| maximum=700, | |
| value=700, | |
| label="Max Params", | |
| step=1, | |
| scale=2, | |
| ) | |
| leaderboard = gr.DataFrame( | |
| value=filter_leaderboard("Spec-to-RTL", "All", "All", "", 700), | |
| headers="first row", | |
| show_row_numbers=True, | |
| wrap=True, | |
| datatype=[ | |
| "markdown", | |
| "html", | |
| ], | |
| interactive=False, | |
| column_widths=[ | |
| "7%", | |
| "24%", | |
| "17%", | |
| "10%", | |
| "13%", | |
| "10%", | |
| "14%", | |
| ], | |
| elem_classes="dataframe-leaderboard", | |
| ) | |
| with gr.Tab("Plot View"): | |
| with gr.Row(equal_height=True): | |
| default_benchmark = s2r_benchs[0] | |
| bubble_benchmark = gr.Dropdown( | |
| choices=benchmarks, | |
| label="Select Benchmark", | |
| value=default_benchmark, | |
| elem_classes="gr-dropdown", | |
| ) | |
| default_metric = non_rtl_metrics[0] | |
| bubble_metric = gr.Dropdown( | |
| choices=non_rtl_metrics, | |
| label="Select Metric", | |
| value=default_metric, | |
| ) | |
| with gr.Row(equal_height=True): | |
| scatter_plot = gr.Plot( | |
| value=generate_scatter_plot(default_benchmark, default_metric), | |
| label="Bubble Chart", | |
| elem_id="full-width-plot", | |
| ) | |
| with gr.Tab("Metrics Information"): | |
| with open("./static/metrics.md", "r") as file: | |
| gr.Markdown( | |
| file.read(), | |
| latex_delimiters=[ | |
| {"left": "$$", "right": "$$", "display": True}, | |
| {"left": "$", "right": "$", "display": False}, | |
| ], | |
| elem_classes="metrics-page", | |
| ) | |
| with gr.Tab("About Us"): | |
| gr.HTML( | |
| """ | |
| <div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;"> | |
| <div style="display: flex; justify-content: center; align-items: center; gap: 5%; margin-bottom: 20px;"> | |
| <img src='/gradio_api/file=hpai_logo_grad.png' alt='HPAI Group Logo' style="width: 45%;"/> | |
| <img src='/gradio_api/file=bsc-logo.png' alt='BSC Logo' style="width: 25%;"/> | |
| </div> | |
| <p style="font-size: 16px; text-align: start;"> | |
| The <b>High-Performance Artificial Intelligence (HPAI)</b> group is part of the | |
| <a href="https://bsc.es/" target="_blank">Barcelona Supercomputing Center (BSC)</a>. | |
| This leaderboard is maintained by HPAI as part of our commitment to <b>open science</b>. | |
| </p> | |
| <ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;"> | |
| <li><a href="https://hpai.bsc.es/" target="_blank">HPAI Website</a></li> | |
| <li><a href="https://github.com/HPAI-BSC/" target="_blank">HPAI GitHub Organization Page</a></li> | |
| <li><a href="https://huggingface.co/HPAI-BSC/" target="_blank">HPAI Hugging Face Organization Page</a></li> | |
| </ul> | |
| <p style="font-size: 16px; margin-top: 15px;"> | |
| Feel free to contact us: | |
| </p> | |
| <p style="font-size: 16px;">Email: <a href="mailto:hpai@bsc.es"><b>hpai@bsc.es</b></a></p> | |
| </div> | |
| """ | |
| ) | |
| with gr.Tab("References"): | |
| gr.HTML( | |
| """ | |
| <div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;"> | |
| <ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;"> | |
| <li><a href="https://github.com/bigcode-project/bigcode-evaluation-harness" target="_blank">Code Generation LM Evaluation Harness</a></li> | |
| <li>Williams, S. Icarus Verilog [Computer software]. <a href="https://github.com/steveicarus/iverilog" target="_blank">https://github.com/steveicarus/iverilog</a></li> | |
| <li>Snyder, W., Wasson, P., Galbi, D., & et al. Verilator [Computer software]. <a href="https://github.com/verilator/verilator" target="_blank">https://github.com/verilator/verilator</a></li> | |
| <li>RTL-Repo: Allam and M. Shalan, “Rtl-repo: A benchmark for evaluating llms on large-scale rtl design projects,” in 2024 IEEE LLM Aided Design Workshop (LAD). IEEE, 2024, pp. 1–5.</li> | |
| <li>VeriGen: S. Thakur, B. Ahmad, H. Pearce, B. Tan, B. Dolan-Gavitt, R. Karri, and S. Garg, “Verigen: A large language model for verilog code generation,” ACM Transactions on Design Automation of Electronic Systems, vol. 29, no. 3, pp. 1–31, 2024. </li> | |
| <li>VerilogEval (I): M. Liu, N. Pinckney, B. Khailany, and H. Ren, “Verilogeval: Evaluating large language models for verilog code generation,” in 2023 IEEE/ACM International Conference on Computer Aided Design (ICCAD). IEEE, 2023, pp. 1–8.</li> | |
| <li>VerilogEval (II): N. Pinckney, C. Batten, M. Liu, H. Ren, and B. Khailany, “Revisiting VerilogEval: A Year of Improvements in Large-Language Models for Hardware Code Generation,” ACM Trans. Des. Autom. Electron. Syst., feb 2025. https://doi.org/10.1145/3718088</li> | |
| <li>RTLLM: Y. Lu, S. Liu, Q. Zhang, and Z. Xie, “Rtllm: An open-source benchmark for design rtl generation with large language model,” in 2024 29th Asia and South Pacific Design Automation Conference (ASP-DAC). IEEE, 2024, pp. 722–727.</li> | |
| </ul> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Accordion("📙 Citation", open=False): | |
| citation_button = gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| lines=14, | |
| elem_id="citation-button", | |
| show_copy_button=True, | |
| ) | |
| # event handlers, ugly way but it works | |
| task_radio.change( | |
| fn=update_benchmarks_by_task, | |
| inputs=[task_radio], | |
| outputs=[benchmark_radio, leaderboard], | |
| ) | |
| benchmark_radio.change( | |
| fn=filter_leaderboard, | |
| inputs=[ | |
| task_radio, | |
| benchmark_radio, | |
| model_type_dropdown, | |
| search_box, | |
| params_slider, | |
| ], | |
| outputs=leaderboard, | |
| ) | |
| model_type_dropdown.change( | |
| fn=filter_leaderboard, | |
| inputs=[ | |
| task_radio, | |
| benchmark_radio, | |
| model_type_dropdown, | |
| search_box, | |
| params_slider, | |
| ], | |
| outputs=leaderboard, | |
| ) | |
| search_box.change( | |
| fn=filter_leaderboard, | |
| inputs=[ | |
| task_radio, | |
| benchmark_radio, | |
| model_type_dropdown, | |
| search_box, | |
| params_slider, | |
| ], | |
| outputs=leaderboard, | |
| ) | |
| params_slider.change( | |
| fn=filter_leaderboard, | |
| inputs=[ | |
| task_radio, | |
| benchmark_radio, | |
| model_type_dropdown, | |
| search_box, | |
| params_slider, | |
| ], | |
| outputs=leaderboard, | |
| ) | |
| def on_benchmark_change(benchmark, _): | |
| if benchmark == "RTL-Repo": | |
| metric = "Exact Matching (EM)" | |
| return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot( | |
| benchmark, metric | |
| ) | |
| else: | |
| metric = non_rtl_metrics[0] | |
| return gr.update( | |
| choices=non_rtl_metrics[:-1], value=metric | |
| ), generate_scatter_plot(benchmark, metric) | |
| def on_metric_change(benchmark, metric): | |
| benchmark, metric = handle_special_cases(benchmark, metric) | |
| fig = generate_scatter_plot(benchmark, metric) | |
| return gr.update(value=benchmark), fig | |
| def on_simulator_change( | |
| simulator, | |
| task, | |
| benchmark, | |
| model_type, | |
| search, | |
| max_params, | |
| plot_bench, | |
| plot_metric, | |
| ): | |
| global df, df_agg | |
| if simulator == "Icarus": | |
| df, df_agg = df_icarus, df_agg_icarus | |
| else: | |
| df, df_agg = df_verilator, df_agg_verilator | |
| leaderboard_df = filter_leaderboard( | |
| task, benchmark, model_type, search, max_params | |
| ) | |
| fig = generate_scatter_plot(plot_bench, plot_metric) | |
| return leaderboard_df, fig | |
| bubble_benchmark.change( | |
| fn=on_benchmark_change, | |
| inputs=[bubble_benchmark, bubble_metric], | |
| outputs=[bubble_metric, scatter_plot], | |
| js=""" // this is to avoid resetting user scroll each time a plot is re-generated | |
| (benchmark, metric) => { | |
| let scrollY = window.scrollY; | |
| const observer = new MutationObserver(() => { | |
| window.scrollTo(0, scrollY); | |
| observer.disconnect(); | |
| }); | |
| observer.observe(document.getElementById('full-width-plot'), { childList: true }); | |
| return [benchmark, metric]; | |
| } | |
| """, | |
| ) | |
| bubble_metric.change( | |
| fn=on_metric_change, | |
| inputs=[bubble_benchmark, bubble_metric], | |
| outputs=[bubble_benchmark, scatter_plot], | |
| js=""" // this is to avoid resetting user scroll each time a plot is re-generated | |
| (benchmark, metric) => { | |
| let scrollY = window.scrollY; | |
| const observer = new MutationObserver(() => { | |
| window.scrollTo(0, scrollY); | |
| observer.disconnect(); | |
| }); | |
| observer.observe(document.getElementById('full-width-plot'), { childList: true }); | |
| return [benchmark, metric]; | |
| } | |
| """, | |
| ) | |
| simulator_radio.change( | |
| fn=on_simulator_change, | |
| inputs=[ | |
| simulator_radio, | |
| task_radio, | |
| benchmark_radio, | |
| model_type_dropdown, | |
| search_box, | |
| params_slider, | |
| bubble_benchmark, | |
| bubble_metric, | |
| ], | |
| outputs=[leaderboard, scatter_plot], | |
| ) | |
| app.launch( | |
| allowed_paths=[ | |
| "logo.png", | |
| "hpai_logo_grad.png", | |
| "bsc-logo.png", | |
| ] | |
| ) | |