TuRTLe-Leaderboard

Running

App Files Files Community

TuRTLe-Leaderboard / app.py

ggcristian

Add MLCAD 2025 citation

21a7ca2 6 months ago

raw

history blame

22.1 kB

	import sys

	import gradio as gr
	import pandas as pd
	import plotly.express as px
	from gradio.themes.utils import colors

	from results.parse import parse_agg, read_data
	from static.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
	from style.css_html_js import custom_css
	from utils import filter_bench, filter_bench_all, filter_RTLRepo, handle_special_cases


	def filter_leaderboard(task, benchmark, model_type, search_query, max_params):
	subset = df.copy()

	# Filter by task specific benchmarks when 'All' benchmarks is selected
	if task == "Spec-to-RTL":
	valid_benchmarks = s2r_benchs
	if benchmark == "All":
	subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
	elif task == "Code Completion":
	valid_benchmarks = cc_benchs
	if benchmark == "All":
	subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
	elif task == "Line Completion":
	valid_benchmarks = lc_benchs
	if benchmark == "All":
	subset = subset[subset["Benchmark"].isin(valid_benchmarks)]

	if benchmark != "All":
	subset = df[df["Benchmark"] == benchmark]

	if model_type != "All":
	# without emojis
	subset = subset[subset["Model Type"] == model_type.split(" ")[0]]
	if search_query:
	subset = subset[
	subset["Model"].str.contains(search_query, case=False, na=False)
	]
	max_params = float(max_params)
	subset = subset[subset["Params"] <= max_params]

	if benchmark == "All":
	if task == "Spec-to-RTL":
	return filter_bench_all(subset, df_agg, agg_column="Agg S2R")
	elif task == "Code Completion":
	return filter_bench_all(subset, df_agg, agg_column="Agg MC")
	elif task == "Line Completion":
	return filter_RTLRepo(subset)
	elif benchmark == "RTL-Repo":
	return filter_RTLRepo(subset)
	else:
	agg_column = None
	if benchmark == "VerilogEval S2R":
	agg_column = "Agg VerilogEval S2R"
	elif benchmark == "VerilogEval MC":
	agg_column = "Agg VerilogEval MC"
	elif benchmark == "RTLLM":
	agg_column = "Agg RTLLM"
	elif benchmark == "VeriGen":
	agg_column = "Agg VeriGen"

	return filter_bench(subset, df_agg, agg_column)


	def update_benchmarks_by_task(task):
	if task == "Spec-to-RTL":
	new_benchmarks = ["All"] + s2r_benchs
	elif task == "Code Completion":
	new_benchmarks = ["All"] + cc_benchs
	elif task == "Line Completion":
	new_benchmarks = lc_benchs
	else:
	new_benchmarks = ["All"] + benchmarks
	benchmark_value = "All" if "All" in new_benchmarks else new_benchmarks[0]
	filtered = filter_leaderboard(
	task,
	benchmark_value,
	model_type_dropdown.value,
	search_box.value,
	params_slider.value,
	)
	return gr.update(value=benchmark_value, choices=new_benchmarks), filtered


	def generate_scatter_plot(benchmark, metric):
	benchmark, metric = handle_special_cases(benchmark, metric)

	subset = df[df["Benchmark"] == benchmark]
	if benchmark == "RTL-Repo":
	subset = subset[subset["Metric"].str.contains("EM", case=False, na=False)]
	detailed_scores = subset.groupby("Model", as_index=False)["Score"].mean()
	detailed_scores.rename(columns={"Score": "Exact Matching (EM)"}, inplace=True)
	else:
	detailed_scores = subset.pivot_table(
	index="Model", columns="Metric", values="Score"
	).reset_index()

	details = df[["Model", "Params", "Model Type"]].drop_duplicates("Model")
	scatter_data = pd.merge(detailed_scores, details, on="Model", how="left").dropna(
	subset=["Params", metric]
	)

	scatter_data["x"] = scatter_data["Params"]
	scatter_data["y"] = scatter_data[metric]
	scatter_data["size"] = (scatter_data["x"] ** 0.3) * 40

	type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
	scatter_data["color"] = scatter_data["Model Type"].map(type_colors).fillna("gray")

	y_axis_limits = {
	"Functionality (FNC)": [5, 90],
	"Syntax (STX)": [20, 100],
	"Synthesis (SYN)": [5, 90],
	"Power": [0, 50],
	"Performance": [0, 50],
	"Area": [0, 50],
	"Exact Matching (EM)": [0, 50],
	}
	y_range = y_axis_limits.get(metric, [0, 80])

	fig = px.scatter(
	scatter_data,
	x="x",
	y="y",
	log_x=True,
	size="size",
	color="Model Type",
	text="Model",
	hover_data={metric: ":.2f"},
	title=f"Params vs. {metric} for {benchmark}",
	labels={"x": "# Params (Log Scale)", "y": metric},
	template="plotly_white",
	height=600,
	width=1200,
	)

	fig.update_traces(
	textposition="top center",
	textfont_size=10,
	marker=dict(opacity=0.8, line=dict(width=0.5, color="black")),
	)
	fig.update_layout(
	xaxis=dict(
	showgrid=True,
	type="log",
	tickmode="array",
	tickvals=[8, 14, 32, 72, 200, 700],
	ticktext=["8", "14", "32", "72", "200", "700"],
	),
	showlegend=False,
	yaxis=dict(range=y_range),
	margin=dict(l=50, r=50, t=50, b=50),
	plot_bgcolor="white",
	)

	return fig


	js_func = """
	function refresh() {
	const url = new URL(window.location);
	if (url.searchParams.get('__theme') !== 'light') {
	url.searchParams.set('__theme', 'light');
	window.location.href = url.href;
	}
	}
	"""

	with gr.Blocks(
	css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=colors.emerald)
	) as app:
	df_icarus, benchmarks, metrics, default_metric = read_data(
	"results/results_icarus.json"
	)
	df_agg_icarus = parse_agg("results/aggregated_scores_icarus.csv")
	df_verilator, _, _, _ = read_data("results/results_verilator.json")
	df_agg_verilator = parse_agg("results/aggregated_scores_verilator.csv")
	df = df_icarus
	df_agg = df_agg_icarus
	tasks = ["Spec-to-RTL", "Code Completion", "Line Completion"]
	s2r_benchs = ["VerilogEval S2R", "RTLLM"]
	cc_benchs = ["VerilogEval MC", "VeriGen"]
	lc_benchs = ["RTL-Repo"]
	non_rtl_metrics = [
	"Syntax (STX)",
	"Functionality (FNC)",
	"Synthesis (SYN)",
	"Power",
	"Performance",
	"Area",
	]
	rtl_metrics = ["Exact Matching (EM)"]
	model_types = ["All", "General 🟢", "Coding 🔵", "RTL-Specific 🔴"]

	gr.HTML(
	"""
	<div align="center">
	<img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/>
	</div>
	"""
	)
	gr.HTML(
	"""
	<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
	<script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
	<div style="text-align: center; margin-bottom: 0px; margin-top: 0px;">
	<a href="https://github.com/HPAI-BSC/TuRTLe" target="_blank" style="text-decoration: none; margin-right: 10px;">
	<button style="background: #333; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
	GitHub Repo
	</button>
	</a>

	<a href="http://arxiv.org/abs/2504.01986" target="_blank" style="text-decoration: none; margin-right: 10px;">
	<button style="background: #b31b1b; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
	arXiv MLCAD 2025
	</button>
	</a>

	<a href="mailto:hpai@bsc.es?subject=TuRTLe%20leaderboard%20new%20entry&body=Link%20to%20HuggingFace%20Model:" style="text-decoration: none;">
	<button style="background: #00674F; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
	How to submit
	</button>
	</a>
	<p style="margin-top: 15px;">If you have any inquiries or wish to collaborate:
	<a href="mailto:hpai@bsc.es">hpai@bsc.es</a>
	</p>
	</div>
	"""
	)
	gr.HTML(
	"""
	<div style=" margin-top:-10px !important;">
	<p style="margin-bottom: 15px; text-align: start !important;">Welcome to the TuRTLe Model Leaderboard! TuRTLe is a <b>unified evaluation framework designed to systematically assess Large Language Models (LLMs) in RTL (Register-Transfer Level) generation</b> for hardware design.
	Evaluation criteria include <b>syntax correctness, functional accuracy, synthesizability, and post-synthesis quality</b> (PPA: Power, Performance, Area). TuRTLe integrates multiple benchmarks to highlight strengths and weaknesses of available LLMs.
	Use the filters below to explore different RTL benchmarks, simulators and models.</p>
	<p style="margin-top:10px; text-align:start !important;"> <span style="font-variant:small-caps; font-weight:bold;">UPDATE (JULY 2025)</span>: Our TuRTLe paper has been accepted to <a href="https://mlcad.org/symposium/2025/" target="_blank"><b>MLCAD 2025</b></a> which will be held in September in Santa Cruz, California!</p>
	<p style="margin-top: -6px; text-align:start !important;"> <span style="font-variant:small-caps; font-weight:bold;">UPDATE (JULY 2025)</span>: Verilator has been added as an additional simulator alongside Icarus Verilog. You can now filter and compare results by simulator</p>
	<p style="margin-top: -6px; text-align: start !important; "><span style="font-variant: small-caps; font-weight: bold;">UPDATE (JUNE 2025)</span>: We make our framework open-source on GitHub and we add 7 new recent models! For a total of 40 base and instruct models and 5 RTL benchmarks</p>
	</div>
	"""
	)
	with gr.Tabs():
	with gr.Tab("Leaderboard"):
	with gr.Row(equal_height=True):
	with gr.Column(scale=4):
	task_radio = gr.Radio(
	choices=tasks, label="Select Task", value="Spec-to-RTL"
	)
	with gr.Column(scale=3):
	benchmark_radio = gr.Radio(
	choices=["All"] + s2r_benchs,
	label="Select Benchmark",
	value="All",
	)
	with gr.Column(scale=2, min_width=180):
	simulator_radio = gr.Radio(
	choices=["Icarus", "Verilator"],
	value="Icarus",
	label="Select Simulator",
	scale=1,
	)

	with gr.Row(equal_height=True):
	search_box = gr.Textbox(
	label="Search Model",
	placeholder="Type model name...",
	scale=2,
	)
	model_type_dropdown = gr.Radio(
	choices=model_types,
	label="Select Model Type",
	value="All",
	scale=3,
	)
	params_slider = gr.Slider(
	minimum=df["Params"].min(),
	maximum=700,
	value=700,
	label="Max Params",
	step=1,
	scale=2,
	)

	leaderboard = gr.DataFrame(
	value=filter_leaderboard("Spec-to-RTL", "All", "All", "", 700),
	headers="first row",
	show_row_numbers=True,
	wrap=True,
	datatype=[
	"markdown",
	"html",
	],
	interactive=False,
	column_widths=[
	"7%",
	"24%",
	"17%",
	"10%",
	"13%",
	"10%",
	"14%",
	],
	elem_classes="dataframe-leaderboard",
	)

	with gr.Tab("Plot View"):
	with gr.Row(equal_height=True):
	default_benchmark = s2r_benchs[0]
	bubble_benchmark = gr.Dropdown(
	choices=benchmarks,
	label="Select Benchmark",
	value=default_benchmark,
	elem_classes="gr-dropdown",
	)
	default_metric = non_rtl_metrics[0]
	bubble_metric = gr.Dropdown(
	choices=non_rtl_metrics,
	label="Select Metric",
	value=default_metric,
	)
	with gr.Row(equal_height=True):
	scatter_plot = gr.Plot(
	value=generate_scatter_plot(default_benchmark, default_metric),
	label="Bubble Chart",
	elem_id="full-width-plot",
	)

	with gr.Tab("Metrics Information"):
	with open("./static/metrics.md", "r") as file:
	gr.Markdown(
	file.read(),
	latex_delimiters=[
	{"left": "$$", "right": "$$", "display": True},
	{"left": "$", "right": "$", "display": False},
	],
	elem_classes="metrics-page",
	)
	with gr.Tab("About Us"):
	gr.HTML(
	"""
	<div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;">
	<div style="display: flex; justify-content: center; align-items: center; gap: 5%; margin-bottom: 20px;">
	<img src='/gradio_api/file=hpai_logo_grad.png' alt='HPAI Group Logo' style="width: 45%;"/>
	<img src='/gradio_api/file=bsc-logo.png' alt='BSC Logo' style="width: 25%;"/>
	</div>

	<p style="font-size: 16px; text-align: start;">
	The <b>High-Performance Artificial Intelligence (HPAI)</b> group is part of the
	<a href="https://bsc.es/" target="_blank">Barcelona Supercomputing Center (BSC)</a>.
	This leaderboard is maintained by HPAI as part of our commitment to <b>open science</b>.
	</p>

	<ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;">
	<li><a href="https://hpai.bsc.es/" target="_blank">HPAI Website</a></li>
	<li><a href="https://github.com/HPAI-BSC/" target="_blank">HPAI GitHub Organization Page</a></li>
	<li><a href="https://huggingface.co/HPAI-BSC/" target="_blank">HPAI Hugging Face Organization Page</a></li>
	</ul>

	<p style="font-size: 16px; margin-top: 15px;">
	Feel free to contact us:
	</p>

	<p style="font-size: 16px;">Email: <a href="mailto:hpai@bsc.es"><b>hpai@bsc.es</b></a></p>
	</div>
	"""
	)
	with gr.Tab("References"):
	gr.HTML(
	"""
	<div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;">
	<ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;">
	<li><a href="https://github.com/bigcode-project/bigcode-evaluation-harness" target="_blank">Code Generation LM Evaluation Harness</a></li>
	<li>Williams, S. Icarus Verilog [Computer software]. <a href="https://github.com/steveicarus/iverilog" target="_blank">https://github.com/steveicarus/iverilog</a></li>
	<li>Snyder, W., Wasson, P., Galbi, D., & et al. Verilator [Computer software]. <a href="https://github.com/verilator/verilator" target="_blank">https://github.com/verilator/verilator</a></li>
	<li>RTL-Repo: Allam and M. Shalan, “Rtl-repo: A benchmark for evaluating llms on large-scale rtl design projects,” in 2024 IEEE LLM Aided Design Workshop (LAD). IEEE, 2024, pp. 1–5.</li>
	<li>VeriGen: S. Thakur, B. Ahmad, H. Pearce, B. Tan, B. Dolan-Gavitt, R. Karri, and S. Garg, “Verigen: A large language model for verilog code generation,” ACM Transactions on Design Automation of Electronic Systems, vol. 29, no. 3, pp. 1–31, 2024. </li>
	<li>VerilogEval (I): M. Liu, N. Pinckney, B. Khailany, and H. Ren, “Verilogeval: Evaluating large language models for verilog code generation,” in 2023 IEEE/ACM International Conference on Computer Aided Design (ICCAD). IEEE, 2023, pp. 1–8.</li>
	<li>VerilogEval (II): N. Pinckney, C. Batten, M. Liu, H. Ren, and B. Khailany, “Revisiting VerilogEval: A Year of Improvements in Large-Language Models for Hardware Code Generation,” ACM Trans. Des. Autom. Electron. Syst., feb 2025. https://doi.org/10.1145/3718088</li>
	<li>RTLLM: Y. Lu, S. Liu, Q. Zhang, and Z. Xie, “Rtllm: An open-source benchmark for design rtl generation with large language model,” in 2024 29th Asia and South Pacific Design Automation Conference (ASP-DAC). IEEE, 2024, pp. 722–727.</li>
	</ul>
	</div>
	"""
	)
	with gr.Row():
	with gr.Accordion("📙 Citation", open=False):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	lines=14,
	elem_id="citation-button",
	show_copy_button=True,
	)

	# event handlers, ugly way but it works
	task_radio.change(
	fn=update_benchmarks_by_task,
	inputs=[task_radio],
	outputs=[benchmark_radio, leaderboard],
	)
	benchmark_radio.change(
	fn=filter_leaderboard,
	inputs=[
	task_radio,
	benchmark_radio,
	model_type_dropdown,
	search_box,
	params_slider,
	],
	outputs=leaderboard,
	)
	model_type_dropdown.change(
	fn=filter_leaderboard,
	inputs=[
	task_radio,
	benchmark_radio,
	model_type_dropdown,
	search_box,
	params_slider,
	],
	outputs=leaderboard,
	)
	search_box.change(
	fn=filter_leaderboard,
	inputs=[
	task_radio,
	benchmark_radio,
	model_type_dropdown,
	search_box,
	params_slider,
	],
	outputs=leaderboard,
	)
	params_slider.change(
	fn=filter_leaderboard,
	inputs=[
	task_radio,
	benchmark_radio,
	model_type_dropdown,
	search_box,
	params_slider,
	],
	outputs=leaderboard,
	)

	def on_benchmark_change(benchmark, _):
	if benchmark == "RTL-Repo":
	metric = "Exact Matching (EM)"
	return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot(
	benchmark, metric
	)
	else:
	metric = non_rtl_metrics[0]
	return gr.update(
	choices=non_rtl_metrics[:-1], value=metric
	), generate_scatter_plot(benchmark, metric)

	def on_metric_change(benchmark, metric):
	benchmark, metric = handle_special_cases(benchmark, metric)
	fig = generate_scatter_plot(benchmark, metric)
	return gr.update(value=benchmark), fig

	def on_simulator_change(
	simulator,
	task,
	benchmark,
	model_type,
	search,
	max_params,
	plot_bench,
	plot_metric,
	):
	global df, df_agg
	if simulator == "Icarus":
	df, df_agg = df_icarus, df_agg_icarus
	else:
	df, df_agg = df_verilator, df_agg_verilator

	leaderboard_df = filter_leaderboard(
	task, benchmark, model_type, search, max_params
	)
	fig = generate_scatter_plot(plot_bench, plot_metric)
	return leaderboard_df, fig

	bubble_benchmark.change(
	fn=on_benchmark_change,
	inputs=[bubble_benchmark, bubble_metric],
	outputs=[bubble_metric, scatter_plot],
	js=""" // this is to avoid resetting user scroll each time a plot is re-generated
	(benchmark, metric) => {
	let scrollY = window.scrollY;
	const observer = new MutationObserver(() => {
	window.scrollTo(0, scrollY);
	observer.disconnect();
	});
	observer.observe(document.getElementById('full-width-plot'), { childList: true });
	return [benchmark, metric];
	}
	""",
	)

	bubble_metric.change(
	fn=on_metric_change,
	inputs=[bubble_benchmark, bubble_metric],
	outputs=[bubble_benchmark, scatter_plot],
	js=""" // this is to avoid resetting user scroll each time a plot is re-generated
	(benchmark, metric) => {
	let scrollY = window.scrollY;
	const observer = new MutationObserver(() => {
	window.scrollTo(0, scrollY);
	observer.disconnect();
	});
	observer.observe(document.getElementById('full-width-plot'), { childList: true });
	return [benchmark, metric];
	}
	""",
	)

	simulator_radio.change(
	fn=on_simulator_change,
	inputs=[
	simulator_radio,
	task_radio,
	benchmark_radio,
	model_type_dropdown,
	search_box,
	params_slider,
	bubble_benchmark,
	bubble_metric,
	],
	outputs=[leaderboard, scatter_plot],
	)


	app.launch(
	allowed_paths=[
	"logo.png",
	"hpai_logo_grad.png",
	"bsc-logo.png",
	]
	)