| import gradio as gr |
| import pandas as pd |
| import json |
| import os |
| import matplotlib.pyplot as plt |
| import numpy as np |
|
|
| def create_benchmark_plot(df): |
| if df.empty: |
| return None |
|
|
| df_copy = df.copy() |
| score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] |
| |
| for col in score_columns: |
| df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce').fillna(0) |
| |
| df_copy['Total_Score'] = df_copy[score_columns].sum(axis=1) |
| |
| df_sorted = df_copy.sort_values(by='Total_Score', ascending=False) |
| |
| if len(df_sorted) > 10: |
| top_models = df_sorted.head(10) |
| else: |
| top_models = df_sorted |
| |
| benchmarks = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] |
| models = top_models['Model'].unique() |
| |
| x = np.arange(len(benchmarks)) |
| width = 0.8 / len(models) if len(models) > 0 else 0.8 |
| |
| fig, ax = plt.subplots(figsize=(30, 10)) |
| |
| all_scores = [] |
| for i, model in enumerate(models): |
| model_data = top_models[top_models['Model'] == model] |
| scores = [model_data[benchmark].values[0] if not model_data[benchmark].empty else 0 for benchmark in benchmarks] |
| all_scores.extend(scores) |
| offset = width * i - (width * (len(models) - 1) / 2) |
| rects = ax.bar(x + offset, scores, width, label=model) |
| ax.bar_label(rects, padding=3) |
|
|
| ax.set_ylabel('Scores') |
| ax.set_xticks(x) |
| ax.set_xticklabels(benchmarks, rotation=45, ha="right") |
| ax.legend(loc='lower right') |
| |
| if all_scores: |
| ax.set_ylim(top=max(all_scores) * 1.15) |
|
|
| plt.tight_layout() |
| |
| return fig |
|
|
| def load_leaderboard_data(): |
| data = [] |
| benchmarks_dir = "benchmarks" |
| |
| mmlu_categories = { |
| "mmlu_professional": [ |
| "mmlu_professional_accounting", "mmlu_professional_law", |
| "mmlu_professional_medicine", "mmlu_professional_psychology" |
| ], |
| "mmlu_college": [ |
| "mmlu_college_biology", "mmlu_college_chemistry", "mmlu_college_computer_science", |
| "mmlu_college_mathematics", "mmlu_college_medicine", "mmlu_college_physics" |
| ], |
| "mmlu_high_school": [ |
| "mmlu_high_school_biology", "mmlu_high_school_chemistry", "mmlu_high_school_computer_science", |
| "mmlu_high_school_european_history", "mmlu_high_school_geography", |
| "mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics", |
| "mmlu_high_school_mathematics", "mmlu_high_school_microeconomics", |
| "mmlu_high_school_physics", "mmlu_high_school_psychology", |
| "mmlu_high_school_statistics", "mmlu_high_school_us_history", |
| "mmlu_high_school_world_history" |
| ] |
| } |
| |
| all_mmlu_scores = [ |
| "mmlu_abstract_algebra", "mmlu_anatomy", "mmlu_astronomy", "mmlu_business_ethics", |
| "mmlu_clinical_knowledge", "mmlu_college_biology", "mmlu_college_chemistry", |
| "mmlu_college_computer_science", "mmlu_college_mathematics", "mmlu_college_medicine", |
| "mmlu_college_physics", "mmlu_computer_security", "mmlu_conceptual_physics", |
| "mmlu_econometrics", "mmlu_electrical_engineering", "mmlu_elementary_mathematics", |
| "mmlu_formal_logic", "mmlu_global_facts", "mmlu_high_school_biology", |
| "mmlu_high_school_chemistry", "mmlu_high_school_computer_science", |
| "mmlu_high_school_european_history", "mmlu_high_school_geography", |
| "mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics", |
| "mmlu_high_school_mathematics", "mmlu_high_school_microeconomics", |
| "mmlu_high_school_physics", "mmlu_high_school_psychology", |
| "mmlu_high_school_statistics", "mmlu_high_school_us_history", |
| "mmlu_high_school_world_history", "mmlu_human_aging", "mmlu_human_sexuality", |
| "mmlu_humanities", "mmlu_international_law", "mmlu_jurisprudence", |
| "mmlu_logical_fallacies", "mmlu_machine_learning", "mmlu_management", |
| "mmlu_marketing", "mmlu_medical_genetics", "mmlu_miscellaneous", |
| "mmlu_moral_disputes", "mmlu_moral_scenarios", "mmlu_nutrition", "mmlu_other", |
| "mmlu_philosophy", "mmlu_prehistory", "mmlu_professional_accounting", |
| "mmlu_professional_law", "mmlu_professional_medicine", |
| "mmlu_professional_psychology", "mmlu_public_relations", "mmlu_security_studies", |
| "mmlu_social_sciences", "mmlu_sociology", "mmlu_stem", "mmlu_us_foreign_policy", |
| "mmlu_virology", "mmlu_world_religions" |
| ] |
|
|
| other_mmlu_scores = [s for s in all_mmlu_scores if s not in sum(mmlu_categories.values(), [])] |
| mmlu_categories["mmlu_other"] = other_mmlu_scores |
|
|
| for filename in os.listdir(benchmarks_dir): |
| if filename.endswith(".json") and filename.startswith("results_"): |
| filepath = os.path.join(benchmarks_dir, filename) |
| with open(filepath, 'r') as f: |
| content = json.load(f) |
| |
| model_name = content.get("model_name") |
| if not model_name: |
| model_name = os.path.splitext(filename)[0] |
| |
| if model_name.endswith('/'): |
| model_name = model_name.rstrip('/') |
|
|
| model_name = os.path.basename(model_name) |
| |
| results = content.get("results", {}) |
| ifeval_score = results.get("ifeval", {}).get("prompt_level_strict_acc,none") |
| mmlu_score = results.get("mmlu", {}).get("acc,none") |
|
|
| row = {"Model": model_name, "IFEval": ifeval_score, "MMLU": mmlu_score} |
|
|
| for score_name in all_mmlu_scores: |
| row[score_name] = results.get(score_name, {}).get("acc,none") |
|
|
| for category, scores in mmlu_categories.items(): |
| category_scores = [pd.to_numeric(row.get(s), errors='coerce') for s in scores] |
| category_scores = [s for s in category_scores if pd.notna(s)] |
| if category_scores: |
| row[category] = sum(category_scores) / len(category_scores) |
| else: |
| row[category] = np.nan |
| |
| data.append(row) |
| |
| df_raw = pd.DataFrame(data) |
| |
| numeric_cols = [col for col in df_raw.columns if col != 'Model'] |
| for col in numeric_cols: |
| df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce') |
|
|
| score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] |
| for col in score_columns: |
| df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce').fillna(0) |
| |
| df_raw['Total_Score'] = df_raw[score_columns].sum(axis=1) |
|
|
| df_sorted = df_raw.sort_values(by='Total_Score', ascending=False) |
| |
| df = df_sorted.drop_duplicates(subset=['Model'], keep='first').copy() |
| |
| df = df.drop(columns=['Total_Score']) |
|
|
| for col in numeric_cols: |
| df[col] = df[col].apply(lambda x: round(x, 4) if pd.notna(x) else x) |
|
|
| df.fillna(0, inplace=True) |
| |
| return df |
|
|
| def style_diff(df, all_data_df): |
| def highlight_max(s): |
| s_numeric = pd.to_numeric(s, errors='coerce') |
| max_val = s_numeric.max() |
| return ['background-color: #68a055' if v == max_val else '' for v in s_numeric] |
|
|
| def highlight_min(s): |
| s_numeric = pd.to_numeric(s, errors='coerce') |
| s_filtered = s_numeric[s_numeric > 0] |
| if s_filtered.empty: |
| return ['' for _ in s_numeric] |
| min_val = s_filtered.min() |
| return ['background-color: #d4605b' if v == min_val else '' for v in s_numeric] |
|
|
| df_styler = df.style |
| for col in df.columns: |
| if col != 'Model': |
| numeric_col = pd.to_numeric(df[col], errors='coerce') |
| if not numeric_col.isnull().all(): |
| df_styler = df_styler.apply(highlight_max, subset=[col], axis=0) |
| df_styler = df_styler.apply(highlight_min, subset=[col], axis=0) |
| return df_styler |
|
|
| def prepare_plot_data(df, all_cols=False): |
| df_plot = df.copy() |
| |
| if not df_plot.empty: |
| if all_cols: |
| score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] |
| for col in score_columns: |
| df_plot[col] = pd.to_numeric(df_plot[col], errors='coerce').fillna(0) |
| df_plot['Total_Score'] = df_plot[score_columns].sum(axis=1) |
| df_plot = df_plot.sort_values(by='Total_Score', ascending=False).reset_index(drop=True) |
| df_plot = df_plot.head(10) |
| df_plot['Ranked_Model'] = [f"{i+1:02d}. {model}" for i, model in enumerate(df_plot['Model'])] |
| else: |
| df_plot['MMLU_IFEval_Combined'] = df_plot['MMLU'].fillna(0) + df_plot['IFEval'].fillna(0) |
| df_plot = df_plot.sort_values(by='MMLU_IFEval_Combined', ascending=False).reset_index(drop=True) |
| |
| return df_plot |
|
|
| initial_df = load_leaderboard_data() |
| display_cols = ['Model', 'IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] |
| display_df = initial_df[display_cols].copy() |
| for col in display_df.columns: |
| if col != 'Model': |
| display_df[col] = pd.to_numeric(display_df[col], errors='coerce').fillna(0) |
|
|
| with gr.Blocks() as demo: |
| gr.Markdown("# Model Leaderboard") |
|
|
| def update_plots(selected_models): |
| if not selected_models: |
| df_to_plot = initial_df |
| else: |
| df_to_plot = initial_df[initial_df['Model'].isin(selected_models)] |
|
|
| scatter_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=False) |
| |
| padding_factor = 0.1 |
| min_padding = 0.05 |
|
|
| if not scatter_plot_df.empty: |
| x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max() |
| x_range = x_max - x_min |
| x_padding = max(x_range * padding_factor, min_padding) if x_range > 0 else min_padding |
| x_lim = [x_min - x_padding, x_max + x_padding] |
|
|
| y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max() |
| y_range = y_max - y_min |
| y_padding = max(y_range * padding_factor, min_padding) if y_range > 0 else min_padding |
| y_lim = [y_min - y_padding, y_max + y_padding] |
| else: |
| x_lim = [0, 1] |
| y_lim = [0, 1] |
| scatter_plot_df = pd.DataFrame(columns=['Model', 'MMLU', 'IFEval', 'MMLU_IFEval_Combined']) |
|
|
| scatter_plot_update = gr.ScatterPlot( |
| value=scatter_plot_df, |
| x="MMLU", |
| y="IFEval", |
| color="Model", |
| title="Model Performance", |
| x_lim=x_lim, |
| y_lim=y_lim, |
| ) |
|
|
| bar_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=True) |
| |
| if not bar_plot_df.empty: |
| value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] |
| melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars, |
| var_name='Benchmark', value_name='Score') |
| else: |
| melted_df = pd.DataFrame(columns=['Ranked_Model', 'Benchmark', 'Score']) |
|
|
| bar_plot_update = gr.BarPlot( |
| value=melted_df, |
| x="Score", |
| y="Ranked_Model", |
| color="Benchmark", |
| title="MMLU and IFEval Scores by Model", |
| x_title="Score", |
| y_title="Model", |
| color_legend_title="Benchmark", |
| vertical=False, |
| ) |
| |
| benchmark_plot_update = create_benchmark_plot(df_to_plot) |
|
|
| if not selected_models: |
| df_to_display = display_df |
| styled_df = style_diff(df_to_display, initial_df) |
| else: |
| df_to_display = display_df[display_df['Model'].isin(selected_models)] |
| styled_df = style_diff(df_to_display, initial_df) |
| |
| return scatter_plot_update, bar_plot_update, benchmark_plot_update, styled_df |
|
|
| with gr.Accordion("Plots", open=True): |
| with gr.Tabs(): |
| with gr.TabItem("Summary Plots"): |
| with gr.Row(): |
| scatter_plot_df = prepare_plot_data(initial_df.copy(), all_cols=False) |
| |
| padding_factor = 0.1 |
| min_padding = 0.05 |
|
|
| x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max() |
| x_range = x_max - x_min |
| x_padding = max(x_range * padding_factor, min_padding) |
| x_lim = [x_min - x_padding, x_max + x_padding] |
|
|
| y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max() |
| y_range = y_max - y_min |
| y_padding = max(y_range * padding_factor, min_padding) |
| y_lim = [y_min - y_padding, y_max + y_padding] |
|
|
| scatterplot = gr.ScatterPlot( |
| value=scatter_plot_df, |
| x="MMLU", |
| y="IFEval", |
| color="Model", |
| title="Model Performance", |
| x_lim=x_lim, |
| y_lim=y_lim, |
| ) |
|
|
| bar_plot_df = prepare_plot_data(initial_df.copy(), all_cols=True) |
| value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] |
| melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars, |
| var_name='Benchmark', value_name='Score') |
|
|
| barplot = gr.BarPlot( |
| value=melted_df, |
| x="Score", |
| y="Ranked_Model", |
| color="Benchmark", |
| title="MMLU and IFEval Scores by Model", |
| x_title="Score", |
| y_title="Model", |
| color_legend_title="Benchmark", |
| vertical=False, |
| ) |
| with gr.TabItem("Benchmark Comparison"): |
| with gr.Row(): |
| benchmark_plot = gr.Plot(value=create_benchmark_plot(initial_df)) |
|
|
| model_names = initial_df["Model"].tolist() |
| model_selector = gr.Dropdown( |
| choices=model_names, |
| label="Select Models to Display", |
| multiselect=True, |
| info="Select one or more models to display on the plots. If none are selected, all models will be shown." |
| ) |
|
|
| with gr.Row(): |
| dataframe = gr.DataFrame( |
| value=style_diff(display_df, initial_df), |
| type="pandas", |
| column_widths=["30%", "10%", "10%", "12%", "10%", "10%", "10%"], |
| wrap=True |
| ) |
| |
| model_selector.change(update_plots, inputs=model_selector, outputs=[scatterplot, barplot, benchmark_plot, dataframe]) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|