"""
Content management for the Tox21 leaderboard frontend.
Contains all text, styling, and data formatting - separated from UI layout.
"""
import pandas as pd
from typing import Dict, List
from config.tasks import TOX21_TASKS, get_task_groups
from config.settings import APP_TITLE, APP_DESCRIPTION
class LeaderboardContent:
"""Content for the leaderboard tab"""
title = "Tox21 Leaderboard š§Ŗ"
subtitle = "Measuring AI progress in Drug Discovery"
@staticmethod
def get_header_html() -> str:
"""Generate header HTML"""
return f"""
"""
@staticmethod
def get_info_html() -> str:
"""Generate info section HTML"""
return """
Avg. AUC: Mean ROC-AUC across all 12 tasks
Avg. ĪAUC-PR: Mean ĪAUC-PR across all 12 tasks
Rank: based on Avg. AUC
Type: 0ļøā£ Zero-shot | 1ļøā£ Few-shot | ā¤µļø Pre-trained | š¼ Models trained from scratch
"""
class AboutContent:
"""Content for the about tab"""
@staticmethod
def get_markdown_content() -> str:
"""Generate about page markdown content"""
return f"""
# About the Tox21 Leaderboard
{APP_DESCRIPTION}
## Overview
The **Tox21 Leaderboard** provides a standardized and reproducible evaluation framework for molecular toxicity prediction models.
It restores the original evaluation protocol of the 2014ā2015 **Tox21 Data Challenge**, ensuring that modern models can be compared under identical conditions using the original test set of 647 compounds.
This leaderboard addresses inconsistencies introduced by later benchmark integrations (e.g., MoleculeNet, TDC, OGB), where the dataset was altered through label imputation, resampling, and metric changes.
By aligning evaluation with the original challenge data, the leaderboard enables a faithful assessment of genuine progress in bioactivity modeling over the past decade.
## How it works
- **Hosted on Hugging Face Spaces:** The leaderboard Space stores the original Tox21 test set and orchestrates evaluation.
- **Model submission:** Participants provide a public Hugging Face Space exposing a `FastAPI` endpoint that accepts SMILES strings and returns predicted probabilities for the 12 toxicity endpoints.
- **Evaluation process:** The leaderboard sends the test compounds to the modelās API, receives predictions, computes per-endpoint AUC scores, and appends results to a public results dataset.
- **Manual approval:** Each submission is verified by the maintainers for completeness, correctness, and reproducibility before publication.
## FastAPI template
To simplify participation, we provide a minimal **FastAPI template** that defines a `/predict` endpoint.
Developers only need to adapt the `predict_fn()` function to include their modelās preprocessing and inference logic.
This interface ensures:
- Compatibility with the leaderboard orchestrator,
- Transparent, reproducible evaluation, and
- External accessibility for research or industry partners.
Example implementation: [`ml-jku/tox21_gin_classifier`](https://huggingface.co/spaces/ml-jku/tox21_gin_classifier).
## Evaluation protocol
- **Task:** 12 binary toxicity classification endpoints (NR and SR assays).
- **Input:** SMILES strings of the 647 original Tox21 test molecules.
- **Output:** Probabilities in `[0, 1]` for each moleculeāendpoint pair.
- **Metric:** ROC-AUC per endpoint, averaged across all 12 tasks (macro AUC).
- **Integrity:** The original split and label sparsity are preserved; no imputation or data alteration is applied.
## Baselines
The leaderboard includes reference implementations of key model families:
- Descriptor-based models (e.g., DeepTox, SNN)
- Graph neural networks (e.g., GIN, Chemprop)
- Classical machine learning (e.g., RF, XGBoost)
- Foundation models (e.g., TabPFN, GPT-OSS)
These baselines form the foundation for future community submissions and progress tracking.
## Responsible use
This leaderboard is intended for **research benchmarking only**.
Predictions are not suitable for clinical or regulatory decision-making without experimental validation.
## Citation
If you use this leaderboard in your research, please cite:
```bibtex
-
"""
class SubmissionContent:
"""Content for the submission tab"""
emoji = "š"
title = "Submit Your Model"
form_labels = {
"model_name": "*Model Name",
"hf_space_tag": "*HuggingFace Space Tag",
"model_description": "*Model Description",
"organization": "*Organization",
"model_size": "*Model Size",
"publication_title": "*Publication Title",
"publication_link": "*Publication Link",
"pretrained": "*Pretrained (y/n)",
"zero_shot": "*Zero shot (y/n)",
"n_shot": "*N-shot",
"few_shot": "*Few-shot (y/n)",
"pretraining_data": "*Pretraining Data"
}
form_placeholders = {
"model_name": "e.g., AwesomeTox",
"hf_space_tag": "e.g., username/model-name",
"model_description": "Brief description of your model architecture and approach...",
"organization": "e.g., University of Example",
"model_size": "e.g., 150M",
"publication_title": "Title of associated paper",
"publication_link": "https://arxiv.org/abs/...",
"pretrained": "Yes/No",
"zero_shot": "Yes/No",
"n_shot": "e.g. 5",
"few_shot": "Yes/No",
"pretraining_data": "e.g., ChEMBL 29, ZINC-15"
}
form_info = {
"model_name": "A short, descriptive name for your model",
"hf_space_tag": "Your HuggingFace space in format: username/space-name",
"model_description": "Describe your model, methodology, and key features"
}
@staticmethod
def get_instructions_html() -> str:
"""Generate submission instructions HTML"""
return """
Submit your HuggingFace Space for evaluation on the Tox21 benchmark.
Requirements:
- Your Space must implement two HTTP endpoints:
/metadata and /predict.
- The
/predict endpoint has to be implemented in the form of a predict function in predict.py. This endpoint must accept a JSON payload of the form:
{"smiles": [list_of_smiles]}
- The
/predict endpoint must return a dictionary mapping each SMILES string to a dictionary of task scores:{"smiles": {"task": pred, ...} ... }
- Your Space must provide a file calles
train.py which implements the training process of the model used in predict.py
For a detailed explanation have a look at this Example Space.
Evaluation flow:
- After submission, the leaderboard will automatically send evaluation SMILES to your
/predict endpoint.
- The returned results will be used to compute benchmark metrics.
- The automatic evaluation after submitting can take a few moments. If the submission is valid, a success message will be displayed beneath the form, an error message otherwise. Before the scores enter the public leaderboard, they will be manually verified, so the results will
not be visible on the leaderboard immediately after a successful submission.
* Required fields
"""