tox21_leaderboard / backend /submission.py
Tschoui's picture
🐛 Fix nbr parameter type issue when submitting new models
c1afbc3
import asyncio
from typing import Dict, Any, List
from sklearn.metrics import roc_auc_score, average_precision_score
from .evaluator import evaluate_model
from .schema import create_submission_record
from .dataset_storage import save_submission_to_dataset
from config.tasks import TOX21_TASKS
async def process_submission(
model_name: str,
hf_space_tag: str,
model_description: str,
organization: str,
model_size: int, # Changed from str to int
publication_title: str,
publication_link: str,
pretrained: bool,
pretraining_data: str,
zero_shot: bool,
few_shot: bool,
n_shot: str,
smiles_list: List[str],
true_labels: Dict[str, Dict[str, float]],
) -> Dict[str, Any]:
"""Process a complete submission from evaluation to metrics computation."""
# Step 1: Evaluate the model
evaluation_result = await evaluate_model(hf_space_tag, smiles_list)
# Step 2: Compute metrics
metrics = compute_metrics(evaluation_result["results"], true_labels)
# Step 3: Create the submission record
record = create_submission_record(
model_name=model_name,
hf_space_tag=hf_space_tag,
model_description=model_description,
organization=organization,
model_size=model_size,
publication_title=publication_title,
publication_link=publication_link,
pretrained=pretrained,
pretraining_data=pretraining_data,
zero_shot=zero_shot,
few_shot=few_shot,
n_shot=n_shot,
raw_predictions=evaluation_result["results"],
computed_metrics=metrics,
status="completed",
approved=False,
)
# Step 4: Save to HuggingFace dataset
save_submission_to_dataset(record)
return record
def compute_metrics(
predictions: List[Dict[str, Any]], true_labels: List[Dict[str, float]]
) -> Dict[str, Any]:
"""Compute evaluation metrics from predictions and true labels."""
# Simple placeholder - you'll want to implement proper ROC-AUC computation
task_metrics = {}
# Get all unique tasks
if predictions:
tasks = [task.key for task in TOX21_TASKS]
for task in tasks:
# arrange labels and predictions for evaluation
y_true = []
y_pred = []
for i, sample in enumerate(predictions):
smiles = sample["smiles"]
label = true_labels[i].get(task, None)
if label is None:
continue
y_true.append(int(label))
y_pred.append(sample["raw_predictions"][task])
# calculate score
random_clf_auprc = sum(y_true) / len(y_true)
auprc_score = average_precision_score(y_true=y_true, y_score=y_pred)
delta_auprc_score = auprc_score - random_clf_auprc
roc_auc_score_ = roc_auc_score(y_true=y_true, y_score=y_pred)
task_metrics[task] = {
"roc_auc": roc_auc_score_,
"delta_auprc": delta_auprc_score,
}
# Overall score (average of task scores)
if task_metrics:
overall_roc_auc_score = sum(
m["roc_auc"] for m in task_metrics.values()
) / len(task_metrics)
overall_delta_auprc_score = sum(
m["delta_auprc"] for m in task_metrics.values()
) / len(task_metrics)
task_metrics["overall_score"] = {
"roc_auc": overall_roc_auc_score,
"delta_auprc": overall_delta_auprc_score,
}
return task_metrics