File size: 1,595 Bytes
1146e96
 
 
 
 
 
fe1a8cf
a3311e7
 
1146e96
 
fe1a8cf
1146e96
 
 
 
66a9b43
fe1a8cf
68fefb1
 
1146e96
fe1a8cf
1146e96
fe1a8cf
 
1146e96
 
 
 
fe1a8cf
1146e96
 
fe1a8cf
a3311e7
d22d443
68fefb1
 
a3311e7
fe1a8cf
a3311e7
d22d443
 
 
a3311e7
fe1a8cf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""
Data loading functionality for the Tox21 leaderboard.
Handles loading and processing results from HuggingFace datasets.
"""

import pandas as pd
from datasets import load_dataset, Dataset
from config.settings import RESULTS_DATASET, TEST_DATASET, HF_TOKEN
from config.tasks import TOX21_TASKS


def load_leaderboard_data() -> Dataset:
    """
    Load leaderboard data from HuggingFace dataset.
    """
    print(f"Loading dataset: {RESULTS_DATASET}")
    print(f"Using HF token: {'Yes' if HF_TOKEN else 'No'}")

    # Load the dataset (token already set globally via login in settings)
    dataset = load_dataset(RESULTS_DATASET)
    print(f"Dataset loaded successfully. Keys: {dataset.keys()}")

    # Look for test split (more appropriate for results)
    if "test" in dataset:
        results_data = dataset["test"]
        print(f"Test split has {len(results_data)} entries")
        if len(results_data) > 0:
            print(f"First entry keys: {results_data[0].keys()}")
            print(f"First entry: {results_data[0]}")
        return results_data
    else:
        raise ValueError("Dataset does not contain a 'test' split.")


def load_test_dataset() -> tuple[list[str], list[dict[str, float]]]:
    # Get test smiles and labels (token already set globally via login in settings)
    dset = load_dataset(TEST_DATASET, split="test")

    tasks = [t.key for t in TOX21_TASKS]
    smiles = list(dset["smiles"])
    labels = []
    for sample in list(dset):
        labels.append({task: sample[task] for task in tasks})
    print(f"Loaded test dataset")
    return smiles, labels