Spaces:

ml-jku
/

tox21_grover_classifier

Sleeping

App Files Files Community

hiitsmeme commited on 11 days ago

Commit

f986893

1 Parent(s): b25d2b6

added grover code, hf api files

Browse files

Files changed (35) hide show

Dockerfile +16 -0
generate_features.py +2 -1
grover +0 -1
grover/data/__init__.py +7 -0
grover/data/dist_sampler.py +137 -0
grover/data/groverdataset.py +247 -0
grover/data/moldataset.py +245 -0
grover/data/molfeaturegenerator.py +146 -0
grover/data/molgraph.py +378 -0
grover/data/scaler.py +70 -0
grover/data/task_labels.py +116 -0
grover/data/torchvocab.py +190 -0
grover/model/layers.py +902 -0
grover/model/models.py +506 -0
grover/util/metrics.py +122 -0
grover/util/multi_gpu_wrapper.py +110 -0
grover/util/nn_utils.py +96 -0
grover/util/parsing.py +487 -0
grover/util/scheduler.py +97 -0
grover/util/utils.py +797 -0
prepare_data.py +2 -1
requirements.txt +82 -0
scripts/__init__.py +0 -0
scripts/build_vocab.py +41 -0
scripts/save_features.py +127 -0
scripts/split_data.py +87 -0
src/commands.py +10 -0
task/__init__.py +0 -0
task/cross_validate.py +69 -0
task/fingerprint.py +79 -0
task/grovertrainer.py +279 -0
task/predict.py +316 -0
task/pretrain.py +241 -0
task/run_evaluation.py +157 -0
task/train.py +454 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.11.4
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

generate_features.py CHANGED Viewed

@@ -6,4 +6,5 @@ from src.commands import generate_features
 TRAIN_CSV = "./tox21/tox21_train_clean.csv"
 VAL_CSV = "./tox21/tox21_validation_clean.csv"
-generate_features(EXAMPLES_CSV, EXAMPLES_CSV.replace('.csv', '.npz'))

 TRAIN_CSV = "./tox21/tox21_train_clean.csv"
 VAL_CSV = "./tox21/tox21_validation_clean.csv"
+generate_features(TRAIN_CSV, TRAIN_CSV.replace('.csv', '.npz'))
+generate_features(VAL_CSV, VAL_CSV.replace('.csv', '.npz'))

grover DELETED Viewed

	@@ -1 +0,0 @@
1	- Subproject commit 3f280d7d3419a781d303b1500c7039e37a1d87a2

grover/data/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from grover.data.molfeaturegenerator import get_available_features_generators, get_features_generator
+from grover.data.molgraph import BatchMolGraph, get_atom_fdim, get_bond_fdim, mol2graph
+from grover.data.molgraph import MolGraph, BatchMolGraph, MolCollator
+from grover.data.moldataset import MoleculeDataset, MoleculeDatapoint
+from grover.data.scaler import StandardScaler
+# from .utils import load_features, save_features

grover/data/dist_sampler.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""
+The re-implemented distributed sampler for the distributed training of GROVER.
+"""
+import math
+import time
+import torch
+from torch.utils.data.sampler import Sampler
+import torch.distributed as dist
+class DistributedSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+    """
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, sample_per_file=None):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        self.sample_per_file = sample_per_file
+        self.shuffle = shuffle
+    def get_indices(self):
+        indices = list(range(len(self.dataset)))
+        if self.sample_per_file is not None:
+            indices = self.sub_indices_of_rank(indices)
+        else:
+            # add extra samples to make it evenly divisible
+            indices += indices[:(self.total_size - len(indices))]
+            assert len(indices) == self.total_size
+            # subsample
+            s = self.rank * self.num_samples
+            e = min((self.rank + 1) * self.num_samples, len(indices))
+            # indices = indices[self.rank:self.total_size:self.num_replicas]
+            indices = indices[s:e]
+        if self.shuffle:
+            g = torch.Generator()
+            # the seed need to be considered.
+            g.manual_seed((self.epoch + 1) * (self.rank + 1) * time.time())
+            idx = torch.randperm(len(indices), generator=g).tolist()
+            indices = [indices[i] for i in idx]
+        # disable this since sub_indices_of_rank.
+        # assert len(indices) == self.num_samples
+        return indices
+    def sub_indices_of_rank(self, indices):
+        # fix generator for each epoch
+        g = torch.Generator()
+        # All data should be loaded in each epoch.
+        g.manual_seed((self.epoch + 1) * 2 + 3)
+        # the fake file indices to cache
+        f_indices = list(range(int(math.ceil(len(indices) * 1.0 / self.sample_per_file))))
+        idx = torch.randperm(len(f_indices), generator=g).tolist()
+        f_indices = [f_indices[i] for i in idx]
+        file_per_rank = int(math.ceil(len(f_indices) * 1.0 / self.num_replicas))
+        # add extra fake file to make it evenly divisible
+        f_indices += f_indices[:(file_per_rank * self.num_replicas - len(f_indices))]
+        # divide index by rank
+        rank_s = self.rank * file_per_rank
+        rank_e = min((self.rank + 1) * file_per_rank, len(f_indices))
+        # get file index for this rank
+        f_indices = f_indices[rank_s:rank_e]
+        # print("f_indices")
+        # print(f_indices)
+        res_indices = []
+        for fi in f_indices:
+            # get real indices for this rank
+            si = fi * self.sample_per_file
+            ei = min((fi + 1) * self.sample_per_file, len(indices))
+            cur_idx = [indices[i] for i in range(si, ei)]
+            res_indices += cur_idx
+        self.num_samples = len(res_indices)
+        return res_indices
+    def __iter__(self):
+        return iter(self.get_indices())
+    def __len__(self):
+        return self.num_samples
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+if __name__ == "__main__":
+    # dataset = [1] * 9
+    # ds = DistributedSampler(dataset, num_replicas=2, rank=0, shuffle=True)
+    # print(ds.get_indices())
+    # ds = DistributedSampler(dataset, num_replicas=2, rank=1, shuffle=True)
+    # print(ds.get_indices())
+    dataset = [1] * 190001
+    res = []
+    ds = DistributedSampler(dataset, num_replicas=2, rank=0, shuffle=True, sample_per_file=777)
+    res.extend(ds.get_indices())
+    print(len(ds.get_indices()))
+    ds = DistributedSampler(dataset, num_replicas=2, rank=1, shuffle=True, sample_per_file=777)
+    res.extend(ds.get_indices())
+    print(len(ds.get_indices()))
+    print(len(set(res)))
+    print("hello")

grover/data/groverdataset.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+The dataset used in training GROVER.
+"""
+import math
+import os
+import csv
+from typing import Union, List
+import numpy as np
+import torch
+from torch.utils.data.dataset import Dataset
+from rdkit import Chem
+import grover.util.utils as feautils
+from grover.data import mol2graph
+from grover.data.moldataset import MoleculeDatapoint
+from grover.data.task_labels import atom_to_vocab, bond_to_vocab
+def get_data(data_path, logger=None):
+    """
+    Load data from the data_path.
+    :param data_path: the data_path.
+    :param logger: the logger.
+    :return:
+    """
+    debug = logger.debug if logger is not None else print
+    summary_path = os.path.join(data_path, "summary.txt")
+    smiles_path = os.path.join(data_path, "graph")
+    feature_path = os.path.join(data_path, "feature")
+    fin = open(summary_path)
+    n_files = int(fin.readline().strip().split(":")[-1])
+    n_samples = int(fin.readline().strip().split(":")[-1])
+    sample_per_file = int(fin.readline().strip().split(":")[-1])
+    debug("Loading data:")
+    debug("Number of files: %d" % n_files)
+    debug("Number of samples: %d" % n_samples)
+    debug("Samples/file: %d" % sample_per_file)
+    datapoints = []
+    for i in range(n_files):
+        smiles_path_i = os.path.join(smiles_path, str(i) + ".csv")
+        feature_path_i = os.path.join(feature_path, str(i) + ".npz")
+        n_samples_i = sample_per_file if i != (n_files - 1) else n_samples % sample_per_file
+        datapoints.append(BatchDatapoint(smiles_path_i, feature_path_i, n_samples_i))
+    return BatchMolDataset(datapoints), sample_per_file
+def split_data(data,
+               split_type='random',
+               sizes=(0.8, 0.1, 0.1),
+               seed=0,
+               logger=None):
+    """
+    Split data with given train/validation/test ratio.
+    :param data:
+    :param split_type:
+    :param sizes:
+    :param seed:
+    :param logger:
+    :return:
+    """
+    assert len(sizes) == 3 and sum(sizes) == 1
+    if split_type == "random":
+        data.shuffle(seed=seed)
+        data = data.data
+        train_size = int(sizes[0] * len(data))
+        train_val_size = int((sizes[0] + sizes[1]) * len(data))
+        train = data[:train_size]
+        val = data[train_size:train_val_size]
+        test = data[train_val_size:]
+        return BatchMolDataset(train), BatchMolDataset(val), BatchMolDataset(test)
+    else:
+        raise NotImplementedError("Do not support %s splits" % split_type)
+class BatchDatapoint:
+    def __init__(self,
+                 smiles_file,
+                 feature_file,
+                 n_samples,
+                 ):
+        self.smiles_file = smiles_file
+        self.feature_file = feature_file
+        # deal with the last batch graph numbers.
+        self.n_samples = n_samples
+        self.datapoints = None
+    def load_datapoints(self):
+        features = self.load_feature()
+        self.datapoints = []
+        with open(self.smiles_file) as f:
+            reader = csv.reader(f)
+            next(reader)
+            for i, line in enumerate(reader):
+                # line = line[0]
+                d = MoleculeDatapoint(line=line,
+                                      features=features[i])
+                self.datapoints.append(d)
+        assert len(self.datapoints) == self.n_samples
+    def load_feature(self):
+        return feautils.load_features(self.feature_file)
+    def shuffle(self):
+        pass
+    def clean_cache(self):
+        del self.datapoints
+        self.datapoints = None
+    def __len__(self):
+        return self.n_samples
+    def __getitem__(self, idx):
+        assert self.datapoints is not None
+        return self.datapoints[idx]
+    def is_loaded(self):
+        return self.datapoints is not None
+class BatchMolDataset(Dataset):
+    def __init__(self, data: List[BatchDatapoint],
+                 graph_per_file=None):
+        self.data = data
+        self.len = 0
+        for d in self.data:
+            self.len += len(d)
+        if graph_per_file is not None:
+            self.sample_per_file = graph_per_file
+        else:
+            self.sample_per_file = len(self.data[0]) if len(self.data) != 0 else None
+    def shuffle(self, seed: int = None):
+        pass
+    def clean_cache(self):
+        for d in self.data:
+            d.clean_cache()
+    def __len__(self) -> int:
+        return self.len
+    def __getitem__(self, idx) -> Union[MoleculeDatapoint, List[MoleculeDatapoint]]:
+        # print(idx)
+        dp_idx = int(idx / self.sample_per_file)
+        real_idx = idx % self.sample_per_file
+        return self.data[dp_idx][real_idx]
+    def load_data(self, idx):
+        dp_idx = int(idx / self.sample_per_file)
+        if not self.data[dp_idx].is_loaded():
+            self.data[dp_idx].load_datapoints()
+    def count_loaded_datapoints(self):
+        res = 0
+        for d in self.data:
+            if d.is_loaded():
+                res += 1
+        return res
+class GroverCollator(object):
+    def __init__(self, shared_dict, atom_vocab, bond_vocab, args):
+        self.args = args
+        self.shared_dict = shared_dict
+        self.atom_vocab = atom_vocab
+        self.bond_vocab = bond_vocab
+    def atom_random_mask(self, smiles_batch):
+        """
+        Perform the random mask operation on atoms.
+        :param smiles_batch:
+        :return: The corresponding atom labels.
+        """
+        # There is a zero padding.
+        vocab_label = [0]
+        percent = 0.15
+        for smi in smiles_batch:
+            mol = Chem.MolFromSmiles(smi)
+            mlabel = [0] * mol.GetNumAtoms()
+            n_mask = math.ceil(mol.GetNumAtoms() * percent)
+            perm = np.random.permutation(mol.GetNumAtoms())[:n_mask]
+            for p in perm:
+                atom = mol.GetAtomWithIdx(int(p))
+                mlabel[p] = self.atom_vocab.stoi.get(atom_to_vocab(mol, atom), self.atom_vocab.other_index)
+            vocab_label.extend(mlabel)
+        return vocab_label
+    def bond_random_mask(self, smiles_batch):
+        """
+        Perform the random mask operaiion on bonds.
+        :param smiles_batch:
+        :return: The corresponding bond labels.
+        """
+        # There is a zero padding.
+        vocab_label = [0]
+        percent = 0.15
+        for smi in smiles_batch:
+            mol = Chem.MolFromSmiles(smi)
+            nm_atoms = mol.GetNumAtoms()
+            nm_bonds = mol.GetNumBonds()
+            mlabel = []
+            n_mask = math.ceil(nm_bonds * percent)
+            perm = np.random.permutation(nm_bonds)[:n_mask]
+            virtual_bond_id = 0
+            for a1 in range(nm_atoms):
+                for a2 in range(a1 + 1, nm_atoms):
+                    bond = mol.GetBondBetweenAtoms(a1, a2)
+                    if bond is None:
+                        continue
+                    if virtual_bond_id in perm:
+                        label = self.bond_vocab.stoi.get(bond_to_vocab(mol, bond), self.bond_vocab.other_index)
+                        mlabel.extend([label])
+                    else:
+                        mlabel.extend([0])
+                    virtual_bond_id += 1
+            # todo: might need to consider bond_drop_rate
+            # todo: double check reverse bond
+            vocab_label.extend(mlabel)
+        return vocab_label
+    def __call__(self, batch):
+        smiles_batch = [d.smiles for d in batch]
+        batchgraph = mol2graph(smiles_batch, self.shared_dict, self.args).get_components()
+        atom_vocab_label = torch.Tensor(self.atom_random_mask(smiles_batch)).long()
+        bond_vocab_label = torch.Tensor(self.bond_random_mask(smiles_batch)).long()
+        fgroup_label = torch.Tensor([d.features for d in batch]).float()
+        # may be some mask here
+        res = {"graph_input": batchgraph,
+               "targets": {"av_task": atom_vocab_label,
+                           "bv_task": bond_vocab_label,
+                           "fg_task": fgroup_label}
+               }
+        return res

grover/data/moldataset.py ADDED Viewed

	@@ -0,0 +1,245 @@

+"""
+The molecule dataset for finetuning.
+This implementation is adapted from
+https://github.com/chemprop/chemprop/blob/master/chemprop/data/data.py
+"""
+import random
+from argparse import Namespace
+from typing import Callable, List, Union
+import numpy as np
+from rdkit import Chem
+from torch.utils.data.dataset import Dataset
+from grover.data.molfeaturegenerator import get_features_generator
+from grover.data.scaler import StandardScaler
+class MoleculeDatapoint:
+    """A MoleculeDatapoint contains a single molecule and its associated features and targets."""
+    def __init__(self,
+                 line: List[str],
+                 args: Namespace = None,
+                 features: np.ndarray = None,
+                 use_compound_names: bool = False):
+        """
+        Initializes a MoleculeDatapoint, which contains a single molecule.
+        :param line: A list of strings generated by separating a line in a data CSV file by comma.
+        :param args: Arguments.
+        :param features: A numpy array containing additional features (ex. Morgan fingerprint).
+        :param use_compound_names: Whether the data CSV includes the compound name on each line.
+        """
+        self.features_generator = None
+        self.args = None
+        if args is not None:
+            if hasattr(args, "features_generator"):
+                self.features_generator = args.features_generator
+            self.args = args
+        if features is not None and self.features_generator is not None:
+            raise ValueError('Currently cannot provide both loaded features and a features generator.')
+        self.features = features
+        if use_compound_names:
+            self.compound_name = line[0]  # str
+            line = line[1:]
+        else:
+            self.compound_name = None
+        self.smiles = line[0]  # str
+        # Generate additional features if given a generator
+        if self.features_generator is not None:
+            self.features = []
+            mol = Chem.MolFromSmiles(self.smiles)
+            for fg in self.features_generator:
+                features_generator = get_features_generator(fg)
+                if mol is not None and mol.GetNumHeavyAtoms() > 0:
+                    if fg in ['morgan', 'morgan_count']:
+                        self.features.extend(features_generator(mol, num_bits=args.num_bits))
+                    else:
+                        self.features.extend(features_generator(mol))
+            self.features = np.array(self.features)
+        # Fix nans in features
+        if self.features is not None:
+            replace_token = 0
+            self.features = np.where(np.isnan(self.features), replace_token, self.features)
+        # Create targets
+        self.targets = [float(x) if x != '' else None for x in line[1:]]
+    def set_features(self, features: np.ndarray):
+        """
+        Sets the features of the molecule.
+        :param features: A 1-D numpy array of features for the molecule.
+        """
+        self.features = features
+    def num_tasks(self) -> int:
+        """
+        Returns the number of prediction tasks.
+        :return: The number of tasks.
+        """
+        return len(self.targets)
+    def set_targets(self, targets: List[float]):
+        """
+        Sets the targets of a molecule.
+        :param targets: A list of floats containing the targets.
+        """
+        self.targets = targets
+class MoleculeDataset(Dataset):
+    """A MoleculeDataset contains a list of molecules and their associated features and targets."""
+    def __init__(self, data: List[MoleculeDatapoint]):
+        """
+        Initializes a MoleculeDataset, which contains a list of MoleculeDatapoints (i.e. a list of molecules).
+        :param data: A list of MoleculeDatapoints.
+        """
+        self.data = data
+        self.args = self.data[0].args if len(self.data) > 0 else None
+        self.scaler = None
+    def compound_names(self) -> List[str]:
+        """
+        Returns the compound names associated with the molecule (if they exist).
+        :return: A list of compound names or None if the dataset does not contain compound names.
+        """
+        if len(self.data) == 0 or self.data[0].compound_name is None:
+            return None
+        return [d.compound_name for d in self.data]
+    def smiles(self) -> List[str]:
+        """
+        Returns the smiles strings associated with the molecules.
+        :return: A list of smiles strings.
+        """
+        return [d.smiles for d in self.data]
+    def features(self) -> List[np.ndarray]:
+        """
+        Returns the features associated with each molecule (if they exist).
+        :return: A list of 1D numpy arrays containing the features for each molecule or None if there are no features.
+        """
+        if len(self.data) == 0 or self.data[0].features is None:
+            return None
+        return [d.features for d in self.data]
+    def targets(self) -> List[List[float]]:
+        """
+        Returns the targets associated with each molecule.
+        :return: A list of lists of floats containing the targets.
+        """
+        return [d.targets for d in self.data]
+    def num_tasks(self) -> int:
+        """
+        Returns the number of prediction tasks.
+        :return: The number of tasks.
+        """
+        if self.args.dataset_type == 'multiclass':
+            return int(max([i[0] for i in self.targets()])) + 1
+        else:
+            return self.data[0].num_tasks() if len(self.data) > 0 else None
+    def features_size(self) -> int:
+        """
+        Returns the size of the features array associated with each molecule.
+        :return: The size of the features.
+        """
+        return len(self.data[0].features) if len(self.data) > 0 and self.data[0].features is not None else None
+    def shuffle(self, seed: int = None):
+        """
+        Shuffles the dataset.
+        :param seed: Optional random seed.
+        """
+        if seed is not None:
+            random.seed(seed)
+        random.shuffle(self.data)
+    def normalize_features(self, scaler: StandardScaler = None, replace_nan_token: int = 0) -> StandardScaler:
+        """
+        Normalizes the features of the dataset using a StandardScaler (subtract mean, divide by standard deviation).
+        If a scaler is provided, uses that scaler to perform the normalization. Otherwise fits a scaler to the
+        features in the dataset and then performs the normalization.
+        :param scaler: A fitted StandardScaler. Used if provided. Otherwise a StandardScaler is fit on
+        this dataset and is then used.
+        :param replace_nan_token: What to replace nans with.
+        :return: A fitted StandardScaler. If a scaler is provided, this is the same scaler. Otherwise, this is
+        a scaler fit on this dataset.
+        """
+        if len(self.data) == 0 or self.data[0].features is None:
+            return None
+        if scaler is not None:
+            self.scaler = scaler
+        elif self.scaler is None:
+            features = np.vstack([d.features for d in self.data])
+            self.scaler = StandardScaler(replace_nan_token=replace_nan_token)
+            self.scaler.fit(features)
+        for d in self.data:
+            d.set_features(self.scaler.transform(d.features.reshape(1, -1))[0])
+        return self.scaler
+    def set_targets(self, targets: List[List[float]]):
+        """
+        Sets the targets for each molecule in the dataset. Assumes the targets are aligned with the datapoints.
+        :param targets: A list of lists of floats containing targets for each molecule. This must be the
+        same length as the underlying dataset.
+        """
+        assert len(self.data) == len(targets)
+        for i in range(len(self.data)):
+            self.data[i].set_targets(targets[i])
+    def sort(self, key: Callable):
+        """
+        Sorts the dataset using the provided key.
+        :param key: A function on a MoleculeDatapoint to determine the sorting order.
+        """
+        self.data.sort(key=key)
+    def __len__(self) -> int:
+        """
+        Returns the length of the dataset (i.e. the number of molecules).
+        :return: The length of the dataset.
+        """
+        return len(self.data)
+    def __getitem__(self, idx) -> Union[MoleculeDatapoint, List[MoleculeDatapoint]]:
+        """
+        Gets one or more MoleculeDatapoints via an index or slice.
+        :param item: An index (int) or a slice object.
+        :return: A MoleculeDatapoint if an int is provided or a list of MoleculeDatapoints if a slice is provided.
+        """
+        return self.data[idx]

grover/data/molfeaturegenerator.py ADDED Viewed

	@@ -0,0 +1,146 @@

+"""
+The registered feature generator for molecules.
+This implementation is adapted from
+https://github.com/chemprop/chemprop/blob/master/chemprop/features/features_generators.py
+"""
+from typing import Callable, List, Union
+import numpy as np
+from rdkit import Chem, DataStructs
+from rdkit.Chem import AllChem
+Molecule = Union[str, Chem.Mol]
+FeaturesGenerator = Callable[[Molecule], np.ndarray]
+FEATURES_GENERATOR_REGISTRY = {}
+def register_features_generator(features_generator_name: str) -> Callable[[FeaturesGenerator], FeaturesGenerator]:
+    """
+    Registers a features generator.
+    :param features_generator_name: The name to call the FeaturesGenerator.
+    :return: A decorator which will add a FeaturesGenerator to the registry using the specified name.
+    """
+    def decorator(features_generator: FeaturesGenerator) -> FeaturesGenerator:
+        FEATURES_GENERATOR_REGISTRY[features_generator_name] = features_generator
+        return features_generator
+    return decorator
+def get_features_generator(features_generator_name: str) -> FeaturesGenerator:
+    """
+    Gets a registered FeaturesGenerator by name.
+    :param features_generator_name: The name of the FeaturesGenerator.
+    :return: The desired FeaturesGenerator.
+    """
+    if features_generator_name not in FEATURES_GENERATOR_REGISTRY:
+        raise ValueError(f'Features generator "{features_generator_name}" could not be found. '
+                         f'If this generator relies on rdkit features, you may need to install descriptastorus.')
+    return FEATURES_GENERATOR_REGISTRY[features_generator_name]
+def get_available_features_generators() -> List[str]:
+    """Returns the names of available features generators."""
+    return list(FEATURES_GENERATOR_REGISTRY.keys())
+MORGAN_RADIUS = 2
+MORGAN_NUM_BITS = 2048
+@register_features_generator('morgan')
+def morgan_binary_features_generator(mol: Molecule,
+                                     radius: int = MORGAN_RADIUS,
+                                     num_bits: int = MORGAN_NUM_BITS) -> np.ndarray:
+    """
+    Generates a binary Morgan fingerprint for a molecule.
+    :param mol: A molecule (i.e. either a SMILES string or an RDKit molecule).
+    :param radius: Morgan fingerprint radius.
+    :param num_bits: Number of bits in Morgan fingerprint.
+    :return: A 1-D numpy array containing the binary Morgan fingerprint.
+    """
+    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
+    features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=num_bits)
+    features = np.zeros((1,))
+    DataStructs.ConvertToNumpyArray(features_vec, features)
+    return features
+@register_features_generator('morgan_count')
+def morgan_counts_features_generator(mol: Molecule,
+                                     radius: int = MORGAN_RADIUS,
+                                     num_bits: int = MORGAN_NUM_BITS) -> np.ndarray:
+    """
+    Generates a counts-based Morgan fingerprint for a molecule.
+    :param mol: A molecule (i.e. either a SMILES string or an RDKit molecule).
+    :param radius: Morgan fingerprint radius.
+    :param num_bits: Number of bits in Morgan fingerprint.
+    :return: A 1D numpy array containing the counts-based Morgan fingerprint.
+    """
+    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
+    features_vec = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=num_bits)
+    features = np.zeros((1,))
+    DataStructs.ConvertToNumpyArray(features_vec, features)
+    return features
+try:
+    from descriptastorus.descriptors import rdDescriptors, rdNormalizedDescriptors
+    @register_features_generator('rdkit_2d')
+    def rdkit_2d_features_generator(mol: Molecule) -> np.ndarray:
+        """
+        Generates RDKit 2D features for a molecule.
+        :param mol: A molecule (i.e. either a SMILES string or an RDKit molecule).
+        :return: A 1D numpy array containing the RDKit 2D features.
+        """
+        smiles = Chem.MolToSmiles(mol, isomericSmiles=True) if type(mol) != str else mol
+        generator = rdDescriptors.RDKit2D()
+        features = generator.process(smiles)[1:]
+        return features
+    @register_features_generator('rdkit_2d_normalized')
+    def rdkit_2d_features_normalized_generator(mol: Molecule) -> np.ndarray:
+        """
+        Generates RDKit 2D normalized features for a molecule.
+        :param mol: A molecule (i.e. either a SMILES string or an RDKit molecule).
+        :return: A 1D numpy array containing the RDKit 2D normalized features.
+        """
+        smiles = Chem.MolToSmiles(mol, isomericSmiles=True) if type(mol) != str else mol
+        generator = rdNormalizedDescriptors.RDKit2DNormalized()
+        features = generator.process(smiles)[1:]
+        return features
+except ImportError:
+    pass
+"""
+Custom features generator template.
+Note: The name you use to register the features generator is the name
+you will specify on the command line when using the --features_generator <name> flag.
+Ex. python train.py ... --features_generator custom ...
+@register_features_generator('custom')
+def custom_features_generator(mol: Molecule) -> np.ndarray:
+    # If you want to use the SMILES string
+    smiles = Chem.MolToSmiles(mol, isomericSmiles=True) if type(mol) != str else mol
+    # If you want to use the RDKit molecule
+    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
+    # Replace this with code which generates features from the molecule
+    features = np.array([0, 0, 1])
+    return features
+"""

grover/data/molgraph.py ADDED Viewed

	@@ -0,0 +1,378 @@

+"""
+The data structure of Molecules.
+This implementation is adapted from
+https://github.com/chemprop/chemprop/blob/master/chemprop/features/featurization.py
+"""
+from argparse import Namespace
+from typing import List, Tuple, Union
+import numpy as np
+import torch
+from rdkit import Chem
+# Atom feature sizes
+MAX_ATOMIC_NUM = 100
+ATOM_FEATURES = {
+    'atomic_num': list(range(MAX_ATOMIC_NUM)),
+    'degree': [0, 1, 2, 3, 4, 5],
+    'formal_charge': [-1, -2, 1, 2, 0],
+    'chiral_tag': [0, 1, 2, 3],
+    'num_Hs': [0, 1, 2, 3, 4],
+    'hybridization': [
+        Chem.rdchem.HybridizationType.SP,
+        Chem.rdchem.HybridizationType.SP2,
+        Chem.rdchem.HybridizationType.SP3,
+        Chem.rdchem.HybridizationType.SP3D,
+        Chem.rdchem.HybridizationType.SP3D2
+    ],
+}
+# len(choices) + 1 to include room for uncommon values; + 2 at end for IsAromatic and mass
+ATOM_FDIM = sum(len(choices) + 1 for choices in ATOM_FEATURES.values()) + 2
+BOND_FDIM = 14
+def get_atom_fdim() -> int:
+    """
+    Gets the dimensionality of atom features.
+    :param: Arguments.
+    """
+    return ATOM_FDIM + 18
+def get_bond_fdim() -> int:
+    """
+    Gets the dimensionality of bond features.
+    :param: Arguments.
+    """
+    return BOND_FDIM
+def onek_encoding_unk(value: int, choices: List[int]) -> List[int]:
+    """
+    Creates a one-hot encoding.
+    :param value: The value for which the encoding should be one.
+    :param choices: A list of possible values.
+    :return: A one-hot encoding of the value in a list of length len(choices) + 1.
+    If value is not in the list of choices, then the final element in the encoding is 1.
+    """
+    encoding = [0] * (len(choices) + 1)
+    if min(choices) < 0:
+        index = value
+    else:
+        index = choices.index(value) if value in choices else -1
+    encoding[index] = 1
+    return encoding
+class MolGraph:
+    """
+    A MolGraph represents the graph structure and featurization of a single molecule.
+    A MolGraph computes the following attributes:
+    - smiles: Smiles string.
+    - n_atoms: The number of atoms in the molecule.
+    - n_bonds: The number of bonds in the molecule.
+    - f_atoms: A mapping from an atom index to a list atom features.
+    - f_bonds: A mapping from a bond index to a list of bond features.
+    - a2b: A mapping from an atom index to a list of incoming bond indices.
+    - b2a: A mapping from a bond index to the index of the atom the bond originates from.
+    - b2revb: A mapping from a bond index to the index of the reverse bond.
+    """
+    def __init__(self, smiles: str,  args: Namespace):
+        """
+        Computes the graph structure and featurization of a molecule.
+        :param smiles: A smiles string.
+        :param args: Arguments.
+        """
+        self.smiles = smiles
+        self.args = args
+        self.n_atoms = 0  # number of atoms
+        self.n_bonds = 0  # number of bonds
+        self.f_atoms = []  # mapping from atom index to atom features
+        self.f_bonds = []  # mapping from bond index to concat(in_atom, bond) features
+        self.a2b = []  # mapping from atom index to incoming bond indices
+        self.b2a = []  # mapping from bond index to the index of the atom the bond is coming from
+        self.b2revb = []  # mapping from bond index to the index of the reverse bond
+        # Convert smiles to molecule
+        mol = Chem.MolFromSmiles(smiles)
+        self.hydrogen_donor = Chem.MolFromSmarts("[$([N;!H0;v3,v4&+1]),$([O,S;H1;+0]),n&H1&+0]")
+        self.hydrogen_acceptor = Chem.MolFromSmarts(
+            "[$([O,S;H1;v2;!$(*-*=[O,N,P,S])]),$([O,S;H0;v2]),$([O,S;-]),$([N;v3;!$(N-*=[O,N,P,S])]),"
+            "n&H0&+0,$([o,s;+0;!$([o,s]:n);!$([o,s]:c:n)])]")
+        self.acidic = Chem.MolFromSmarts("[$([C,S](=[O,S,P])-[O;H1,-1])]")
+        self.basic = Chem.MolFromSmarts(
+            "[#7;+,$([N;H2&+0][$([C,a]);!$([C,a](=O))]),$([N;H1&+0]([$([C,a]);!$([C,a](=O))])[$([C,a]);"
+            "!$([C,a](=O))]),$([N;H0&+0]([C;!$(C(=O))])([C;!$(C(=O))])[C;!$(C(=O))])]")
+        self.hydrogen_donor_match = sum(mol.GetSubstructMatches(self.hydrogen_donor), ())
+        self.hydrogen_acceptor_match = sum(mol.GetSubstructMatches(self.hydrogen_acceptor), ())
+        self.acidic_match = sum(mol.GetSubstructMatches(self.acidic), ())
+        self.basic_match = sum(mol.GetSubstructMatches(self.basic), ())
+        self.ring_info = mol.GetRingInfo()
+        # fake the number of "atoms" if we are collapsing substructures
+        self.n_atoms = mol.GetNumAtoms()
+        # Get atom features
+        for _, atom in enumerate(mol.GetAtoms()):
+            self.f_atoms.append(self.atom_features(atom))
+        self.f_atoms = [self.f_atoms[i] for i in range(self.n_atoms)]
+        for _ in range(self.n_atoms):
+            self.a2b.append([])
+        # Get bond features
+        for a1 in range(self.n_atoms):
+            for a2 in range(a1 + 1, self.n_atoms):
+                bond = mol.GetBondBetweenAtoms(a1, a2)
+                if bond is None:
+                    continue
+                if args.bond_drop_rate > 0:
+                    if np.random.binomial(1, args.bond_drop_rate):
+                        continue
+                f_bond = self.bond_features(bond)
+                # Always treat the bond as directed.
+                self.f_bonds.append(self.f_atoms[a1] + f_bond)
+                self.f_bonds.append(self.f_atoms[a2] + f_bond)
+                # Update index mappings
+                b1 = self.n_bonds
+                b2 = b1 + 1
+                self.a2b[a2].append(b1)  # b1 = a1 --> a2
+                self.b2a.append(a1)
+                self.a2b[a1].append(b2)  # b2 = a2 --> a1
+                self.b2a.append(a2)
+                self.b2revb.append(b2)
+                self.b2revb.append(b1)
+                self.n_bonds += 2
+    def atom_features(self, atom: Chem.rdchem.Atom) -> List[Union[bool, int, float]]:
+        """
+        Builds a feature vector for an atom.
+        :param atom: An RDKit atom.
+        :param functional_groups: A k-hot vector indicating the functional groups the atom belongs to.
+        :return: A list containing the atom features.
+        """
+        features = onek_encoding_unk(atom.GetAtomicNum() - 1, ATOM_FEATURES['atomic_num']) + \
+                   onek_encoding_unk(atom.GetTotalDegree(), ATOM_FEATURES['degree']) + \
+                   onek_encoding_unk(atom.GetFormalCharge(), ATOM_FEATURES['formal_charge']) + \
+                   onek_encoding_unk(int(atom.GetChiralTag()), ATOM_FEATURES['chiral_tag']) + \
+                   onek_encoding_unk(int(atom.GetTotalNumHs()), ATOM_FEATURES['num_Hs']) + \
+                   onek_encoding_unk(int(atom.GetHybridization()), ATOM_FEATURES['hybridization']) + \
+                   [1 if atom.GetIsAromatic() else 0] + \
+                   [atom.GetMass() * 0.01]
+        atom_idx = atom.GetIdx()
+        features = features + \
+                   onek_encoding_unk(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5, 6]) + \
+                   [atom_idx in self.hydrogen_acceptor_match] + \
+                   [atom_idx in self.hydrogen_donor_match] + \
+                   [atom_idx in self.acidic_match] + \
+                   [atom_idx in self.basic_match] + \
+                   [self.ring_info.IsAtomInRingOfSize(atom_idx, 3),
+                    self.ring_info.IsAtomInRingOfSize(atom_idx, 4),
+                    self.ring_info.IsAtomInRingOfSize(atom_idx, 5),
+                    self.ring_info.IsAtomInRingOfSize(atom_idx, 6),
+                    self.ring_info.IsAtomInRingOfSize(atom_idx, 7),
+                    self.ring_info.IsAtomInRingOfSize(atom_idx, 8)]
+        return features
+    def bond_features(self, bond: Chem.rdchem.Bond
+                      ) -> List[Union[bool, int, float]]:
+        """
+        Builds a feature vector for a bond.
+        :param bond: A RDKit bond.
+        :return: A list containing the bond features.
+        """
+        if bond is None:
+            fbond = [1] + [0] * (BOND_FDIM - 1)
+        else:
+            bt = bond.GetBondType()
+            fbond = [
+                0,  # bond is not None
+                bt == Chem.rdchem.BondType.SINGLE,
+                bt == Chem.rdchem.BondType.DOUBLE,
+                bt == Chem.rdchem.BondType.TRIPLE,
+                bt == Chem.rdchem.BondType.AROMATIC,
+                (bond.GetIsConjugated() if bt is not None else 0),
+                (bond.IsInRing() if bt is not None else 0)
+            ]
+            fbond += onek_encoding_unk(int(bond.GetStereo()), list(range(6)))
+        return fbond
+class BatchMolGraph:
+    """
+    A BatchMolGraph represents the graph structure and featurization of a batch of molecules.
+    A BatchMolGraph contains the attributes of a MolGraph plus:
+    - smiles_batch: A list of smiles strings.
+    - n_mols: The number of molecules in the batch.
+    - atom_fdim: The dimensionality of the atom features.
+    - bond_fdim: The dimensionality of the bond features (technically the combined atom/bond features).
+    - a_scope: A list of tuples indicating the start and end atom indices for each molecule.
+    - b_scope: A list of tuples indicating the start and end bond indices for each molecule.
+    - max_num_bonds: The maximum number of bonds neighboring an atom in this batch.
+    - b2b: (Optional) A mapping from a bond index to incoming bond indices.
+    - a2a: (Optional): A mapping from an atom index to neighboring atom indices.
+    """
+    def __init__(self, mol_graphs: List[MolGraph], args: Namespace):
+        self.smiles_batch = [mol_graph.smiles for mol_graph in mol_graphs]
+        self.n_mols = len(self.smiles_batch)
+        self.atom_fdim = get_atom_fdim()
+        self.bond_fdim = get_bond_fdim() + self.atom_fdim
+        # Start n_atoms and n_bonds at 1 b/c zero padding
+        self.n_atoms = 1  # number of atoms (start at 1 b/c need index 0 as padding)
+        self.n_bonds = 1  # number of bonds (start at 1 b/c need index 0 as padding)
+        self.a_scope = []  # list of tuples indicating (start_atom_index, num_atoms) for each molecule
+        self.b_scope = []  # list of tuples indicating (start_bond_index, num_bonds) for each molecule
+        # All start with zero padding so that indexing with zero padding returns zeros
+        f_atoms = [[0] * self.atom_fdim]  # atom features
+        f_bonds = [[0] * self.bond_fdim]  # combined atom/bond features
+        a2b = [[]]  # mapping from atom index to incoming bond indices
+        b2a = [0]  # mapping from bond index to the index of the atom the bond is coming from
+        b2revb = [0]  # mapping from bond index to the index of the reverse bond
+        for mol_graph in mol_graphs:
+            f_atoms.extend(mol_graph.f_atoms)
+            f_bonds.extend(mol_graph.f_bonds)
+            for a in range(mol_graph.n_atoms):
+                a2b.append([b + self.n_bonds for b in mol_graph.a2b[a]])
+            for b in range(mol_graph.n_bonds):
+                b2a.append(self.n_atoms + mol_graph.b2a[b])
+                b2revb.append(self.n_bonds + mol_graph.b2revb[b])
+            self.a_scope.append((self.n_atoms, mol_graph.n_atoms))
+            self.b_scope.append((self.n_bonds, mol_graph.n_bonds))
+            self.n_atoms += mol_graph.n_atoms
+            self.n_bonds += mol_graph.n_bonds
+        # max with 1 to fix a crash in rare case of all single-heavy-atom mols
+        self.max_num_bonds = max(1, max(len(in_bonds) for in_bonds in a2b))
+        self.f_atoms = torch.FloatTensor(f_atoms)
+        self.f_bonds = torch.FloatTensor(f_bonds)
+        self.a2b = torch.LongTensor([a2b[a] + [0] * (self.max_num_bonds - len(a2b[a])) for a in range(self.n_atoms)])
+        self.b2a = torch.LongTensor(b2a)
+        self.b2revb = torch.LongTensor(b2revb)
+        self.b2b = None  # try to avoid computing b2b b/c O(n_atoms^3)
+        self.a2a = self.b2a[self.a2b]  # only needed if using atom messages
+        self.a_scope = torch.LongTensor(self.a_scope)
+        self.b_scope = torch.LongTensor(self.b_scope)
+    def set_new_atom_feature(self, f_atoms):
+        """
+        Set the new atom feature. Do not update bond feature.
+        :param f_atoms:
+        """
+        self.f_atoms = f_atoms
+    def get_components(self) -> Tuple[torch.FloatTensor, torch.FloatTensor,
+                                      torch.LongTensor, torch.LongTensor, torch.LongTensor,
+                                      List[Tuple[int, int]], List[Tuple[int, int]]]:
+        """
+        Returns the components of the BatchMolGraph.
+        :return: A tuple containing PyTorch tensors with the atom features, bond features, and graph structure
+        and two lists indicating the scope of the atoms and bonds (i.e. which molecules they belong to).
+        """
+        return self.f_atoms, self.f_bonds, self.a2b, self.b2a, self.b2revb, self.a_scope, self.b_scope, self.a2a
+    def get_b2b(self) -> torch.LongTensor:
+        """
+        Computes (if necessary) and returns a mapping from each bond index to all the incoming bond indices.
+        :return: A PyTorch tensor containing the mapping from each bond index to all the incoming bond indices.
+        """
+        if self.b2b is None:
+            b2b = self.a2b[self.b2a]  # num_bonds x max_num_bonds
+            # b2b includes reverse edge for each bond so need to mask out
+            revmask = (b2b != self.b2revb.unsqueeze(1).repeat(1, b2b.size(1))).long()  # num_bonds x max_num_bonds
+            self.b2b = b2b * revmask
+        return self.b2b
+    def get_a2a(self) -> torch.LongTensor:
+        """
+        Computes (if necessary) and returns a mapping from each atom index to all neighboring atom indices.
+        :return: A PyTorch tensor containing the mapping from each bond index to all the incodming bond indices.
+        """
+        if self.a2a is None:
+            # b = a1 --> a2
+            # a2b maps a2 to all incoming bonds b
+            # b2a maps each bond b to the atom it comes from a1
+            # thus b2a[a2b] maps atom a2 to neighboring atoms a1
+            self.a2a = self.b2a[self.a2b]  # num_atoms x max_num_bonds
+        return self.a2a
+def mol2graph(smiles_batch: List[str], shared_dict,
+              args: Namespace) -> BatchMolGraph:
+    """
+    Converts a list of SMILES strings to a BatchMolGraph containing the batch of molecular graphs.
+    :param smiles_batch: A list of SMILES strings.
+    :param args: Arguments.
+    :return: A BatchMolGraph containing the combined molecular graph for the molecules
+    """
+    mol_graphs = []
+    for smiles in smiles_batch:
+        if smiles in shared_dict:
+            mol_graph = shared_dict[smiles]
+        else:
+            mol_graph = MolGraph(smiles, args)
+            if not args.no_cache:
+                shared_dict[smiles] = mol_graph
+        mol_graphs.append(mol_graph)
+    return BatchMolGraph(mol_graphs, args)
+class MolCollator(object):
+    """
+    Collator for pytorch dataloader
+    :param shared_dict: a shared dict of multiprocess.
+    :param args: Arguments.
+    """
+    def __init__(self, shared_dict, args):
+        self.args = args
+        self.shared_dict = shared_dict
+    def __call__(self, batch):
+        smiles_batch = [d.smiles for d in batch]
+        features_batch = [d.features for d in batch]
+        target_batch = [d.targets for d in batch]
+        batch_mol_graph = mol2graph(smiles_batch, self.shared_dict, self.args)
+        batch = batch_mol_graph.get_components()
+        mask = torch.Tensor([[x is not None for x in tb] for tb in target_batch])
+        targets = torch.Tensor([[0 if x is None else x for x in tb] for tb in target_batch])
+        return smiles_batch, batch, features_batch, mask, targets

grover/data/scaler.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""
+The scaler for the regression task.
+This implementation is adapted from
+https://github.com/chemprop/chemprop/blob/master/chemprop/data/scaler.py
+"""
+from typing import Any, List
+import numpy as np
+class StandardScaler:
+    """A StandardScaler normalizes a dataset.
+    When fit on a dataset, the StandardScaler learns the mean and standard deviation across the 0th axis.
+    When transforming a dataset, the StandardScaler subtracts the means and divides by the standard deviations.
+    """
+    def __init__(self, means: np.ndarray = None, stds: np.ndarray = None, replace_nan_token: Any = None):
+        """
+        Initialize StandardScaler, optionally with means and standard deviations precomputed.
+        :param means: An optional 1D numpy array of precomputed means.
+        :param stds: An optional 1D numpy array of precomputed standard deviations.
+        :param replace_nan_token: The token to use in place of nans.
+        """
+        self.means = means
+        self.stds = stds
+        self.replace_nan_token = replace_nan_token
+    def fit(self, X: List[List[float]]) -> 'StandardScaler':
+        """
+        Learns means and standard deviations across the 0th axis.
+        :param X: A list of lists of floats.
+        :return: The fitted StandardScaler.
+        """
+        X = np.array(X).astype(float)
+        self.means = np.nanmean(X, axis=0)
+        self.stds = np.nanstd(X, axis=0)
+        self.means = np.where(np.isnan(self.means), np.zeros(self.means.shape), self.means)
+        self.stds = np.where(np.isnan(self.stds), np.ones(self.stds.shape), self.stds)
+        self.stds = np.where(self.stds == 0, np.ones(self.stds.shape), self.stds)
+        return self
+    def transform(self, X: List[List[float]]):
+        """
+        Transforms the data by subtracting the means and dividing by the standard deviations.
+        :param X: A list of lists of floats.
+        :return: The transformed data.
+        """
+        X = np.array(X).astype(float)
+        transformed_with_nan = (X - self.means) / self.stds
+        transformed_with_none = np.where(np.isnan(transformed_with_nan), self.replace_nan_token, transformed_with_nan)
+        return transformed_with_none
+    def inverse_transform(self, X: List[List[float]]):
+        """
+        Performs the inverse transformation by multiplying by the standard deviations and adding the means.
+        :param X: A list of lists of floats.
+        :return: The inverse transformed data.
+        """
+        if isinstance(X, np.ndarray) or isinstance(X, list):
+            X = np.array(X).astype(float)
+            transformed_with_nan = X * self.stds + self.means
+            transformed_with_none = np.where(np.isnan(transformed_with_nan),
+                                             self.replace_nan_token, transformed_with_nan)
+        return transformed_with_none

grover/data/task_labels.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""
+The label generator for the pretraining.
+"""
+from collections import Counter
+from typing import Callable, Union
+import numpy as np
+from rdkit import Chem
+from descriptastorus.descriptors import rdDescriptors
+from grover.data.molfeaturegenerator import register_features_generator
+Molecule = Union[str, Chem.Mol]
+FeaturesGenerator = Callable[[Molecule], np.ndarray]
+# The functional group descriptors in RDkit.
+RDKIT_PROPS = ['fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN',
+               'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO', 'fr_COO2',
+               'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0',
+               'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2',
+               'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide',
+               'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl',
+               'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine',
+               'fr_bicyclic', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester',
+               'fr_ether', 'fr_furan', 'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone',
+               'fr_imidazole', 'fr_imide', 'fr_isocyan', 'fr_isothiocyan', 'fr_ketone',
+               'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy', 'fr_morpholine',
+               'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho',
+               'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol',
+               'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperdine',
+               'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN',
+               'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole',
+               'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea']
+BOND_FEATURES = ['BondType', 'Stereo', 'BondDir']
+# BOND_FEATURES = ['BondType', 'Stereo']
+# BOND_FEATURES = ['Stereo']
+@register_features_generator('fgtasklabel')
+def rdkit_functional_group_label_features_generator(mol: Molecule) -> np.ndarray:
+    """
+    Generates functional group label for a molecule using RDKit.
+    :param mol: A molecule (i.e. either a SMILES string or an RDKit molecule).
+    :return: A 1D numpy array containing the RDKit 2D features.
+    """
+    smiles = Chem.MolToSmiles(mol, isomericSmiles=True) if type(mol) != str else mol
+    generator = rdDescriptors.RDKit2D(RDKIT_PROPS)
+    features = generator.process(smiles)[1:]
+    features = np.array(features)
+    features[features != 0] = 1
+    return features
+def atom_to_vocab(mol, atom):
+    """
+    Convert atom to vocabulary. The convention is based on atom type and bond type.
+    :param mol: the molecular.
+    :param atom: the target atom.
+    :return: the generated atom vocabulary with its contexts.
+    """
+    nei = Counter()
+    for a in atom.GetNeighbors():
+        bond = mol.GetBondBetweenAtoms(atom.GetIdx(), a.GetIdx())
+        nei[str(a.GetSymbol()) + "-" + str(bond.GetBondType())] += 1
+    keys = nei.keys()
+    keys = list(keys)
+    keys.sort()
+    output = atom.GetSymbol()
+    for k in keys:
+        output = "%s_%s%d" % (output, k, nei[k])
+    # The generated atom_vocab is too long?
+    return output
+def bond_to_vocab(mol, bond):
+    """
+    Convert bond to vocabulary. The convention is based on atom type and bond type.
+    Considering one-hop neighbor atoms
+    :param mol: the molecular.
+    :param atom: the target atom.
+    :return: the generated bond vocabulary with its contexts.
+    """
+    nei = Counter()
+    two_neighbors = (bond.GetBeginAtom(), bond.GetEndAtom())
+    two_indices = [a.GetIdx() for a in two_neighbors]
+    for nei_atom in two_neighbors:
+        for a in nei_atom.GetNeighbors():
+            a_idx = a.GetIdx()
+            if a_idx in two_indices:
+                continue
+            tmp_bond = mol.GetBondBetweenAtoms(nei_atom.GetIdx(), a_idx)
+            nei[str(nei_atom.GetSymbol()) + '-' + get_bond_feature_name(tmp_bond)] += 1
+    keys = list(nei.keys())
+    keys.sort()
+    output = get_bond_feature_name(bond)
+    for k in keys:
+        output = "%s_%s%d" % (output, k, nei[k])
+    return output
+def get_bond_feature_name(bond):
+    """
+    Return the string format of bond features.
+    Bond features are surrounded with ()
+    """
+    ret = []
+    for bond_feature in BOND_FEATURES:
+        fea = eval(f"bond.Get{bond_feature}")()
+        ret.append(str(fea))
+    return '(' + '-'.join(ret) + ')'

grover/data/torchvocab.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""
+The contextual property.
+"""
+import pickle
+from collections import Counter
+from multiprocessing import Pool
+import tqdm
+from rdkit import Chem
+from grover.data.task_labels import atom_to_vocab
+from grover.data.task_labels import bond_to_vocab
+class TorchVocab(object):
+    """
+    Defines the vocabulary for atoms/bonds in molecular.
+    """
+    def __init__(self, counter, max_size=None, min_freq=1, specials=('<pad>', '<other>'), vocab_type='atom'):
+        """
+        :param counter:
+        :param max_size:
+        :param min_freq:
+        :param specials:
+        :param vocab_type: 'atom': atom atom_vocab; 'bond': bond atom_vocab.
+        """
+        self.freqs = counter
+        counter = counter.copy()
+        min_freq = max(min_freq, 1)
+        if vocab_type in ('atom', 'bond'):
+            self.vocab_type = vocab_type
+        else:
+            raise ValueError('Wrong input for vocab_type!')
+        self.itos = list(specials)
+        max_size = None if max_size is None else max_size + len(self.itos)
+        # sort by frequency, then alphabetically
+        words_and_frequencies = sorted(counter.items(), key=lambda tup: tup[0])
+        words_and_frequencies.sort(key=lambda tup: tup[1], reverse=True)
+        for word, freq in words_and_frequencies:
+            if freq < min_freq or len(self.itos) == max_size:
+                break
+            self.itos.append(word)
+        # stoi is simply a reverse dict for itos
+        self.stoi = {tok: i for i, tok in enumerate(self.itos)}
+        self.other_index = 1
+        self.pad_index = 0
+    def __eq__(self, other):
+        if self.freqs != other.freqs:
+            return False
+        if self.stoi != other.stoi:
+            return False
+        if self.itos != other.itos:
+            return False
+        # if self.vectors != other.vectors:
+        #    return False
+        return True
+    def __len__(self):
+        return len(self.itos)
+    def vocab_rerank(self):
+        self.stoi = {word: i for i, word in enumerate(self.itos)}
+    def extend(self, v, sort=False):
+        words = sorted(v.itos) if sort else v.itos
+        for w in words:
+            if w not in self.stoi:
+                self.itos.append(w)
+                self.stoi[w] = len(self.itos) - 1
+                self.freqs[w] = 0
+            self.freqs[w] += v.freqs[w]
+    def mol_to_seq(self, mol, with_len=False):
+        mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
+        if self.vocab_type == 'atom':
+            seq = [self.stoi.get(atom_to_vocab(mol, atom), self.other_index) for i, atom in enumerate(mol.GetAtoms())]
+        else:
+            seq = [self.stoi.get(bond_to_vocab(mol, bond), self.other_index) for i, bond in enumerate(mol.GetBonds())]
+        return (seq, len(seq)) if with_len else seq
+    @staticmethod
+    def load_vocab(vocab_path: str) -> 'Vocab':
+        with open(vocab_path, "rb") as f:
+            return pickle.load(f)
+    def save_vocab(self, vocab_path):
+        with open(vocab_path, "wb") as f:
+            pickle.dump(self, f)
+class MolVocab(TorchVocab):
+    def __init__(self, smiles, max_size=None, min_freq=1, vocab_type='atom'):
+        if vocab_type in ('atom', 'bond'):
+            self.vocab_type = vocab_type
+        else:
+            raise ValueError('Wrong input for vocab_type!')
+        print("Building %s vocab from smiles: %d" % (self.vocab_type, len(smiles)))
+        counter = Counter()
+        for smi in tqdm.tqdm(smiles):
+            mol = Chem.MolFromSmiles(smi)
+            if self.vocab_type == 'atom':
+                for _, atom in enumerate(mol.GetAtoms()):
+                    v = atom_to_vocab(mol, atom)
+                    counter[v] += 1
+            else:
+                for _, bond in enumerate(mol.GetBonds()):
+                    v = bond_to_vocab(mol, bond)
+                    counter[v] += 1
+        super().__init__(counter, max_size=max_size, min_freq=min_freq, vocab_type=vocab_type)
+    def __init__(self, file_path, max_size=None, min_freq=1, num_workers=1, total_lines=None, vocab_type='atom'):
+        if vocab_type in ('atom', 'bond'):
+            self.vocab_type = vocab_type
+        else:
+            raise ValueError('Wrong input for vocab_type!')
+        print("Building %s vocab from file: %s" % (self.vocab_type, file_path))
+        from rdkit import RDLogger
+        lg = RDLogger.logger()
+        lg.setLevel(RDLogger.CRITICAL)
+        if total_lines is None:
+            def file_len(fname):
+                f_len = 0
+                with open(fname) as f:
+                    for f_len, _ in enumerate(f):
+                        pass
+                return f_len + 1
+            total_lines = file_len(file_path)
+        counter = Counter()
+        pbar = tqdm.tqdm(total=total_lines)
+        pool = Pool(num_workers)
+        res = []
+        batch = 50000
+        callback = lambda a: pbar.update(batch)
+        for i in range(int(total_lines / batch + 1)):
+            start = int(batch * i)
+            end = min(total_lines, batch * (i + 1))
+            # print("Start: %d, End: %d"%(start, end))
+            res.append(pool.apply_async(MolVocab.read_smiles_from_file,
+                                        args=(file_path, start, end, vocab_type,),
+                                        callback=callback))
+            # read_smiles_from_file(lock, file_path, start, end)
+        pool.close()
+        pool.join()
+        for r in res:
+            sub_counter = r.get()
+            for k in sub_counter:
+                if k not in counter:
+                    counter[k] = 0
+                counter[k] += sub_counter[k]
+        # print(counter)
+        super().__init__(counter, max_size=max_size, min_freq=min_freq, vocab_type=vocab_type)
+    @staticmethod
+    def read_smiles_from_file(file_path, start, end, vocab_type):
+        # print("start")
+        smiles = open(file_path, "r")
+        smiles.readline()
+        sub_counter = Counter()
+        for i, smi in enumerate(smiles):
+            if i < start:
+                continue
+            if i >= end:
+                break
+            mol = Chem.MolFromSmiles(smi)
+            if vocab_type == 'atom':
+                for atom in mol.GetAtoms():
+                    v = atom_to_vocab(mol, atom)
+                    sub_counter[v] += 1
+            else:
+                for bond in mol.GetBonds():
+                    v = bond_to_vocab(mol, bond)
+                    sub_counter[v] += 1
+        # print("end")
+        return sub_counter
+    @staticmethod
+    def load_vocab(vocab_path: str) -> 'MolVocab':
+        with open(vocab_path, "rb") as f:
+            return pickle.load(f)

grover/model/layers.py ADDED Viewed

	@@ -0,0 +1,902 @@

+"""
+The basic building blocks in model.
+"""
+import math
+from argparse import Namespace
+from typing import Union
+import numpy
+import scipy.stats as stats
+import torch
+from torch import nn as nn
+from torch.nn import LayerNorm, functional as F
+from grover.util.nn_utils import get_activation_function, select_neighbor_and_aggregate
+class SelfAttention(nn.Module):
+    """
+       Self SelfAttention Layer
+       Given $X\in \mathbb{R}^{n \times in_feature}$, the attention is calculated by: $a=Softmax(W_2tanh(W_1X))$, where
+       $W_1 \in \mathbb{R}^{hidden \times in_feature}$, $W_2 \in \mathbb{R}^{out_feature \times hidden}$.
+       The final output is: $out=aX$, which is unrelated with input $n$.
+    """
+    def __init__(self, *, hidden, in_feature, out_feature):
+        """
+        The init function.
+        :param hidden: the hidden dimension, can be viewed as the number of experts.
+        :param in_feature: the input feature dimension.
+        :param out_feature: the output feature dimension.
+        """
+        super(SelfAttention, self).__init__()
+        self.w1 = torch.nn.Parameter(torch.FloatTensor(hidden, in_feature))
+        self.w2 = torch.nn.Parameter(torch.FloatTensor(out_feature, hidden))
+        self.reset_parameters()
+    def reset_parameters(self):
+        """
+        Use xavier_normal method to initialize parameters.
+        """
+        nn.init.xavier_normal_(self.w1)
+        nn.init.xavier_normal_(self.w2)
+    def forward(self, X):
+        """
+        The forward function.
+        :param X: The input feature map. $X \in \mathbb{R}^{n \times in_feature}$.
+        :return: The final embeddings and attention matrix.
+        """
+        x = torch.tanh(torch.matmul(self.w1, X.transpose(1, 0)))
+        x = torch.matmul(self.w2, x)
+        attn = torch.nn.functional.softmax(x, dim=-1)
+        x = torch.matmul(attn, X)
+        return x, attn
+class Readout(nn.Module):
+    """The readout function. Convert the node embeddings to the graph embeddings."""
+    def __init__(self,
+                 rtype: str = "none",
+                 hidden_size: int = 0,
+                 attn_hidden: int = None,
+                 attn_out: int = None,
+                 ):
+        """
+        The readout function.
+        :param rtype: readout type, can be "mean" and "self_attention".
+        :param hidden_size: input hidden size
+        :param attn_hidden: only valid if rtype == "self_attention". The attention hidden size.
+        :param attn_out: only valid if rtype == "self_attention". The attention out size.
+        :param args: legacy use.
+        """
+        super(Readout, self).__init__()
+        # Cached zeros
+        self.cached_zero_vector = nn.Parameter(torch.zeros(hidden_size), requires_grad=False)
+        self.rtype = "mean"
+        if rtype == "self_attention":
+            self.attn = SelfAttention(hidden=attn_hidden,
+                                      in_feature=hidden_size,
+                                      out_feature=attn_out)
+            self.rtype = "self_attention"
+    def forward(self, embeddings, scope):
+        """
+        The forward function, given a batch node/edge embedding and a scope list,
+        produce the graph-level embedding by a scope.
+        :param embeddings: The embedding matrix, num_atoms or num_bonds \times hidden_size.
+        :param scope: a list, in which the element is a list [start, range]. `start` is the index
+        :return:
+        """
+        # Readout
+        mol_vecs = []
+        self.attns = []
+        for _, (a_start, a_size) in enumerate(scope):
+            if a_size == 0:
+                mol_vecs.append(self.cached_zero_vector)
+            else:
+                cur_hiddens = embeddings.narrow(0, a_start, a_size)
+                if self.rtype == "self_attention":
+                    cur_hiddens, attn = self.attn(cur_hiddens)
+                    cur_hiddens = cur_hiddens.flatten()
+                    # Temporarily disable. Enable it if you want to save attentions.
+                    # self.attns.append(attn.cpu().detach().numpy())
+                else:
+                    cur_hiddens = cur_hiddens.sum(dim=0) / a_size
+                mol_vecs.append(cur_hiddens)
+        mol_vecs = torch.stack(mol_vecs, dim=0)  # (num_molecules, hidden_size)
+        return mol_vecs
+class MPNEncoder(nn.Module):
+    """A message passing neural network for encoding a molecule."""
+    def __init__(self, args: Namespace,
+                 atom_messages: bool,
+                 init_message_dim: int,
+                 attached_fea_fdim: int,
+                 hidden_size: int,
+                 bias: bool,
+                 depth: int,
+                 dropout: float,
+                 undirected: bool,
+                 dense: bool,
+                 aggregate_to_atom: bool,
+                 attach_fea: bool,
+                 input_layer="fc",
+                 dynamic_depth='none'
+                 ):
+        """
+        Initializes the MPNEncoder.
+        :param args: the arguments.
+        :param atom_messages: enables atom_messages or not.
+        :param init_message_dim:  the initial input message dimension.
+        :param attached_fea_fdim:  the attached feature dimension.
+        :param hidden_size: the output message dimension during message passing.
+        :param bias: the bias in the message passing.
+        :param depth: the message passing depth.
+        :param dropout: the dropout rate.
+        :param undirected: the message passing is undirected or not.
+        :param dense: enables the dense connections.
+        :param attach_fea: enables the feature attachment during the message passing process.
+        :param dynamic_depth: enables the dynamic depth. Possible choices: "none", "uniform" and "truncnorm"
+        """
+        super(MPNEncoder, self).__init__()
+        self.init_message_dim = init_message_dim
+        self.attached_fea_fdim = attached_fea_fdim
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.depth = depth
+        self.dropout = dropout
+        self.input_layer = input_layer
+        self.layers_per_message = 1
+        self.undirected = undirected
+        self.atom_messages = atom_messages
+        self.dense = dense
+        self.aggreate_to_atom = aggregate_to_atom
+        self.attached_fea = attach_fea
+        self.dynamic_depth = dynamic_depth
+        # Dropout
+        self.dropout_layer = nn.Dropout(p=self.dropout)
+        # Activation
+        self.act_func = get_activation_function(args.activation)
+        # Input
+        if self.input_layer == "fc":
+            input_dim = self.init_message_dim
+            self.W_i = nn.Linear(input_dim, self.hidden_size, bias=self.bias)
+        if self.attached_fea:
+            w_h_input_size = self.hidden_size + self.attached_fea_fdim
+        else:
+            w_h_input_size = self.hidden_size
+        # Shared weight matrix across depths (default)
+        self.W_h = nn.Linear(w_h_input_size, self.hidden_size, bias=self.bias)
+    def forward(self,
+                init_messages,
+                init_attached_features,
+                a2nei,
+                a2attached,
+                b2a=None,
+                b2revb=None,
+                adjs=None
+                ) -> torch.FloatTensor:
+        """
+        The forward function.
+        :param init_messages:  initial massages, can be atom features or bond features.
+        :param init_attached_features: initial attached_features.
+        :param a2nei: the relation of item to its neighbors. For the atom message passing, a2nei = a2a. For bond
+        messages a2nei = a2b
+        :param a2attached: the relation of item to the attached features during message passing. For the atom message
+        passing, a2attached = a2b. For the bond message passing a2attached = a2a
+        :param b2a: remove the reversed bond in bond message passing
+        :param b2revb: remove the revered atom in bond message passing
+        :return: if aggreate_to_atom or self.atom_messages, return num_atoms x hidden.
+        Otherwise, return num_bonds x hidden
+        """
+        # Input
+        if self.input_layer == 'fc':
+            input = self.W_i(init_messages)  # num_bonds x hidden_size # f_bond
+            message = self.act_func(input)  # num_bonds x hidden_size
+        elif self.input_layer == 'none':
+            input = init_messages
+            message = input
+        attached_fea = init_attached_features  # f_atom / f_bond
+        # dynamic depth
+        # uniform sampling from depth - 1 to depth + 1
+        # only works in training.
+        if self.training and self.dynamic_depth != "none":
+            if self.dynamic_depth == "uniform":
+                # uniform sampling
+                ndepth = numpy.random.randint(self.depth - 3, self.depth + 3)
+            else:
+                # truncnorm
+                mu = self.depth
+                sigma = 1
+                lower = mu - 3 * sigma
+                upper = mu + 3 * sigma
+                X = stats.truncnorm((lower - mu) / sigma, (upper - mu) / sigma, loc=mu, scale=sigma)
+                ndepth = int(X.rvs(1))
+        else:
+            ndepth = self.depth
+        # Message passing
+        for _ in range(ndepth - 1):
+            if self.undirected:
+                # two directions should be the same
+                message = (message + message[b2revb]) / 2
+            nei_message = select_neighbor_and_aggregate(message, a2nei)
+            a_message = nei_message
+            if self.attached_fea:
+                attached_nei_fea = select_neighbor_and_aggregate(attached_fea, a2attached)
+                a_message = torch.cat((nei_message, attached_nei_fea), dim=1)
+            if not self.atom_messages:
+                rev_message = message[b2revb]
+                if self.attached_fea:
+                    atom_rev_message = attached_fea[b2a[b2revb]]
+                    rev_message = torch.cat((rev_message, atom_rev_message), dim=1)
+                # Except reverse bond its-self(w) ! \sum_{k\in N(u) \ w}
+                message = a_message[b2a] - rev_message  # num_bonds x hidden
+            else:
+                message = a_message
+            message = self.W_h(message)
+            # BUG here, by default MPNEncoder use the dense connection in the message passing step.
+            # The correct form should if not self.dense
+            if self.dense:
+                message = self.act_func(message)  # num_bonds x hidden_size
+            else:
+                message = self.act_func(input + message)
+            message = self.dropout_layer(message)  # num_bonds x hidden
+        output = message
+        return output  # num_atoms x hidden
+class PositionwiseFeedForward(nn.Module):
+    """Implements FFN equation."""
+    def __init__(self, d_model, d_ff, activation="PReLU", dropout=0.1, d_out=None):
+        """Initialization.
+        :param d_model: the input dimension.
+        :param d_ff: the hidden dimension.
+        :param activation: the activation function.
+        :param dropout: the dropout rate.
+        :param d_out: the output dimension, the default value is equal to d_model.
+        """
+        super(PositionwiseFeedForward, self).__init__()
+        if d_out is None:
+            d_out = d_model
+        # By default, bias is on.
+        self.W_1 = nn.Linear(d_model, d_ff)
+        self.W_2 = nn.Linear(d_ff, d_out)
+        self.dropout = nn.Dropout(dropout)
+        self.act_func = get_activation_function(activation)
+    def forward(self, x):
+        """
+        The forward function
+        :param x: input tensor.
+        :return:
+        """
+        return self.W_2(self.dropout(self.act_func(self.W_1(x))))
+class SublayerConnection(nn.Module):
+    """
+    A residual connection followed by a layer norm.
+    Note for code simplicity the norm is first as opposed to last.
+    """
+    def __init__(self, size, dropout):
+        """Initialization.
+        :param size: the input dimension.
+        :param dropout: the dropout ratio.
+        """
+        super(SublayerConnection, self).__init__()
+        self.norm = LayerNorm(size, elementwise_affine=True)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, inputs, outputs):
+        """Apply residual connection to any sublayer with the same size."""
+        # return x + self.dropout(self.norm(x))
+        if inputs is None:
+            return self.dropout(self.norm(outputs))
+        return inputs + self.dropout(self.norm(outputs))
+class Attention(nn.Module):
+    """
+    Compute 'Scaled Dot Product SelfAttention
+    """
+    def forward(self, query, key, value, mask=None, dropout=None):
+        """
+        :param query:
+        :param key:
+        :param value:
+        :param mask:
+        :param dropout:
+        :return:
+        """
+        scores = torch.matmul(query, key.transpose(-2, -1)) \
+                 / math.sqrt(query.size(-1))
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e9)
+        p_attn = F.softmax(scores, dim=-1)
+        if dropout is not None:
+            p_attn = dropout(p_attn)
+        return torch.matmul(p_attn, value), p_attn
+class MultiHeadedAttention(nn.Module):
+    """
+    The multi-head attention module. Take in model size and number of heads.
+    """
+    def __init__(self, h, d_model, dropout=0.1, bias=False):
+        """
+        :param h:
+        :param d_model:
+        :param dropout:
+        :param bias:
+        """
+        super().__init__()
+        assert d_model % h == 0
+        # We assume d_v always equals d_k
+        self.d_k = d_model // h
+        self.h = h  # number of heads
+        self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])  # why 3: query, key, value
+        self.output_linear = nn.Linear(d_model, d_model, bias)
+        self.attention = Attention()
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(self, query, key, value, mask=None):
+        """
+        :param query:
+        :param key:
+        :param value:
+        :param mask:
+        :return:
+        """
+        batch_size = query.size(0)
+        # 1) Do all the linear projections in batch from d_model => h x d_k
+        query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
+                             for l, x in zip(self.linear_layers, (query, key, value))]
+        # 2) Apply attention on all the projected vectors in batch.
+        x, _ = self.attention(query, key, value, mask=mask, dropout=self.dropout)
+        # 3) "Concat" using a view and apply a final linear.
+        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
+        return self.output_linear(x)
+class Head(nn.Module):
+    """
+    One head for multi-headed attention.
+    :return: (query, key, value)
+    """
+    def __init__(self, args, hidden_size, atom_messages=False):
+        """
+        Initialization.
+        :param args: The argument.
+        :param hidden_size: the dimension of hidden layer in Head.
+        :param atom_messages: the MPNEncoder type.
+        """
+        super(Head, self).__init__()
+        atom_fdim = hidden_size
+        bond_fdim = hidden_size
+        hidden_size = hidden_size
+        self.atom_messages = atom_messages
+        if self.atom_messages:
+            init_message_dim = atom_fdim
+            attached_fea_dim = bond_fdim
+        else:
+            init_message_dim = bond_fdim
+            attached_fea_dim = atom_fdim
+        # Here we use the message passing network as query, key and value.
+        self.mpn_q = MPNEncoder(args=args,
+                                atom_messages=atom_messages,
+                                init_message_dim=init_message_dim,
+                                attached_fea_fdim=attached_fea_dim,
+                                hidden_size=hidden_size,
+                                bias=args.bias,
+                                depth=args.depth,
+                                dropout=args.dropout,
+                                undirected=args.undirected,
+                                dense=args.dense,
+                                aggregate_to_atom=False,
+                                attach_fea=False,
+                                input_layer="none",
+                                dynamic_depth="truncnorm")
+        self.mpn_k = MPNEncoder(args=args,
+                                atom_messages=atom_messages,
+                                init_message_dim=init_message_dim,
+                                attached_fea_fdim=attached_fea_dim,
+                                hidden_size=hidden_size,
+                                bias=args.bias,
+                                depth=args.depth,
+                                dropout=args.dropout,
+                                undirected=args.undirected,
+                                dense=args.dense,
+                                aggregate_to_atom=False,
+                                attach_fea=False,
+                                input_layer="none",
+                                dynamic_depth="truncnorm")
+        self.mpn_v = MPNEncoder(args=args,
+                                atom_messages=atom_messages,
+                                init_message_dim=init_message_dim,
+                                attached_fea_fdim=attached_fea_dim,
+                                hidden_size=hidden_size,
+                                bias=args.bias,
+                                depth=args.depth,
+                                dropout=args.dropout,
+                                undirected=args.undirected,
+                                dense=args.dense,
+                                aggregate_to_atom=False,
+                                attach_fea=False,
+                                input_layer="none",
+                                dynamic_depth="truncnorm")
+    def forward(self, f_atoms, f_bonds, a2b, a2a, b2a, b2revb):
+        """
+        The forward function.
+        :param f_atoms: the atom features, num_atoms * atom_dim
+        :param f_bonds: the bond features, num_bonds * bond_dim
+        :param a2b: mapping from atom index to incoming bond indices.
+        :param a2a: mapping from atom index to its neighbors. num_atoms * max_num_bonds
+        :param b2a: mapping from bond index to the index of the atom the bond is coming from.
+        :param b2revb: mapping from bond index to the index of the reverse bond.
+        :return:
+        """
+        if self.atom_messages:
+            init_messages = f_atoms
+            init_attached_features = f_bonds
+            a2nei = a2a
+            a2attached = a2b
+            b2a = b2a
+            b2revb = b2revb
+        else:
+            init_messages = f_bonds
+            init_attached_features = f_atoms
+            a2nei = a2b
+            a2attached = a2a
+            b2a = b2a
+            b2revb = b2revb
+        q = self.mpn_q(init_messages=init_messages,
+                       init_attached_features=init_attached_features,
+                       a2nei=a2nei,
+                       a2attached=a2attached,
+                       b2a=b2a,
+                       b2revb=b2revb)
+        k = self.mpn_k(init_messages=init_messages,
+                       init_attached_features=init_attached_features,
+                       a2nei=a2nei,
+                       a2attached=a2attached,
+                       b2a=b2a,
+                       b2revb=b2revb)
+        v = self.mpn_v(init_messages=init_messages,
+                       init_attached_features=init_attached_features,
+                       a2nei=a2nei,
+                       a2attached=a2attached,
+                       b2a=b2a,
+                       b2revb=b2revb)
+        return q, k, v
+class MTBlock(nn.Module):
+    """
+    The Multi-headed attention block.
+    """
+    def __init__(self,
+                 args,
+                 num_attn_head,
+                 input_dim,
+                 hidden_size,
+                 activation="ReLU",
+                 dropout=0.0,
+                 bias=True,
+                 atom_messages=False,
+                 cuda=True,
+                 res_connection=False):
+        """
+        :param args: the arguments.
+        :param num_attn_head: the number of attention head.
+        :param input_dim: the input dimension.
+        :param hidden_size: the hidden size of the model.
+        :param activation: the activation function.
+        :param dropout: the dropout ratio
+        :param bias: if true: all linear layer contains bias term.
+        :param atom_messages: the MPNEncoder type
+        :param cuda: if true, the model run with GPU.
+        :param res_connection: enables the skip-connection in MTBlock.
+        """
+        super(MTBlock, self).__init__()
+        # self.args = args
+        self.atom_messages = atom_messages
+        self.hidden_size = hidden_size
+        self.heads = nn.ModuleList()
+        self.input_dim = input_dim
+        self.cuda = cuda
+        self.res_connection = res_connection
+        self.act_func = get_activation_function(activation)
+        self.dropout_layer = nn.Dropout(p=dropout)
+        # Note: elementwise_affine has to be consistent with the pre-training phase
+        self.layernorm = nn.LayerNorm(self.hidden_size, elementwise_affine=True)
+        self.W_i = nn.Linear(self.input_dim, self.hidden_size, bias=bias)
+        self.attn = MultiHeadedAttention(h=num_attn_head,
+                                         d_model=self.hidden_size,
+                                         bias=bias,
+                                         dropout=dropout)
+        self.W_o = nn.Linear(self.hidden_size * num_attn_head, self.hidden_size, bias=bias)
+        self.sublayer = SublayerConnection(self.hidden_size, dropout)
+        for _ in range(num_attn_head):
+            self.heads.append(Head(args, hidden_size=hidden_size, atom_messages=atom_messages))
+    def forward(self, batch, features_batch=None):
+        """
+        :param batch: the graph batch generated by GroverCollator.
+        :param features_batch: the additional features of molecules. (deprecated)
+        :return:
+        """
+        f_atoms, f_bonds, a2b, b2a, b2revb, a_scope, b_scope, a2a = batch
+        if self.atom_messages:
+            # Only add linear transformation in the input feature.
+            if f_atoms.shape[1] != self.hidden_size:
+                f_atoms = self.W_i(f_atoms)
+                f_atoms = self.dropout_layer(self.layernorm(self.act_func(f_atoms)))
+        else:  # bond messages
+            if f_bonds.shape[1] != self.hidden_size:
+                f_bonds = self.W_i(f_bonds)
+                f_bonds = self.dropout_layer(self.layernorm(self.act_func(f_bonds)))
+        queries = []
+        keys = []
+        values = []
+        for head in self.heads:
+            q, k, v = head(f_atoms, f_bonds, a2b, a2a, b2a, b2revb)
+            queries.append(q.unsqueeze(1))
+            keys.append(k.unsqueeze(1))
+            values.append(v.unsqueeze(1))
+        queries = torch.cat(queries, dim=1)
+        keys = torch.cat(keys, dim=1)
+        values = torch.cat(values, dim=1)
+        x_out = self.attn(queries, keys, values)  # multi-headed attention
+        x_out = x_out.view(x_out.shape[0], -1)
+        x_out = self.W_o(x_out)
+        x_in = None
+        # support no residual connection in MTBlock.
+        if self.res_connection:
+            if self.atom_messages:
+                x_in = f_atoms
+            else:
+                x_in = f_bonds
+        if self.atom_messages:
+            f_atoms = self.sublayer(x_in, x_out)
+        else:
+            f_bonds = self.sublayer(x_in, x_out)
+        batch = f_atoms, f_bonds, a2b, b2a, b2revb, a_scope, b_scope, a2a
+        features_batch = features_batch
+        return batch, features_batch
+class GTransEncoder(nn.Module):
+    def __init__(self,
+                 args,
+                 hidden_size,
+                 edge_fdim,
+                 node_fdim,
+                 dropout=0.0,
+                 activation="ReLU",
+                 num_mt_block=1,
+                 num_attn_head=4,
+                 atom_emb_output: Union[bool, str] = False,  # options: True, False, None, "atom", "bond", "both"
+                 bias=False,
+                 cuda=True,
+                 res_connection=False):
+        """
+        :param args: the arguments.
+        :param hidden_size: the hidden size of the model.
+        :param edge_fdim: the dimension of additional feature for edge/bond.
+        :param node_fdim: the dimension of additional feature for node/atom.
+        :param dropout: the dropout ratio
+        :param activation: the activation function
+        :param num_mt_block: the number of mt block.
+        :param num_attn_head: the number of attention head.
+        :param atom_emb_output:  enable the output aggregation after message passing.
+                                              atom_messages:      True                      False
+        -False: no aggregating to atom. output size:     (num_atoms, hidden_size)    (num_bonds, hidden_size)
+        -True:  aggregating to atom.    output size:     (num_atoms, hidden_size)    (num_atoms, hidden_size)
+        -None:                         same as False
+        -"atom":                       same as True
+        -"bond": aggragating to bond.   output size:     (num_bonds, hidden_size)    (num_bonds, hidden_size)
+        -"both": aggregating to atom&bond. output size:  (num_atoms, hidden_size)    (num_bonds, hidden_size)
+                                                         (num_bonds, hidden_size)    (num_atoms, hidden_size)
+        :param bias: enable bias term in all linear layers.
+        :param cuda: run with cuda.
+        :param res_connection: enables the skip-connection in MTBlock.
+        """
+        super(GTransEncoder, self).__init__()
+        # For the compatibility issue.
+        if atom_emb_output is False:
+            atom_emb_output = None
+        if atom_emb_output is True:
+            atom_emb_output = 'atom'
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.activation = activation
+        self.cuda = cuda
+        self.bias = bias
+        self.res_connection = res_connection
+        self.edge_blocks = nn.ModuleList()
+        self.node_blocks = nn.ModuleList()
+        edge_input_dim = edge_fdim
+        node_input_dim = node_fdim
+        edge_input_dim_i = edge_input_dim
+        node_input_dim_i = node_input_dim
+        for i in range(num_mt_block):
+            if i != 0:
+                edge_input_dim_i = self.hidden_size
+                node_input_dim_i = self.hidden_size
+            self.edge_blocks.append(MTBlock(args=args,
+                                            num_attn_head=num_attn_head,
+                                            input_dim=edge_input_dim_i,
+                                            hidden_size=self.hidden_size,
+                                            activation=activation,
+                                            dropout=dropout,
+                                            bias=self.bias,
+                                            atom_messages=False,
+                                            cuda=cuda))
+            self.node_blocks.append(MTBlock(args=args,
+                                            num_attn_head=num_attn_head,
+                                            input_dim=node_input_dim_i,
+                                            hidden_size=self.hidden_size,
+                                            activation=activation,
+                                            dropout=dropout,
+                                            bias=self.bias,
+                                            atom_messages=True,
+                                            cuda=cuda))
+        self.atom_emb_output = atom_emb_output
+        self.ffn_atom_from_atom = PositionwiseFeedForward(self.hidden_size + node_fdim,
+                                                          self.hidden_size * 4,
+                                                          activation=self.activation,
+                                                          dropout=self.dropout,
+                                                          d_out=self.hidden_size)
+        self.ffn_atom_from_bond = PositionwiseFeedForward(self.hidden_size + node_fdim,
+                                                          self.hidden_size * 4,
+                                                          activation=self.activation,
+                                                          dropout=self.dropout,
+                                                          d_out=self.hidden_size)
+        self.ffn_bond_from_atom = PositionwiseFeedForward(self.hidden_size + edge_fdim,
+                                                          self.hidden_size * 4,
+                                                          activation=self.activation,
+                                                          dropout=self.dropout,
+                                                          d_out=self.hidden_size)
+        self.ffn_bond_from_bond = PositionwiseFeedForward(self.hidden_size + edge_fdim,
+                                                          self.hidden_size * 4,
+                                                          activation=self.activation,
+                                                          dropout=self.dropout,
+                                                          d_out=self.hidden_size)
+        self.atom_from_atom_sublayer = SublayerConnection(size=self.hidden_size, dropout=self.dropout)
+        self.atom_from_bond_sublayer = SublayerConnection(size=self.hidden_size, dropout=self.dropout)
+        self.bond_from_atom_sublayer = SublayerConnection(size=self.hidden_size, dropout=self.dropout)
+        self.bond_from_bond_sublayer = SublayerConnection(size=self.hidden_size, dropout=self.dropout)
+        self.act_func_node = get_activation_function(self.activation)
+        self.act_func_edge = get_activation_function(self.activation)
+        self.dropout_layer = nn.Dropout(p=args.dropout)
+    def pointwise_feed_forward_to_atom_embedding(self, emb_output, atom_fea, index, ffn_layer):
+        """
+        The point-wise feed forward and long-range residual connection for atom view.
+        aggregate to atom.
+        :param emb_output: the output embedding from the previous multi-head attentions.
+        :param atom_fea: the atom/node feature embedding.
+        :param index: the index of neighborhood relations.
+        :param ffn_layer: the feed forward layer
+        :return:
+        """
+        aggr_output = select_neighbor_and_aggregate(emb_output, index)
+        aggr_outputx = torch.cat([atom_fea, aggr_output], dim=1)
+        return ffn_layer(aggr_outputx), aggr_output
+    def pointwise_feed_forward_to_bond_embedding(self, emb_output, bond_fea, a2nei, b2revb, ffn_layer):
+        """
+        The point-wise feed forward and long-range residual connection for bond view.
+        aggregate to bond.
+        :param emb_output: the output embedding from the previous multi-head attentions.
+        :param bond_fea: the bond/edge feature embedding.
+        :param index: the index of neighborhood relations.
+        :param ffn_layer: the feed forward layer
+        :return:
+        """
+        aggr_output = select_neighbor_and_aggregate(emb_output, a2nei)
+        # remove rev bond / atom --- need for bond view
+        aggr_output = self.remove_rev_bond_message(emb_output, aggr_output, b2revb)
+        aggr_outputx = torch.cat([bond_fea, aggr_output], dim=1)
+        return ffn_layer(aggr_outputx), aggr_output
+    @staticmethod
+    def remove_rev_bond_message(orginal_message, aggr_message, b2revb):
+        """
+        :param orginal_message:
+        :param aggr_message:
+        :param b2revb:
+        :return:
+        """
+        rev_message = orginal_message[b2revb]
+        return aggr_message - rev_message
+    def atom_bond_transform(self,
+                            to_atom=True,  # False: to bond
+                            atomwise_input=None,
+                            bondwise_input=None,
+                            original_f_atoms=None,
+                            original_f_bonds=None,
+                            a2a=None,
+                            a2b=None,
+                            b2a=None,
+                            b2revb=None
+                            ):
+        """
+        Transfer the output of atom/bond multi-head attention to the final atom/bond output.
+        :param to_atom: if true, the output is atom emebedding, otherwise, the output is bond embedding.
+        :param atomwise_input: the input embedding of atom/node.
+        :param bondwise_input: the input embedding of bond/edge.
+        :param original_f_atoms: the initial atom features.
+        :param original_f_bonds: the initial bond features.
+        :param a2a: mapping from atom index to its neighbors. num_atoms * max_num_bonds
+        :param a2b: mapping from atom index to incoming bond indices.
+        :param b2a: mapping from bond index to the index of the atom the bond is coming from.
+        :param b2revb: mapping from bond index to the index of the reverse bond.
+        :return:
+        """
+        if to_atom:
+            # atom input to atom output
+            atomwise_input, _ = self.pointwise_feed_forward_to_atom_embedding(atomwise_input, original_f_atoms, a2a,
+                                                                              self.ffn_atom_from_atom)
+            atom_in_atom_out = self.atom_from_atom_sublayer(None, atomwise_input)
+            # bond to atom
+            bondwise_input, _ = self.pointwise_feed_forward_to_atom_embedding(bondwise_input, original_f_atoms, a2b,
+                                                                              self.ffn_atom_from_bond)
+            bond_in_atom_out = self.atom_from_bond_sublayer(None, bondwise_input)
+            return atom_in_atom_out, bond_in_atom_out
+        else:  # to bond embeddings
+            # atom input to bond output
+            atom_list_for_bond = torch.cat([b2a.unsqueeze(dim=1), a2a[b2a]], dim=1)
+            atomwise_input, _ = self.pointwise_feed_forward_to_bond_embedding(atomwise_input, original_f_bonds,
+                                                                              atom_list_for_bond,
+                                                                              b2a[b2revb], self.ffn_bond_from_atom)
+            atom_in_bond_out = self.bond_from_atom_sublayer(None, atomwise_input)
+            # bond input to bond output
+            bond_list_for_bond = a2b[b2a]
+            bondwise_input, _ = self.pointwise_feed_forward_to_bond_embedding(bondwise_input, original_f_bonds,
+                                                                              bond_list_for_bond,
+                                                                              b2revb, self.ffn_bond_from_bond)
+            bond_in_bond_out = self.bond_from_bond_sublayer(None, bondwise_input)
+            return atom_in_bond_out, bond_in_bond_out
+    def forward(self, batch, features_batch = None):
+        f_atoms, f_bonds, a2b, b2a, b2revb, a_scope, b_scope, a2a = batch
+        if self.cuda or next(self.parameters()).is_cuda:
+            f_atoms, f_bonds, a2b, b2a, b2revb = f_atoms.cuda(), f_bonds.cuda(), a2b.cuda(), b2a.cuda(), b2revb.cuda()
+            a2a = a2a.cuda()
+        node_batch = f_atoms, f_bonds, a2b, b2a, b2revb, a_scope, b_scope, a2a
+        edge_batch = f_atoms, f_bonds, a2b, b2a, b2revb, a_scope, b_scope, a2a
+        # opt pointwise_feed_forward
+        original_f_atoms, original_f_bonds = f_atoms, f_bonds
+        # Note: features_batch is not used here.
+        for nb in self.node_blocks:  # atom messages. Multi-headed attention
+            node_batch, features_batch = nb(node_batch, features_batch)
+        for eb in self.edge_blocks:  # bond messages. Multi-headed attention
+            edge_batch, features_batch = eb(edge_batch, features_batch)
+        atom_output, _, _, _, _, _, _, _ = node_batch  # atom hidden states
+        _, bond_output, _, _, _, _, _, _ = edge_batch  # bond hidden states
+        if self.atom_emb_output is None:
+            # output the embedding from multi-head attention directly.
+            return atom_output, bond_output
+        if self.atom_emb_output == 'atom':
+            return self.atom_bond_transform(to_atom=True,  # False: to bond
+                                            atomwise_input=atom_output,
+                                            bondwise_input=bond_output,
+                                            original_f_atoms=original_f_atoms,
+                                            original_f_bonds=original_f_bonds,
+                                            a2a=a2a,
+                                            a2b=a2b,
+                                            b2a=b2a,
+                                            b2revb=b2revb)
+        elif self.atom_emb_output == 'bond':
+            return self.atom_bond_transform(to_atom=False,  # False: to bond
+                                            atomwise_input=atom_output,
+                                            bondwise_input=bond_output,
+                                            original_f_atoms=original_f_atoms,
+                                            original_f_bonds=original_f_bonds,
+                                            a2a=a2a,
+                                            a2b=a2b,
+                                            b2a=b2a,
+                                            b2revb=b2revb)
+        else:  # 'both'
+            atom_embeddings = self.atom_bond_transform(to_atom=True,  # False: to bond
+                                                       atomwise_input=atom_output,
+                                                       bondwise_input=bond_output,
+                                                       original_f_atoms=original_f_atoms,
+                                                       original_f_bonds=original_f_bonds,
+                                                       a2a=a2a,
+                                                       a2b=a2b,
+                                                       b2a=b2a,
+                                                       b2revb=b2revb)
+            bond_embeddings = self.atom_bond_transform(to_atom=False,  # False: to bond
+                                                       atomwise_input=atom_output,
+                                                       bondwise_input=bond_output,
+                                                       original_f_atoms=original_f_atoms,
+                                                       original_f_bonds=original_f_bonds,
+                                                       a2a=a2a,
+                                                       a2b=a2b,
+                                                       b2a=b2a,
+                                                       b2revb=b2revb)
+            # Notice: need to be consistent with output format of DualMPNN encoder
+            return ((atom_embeddings[0], bond_embeddings[0]),
+                    (atom_embeddings[1], bond_embeddings[1]))

grover/model/models.py ADDED Viewed

	@@ -0,0 +1,506 @@

+"""
+The GROVER models for pretraining, finetuning and fingerprint generating.
+"""
+from argparse import Namespace
+from typing import List, Dict, Callable
+import numpy as np
+import torch
+from torch import nn as nn
+from grover.data import get_atom_fdim, get_bond_fdim
+from grover.model.layers import Readout, GTransEncoder
+from grover.util.nn_utils import get_activation_function
+class GROVEREmbedding(nn.Module):
+    """
+    The GROVER Embedding class. It contains the GTransEncoder.
+    This GTransEncoder can be replaced by any validate encoders.
+    """
+    def __init__(self, args: Namespace):
+        """
+        Initialize the GROVEREmbedding class.
+        :param args:
+        """
+        super(GROVEREmbedding, self).__init__()
+        self.embedding_output_type = args.embedding_output_type
+        edge_dim = get_bond_fdim() + get_atom_fdim()
+        node_dim = get_atom_fdim()
+        if not hasattr(args, "backbone"):
+            print("No backbone specified in args, use gtrans backbone.")
+            args.backbone = "gtrans"
+        if args.backbone == "gtrans" or args.backbone == "dualtrans":
+            # dualtrans is the old name.
+            self.encoders = GTransEncoder(args,
+                                          hidden_size=args.hidden_size,
+                                          edge_fdim=edge_dim,
+                                          node_fdim=node_dim,
+                                          dropout=args.dropout,
+                                          activation=args.activation,
+                                          num_mt_block=args.num_mt_block,
+                                          num_attn_head=args.num_attn_head,
+                                          atom_emb_output=self.embedding_output_type,
+                                          bias=args.bias,
+                                          cuda=args.cuda)
+    def forward(self, graph_batch: List) -> Dict:
+        """
+        The forward function takes graph_batch as input and output a dict. The content of the dict is decided by
+        self.embedding_output_type.
+        :param graph_batch: the input graph batch generated by MolCollator.
+        :return: a dict containing the embedding results.
+        """
+        output = self.encoders(graph_batch)
+        if self.embedding_output_type == 'atom':
+            return {"atom_from_atom": output[0], "atom_from_bond": output[1],
+                    "bond_from_atom": None, "bond_from_bond": None}  # atom_from_atom, atom_from_bond
+        elif self.embedding_output_type == 'bond':
+            return {"atom_from_atom": None, "atom_from_bond": None,
+                    "bond_from_atom": output[0], "bond_from_bond": output[1]}  # bond_from_atom, bond_from_bond
+        elif self.embedding_output_type == "both":
+            return {"atom_from_atom": output[0][0], "bond_from_atom": output[0][1],
+                    "atom_from_bond": output[1][0], "bond_from_bond": output[1][1]}
+class AtomVocabPrediction(nn.Module):
+    """
+    The atom-wise vocabulary prediction task. The atom vocabulary is constructed by the context.
+    """
+    def __init__(self, args, vocab_size, hidden_size=None):
+        """
+        :param args: the argument.
+        :param vocab_size: the size of atom vocabulary.
+        """
+        super(AtomVocabPrediction, self).__init__()
+        if not hidden_size:
+            hidden_size = args.hidden_size
+        self.linear = nn.Linear(hidden_size, vocab_size)
+        self.logsoftmax = nn.LogSoftmax(dim=1)
+    def forward(self, embeddings):
+        """
+        If embeddings is None: do not go through forward pass.
+        :param embeddings: the atom embeddings, num_atom X fea_dim.
+        :return: the prediction for each atom, num_atom X vocab_size.
+        """
+        if embeddings is None:
+            return None
+        return self.logsoftmax(self.linear(embeddings))
+class BondVocabPrediction(nn.Module):
+    """
+    The bond-wise vocabulary prediction task. The bond vocabulary is constructed by the context.
+    """
+    def __init__(self, args, vocab_size, hidden_size=None):
+        """
+        Might need to use different architecture for bond vocab prediction.
+        :param args:
+        :param vocab_size: size of bond vocab.
+        :param hidden_size: hidden size
+        """
+        super(BondVocabPrediction, self).__init__()
+        if not hidden_size:
+            hidden_size = args.hidden_size
+        self.linear = nn.Linear(hidden_size, vocab_size)
+        # ad-hoc here
+        # If TWO_FC_4_BOND_VOCAB, we will use two distinct fc layer to deal with the bond and rev bond.
+        self.TWO_FC_4_BOND_VOCAB = True
+        if self.TWO_FC_4_BOND_VOCAB:
+            self.linear_rev = nn.Linear(hidden_size, vocab_size)
+        self.logsoftmax = nn.LogSoftmax(dim=1)
+    def forward(self, embeddings):
+        """
+        If embeddings is None: do not go through forward pass.
+        :param embeddings: the atom embeddings, num_bond X fea_dim.
+        :return: the prediction for each atom, num_bond X vocab_size.
+        """
+        if embeddings is None:
+            return None
+        nm_bonds = embeddings.shape[0]  # must be an odd number
+        # The bond and rev bond have odd and even ids respectively. See definition in molgraph.
+        ids1 = [0] + list(range(1, nm_bonds, 2))
+        ids2 = list(range(0, nm_bonds, 2))
+        if self.TWO_FC_4_BOND_VOCAB:
+            logits = self.linear(embeddings[ids1]) + self.linear_rev(embeddings[ids2])
+        else:
+            logits = self.linear(embeddings[ids1] + embeddings[ids2])
+        return self.logsoftmax(logits)
+class FunctionalGroupPrediction(nn.Module):
+    """
+    The functional group (semantic motifs) prediction task. This is a graph-level task.
+    """
+    def __init__(self, args, fg_size):
+        """
+        :param args: The arguments.
+        :param fg_size: The size of semantic motifs.
+        """
+        super(FunctionalGroupPrediction, self).__init__()
+        first_linear_dim = args.hidden_size
+        hidden_size = args.hidden_size
+        # In order to retain maximal information in the encoder, we use a simple readout function here.
+        self.readout = Readout(rtype="mean", hidden_size=hidden_size)
+        # We have four branches here. But the input with less than four branch is OK.
+        # Since we use BCEWithLogitsLoss as the loss function, we only need to output logits here.
+        self.linear_atom_from_atom = nn.Linear(first_linear_dim, fg_size)
+        self.linear_atom_from_bond = nn.Linear(first_linear_dim, fg_size)
+        self.linear_bond_from_atom = nn.Linear(first_linear_dim, fg_size)
+        self.linear_bond_from_bond = nn.Linear(first_linear_dim, fg_size)
+    def forward(self, embeddings: Dict, ascope: List, bscope: List) -> Dict:
+        """
+        The forward function of semantic motif prediction. It takes the node/bond embeddings, and the corresponding
+        atom/bond scope as input and produce the prediction logits for different branches.
+        :param embeddings: The input embeddings are organized as dict. The output of GROVEREmbedding.
+        :param ascope: The scope for bonds. Please refer BatchMolGraph for more details.
+        :param bscope: The scope for aotms. Please refer BatchMolGraph for more details.
+        :return: a dict contains the predicted logits.
+        """
+        preds_atom_from_atom, preds_atom_from_bond, preds_bond_from_atom, preds_bond_from_bond = \
+            None, None, None, None
+        if embeddings["bond_from_atom"] is not None:
+            preds_bond_from_atom = self.linear_bond_from_atom(self.readout(embeddings["bond_from_atom"], bscope))
+        if embeddings["bond_from_bond"] is not None:
+            preds_bond_from_bond = self.linear_bond_from_bond(self.readout(embeddings["bond_from_bond"], bscope))
+        if embeddings["atom_from_atom"] is not None:
+            preds_atom_from_atom = self.linear_atom_from_atom(self.readout(embeddings["atom_from_atom"], ascope))
+        if embeddings["atom_from_bond"] is not None:
+            preds_atom_from_bond = self.linear_atom_from_bond(self.readout(embeddings["atom_from_bond"], ascope))
+        return {"atom_from_atom": preds_atom_from_atom, "atom_from_bond": preds_atom_from_bond,
+                "bond_from_atom": preds_bond_from_atom, "bond_from_bond": preds_bond_from_bond}
+class GroverTask(nn.Module):
+    """
+    The pretrain module.
+    """
+    def __init__(self, args, grover, atom_vocab_size, bond_vocab_size, fg_size):
+        super(GroverTask, self).__init__()
+        self.grover = grover
+        self.av_task_atom = AtomVocabPrediction(args, atom_vocab_size)
+        self.av_task_bond = AtomVocabPrediction(args, atom_vocab_size)
+        self.bv_task_atom = BondVocabPrediction(args, bond_vocab_size)
+        self.bv_task_bond = BondVocabPrediction(args, bond_vocab_size)
+        self.fg_task_all = FunctionalGroupPrediction(args, fg_size)
+        self.embedding_output_type = args.embedding_output_type
+    @staticmethod
+    def get_loss_func(args: Namespace) -> Callable:
+        """
+        The loss function generator.
+        :param args: the arguments.
+        :return: the loss fucntion for GroverTask.
+        """
+        def loss_func(preds, targets, dist_coff=args.dist_coff):
+            """
+            The loss function for GroverTask.
+            :param preds: the predictions.
+            :param targets: the targets.
+            :param dist_coff: the default disagreement coefficient for the distances between different branches.
+            :return:
+            """
+            av_task_loss = nn.NLLLoss(ignore_index=0, reduction="mean")  # same for av and bv
+            fg_task_loss = nn.BCEWithLogitsLoss(reduction="mean")
+            # av_task_dist_loss = nn.KLDivLoss(reduction="mean")
+            av_task_dist_loss = nn.MSELoss(reduction="mean")
+            fg_task_dist_loss = nn.MSELoss(reduction="mean")
+            sigmoid = nn.Sigmoid()
+            av_atom_loss, av_bond_loss, av_dist_loss = 0.0, 0.0, 0.0
+            fg_atom_from_atom_loss, fg_atom_from_bond_loss, fg_atom_dist_loss = 0.0, 0.0, 0.0
+            bv_atom_loss, bv_bond_loss, bv_dist_loss = 0.0, 0.0, 0.0
+            fg_bond_from_atom_loss, fg_bond_from_bond_loss, fg_bond_dist_loss = 0.0, 0.0, 0.0
+            if preds["av_task"][0] is not None:
+                av_atom_loss = av_task_loss(preds['av_task'][0], targets["av_task"])
+                fg_atom_from_atom_loss = fg_task_loss(preds["fg_task"]["atom_from_atom"], targets["fg_task"])
+            if preds["av_task"][1] is not None:
+                av_bond_loss = av_task_loss(preds['av_task'][1], targets["av_task"])
+                fg_atom_from_bond_loss = fg_task_loss(preds["fg_task"]["atom_from_bond"], targets["fg_task"])
+            if preds["bv_task"][0] is not None:
+                bv_atom_loss = av_task_loss(preds['bv_task'][0], targets["bv_task"])
+                fg_bond_from_atom_loss = fg_task_loss(preds["fg_task"]["bond_from_atom"], targets["fg_task"])
+            if preds["bv_task"][1] is not None:
+                bv_bond_loss = av_task_loss(preds['bv_task'][1], targets["bv_task"])
+                fg_bond_from_bond_loss = fg_task_loss(preds["fg_task"]["bond_from_bond"], targets["fg_task"])
+            if preds["av_task"][0] is not None and preds["av_task"][1] is not None:
+                av_dist_loss = av_task_dist_loss(preds['av_task'][0], preds['av_task'][1])
+                fg_atom_dist_loss = fg_task_dist_loss(sigmoid(preds["fg_task"]["atom_from_atom"]),
+                                                      sigmoid(preds["fg_task"]["atom_from_bond"]))
+            if preds["bv_task"][0] is not None and preds["bv_task"][1] is not None:
+                bv_dist_loss = av_task_dist_loss(preds['bv_task'][0], preds['bv_task'][1])
+                fg_bond_dist_loss = fg_task_dist_loss(sigmoid(preds["fg_task"]["bond_from_atom"]),
+                                                      sigmoid(preds["fg_task"]["bond_from_bond"]))
+            av_loss = av_atom_loss + av_bond_loss
+            bv_loss = bv_atom_loss + bv_bond_loss
+            fg_atom_loss = fg_atom_from_atom_loss + fg_atom_from_bond_loss
+            fg_bond_loss = fg_bond_from_atom_loss + fg_bond_from_bond_loss
+            fg_loss = fg_atom_loss + fg_bond_loss
+            fg_dist_loss = fg_atom_dist_loss + fg_bond_dist_loss
+            # dist_loss = av_dist_loss + bv_dist_loss + fg_dist_loss
+            # print("%.4f %.4f %.4f %.4f %.4f %.4f"%(av_atom_loss,
+            #                                       av_bond_loss,
+            #                                       fg_atom_loss,
+            #                                       fg_bond_loss,
+            #                                       av_dist_loss,
+            #                                       fg_dist_loss))
+            # return av_loss + fg_loss + dist_coff * dist_loss
+            overall_loss = av_loss + bv_loss + fg_loss + dist_coff * av_dist_loss + \
+                           dist_coff * bv_dist_loss + fg_dist_loss
+            return overall_loss, av_loss, bv_loss, fg_loss, av_dist_loss, bv_dist_loss, fg_dist_loss
+        return loss_func
+    def forward(self, graph_batch: List):
+        """
+        The forward function.
+        :param graph_batch:
+        :return:
+        """
+        _, _, _, _, _, a_scope, b_scope, _ = graph_batch
+        a_scope = a_scope.data.cpu().numpy().tolist()
+        embeddings = self.grover(graph_batch)
+        av_task_pred_atom = self.av_task_atom(
+            embeddings["atom_from_atom"])  # if None: means not go through this fowward
+        av_task_pred_bond = self.av_task_bond(embeddings["atom_from_bond"])
+        bv_task_pred_atom = self.bv_task_atom(embeddings["bond_from_atom"])
+        bv_task_pred_bond = self.bv_task_bond(embeddings["bond_from_bond"])
+        fg_task_pred_all = self.fg_task_all(embeddings, a_scope, b_scope)
+        return {"av_task": (av_task_pred_atom, av_task_pred_bond),
+                "bv_task": (bv_task_pred_atom, bv_task_pred_bond),
+                "fg_task": fg_task_pred_all}
+class GroverFpGeneration(nn.Module):
+    """
+    GroverFpGeneration class.
+    It loads the pre-trained model and produce the fingerprints for input molecules.
+    """
+    def __init__(self, args):
+        """
+        Init function.
+        :param args: the arguments.
+        """
+        super(GroverFpGeneration, self).__init__()
+        self.fingerprint_source = args.fingerprint_source
+        self.iscuda = args.cuda
+        self.grover = GROVEREmbedding(args)
+        self.readout = Readout(rtype="mean", hidden_size=args.hidden_size)
+    def forward(self, batch, features_batch):
+        """
+        The forward function.
+        It takes graph batch and molecular feature batch as input and produce the fingerprints of this molecules.
+        :param batch:
+        :param features_batch:
+        :return:
+        """
+        _, _, _, _, _, a_scope, b_scope, _ = batch
+        output = self.grover(batch)
+        # Share readout
+        mol_atom_from_bond_output = self.readout(output["atom_from_bond"], a_scope)
+        mol_atom_from_atom_output = self.readout(output["atom_from_atom"], a_scope)
+        if self.fingerprint_source == "bond" or self.fingerprint_source == "both":
+            mol_bond_from_atom_output = self.readout(output["bond_from_atom"], b_scope)
+            mol_bond_from_bodd_output = self.readout(output["bond_from_bond"], b_scope)
+        if features_batch[0] is not None:
+            features_batch = torch.from_numpy(np.stack(features_batch)).float()
+            if self.iscuda:
+                features_batch = features_batch.cuda()
+            features_batch = features_batch.to(output["atom_from_atom"])
+            if len(features_batch.shape) == 1:
+                features_batch = features_batch.view([1, features_batch.shape[0]])
+        else:
+            features_batch = None
+        if self.fingerprint_source == "atom":
+            fp = torch.cat([mol_atom_from_atom_output, mol_atom_from_bond_output], 1)
+        elif self.fingerprint_source == "bond":
+            fp = torch.cat([mol_bond_from_atom_output, mol_bond_from_bodd_output], 1)
+        else:
+            # the both case.
+            fp = torch.cat([mol_atom_from_atom_output, mol_atom_from_bond_output,
+                            mol_bond_from_atom_output, mol_bond_from_bodd_output], 1)
+        if features_batch is not None:
+            fp = torch.cat([fp, features_batch], 1)
+        return fp
+class GroverFinetuneTask(nn.Module):
+    """
+    The finetune
+    """
+    def __init__(self, args):
+        super(GroverFinetuneTask, self).__init__()
+        self.hidden_size = args.hidden_size
+        self.iscuda = args.cuda
+        self.grover = GROVEREmbedding(args)
+        if args.self_attention:
+            self.readout = Readout(rtype="self_attention", hidden_size=self.hidden_size,
+                                   attn_hidden=args.attn_hidden,
+                                   attn_out=args.attn_out)
+        else:
+            self.readout = Readout(rtype="mean", hidden_size=self.hidden_size)
+        self.mol_atom_from_atom_ffn = self.create_ffn(args)
+        self.mol_atom_from_bond_ffn = self.create_ffn(args)
+        #self.ffn = nn.ModuleList()
+        #self.ffn.append(self.mol_atom_from_atom_ffn)
+        #self.ffn.append(self.mol_atom_from_bond_ffn)
+        self.classification = args.dataset_type == 'classification'
+        if self.classification:
+            self.sigmoid = nn.Sigmoid()
+    def create_ffn(self, args: Namespace):
+        """
+        Creates the feed-forward network for the model.
+        :param args: Arguments.
+        """
+        # Note: args.features_dim is set according the real loaded features data
+        if args.features_only:
+            first_linear_dim = args.features_size + args.features_dim
+        else:
+            if args.self_attention:
+                first_linear_dim = args.hidden_size * args.attn_out
+                # TODO: Ad-hoc!
+                # if args.use_input_features:
+                first_linear_dim += args.features_dim
+            else:
+                first_linear_dim = args.hidden_size + args.features_dim
+        dropout = nn.Dropout(args.dropout)
+        activation = get_activation_function(args.activation)
+        # TODO: ffn_hidden_size
+        # Create FFN layers
+        if args.ffn_num_layers == 1:
+            ffn = [
+                dropout,
+                nn.Linear(first_linear_dim, args.output_size)
+            ]
+        else:
+            ffn = [
+                dropout,
+                nn.Linear(first_linear_dim, args.ffn_hidden_size)
+            ]
+            for _ in range(args.ffn_num_layers - 2):
+                ffn.extend([
+                    activation,
+                    dropout,
+                    nn.Linear(args.ffn_hidden_size, args.ffn_hidden_size),
+                ])
+            ffn.extend([
+                activation,
+                dropout,
+                nn.Linear(args.ffn_hidden_size, args.output_size),
+            ])
+        # Create FFN model
+        return nn.Sequential(*ffn)
+    @staticmethod
+    def get_loss_func(args):
+        def loss_func(preds, targets,
+                      dt=args.dataset_type,
+                      dist_coff=args.dist_coff):
+            if dt == 'classification':
+                pred_loss = nn.BCEWithLogitsLoss(reduction='none')
+            elif dt == 'regression':
+                pred_loss = nn.MSELoss(reduction='none')
+            else:
+                raise ValueError(f'Dataset type "{args.dataset_type}" not supported.')
+            # print(type(preds))
+            # TODO: Here, should we need to involve the model status? Using len(preds) is just a hack.
+            if type(preds) is not tuple:
+                # in eval mode.
+                return pred_loss(preds, targets)
+            # in train mode.
+            dist_loss = nn.MSELoss(reduction='none')
+            # dist_loss = nn.CosineSimilarity(dim=0)
+            # print(pred_loss)
+            dist = dist_loss(preds[0], preds[1])
+            pred_loss1 = pred_loss(preds[0], targets)
+            pred_loss2 = pred_loss(preds[1], targets)
+            return pred_loss1 + pred_loss2 + dist_coff * dist
+        return loss_func
+    def forward(self, batch, features_batch):
+        _, _, _, _, _, a_scope, _, _ = batch
+        output = self.grover(batch)
+        # Share readout
+        mol_atom_from_bond_output = self.readout(output["atom_from_bond"], a_scope)
+        mol_atom_from_atom_output = self.readout(output["atom_from_atom"], a_scope)
+        if features_batch[0] is not None:
+            features_batch = torch.from_numpy(np.stack(features_batch)).float()
+            if self.iscuda:
+                features_batch = features_batch.cuda()
+            features_batch = features_batch.to(output["atom_from_atom"])
+            if len(features_batch.shape) == 1:
+                features_batch = features_batch.view([1, features_batch.shape[0]])
+        else:
+            features_batch = None
+        if features_batch is not None:
+            mol_atom_from_atom_output = torch.cat([mol_atom_from_atom_output, features_batch], 1)
+            mol_atom_from_bond_output = torch.cat([mol_atom_from_bond_output, features_batch], 1)
+        if self.training:
+            atom_ffn_output = self.mol_atom_from_atom_ffn(mol_atom_from_atom_output)
+            bond_ffn_output = self.mol_atom_from_bond_ffn(mol_atom_from_bond_output)
+            return atom_ffn_output, bond_ffn_output
+        else:
+            atom_ffn_output = self.mol_atom_from_atom_ffn(mol_atom_from_atom_output)
+            bond_ffn_output = self.mol_atom_from_bond_ffn(mol_atom_from_bond_output)
+            if self.classification:
+                atom_ffn_output = self.sigmoid(atom_ffn_output)
+                bond_ffn_output = self.sigmoid(bond_ffn_output)
+            output = (atom_ffn_output + bond_ffn_output) / 2
+        return output

grover/util/metrics.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+The evaluation metrics.
+"""
+import math
+from typing import List, Callable, Union
+from sklearn.metrics import accuracy_score, mean_squared_error, roc_auc_score, mean_absolute_error, r2_score, \
+    precision_recall_curve, auc, recall_score, confusion_matrix
+def accuracy(targets: List[int], preds: List[float], threshold: float = 0.5) -> float:
+    """
+    Computes the accuracy of a binary prediction task using a given threshold for generating hard predictions.
+    :param targets: A list of binary targets.
+    :param preds: A list of prediction probabilities.
+    :param threshold: The threshold above which a prediction is a 1 and below which (inclusive) a prediction is a 0
+    :return: The computed accuracy.
+    """
+    hard_preds = [1 if p > threshold else 0 for p in preds]
+    return accuracy_score(targets, hard_preds)
+def recall(targets: List[int], preds: List[float], threshold: float = 0.5) -> float:
+    """
+    Computes the recall of a binary prediction task using a given threshold for generating hard predictions.
+    :param targets: A list of binary targets.
+    :param preds: A list of prediction probabilities.
+    :param threshold: The threshold above which a prediction is a 1 and below which (inclusive) a prediction is a 0
+    :return: The computed recall.
+    """
+    hard_preds = [1 if p > threshold else 0 for p in preds]
+    return recall_score(targets, hard_preds)
+def sensitivity(targets: List[int], preds: List[float], threshold: float = 0.5) -> float:
+    """
+    Computes the sensitivity of a binary prediction task using a given threshold for generating hard predictions.
+    :param targets: A list of binary targets.
+    :param preds: A list of prediction probabilities.
+    :param threshold: The threshold above which a prediction is a 1 and below which (inclusive) a prediction is a 0
+    :return: The computed sensitivity.
+    """
+    return recall(targets, preds, threshold)
+def specificity(targets: List[int], preds: List[float], threshold: float = 0.5) -> float:
+    """
+    Computes the specificity of a binary prediction task using a given threshold for generating hard predictions.
+    :param targets: A list of binary targets.
+    :param preds: A list of prediction probabilities.
+    :param threshold: The threshold above which a prediction is a 1 and below which (inclusive) a prediction is a 0
+    :return: The computed specificity.
+    """
+    hard_preds = [1 if p > threshold else 0 for p in preds]
+    tn, fp, _, _ = confusion_matrix(targets, hard_preds).ravel()
+    return tn / float(tn + fp)
+def rmse(targets: List[float], preds: List[float]) -> float:
+    """
+    Computes the root mean squared error.
+    :param targets: A list of targets.
+    :param preds: A list of predictions.
+    :return: The computed rmse.
+    """
+    return math.sqrt(mean_squared_error(targets, preds))
+def get_metric_func(metric: str) -> Callable[[Union[List[int], List[float]], List[float]], float]:
+    """
+    Gets the metric function corresponding to a given metric name.
+    :param metric: Metric name.
+    :return: A metric function which takes as arguments a list of targets and a list of predictions and returns.
+    """
+    # Note: If you want to add a new metric, please also update the parser argument --metric in parsing.py.
+    if metric == 'auc':
+        return roc_auc_score
+    if metric == 'prc-auc':
+        return prc_auc
+    if metric == 'rmse':
+        return rmse
+    if metric == 'mae':
+        return mean_absolute_error
+    if metric == 'r2':
+        return r2_score
+    if metric == 'accuracy':
+        return accuracy
+    if metric == 'recall':
+        return recall
+    if metric == 'sensitivity':
+        return sensitivity
+    if metric == 'specificity':
+        return specificity
+    raise ValueError(f'Metric "{metric}" not supported.')
+def prc_auc(targets: List[int], preds: List[float]) -> float:
+    """
+    Computes the area under the precision-recall curve.
+    :param targets: A list of binary targets.
+    :param preds: A list of prediction probabilities.
+    :return: The computed prc-auc.
+    """
+    precision, recall, _ = precision_recall_curve(targets, preds)
+    return auc(recall, precision)

grover/util/multi_gpu_wrapper.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""
+Wrapper for multi-GPU training.
+"""
+# use Hovorod for multi-GPU pytorch training
+try:
+    import horovod.torch as mgw
+    import torch
+    print('using Horovod for multi-GPU training')
+except ImportError:
+    print('[WARNING] Horovod cannot be imported; multi-GPU training is unsupported')
+    pass
+class MultiGpuWrapper(object):
+    """Wrapper for multi-GPU training."""
+    def __init__(self):
+        """Constructor function."""
+        pass
+    @classmethod
+    def init(cls, *args):
+        """Initialization."""
+        try:
+            return mgw.init(*args)
+        except NameError:
+            raise NameError('module <mgw> not imported')
+    @classmethod
+    def size(cls, *args):
+        """Get the number of workers at all nodes."""
+        try:
+            return mgw.size(*args)
+        except NameError:
+            raise NameError('module <mgw> not imported')
+    @classmethod
+    def rank(cls, *args):
+        """Get the rank of current worker at all nodes."""
+        try:
+            return mgw.rank(*args)
+        except NameError:
+            raise NameError('module <mgw> not imported')
+    @classmethod
+    def local_size(cls, *args):
+        """Get the number of workers at the current node."""
+        try:
+            return mgw.local_size(*args)
+        except NameError:
+            raise NameError('module <mgw> not imported')
+    @classmethod
+    def local_rank(cls, *args):
+        """Get the rank of current worker at the current node."""
+        try:
+            return mgw.local_rank(*args)
+        except NameError:
+            raise NameError('module <mgw> not imported')
+    @classmethod
+    def DistributedOptimizer(cls, *args, **kwargs):
+        """Get a distributed optimizer from the base optimizer."""
+        try:
+            return mgw.DistributedOptimizer(*args, **kwargs)
+        except NameError:
+            raise NameError('module <mgw> not imported')
+    @classmethod
+    def broadcast_parameters(cls, *args, **kwargs):
+        """Get a operation to broadcast all the parameters."""
+        try:
+            return mgw.broadcast_parameters(*args, **kwargs)
+        except NameError:
+            raise NameError('module <mgw> not imported')
+    @classmethod
+    def broadcast_optimizer_state(cls, *args, **kwargs):
+        """Get a operation to broadcast all the optimizer state."""
+        try:
+            return mgw.broadcast_optimizer_state(*args, **kwargs)
+        except NameError:
+            raise NameError('module <mgw> not imported')
+    @classmethod
+    def broadcast(cls, *args, **kwargs):
+        """Get a operation to broadcast all the optimizer state."""
+        try:
+            return mgw.broadcast(*args, **kwargs)
+        except NameError:
+            raise NameError('module <mgw> not imported')
+    @classmethod
+    def barrier(cls):
+        """Add a barrier to synchronize different processes"""
+        try:
+            return mgw.allreduce(torch.tensor(0), name='barrier')
+        except NameError:
+            raise NameError('module <mgw> not imported')

grover/util/nn_utils.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""
+The utility function for model construction.
+This implementation is adapted from
+https://github.com/chemprop/chemprop/blob/master/chemprop/nn_utils.py
+"""
+import torch
+from torch import nn as nn
+def param_count(model: nn.Module) -> int:
+    """
+    Determines number of trainable parameters.
+    :param model: An nn.Module.
+    :return: The number of trainable parameters.
+    """
+    return sum(param.numel() for param in model.parameters() if param.requires_grad)
+def index_select_nd(source: torch.Tensor, index: torch.Tensor) -> torch.Tensor:
+    """
+    Selects the message features from source corresponding to the atom or bond indices in index.
+    :param source: A tensor of shape (num_bonds, hidden_size) containing message features.
+    :param index: A tensor of shape (num_atoms/num_bonds, max_num_bonds) containing the atom or bond
+    indices to select from source.
+    :return: A tensor of shape (num_atoms/num_bonds, max_num_bonds, hidden_size) containing the message
+    features corresponding to the atoms/bonds specified in index.
+    """
+    index_size = index.size()  # (num_atoms/num_bonds, max_num_bonds)
+    suffix_dim = source.size()[1:]  # (hidden_size,)
+    final_size = index_size + suffix_dim  # (num_atoms/num_bonds, max_num_bonds, hidden_size)
+    target = source.index_select(dim=0, index=index.view(-1))  # (num_atoms/num_bonds * max_num_bonds, hidden_size)
+    target = target.view(final_size)  # (num_atoms/num_bonds, max_num_bonds, hidden_size)
+    return target
+def get_activation_function(activation: str) -> nn.Module:
+    """
+    Gets an activation function module given the name of the activation.
+    :param activation: The name of the activation function.
+    :return: The activation function module.
+    """
+    if activation == 'ReLU':
+        return nn.ReLU()
+    elif activation == 'LeakyReLU':
+        return nn.LeakyReLU(0.1)
+    elif activation == 'PReLU':
+        return nn.PReLU()
+    elif activation == 'tanh':
+        return nn.Tanh()
+    elif activation == 'SELU':
+        return nn.SELU()
+    elif activation == 'ELU':
+        return nn.ELU()
+    elif activation == "Linear":
+        return lambda x: x
+    else:
+        raise ValueError(f'Activation "{activation}" not supported.')
+def initialize_weights(model: nn.Module, distinct_init=False, model_idx=0):
+    """
+    Initializes the weights of a model in place.
+    :param model: An nn.Module.
+    """
+    init_fns = [nn.init.kaiming_normal_, nn.init.kaiming_uniform_,
+               nn.init.xavier_normal_, nn.init.xavier_uniform_]
+    for param in model.parameters():
+        if param.dim() == 1:
+            nn.init.constant_(param, 0)
+        else:
+            if distinct_init:
+                init_fn = init_fns[model_idx % 4]
+                if 'kaiming' in init_fn.__name__:
+                    init_fn(param, nonlinearity='relu')
+                else:
+                    init_fn(param)
+            else:
+                nn.init.xavier_normal_(param)
+def select_neighbor_and_aggregate(feature, index):
+    """
+    The basic operation in message passing.
+    Caution: the index_selec_ND would cause the reproducibility issue when performing the training on CUDA.
+    See: https://pytorch.org/docs/stable/notes/randomness.html
+    :param feature: the candidate feature for aggregate. (n_nodes, hidden)
+    :param index: the selected index (neighbor indexes).
+    :return:
+    """
+    neighbor = index_select_nd(feature, index)
+    return neighbor.sum(dim=1)

grover/util/parsing.py ADDED Viewed

	@@ -0,0 +1,487 @@

+"""
+The parsing functions for the argument input.
+"""
+import os
+import pickle
+from argparse import ArgumentParser, Namespace
+from tempfile import TemporaryDirectory
+import torch
+from grover.data.molfeaturegenerator import get_available_features_generators
+from grover.util.utils import makedirs
+def add_common_args(parser: ArgumentParser):
+    parser.add_argument('--no_cache', action='store_true', default=True,
+                        help='Turn off caching mol2graph computation')
+    parser.add_argument('--gpu', type=int, default=0,
+                        choices=list(range(torch.cuda.device_count())),
+                        help='Which GPU to use')
+    parser.add_argument('--no_cuda', action='store_true', default=False,
+                        help='Turn off cuda')
+    parser.add_argument('--batch_size', type=int, default=32,
+                        help='Batch size')
+def add_predict_args(parser: ArgumentParser):
+    """
+    Adds predict arguments to an ArgumentParser.
+    :param parser: An ArgumentParser.
+    """
+    add_common_args(parser)
+    parser.add_argument('--data_path', type=str,
+                        help='Path to CSV file containing testing data for which predictions will be made')
+    parser.add_argument('--output_path', type=str,
+                        help='Path to CSV file where predictions will be saved')
+    parser.add_argument('--checkpoint_dir', type=str,
+                        help='Directory from which to load model checkpoints'
+                             '(walks directory and ensembles all models that are found)')
+    parser.add_argument('--features_generator', type=str, nargs='*',
+                        choices=get_available_features_generators(),
+                        help='Method of generating additional features')
+    parser.add_argument('--features_path', type=str, nargs='*',
+                        help='Path to features to use in FNN (instead of features_generator)')
+    parser.add_argument('--no_features_scaling', action='store_true', default=False,
+                        help='Turn off scaling of features')
+def add_fingerprint_args(parser):
+    add_common_args(parser)
+    # parameters for fingerprints generation
+    parser.add_argument('--data_path', type=str, help='Input csv file which contains SMILES')
+    parser.add_argument('--output_path', type=str,
+                        help='Path to npz file where predictions will be saved')
+    parser.add_argument('--features_path', type=str, nargs='*',
+                        help='Path to features to use in FNN (instead of features_generator)')
+    parser.add_argument('--fingerprint_source', type=str,
+                        choices=['atom', 'bond', 'both'], default='both',
+                        help='The source to generate the fingerprints.')
+    parser.add_argument('--checkpoint_path', type=str, help='model path')
+def add_finetune_args(parser: ArgumentParser):
+    """
+    Adds training arguments to an ArgumentParser.
+    :param parser: An ArgumentParser.
+    """
+    # General arguments
+    add_common_args(parser)
+    parser.add_argument('--tensorboard', action='store_true', default=False, help='Add tensorboard logger')
+    # Data argumenets
+    parser.add_argument('--data_path', type=str,
+                        help='Path to data CSV file.')
+    parser.add_argument('--use_compound_names', action='store_true', default=False,
+                        help='Use when test data file contains compound names in addition to SMILES strings')
+    parser.add_argument('--max_data_size', type=int,
+                        help='Maximum number of data points to load')
+    # Disable this option due to some bugs.
+    # parser.add_argument('--test', action='store_true', default=False,
+    #                     help='Whether to skip training and only test the model')
+    parser.add_argument('--features_only', action='store_true', default=False,
+                        help='Use only the additional features in an FFN, no graph network')
+    parser.add_argument('--features_generator', type=str, nargs='*',
+                        choices=get_available_features_generators(),
+                        help='Method of generating additional features.')
+    parser.add_argument('--features_path', type=str, nargs='*',
+                        help='Path to features to use in FNN (instead of features_generator).')
+    parser.add_argument('--save_dir', type=str, default=None,
+                        help='Directory where model checkpoints will be saved')
+    parser.add_argument('--save_smiles_splits', action='store_true', default=False,
+                        help='Save smiles for each train/val/test splits for prediction convenience later')
+    parser.add_argument('--checkpoint_dir', type=str, default=None,
+                        help='Directory from which to load model checkpoints'
+                             '(walks directory and ensembles all models that are found)')
+    parser.add_argument('--checkpoint_path', type=str, default=None,
+                        help='Path to model checkpoint (.pt file)')
+    # Data splitting.
+    parser.add_argument('--dataset_type', type=str,
+                        choices=['classification', 'regression'], default='classification',
+                        help='Type of dataset, e.g. classification or regression.'
+                             'This determines the loss function used during training.')
+    parser.add_argument('--separate_val_path', type=str,
+                        help='Path to separate val set, optional')
+    parser.add_argument('--separate_val_features_path', type=str, nargs='*',
+                        help='Path to file with features for separate val set')
+    parser.add_argument('--separate_test_path', type=str,
+                        help='Path to separate test set, optional')
+    parser.add_argument('--separate_test_features_path', type=str, nargs='*',
+                        help='Path to file with features for separate test set')
+    parser.add_argument('--split_type', type=str, default='random',
+                        choices=['random', 'scaffold_balanced', 'predetermined', 'crossval', 'index_predetermined'],
+                        help='Method of splitting the data into train/val/test')
+    parser.add_argument('--split_sizes', type=float, nargs=3, default=[0.8, 0.1, 0.1],
+                        help='Split proportions for train/validation/test sets')
+    parser.add_argument('--num_folds', type=int, default=1,
+                        help='Number of folds when performing cross validation')
+    parser.add_argument('--folds_file', type=str, default=None,
+                        help='Optional file of fold labels')
+    parser.add_argument('--val_fold_index', type=int, default=None,
+                        help='Which fold to use as val for leave-one-out cross val')
+    parser.add_argument('--test_fold_index', type=int, default=None,
+                        help='Which fold to use as test for leave-one-out cross val')
+    parser.add_argument('--crossval_index_dir', type=str,
+                        help='Directory in which to find cross validation index files')
+    parser.add_argument('--crossval_index_file', type=str,
+                        help='Indices of files to use as train/val/test'
+                             'Overrides --num_folds and --seed.')
+    parser.add_argument('--seed', type=int, default=0,
+                        help='Random seed to use when splitting data into train/val/test sets.'
+                             'When `num_folds` > 1, the first fold uses this seed and all'
+                             'subsequent folds add 1 to the seed.')
+    # Metric
+    parser.add_argument('--metric', type=str, default=None,
+                        choices=['auc',
+                                 'prc-auc',
+                                 'rmse',
+                                 'mae',
+                                 'r2',
+                                 'accuracy',
+                                 'recall',
+                                 'sensitivity',
+                                 'specificity',
+                                 'matthews_corrcoef'],
+                        help='Metric to use during evaluation.'
+                             'Note: Does NOT affect loss function used during training'
+                             '(loss is determined by the `dataset_type` argument).'
+                             'Note: Defaults to "auc" for classification and "rmse" for regression.')
+    parser.add_argument('--show_individual_scores', action='store_true', default=False,
+                        help='Show all scores for individual targets, not just average, at the end')
+    # Training arguments
+    parser.add_argument('--epochs', type=int, default=30,
+                        help='Number of epochs to task')
+    parser.add_argument('--warmup_epochs', type=float, default=2.0,
+                        help='Number of epochs during which learning rate increases linearly from'
+                             'init_lr to max_lr. Afterwards, learning rate decreases exponentially'
+                             'from max_lr to final_lr.')
+    parser.add_argument('--init_lr', type=float, default=1e-4,
+                        help='Initial learning rate')
+    parser.add_argument('--max_lr', type=float, default=1e-3,
+                        help='Maximum learning rate')
+    parser.add_argument('--final_lr', type=float, default=1e-4,
+                        help='Final learning rate')
+    parser.add_argument('--no_features_scaling', action='store_true', default=False,
+                        help='Turn off scaling of features')
+    parser.add_argument('--early_stop_epoch', type=int, default=1000, help='If val loss did not drop in '
+                                                                           'this epochs, stop running')
+    # Model arguments
+    parser.add_argument('--ensemble_size', type=int, default=1,
+                        help='Number of models for ensemble prediction.')
+    parser.add_argument('--dropout', type=float, default=0.0,
+                        help='Dropout probability')
+    parser.add_argument('--activation', type=str, default='ReLU',
+                        choices=['ReLU', 'LeakyReLU', 'PReLU', 'tanh', 'SELU', 'ELU'],
+                        help='Activation function')
+    parser.add_argument('--ffn_hidden_size', type=int, default=None,
+                        help='Hidden dim for higher-capacity FFN (defaults to hidden_size)')
+    parser.add_argument('--ffn_num_layers', type=int, default=2,
+                        help='Number of layers in FFN after MPN encoding')
+    parser.add_argument('--weight_decay', type=float, default=0.0, help='weight_decay')
+    parser.add_argument('--select_by_loss', action='store_true', default=False,
+                        help='Use validation loss as refence standard to select best model to predict')
+    parser.add_argument("--embedding_output_type", default="atom", choices=["atom", "bond", "both"],
+                        help="This the model parameters for pretrain model. The current finetuning task only use the "
+                             "embeddings from atom branch. ")
+    # Self-attentive readout.
+    parser.add_argument('--self_attention', action='store_true', default=False, help='Use self attention layer. '
+                                                                                     'Otherwise use mean aggregation '
+                                                                                     'layer.')
+    parser.add_argument('--attn_hidden', type=int, default=4, nargs='?', help='Self attention layer '
+                                                                              'hidden layer size.')
+    parser.add_argument('--attn_out', type=int, default=128, nargs='?', help='Self attention layer '
+                                                                             'output feature size.')
+    parser.add_argument('--dist_coff', type=float, default=0.1, help='The dist coefficient for output of two branches.')
+    parser.add_argument('--bond_drop_rate', type=float, default=0, help='Drop out bond in molecular.')
+    parser.add_argument('--distinct_init', action='store_true', default=False,
+                        help='Using distinct weight init for model ensemble')
+    parser.add_argument('--fine_tune_coff', type=float, default=1,
+                        help='Enable distinct fine tune learning rate for fc and other layer')
+    # For multi-gpu finetune.
+    parser.add_argument('--enbl_multi_gpu', dest='enbl_multi_gpu',
+                        action='store_true', default=False,
+                        help='enable multi-GPU training')
+def add_pretrain_args(parser: ArgumentParser):
+    parser.add_argument('--cuda', type=bool, default=True,
+                        help='Enable gpu traning or not.')
+    parser.add_argument('--enable_multi_gpu', dest='enable_multi_gpu',
+                        action='store_true', default=False,
+                        help='enable multi-GPU training')
+    # Data arguments
+    parser.add_argument('--data_path', type=str,
+                        help='Path to data CSV file')
+    parser.add_argument('--fg_label_path', type=str, nargs='*',
+                        help='Path to the label of fg task.')
+    parser.add_argument('--atom_vocab_path', type=str, help="Path to the vocabulary.")
+    parser.add_argument('--bond_vocab_path', type=str,
+                        help="Path to the bond vocabulary.")
+    # Model arguments
+    parser.add_argument('--embedding_output_type', type=str, default='both', nargs='?',
+                        choices=("atom", "bond", "both"),
+                        help="Type of output embeddings. Options: atom, bond, both")
+    #parser.add_argument('--source_branch', type=str, default='both', nargs='?', choices=("atom", "bond", "both"),
+    #                    help="Type of source branch in gtrans. Options: atom, bond, both")
+    parser.add_argument('--save_dir', type=str, default=None,
+                        help='Directory where model checkpoints will be saved')
+    parser.add_argument('--save_interval', type=int, default=9999999999, help='The model saving interval.')
+    parser.add_argument('--hidden_size', type=float, default=3,
+                        help='Dimensionality of hidden layers. The actual dimension is hidden_size * 100.')
+    parser.add_argument('--bias', action='store_true', default=False,
+                        help='Whether to add bias to linear layers')
+    parser.add_argument('--depth', type=int, default=3,
+                        help='Number of message passing steps')
+    parser.add_argument('--dropout', type=float, default=0.0,
+                        help='Dropout probability')
+    parser.add_argument('--activation', type=str, default='PReLU',
+                        choices=['ReLU', 'LeakyReLU', 'PReLU', 'tanh', 'SELU', 'ELU'],
+                        help='Activation function')
+    parser.add_argument('--undirected', action='store_true', default=False,
+                        help='Undirected edges (always sum the two relevant bond vectors)')
+    parser.add_argument('--weight_decay', type=float, default=0.0, help='weight_decay')
+    parser.add_argument('--num_attn_head', type=int, default=4, help='The attention head in MTBlock.')
+    parser.add_argument('--num_mt_block', type=int, default=1, help="The number of MTBlock.")
+    parser.add_argument('--dist_coff', type=float, default=0.1, help='The disagreement coefficient for '
+                                                                     'the atom and bond branch.')
+    # Training arguments
+    parser.add_argument("--backbone", default="gtrans", choices=["gtrans"])
+    parser.add_argument('--epochs', type=int, default=30,
+                        help='Number of epochs to run')
+    parser.add_argument('--batch_size', type=int, default=32,
+                        help='Batch size')
+    parser.add_argument('--warmup_epochs', type=float, default=2.0,
+                        help='Number of epochs during which learning rate increases linearly from'
+                             'init_lr to max_lr. Afterwards, learning rate decreases exponentially'
+                             'from max_lr to final_lr.')
+    parser.add_argument('--init_lr', type=float, default=1e-4,
+                        help='Initial learning rate')
+    parser.add_argument('--max_lr', type=float, default=1e-3,
+                        help='Maximum learning rate')
+    parser.add_argument('--final_lr', type=float, default=1e-4,
+                        help='Final learning rate')
+    parser.add_argument('--bond_drop_rate', type=float, default=0, help='Drop out bond in molecular')
+def update_checkpoint_args(args: Namespace):
+    """
+    Walks the checkpoint directory to find all checkpoints, updating args.checkpoint_paths and args.ensemble_size.
+    :param args: Arguments.
+    """
+    if hasattr(args, 'checkpoint_paths') and args.checkpoint_paths is not None:
+        return
+    if not hasattr(args, 'checkpoint_path'):
+        args.checkpoint_path = None
+    if not hasattr(args, 'checkpoint_dir'):
+        args.checkpoint_dir = None
+    if args.checkpoint_dir is not None and args.checkpoint_path is not None:
+        raise ValueError('Only one of checkpoint_dir and checkpoint_path can be specified.')
+    if args.checkpoint_dir is None:
+        args.checkpoint_paths = [args.checkpoint_path] if args.checkpoint_path is not None else None
+        return
+    args.checkpoint_paths = []
+    for root, _, files in os.walk(args.checkpoint_dir):
+        for fname in files:
+            if fname.endswith('.pt'):
+                args.checkpoint_paths.append(os.path.join(root, fname))
+    if args.parser_name == "eval":
+        assert args.ensemble_size * args.num_folds == len(args.checkpoint_paths)
+    args.ensemble_size = len(args.checkpoint_paths)
+    if args.ensemble_size == 0:
+        raise ValueError(f'Failed to find any model checkpoints in directory "{args.checkpoint_dir}"')
+def modify_predict_args(args: Namespace):
+    """
+    Modifies and validates predicting args in place.
+    :param args: Arguments.
+    """
+    assert args.data_path
+    assert args.output_path
+    assert args.checkpoint_dir is not None or args.checkpoint_path is not None or args.checkpoint_paths is not None
+    update_checkpoint_args(args)
+    args.cuda = not args.no_cuda and torch.cuda.is_available()
+    del args.no_cuda
+    # Create directory for preds path
+    makedirs(args.output_path, isfile=True)
+    setattr(args, 'fingerprint', False)
+def modify_fingerprint_args(args):
+    assert args.data_path
+    assert args.output_path
+    assert args.checkpoint_path is not None or args.checkpoint_paths is not None
+    update_checkpoint_args(args)
+    args.cuda = not args.no_cuda and torch.cuda.is_available()
+    del args.no_cuda
+    makedirs(args.output_path, isfile=True)
+    setattr(args, 'fingerprint', True)
+def get_newest_train_args():
+    """
+    For backward compatibility.
+    :return:  A Namespace containing the newest training arguments
+    """
+    dummy_parser = ArgumentParser()
+    add_finetune_args(dummy_parser)
+    args = dummy_parser.parse_args(args=[])
+    args.data_path = ''
+    modify_train_args(args)
+    return args
+def modify_train_args(args: Namespace):
+    """
+    Modifies and validates training arguments in place.
+    :param args: Arguments.
+    """
+    global TEMP_DIR  # Prevents the temporary directory from being deleted upon function return
+    assert args.data_path is not None
+    assert args.dataset_type is not None
+    if args.save_dir is not None:
+        makedirs(args.save_dir)
+    else:
+        TEMP_DIR = TemporaryDirectory()
+        args.save_dir = TEMP_DIR.name
+    args.cuda = not args.no_cuda and torch.cuda.is_available()
+    del args.no_cuda
+    args.features_scaling = not args.no_features_scaling
+    del args.no_features_scaling
+    if args.metric is None:
+        if args.dataset_type == 'classification':
+            args.metric = 'auc'
+        else:
+            args.metric = 'rmse'
+    if not ((args.dataset_type == 'classification' and args.metric in ['auc', 'prc-auc', 'accuracy']) or
+            (args.dataset_type == 'regression' and args.metric in ['rmse', 'mae', 'r2'])):
+        raise ValueError(f'Metric "{args.metric}" invalid for dataset type "{args.dataset_type}".')
+    args.minimize_score = args.metric in ['rmse', 'mae']
+    update_checkpoint_args(args)
+    if args.features_only:
+        assert args.features_generator or args.features_path
+    args.use_input_features = args.features_generator or args.features_path
+    if args.features_generator is not None and 'rdkit_2d_normalized' in args.features_generator:
+        assert not args.features_scaling
+    args.num_lrs = 1
+    assert (args.split_type == 'predetermined') == (args.folds_file is not None) == (args.test_fold_index is not None)
+    assert (args.split_type == 'crossval') == (args.crossval_index_dir is not None)
+    assert (args.split_type in ['crossval', 'index_predetermined']) == (args.crossval_index_file is not None)
+    if args.split_type in ['crossval', 'index_predetermined']:
+        with open(args.crossval_index_file, 'rb') as rf:
+            args.crossval_index_sets = pickle.load(rf)
+        args.num_folds = len(args.crossval_index_sets)
+        args.seed = 0
+    if args.bond_drop_rate > 0:
+        args.no_cache = True
+    setattr(args, 'fingerprint', False)
+def modify_pretrain_args(args: Namespace):
+    """
+    :param args:
+    :return:
+    """
+    args.dense = False
+    args.fine_tune_coff = 1
+    args.no_cache = True
+    args.hidden_size = int(args.hidden_size)
+def parse_args() -> Namespace:
+    """
+    Parses arguments for training and testing (includes modifying/validating arguments).
+    :return: A Namespace containing the parsed, modified, and validated args.
+    """
+    parser = ArgumentParser()
+    subparser = parser.add_subparsers(title="subcommands",
+                                      dest="parser_name",
+                                      help="Subcommands for fintune, prediction, and fingerprint.")
+    parser_finetune = subparser.add_parser('finetune', help="Fine tune the pre-trained model.")
+    add_finetune_args(parser_finetune)
+    parser_eval = subparser.add_parser('eval', help="Evaluate the results of the pre-trained model.")
+    add_finetune_args(parser_eval)
+    parser_predict = subparser.add_parser('predict', help="Predict results from fine tuned model.")
+    add_predict_args(parser_predict)
+    parser_fp = subparser.add_parser('fingerprint', help="Get the fingerprints of SMILES.")
+    add_fingerprint_args(parser_fp)
+    parser_pretrain = subparser.add_parser('pretrain', help="Pretrain with unlabelled SMILES.")
+    add_pretrain_args(parser_pretrain)
+    args = parser.parse_args()
+    if args.parser_name == 'finetune' or args.parser_name == 'eval':
+        modify_train_args(args)
+    elif args.parser_name == "pretrain":
+        modify_pretrain_args(args)
+    elif args.parser_name == 'predict':
+        modify_predict_args(args)
+    elif args.parser_name == 'fingerprint':
+        modify_fingerprint_args(args)
+    return args

grover/util/scheduler.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""
+The learning rate scheduler.
+This implementation is adapted from
+https://github.com/chemprop/chemprop/blob/master/chemprop/nn_utils.py
+"""
+from typing import List, Union
+import numpy as np
+from torch.optim.lr_scheduler import _LRScheduler
+class NoamLR(_LRScheduler):
+    """
+    Noam learning rate scheduler with piecewise linear increase and exponential decay.
+    The learning rate increases linearly from init_lr to max_lr over the course of
+    the first warmup_steps (where warmup_steps = warmup_epochs * steps_per_epoch).
+    Then the learning rate decreases exponentially from max_lr to final_lr over the
+    course of the remaining total_steps - warmup_steps (where total_steps =
+    total_epochs * steps_per_epoch). This is roughly based on the learning rate
+    schedule from SelfAttention is All You Need, section 5.3 (https://arxiv.org/abs/1706.03762).
+    """
+    def __init__(self,
+                 optimizer,
+                 warmup_epochs: List[Union[float, int]],
+                 total_epochs: List[int],
+                 steps_per_epoch: int,
+                 init_lr: List[float],
+                 max_lr: List[float],
+                 final_lr: List[float],
+                 fine_tune_coff: float = 1.0,
+                 fine_tune_param_idx: int = 0):
+        """
+        Initializes the learning rate scheduler.
+        :param optimizer: A PyTorch optimizer.
+        :param warmup_epochs: The number of epochs during which to linearly increase the learning rate.
+        :param total_epochs: The total number of epochs.
+        :param steps_per_epoch: The number of steps (batches) per epoch.
+        :param init_lr: The initial learning rate.
+        :param max_lr: The maximum learning rate (achieved after warmup_epochs).
+        :param final_lr: The final learning rate (achieved after total_epochs).
+        :param fine_tune_coff: The fine tune coefficient for the target param group. The true learning rate for the
+        target param group would be lr*fine_tune_coff.
+        :param fine_tune_param_idx: The index of target param group. Default is index 0.
+        """
+        # assert len(optimizer.param_groups) == len(warmup_epochs) == len(total_epochs) == len(init_lr) == \
+        #        len(max_lr) == len(final_lr)
+        self.num_lrs = len(optimizer.param_groups)
+        self.optimizer = optimizer
+        self.warmup_epochs = np.array([warmup_epochs] * self.num_lrs)
+        self.total_epochs = np.array([total_epochs] * self.num_lrs)
+        self.steps_per_epoch = steps_per_epoch
+        self.init_lr = np.array([init_lr] * self.num_lrs)
+        self.max_lr = np.array([max_lr] * self.num_lrs)
+        self.final_lr = np.array([final_lr] * self.num_lrs)
+        self.lr_coff = np.array([1] * self.num_lrs)
+        self.fine_tune_param_idx = fine_tune_param_idx
+        self.lr_coff[self.fine_tune_param_idx] = fine_tune_coff
+        self.current_step = 0
+        self.lr = [init_lr] * self.num_lrs
+        self.warmup_steps = (self.warmup_epochs * self.steps_per_epoch).astype(int)
+        self.total_steps = self.total_epochs * self.steps_per_epoch
+        self.linear_increment = (self.max_lr - self.init_lr) / self.warmup_steps
+        self.exponential_gamma = (self.final_lr / self.max_lr) ** (1 / (self.total_steps - self.warmup_steps))
+        super(NoamLR, self).__init__(optimizer)
+    def get_lr(self) -> List[float]:
+        """Gets a list of the current learning rates."""
+        return list(self.lr)
+    def step(self, current_step: int = None):
+        """
+        Updates the learning rate by taking a step.
+        :param current_step: Optionally specify what step to set the learning rate to.
+        If None, current_step = self.current_step + 1.
+        """
+        if current_step is not None:
+            self.current_step = current_step
+        else:
+            self.current_step += 1
+        for i in range(self.num_lrs):
+            if self.current_step <= self.warmup_steps[i]:
+                self.lr[i] = self.init_lr[i] + self.current_step * self.linear_increment[i]
+            elif self.current_step <= self.total_steps[i]:
+                self.lr[i] = self.max_lr[i] * (self.exponential_gamma[i] ** (self.current_step - self.warmup_steps[i]))
+            else:  # theoretically this case should never be reached since training should stop at total_steps
+                self.lr[i] = self.final_lr[i]
+            self.lr[i] *= self.lr_coff[i]
+            self.optimizer.param_groups[i]['lr'] = self.lr[i]

grover/util/utils.py ADDED Viewed

	@@ -0,0 +1,797 @@

+"""
+The general utility functions.
+"""
+import csv
+import logging
+import os
+import pickle
+import random
+from argparse import Namespace
+from collections import defaultdict
+from logging import Logger
+from typing import List, Set, Tuple, Union, Dict
+import numpy as np
+import torch
+from rdkit import Chem
+from rdkit.Chem.Scaffolds import MurckoScaffold
+from torch import nn as nn
+from tqdm import tqdm as core_tqdm
+from grover.data import MoleculeDatapoint, MoleculeDataset, StandardScaler
+from grover.model.models import GroverFpGeneration, GroverFinetuneTask
+from grover.util.nn_utils import initialize_weights
+from grover.util.scheduler import NoamLR
+np.float = float
+def get_model_args():
+    """
+    Get model structure related parameters
+    :return: a list containing parameters
+    """
+    return ['model_type', 'ensemble_size', 'input_layer', 'hidden_size', 'bias', 'depth',
+            'dropout', 'activation', 'undirected', 'ffn_hidden_size', 'ffn_num_layers',
+            'atom_message', 'weight_decay', 'select_by_loss', 'skip_epoch', 'backbone',
+            'embedding_output_type', 'self_attention', 'attn_hidden', 'attn_out', 'dense',
+            'bond_drop_rate', 'distinct_init', 'aug_rate', 'fine_tune_coff', 'nencoders',
+            'dist_coff', 'no_attach_fea', 'coord', "num_attn_head", "num_mt_block",
+            ]
+def save_features(path: str, features: List[np.ndarray]):
+    """
+    Saves features to a compressed .npz file with array name "features".
+    :param path: Path to a .npz file where the features will be saved.
+    :param features: A list of 1D numpy arrays containing the features for molecules.
+    """
+    np.savez_compressed(path, features=features)
+def load_features(path: str) -> np.ndarray:
+    """
+    Loads features saved in a variety of formats.
+    Supported formats:
+    - .npz compressed (assumes features are saved with name "features")
+    All formats assume that the SMILES strings loaded elsewhere in the code are in the same
+    order as the features loaded here.
+    :param path: Path to a file containing features.
+    :return: A 2D numpy array of size (num_molecules, features_size) containing the features.
+    """
+    extension = os.path.splitext(path)[1]
+    if extension == '.npz':
+        features = np.load(path)['features']
+    else:
+        raise ValueError(f'Features path extension {extension} not supported.')
+    return features
+class tqdm(core_tqdm):
+    def __init__(self, *args, **kwargs):
+        kwargs.setdefault("ascii", True)
+        super(tqdm, self).__init__(*args, **kwargs)
+def get_task_names(path: str, use_compound_names: bool = False) -> List[str]:
+    """
+    Gets the task names from a data CSV file.
+    :param path: Path to a CSV file.
+    :param use_compound_names: Whether file has compound names in addition to smiles strings.
+    :return: A list of task names.
+    """
+    index = 2 if use_compound_names else 1
+    task_names = get_header(path)[index:]
+    return task_names
+def get_header(path: str) -> List[str]:
+    """
+    Returns the header of a data CSV file.
+    :param path: Path to a CSV file.
+    :return: A list of strings containing the strings in the comma-separated header.
+    """
+    with open(path) as f:
+        header = next(csv.reader(f))
+    return header
+def get_num_tasks(path: str) -> int:
+    """
+    Gets the number of tasks in a data CSV file.
+    :param path: Path to a CSV file.
+    :return: The number of tasks.
+    """
+    return len(get_header(path)) - 1
+def filter_invalid_smiles(data: MoleculeDataset) -> MoleculeDataset:
+    """
+    Filters out invalid SMILES.
+    :param data: A MoleculeDataset.
+    :return: A MoleculeDataset with only valid molecules.
+    """
+    datapoint_list = []
+    for idx, datapoint in enumerate(data):
+        if datapoint.smiles == '':
+            print(f'invalid smiles {idx}: {datapoint.smiles}')
+            continue
+        mol = Chem.MolFromSmiles(datapoint.smiles)
+        if mol.GetNumHeavyAtoms() == 0:
+            print(f'invalid heavy {idx}')
+            continue
+        datapoint_list.append(datapoint)
+    return MoleculeDataset(datapoint_list)
+def get_data(path: str,
+             skip_invalid_smiles: bool = True,
+             args: Namespace = None,
+             features_path: List[str] = None,
+             max_data_size: int = None,
+             use_compound_names: bool = None,
+             logger: Logger = None) -> MoleculeDataset:
+    """
+    Gets smiles string and target values (and optionally compound names if provided) from a CSV file.
+    :param path: Path to a CSV file.
+    :param skip_invalid_smiles: Whether to skip and filter out invalid smiles.
+    :param args: Arguments.
+    :param features_path: A list of paths to files containing features. If provided, it is used
+    in place of args.features_path.
+    :param max_data_size: The maximum number of data points to load.
+    :param use_compound_names: Whether file has compound names in addition to smiles strings.
+    :param logger: Logger.
+    :return: A MoleculeDataset containing smiles strings and target values along
+    with other info such as additional features and compound names when desired.
+    """
+    debug = logger.debug if logger is not None else print
+    if args is not None:
+        # Prefer explicit function arguments but default to args if not provided
+        features_path = features_path if features_path is not None else args.features_path
+        max_data_size = max_data_size if max_data_size is not None else args.max_data_size
+        use_compound_names = use_compound_names if use_compound_names is not None else args.use_compound_names
+    else:
+        use_compound_names = False
+    max_data_size = max_data_size or float('inf')
+    # Load features
+    if features_path is not None:
+        features_data = []
+        for feat_path in features_path:
+            features_data.append(load_features(feat_path))  # each is num_data x num_features
+        features_data = np.concatenate(features_data, axis=1)
+        args.features_dim = len(features_data[0])
+    else:
+        features_data = None
+        if args is not None:
+            args.features_dim = 0
+    skip_smiles = set()
+    # Load data
+    with open(path) as f:
+        reader = csv.reader(f)
+        next(reader)  # skip header
+        lines = []
+        for line in reader:
+            smiles = line[0]
+            if smiles in skip_smiles:
+                continue
+            lines.append(line)
+            if len(lines) >= max_data_size:
+                break
+        data = MoleculeDataset([
+            MoleculeDatapoint(
+                line=line,
+                args=args,
+                features=features_data[i] if features_data is not None else None,
+                use_compound_names=use_compound_names
+            ) for i, line in tqdm(enumerate(lines), total=len(lines), disable=True)
+        ])
+    # Filter out invalid SMILES
+    if skip_invalid_smiles:
+        original_data_len = len(data)
+        data = filter_invalid_smiles(data)
+        if len(data) < original_data_len:
+            debug(f'Warning: {original_data_len - len(data)} SMILES are invalid.')
+    return data
+def get_data_from_smiles(smiles: List[str], skip_invalid_smiles: bool = True, logger: Logger = None,
+                         args: Namespace = None) -> MoleculeDataset:
+    """
+    Converts SMILES to a MoleculeDataset.
+    :param smiles: A list of SMILES strings.
+    :param skip_invalid_smiles: Whether to skip and filter out invalid smiles.
+    :param logger: Logger.
+    :return: A MoleculeDataset with all of the provided SMILES.
+    """
+    debug = logger.debug if logger is not None else print
+    data = MoleculeDataset([MoleculeDatapoint(line=[smile], args=args) for smile in smiles])
+    # Filter out invalid SMILES
+    if skip_invalid_smiles:
+        original_data_len = len(data)
+        data = filter_invalid_smiles(data)
+        if len(data) < original_data_len:
+            debug(f'Warning: {original_data_len - len(data)} SMILES are invalid.')
+    return data
+def split_data(data: MoleculeDataset,
+               split_type: str = 'random',
+               sizes: Tuple[float, float, float] = (0.8, 0.1, 0.1),
+               seed: int = 0,
+               args: Namespace = None,
+               logger: Logger = None) -> Tuple[MoleculeDataset,
+                                               MoleculeDataset,
+                                               MoleculeDataset]:
+    """
+    Splits data into training, validation, and test splits.
+    :param data: A MoleculeDataset.
+    :param split_type: Split type.
+    :param sizes: A length-3 tuple with the proportions of data in the
+    train, validation, and test sets.
+    :param seed: The random seed to use before shuffling data.
+    :param args: Namespace of arguments.
+    :param logger: A logger.
+    :return: A tuple containing the train, validation, and test splits of the data.
+    """
+    assert len(sizes) == 3 and sum(sizes) == 1
+    if args is not None:
+        folds_file, val_fold_index, test_fold_index = \
+            args.folds_file, args.val_fold_index, args.test_fold_index
+    else:
+        folds_file = val_fold_index = test_fold_index = None
+    if split_type == 'crossval':
+        index_set = args.crossval_index_sets[args.seed]
+        data_split = []
+        for split in range(3):
+            split_indices = []
+            for index in index_set[split]:
+                with open(os.path.join(args.crossval_index_dir, f'{index}.pkl'), 'rb') as rf:
+                    split_indices.extend(pickle.load(rf))
+            data_split.append([data[i] for i in split_indices])
+        train, val, test = tuple(data_split)
+        return MoleculeDataset(train), MoleculeDataset(val), MoleculeDataset(test)
+    elif split_type == 'index_predetermined':
+        split_indices = args.crossval_index_sets[args.seed]
+        assert len(split_indices) == 3
+        data_split = []
+        for split in range(3):
+            data_split.append([data[i] for i in split_indices[split]])
+        train, val, test = tuple(data_split)
+        return MoleculeDataset(train), MoleculeDataset(val), MoleculeDataset(test)
+    elif split_type == 'predetermined':
+        if not val_fold_index:
+            assert sizes[2] == 0  # test set is created separately so use all of the other data for train and val
+        assert folds_file is not None
+        assert test_fold_index is not None
+        try:
+            with open(folds_file, 'rb') as f:
+                all_fold_indices = pickle.load(f)
+        except UnicodeDecodeError:
+            with open(folds_file, 'rb') as f:
+                all_fold_indices = pickle.load(f, encoding='latin1')  # in case we're loading indices from python2
+        # assert len(data) == sum([len(fold_indices) for fold_indices in all_fold_indices])
+        log_scaffold_stats(data, all_fold_indices, logger=logger)
+        folds = [[data[i] for i in fold_indices] for fold_indices in all_fold_indices]
+        test = folds[test_fold_index]
+        if val_fold_index is not None:
+            val = folds[val_fold_index]
+        train_val = []
+        for i in range(len(folds)):
+            if i != test_fold_index and (val_fold_index is None or i != val_fold_index):
+                train_val.extend(folds[i])
+        if val_fold_index is not None:
+            train = train_val
+        else:
+            random.seed(seed)
+            random.shuffle(train_val)
+            train_size = int(sizes[0] * len(train_val))
+            train = train_val[:train_size]
+            val = train_val[train_size:]
+        return MoleculeDataset(train), MoleculeDataset(val), MoleculeDataset(test)
+    elif split_type == 'scaffold_balanced':
+        return scaffold_split(data, sizes=sizes, balanced=True, seed=seed, logger=logger)
+    elif split_type == 'random':
+        data.shuffle(seed=seed)
+        train_size = int(sizes[0] * len(data))
+        train_val_size = int((sizes[0] + sizes[1]) * len(data))
+        train = data[:train_size]
+        val = data[train_size:train_val_size]
+        test = data[train_val_size:]
+        return MoleculeDataset(train), MoleculeDataset(val), MoleculeDataset(test)
+    else:
+        raise ValueError(f'split_type "{split_type}" not supported.')
+def get_class_sizes(data: MoleculeDataset) -> List[List[float]]:
+    """
+    Determines the proportions of the different classes in the classification dataset.
+    :param data: A classification dataset
+    :return: A list of lists of class proportions. Each inner list contains the class proportions
+    for a task.
+    """
+    targets = data.targets()
+    # Filter out Nones
+    valid_targets = [[] for _ in range(data.num_tasks())]
+    for i in range(len(targets)):
+        for task_num in range(len(targets[i])):
+            if targets[i][task_num] is not None:
+                valid_targets[task_num].append(targets[i][task_num])
+    class_sizes = []
+    for task_targets in valid_targets:
+        # Make sure we're dealing with a binary classification task
+        assert set(np.unique(task_targets)) <= {0, 1}
+        try:
+            ones = np.count_nonzero(task_targets) / len(task_targets)
+        except ZeroDivisionError:
+            ones = float('nan')
+            print('Warning: class has no targets')
+        class_sizes.append([1 - ones, ones])
+    return class_sizes
+def generate_scaffold(mol: Union[str, Chem.Mol], include_chirality: bool = False) -> str:
+    """
+    Compute the Bemis-Murcko scaffold for a SMILES string.
+    :param mol: A smiles string or an RDKit molecule.
+    :param include_chirality: Whether to include chirality.
+    :return:
+    """
+    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
+    scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality)
+    return scaffold
+def scaffold_to_smiles(mols: Union[List[str], List[Chem.Mol]],
+                       use_indices: bool = False) -> Dict[str, Union[Set[str], Set[int]]]:
+    """
+    Computes scaffold for each smiles string and returns a mapping from scaffolds to sets of smiles.
+    :param mols: A list of smiles strings or RDKit molecules.
+    :param use_indices: Whether to map to the smiles' index in all_smiles rather than mapping
+    to the smiles string itself. This is necessary if there are duplicate smiles.
+    :return: A dictionary mapping each unique scaffold to all smiles (or smiles indices) which have that scaffold.
+    """
+    scaffolds = defaultdict(set)
+    for i, mol in tqdm(enumerate(mols), total=len(mols)):
+        scaffold = generate_scaffold(mol)
+        if use_indices:
+            scaffolds[scaffold].add(i)
+        else:
+            scaffolds[scaffold].add(mol)
+    return scaffolds
+def scaffold_split(data: MoleculeDataset,
+                   sizes: Tuple[float, float, float] = (0.8, 0.1, 0.1),
+                   balanced: bool = False,
+                   seed: int = 0,
+                   logger: logging.Logger = None) -> Tuple[MoleculeDataset,
+                                                           MoleculeDataset,
+                                                           MoleculeDataset]:
+    """
+    Split a dataset by scaffold so that no molecules sharing a scaffold are in the same split.
+    :param data: A MoleculeDataset.
+    :param sizes: A length-3 tuple with the proportions of data in the
+    train, validation, and test sets.
+    :param balanced: Try to balance sizes of scaffolds in each set, rather than just putting smallest in test set.
+    :param seed: Seed for shuffling when doing balanced splitting.
+    :param logger: A logger.
+    :return: A tuple containing the train, validation, and test splits of the data.
+    """
+    assert sum(sizes) == 1
+    # Split
+    train_size, val_size, test_size = sizes[0] * len(data), sizes[1] * len(data), sizes[2] * len(data)
+    train, val, test = [], [], []
+    train_scaffold_count, val_scaffold_count, test_scaffold_count = 0, 0, 0
+    # Map from scaffold to index in the data
+    scaffold_to_indices = scaffold_to_smiles(data.smiles(), use_indices=True)
+    if balanced:  # Put stuff that's bigger than half the val/test size into train, rest just order randomly
+        index_sets = list(scaffold_to_indices.values())
+        big_index_sets = []
+        small_index_sets = []
+        for index_set in index_sets:
+            if len(index_set) > val_size / 2 or len(index_set) > test_size / 2:
+                big_index_sets.append(index_set)
+            else:
+                small_index_sets.append(index_set)
+        random.seed(seed)
+        random.shuffle(big_index_sets)
+        random.shuffle(small_index_sets)
+        index_sets = big_index_sets + small_index_sets
+    else:  # Sort from largest to smallest scaffold sets
+        index_sets = sorted(list(scaffold_to_indices.values()),
+                            key=lambda index_set: len(index_set),
+                            reverse=True)
+    for index_set in index_sets:
+        if len(train) + len(index_set) <= train_size:
+            train += index_set
+            train_scaffold_count += 1
+        elif len(val) + len(index_set) <= val_size:
+            val += index_set
+            val_scaffold_count += 1
+        else:
+            test += index_set
+            test_scaffold_count += 1
+    if logger is not None:
+        logger.debug(f'Total scaffolds = {len(scaffold_to_indices):,} | '
+                     f'train scaffolds = {train_scaffold_count:,} | '
+                     f'val scaffolds = {val_scaffold_count:,} | '
+                     f'test scaffolds = {test_scaffold_count:,}')
+    log_scaffold_stats(data, index_sets, logger=logger)
+    # Map from indices to data
+    train = [data[i] for i in train]
+    val = [data[i] for i in val]
+    test = [data[i] for i in test]
+    return MoleculeDataset(train), MoleculeDataset(val), MoleculeDataset(test)
+def log_scaffold_stats(data: MoleculeDataset,
+                       index_sets: List[Set[int]],
+                       num_scaffolds: int = 10,
+                       num_labels: int = 20,
+                       logger: logging.Logger = None) -> List[Tuple[List[float], List[int]]]:
+    """
+    Logs and returns statistics about counts and average target values in molecular scaffolds.
+    :param data: A MoleculeDataset.
+    :param index_sets: A list of sets of indices representing splits of the data.
+    :param num_scaffolds: The number of scaffolds about which to display statistics.
+    :param num_labels: The number of labels about which to display statistics.
+    :param logger: A Logger.
+    :return: A list of tuples where each tuple contains a list of average target values
+    across the first num_labels labels and a list of the number of non-zero values for
+    the first num_scaffolds scaffolds, sorted in decreasing order of scaffold frequency.
+    """
+    # print some statistics about scaffolds
+    target_avgs = []
+    counts = []
+    for index_set in index_sets:
+        data_set = [data[i] for i in index_set]
+        targets = [d.targets for d in data_set]
+        targets = np.array(targets, dtype=np.float)
+        target_avgs.append(np.nanmean(targets, axis=0))
+        counts.append(np.count_nonzero(~np.isnan(targets), axis=0))
+    stats = [(target_avgs[i][:num_labels], counts[i][:num_labels]) for i in range(min(num_scaffolds, len(target_avgs)))]
+    if logger is not None:
+        logger.debug('Label averages per scaffold, in decreasing order of scaffold frequency,'
+                     f'capped at {num_scaffolds} scaffolds and {num_labels} labels: {stats}')
+    return stats
+def makedirs(path: str, isfile: bool = False):
+    """
+    Creates a directory given a path to either a directory or file.
+    If a directory is provided, creates that directory. If a file is provided (i.e. isfiled == True),
+    creates the parent directory for that file.
+    :param path: Path to a directory or file.
+    :param isfile: Whether the provided path is a directory or file.
+    """
+    if isfile:
+        path = os.path.dirname(path)
+    if path != '':
+        os.makedirs(path, exist_ok=True)
+def load_args(path: str) -> Namespace:
+    """
+    Loads the arguments a model was trained with.
+    :param path: Path where model checkpoint is saved.
+    :return: The arguments Namespace that the model was trained with.
+    """
+    return torch.load(path, map_location=lambda storage, loc: storage)['args']
+def get_ffn_layer_id(model: GroverFinetuneTask):
+    """
+    Get the ffn layer id for GroverFinetune Task. (Adhoc!)
+    :param model:
+    :return:
+    """
+    return [id(x) for x in model.state_dict() if "grover" not in x and "ffn" in x]
+def build_optimizer(model: nn.Module, args: Namespace):
+    """
+    Builds an Optimizer.
+    :param model: The model to optimize.
+    :param args: Arguments.
+    :return: An initialized Optimizer.
+    """
+    # Only adjust the learning rate for the GroverFinetuneTask.
+    if type(model) == GroverFinetuneTask:
+        ffn_params = get_ffn_layer_id(model)
+    else:
+        # if not, init adam optimizer normally.
+        return torch.optim.Adam(model.parameters(), lr=args.init_lr, weight_decay=args.weight_decay)
+    base_params = filter(lambda p: id(p) not in ffn_params, model.parameters())
+    ffn_params = filter(lambda p: id(p) in ffn_params, model.parameters())
+    if args.fine_tune_coff == 0:
+        for param in base_params:
+            param.requires_grad = False
+    optimizer = torch.optim.Adam([
+        {'params': base_params, 'lr': args.init_lr * args.fine_tune_coff},
+        {'params': ffn_params, 'lr': args.init_lr}
+    ], lr=args.init_lr, weight_decay=args.weight_decay)
+    return optimizer
+def build_lr_scheduler(optimizer, args: Namespace, total_epochs: List[int] = None):
+    """
+    Builds a learning rate scheduler.
+    :param optimizer: The Optimizer whose learning rate will be scheduled.
+    :param args: Arguments.
+    :param total_epochs: The total number of epochs for which the model will be task.
+    :return: An initialized learning rate scheduler.
+    """
+    # Learning rate scheduler
+    # Divide the parameter into two groups for the finetune.
+    return NoamLR(
+        optimizer=optimizer,
+        warmup_epochs=args.warmup_epochs,
+        total_epochs=args.epochs,
+        steps_per_epoch=args.train_data_size // args.batch_size,
+        init_lr=args.init_lr,
+        max_lr=args.max_lr,
+        final_lr=args.final_lr,
+        fine_tune_coff=args.fine_tune_coff
+    )
+def create_logger(name: str, save_dir: str = None, quiet: bool = False) -> logging.Logger:
+    """
+    Creates a logger with a stream handler and two file handlers.
+    The stream handler prints to the screen depending on the value of `quiet`.
+    One file handler (verbose.log) saves all logs, the other (quiet.log) only saves important info.
+    :param name: The name of the logger.
+    :param save_dir: The directory in which to save the logs.
+    :param quiet: Whether the stream handler should be quiet (i.e. print only important info).
+    :return: The logger.
+    """
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+    # Set logger depending on desired verbosity
+    ch = logging.StreamHandler()
+    if quiet:
+        ch.setLevel(logging.INFO)
+    else:
+        ch.setLevel(logging.DEBUG)
+    logger.addHandler(ch)
+    if save_dir is not None:
+        makedirs(save_dir)
+        fh_v = logging.FileHandler(os.path.join(save_dir, 'verbose.log'))
+        fh_v.setLevel(logging.DEBUG)
+        fh_q = logging.FileHandler(os.path.join(save_dir, 'quiet.log'))
+        fh_q.setLevel(logging.INFO)
+        logger.addHandler(fh_v)
+        logger.addHandler(fh_q)
+    return logger
+def load_checkpoint(path: str,
+                    current_args: Namespace = None,
+                    cuda: bool = None,
+                    logger: logging.Logger = None):
+    """
+    Loads a model checkpoint.
+    :param path: Path where checkpoint is saved.
+    :param current_args: The current arguments. Replaces the arguments loaded from the checkpoint if provided.
+    :param cuda: Whether to move model to cuda.
+    :param logger: A logger.
+    :return: The loaded MPNN.
+    """
+    debug = logger.debug if logger is not None else print
+    # Load model and args
+    state = torch.load(path, map_location=lambda storage, loc: storage)
+    args, loaded_state_dict = state['args'], state['state_dict']
+    model_ralated_args = get_model_args()
+    if current_args is not None:
+        for key, value in vars(args).items():
+            if key in model_ralated_args:
+                setattr(current_args, key, value)
+    else:
+        current_args = args
+    # args.cuda = cuda if cuda is not None else args.cuda
+    # Build model
+    model = build_model(current_args)
+    model_state_dict = model.state_dict()
+    # Skip missing parameters and parameters of mismatched size
+    pretrained_state_dict = {}
+    for param_name in loaded_state_dict.keys():
+        new_param_name = param_name
+        if new_param_name not in model_state_dict:
+            debug(f'Pretrained parameter "{param_name}" cannot be found in model parameters.')
+        elif model_state_dict[new_param_name].shape != loaded_state_dict[param_name].shape:
+            debug(f'Pretrained parameter "{param_name}" '
+                  f'of shape {loaded_state_dict[param_name].shape} does not match corresponding '
+                  f'model parameter of shape {model_state_dict[new_param_name].shape}.')
+        else:
+            debug(f'Loading pretrained parameter "{param_name}".')
+            pretrained_state_dict[new_param_name] = loaded_state_dict[param_name]
+    # Load pretrained weights
+    model_state_dict.update(pretrained_state_dict)
+    model.load_state_dict(model_state_dict)
+    if cuda:
+        debug('Moving model to cuda')
+        model = model.cuda()
+    return model
+def get_loss_func(args: Namespace, model=None):
+    """
+    Gets the loss function corresponding to a given dataset type.
+    :param args: Namespace containing the dataset type ("classification" or "regression").
+    :return: A PyTorch loss function.
+    """
+    if hasattr(model, "get_loss_func"):
+        return model.get_loss_func(args)
+    if args.dataset_type == 'classification':
+        return nn.BCEWithLogitsLoss(reduction='none')
+    if args.dataset_type == 'regression':
+        return nn.MSELoss(reduction='none')
+    raise ValueError(f'Dataset type "{args.dataset_type}" not supported.')
+def load_scalars(path: str):
+    """
+    Loads the scalars a model was trained with.
+    :param path: Path where model checkpoint is saved.
+    :return: A tuple with the data scaler and the features scaler.
+    """
+    state = torch.load(path, map_location=lambda storage, loc: storage)
+    scaler = StandardScaler(state['data_scaler']['means'],
+                            state['data_scaler']['stds']) if state['data_scaler'] is not None else None
+    features_scaler = StandardScaler(state['features_scaler']['means'],
+                                     state['features_scaler']['stds'],
+                                     replace_nan_token=0) if state['features_scaler'] is not None else None
+    return scaler, features_scaler
+def save_checkpoint(path: str,
+                    model,
+                    scaler,
+                    features_scaler,
+                    args: Namespace = None):
+    """
+    Saves a model checkpoint.
+    :param model: A MPNN.
+    :param scaler: A StandardScaler fitted on the data.
+    :param features_scaler: A StandardScaler fitted on the features.
+    :param args: Arguments namespace.
+    :param path: Path where checkpoint will be saved.
+    """
+    state = {
+        'args': args,
+        'state_dict': model.state_dict(),
+        'data_scaler': {
+            'means': scaler.means,
+            'stds': scaler.stds
+        } if scaler is not None else None,
+        'features_scaler': {
+            'means': features_scaler.means,
+            'stds': features_scaler.stds
+        } if features_scaler is not None else None
+    }
+    torch.save(state, path)
+def build_model(args: Namespace, model_idx=0):
+    """
+    Builds a MPNN, which is a message passing neural network + feed-forward layers.
+    :param args: Arguments.
+    :return: A MPNN containing the MPN encoder along with final linear layers with parameters initialized.
+    """
+    if hasattr(args, 'num_tasks'):
+        args.output_size = args.num_tasks
+    else:
+        args.output_size = 1
+    if args.parser_name == "fingerprint":
+        model = GroverFpGeneration(args)
+    else:
+        # finetune and evaluation case.
+        model = GroverFinetuneTask(args)
+    initialize_weights(model=model, model_idx=model_idx)
+    return model

prepare_data.py CHANGED Viewed

@@ -17,4 +17,5 @@ val_path = "./tox21/tox21_validation.csv"
 train_path_clean = "./tox21/tox21_train_clean.csv"
 val_path_clean = "./tox21/tox21_validation_clean.csv"
-prepare_data(test_path, test_path_clean, "./tox21/valid_mask_test.npy")

 train_path_clean = "./tox21/tox21_train_clean.csv"
 val_path_clean = "./tox21/tox21_validation_clean.csv"
+prepare_data(train_path, train_path_clean, "./tox21/valid_mask_train.npy")
+prepare_data(val_path, val_path_clean, "./tox21/valid_mask_val.npy")

requirements.txt ADDED Viewed

	@@ -0,0 +1,82 @@

+absl-py @ file:///home/conda/feedstock_root/build_artifacts/absl-py_1751547525079/work
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.2
+aiosignal==1.4.0
+anyio==4.12.0
+async-timeout==5.0.1
+attrs==25.4.0
+Brotli @ file:///home/conda/feedstock_root/build_artifacts/brotli-split_1749229842835/work
+certifi @ file:///home/conda/feedstock_root/build_artifacts/certifi_1754231422783/work/certifi
+cffi @ file:///home/conda/feedstock_root/build_artifacts/cffi_1725571112467/work
+charset-normalizer @ file:///home/conda/feedstock_root/build_artifacts/charset-normalizer_1754767332901/work
+click==8.1.8
+colorama @ file:///home/conda/feedstock_root/build_artifacts/colorama_1733218098505/work
+datasets==4.4.1
+descriptastorus==2.8.0
+dill==0.4.0
+exceptiongroup==1.3.1
+filelock @ file:///home/conda/feedstock_root/build_artifacts/filelock_1755216263872/work
+frozenlist==1.8.0
+fsspec==2025.10.0
+git-filter-repo @ file:///home/conda/feedstock_root/build_artifacts/git-filter-repo_1735551402582/work
+gmpy2 @ file:///home/conda/feedstock_root/build_artifacts/gmpy2_1745509363867/work
+grpcio @ file:///home/conda/feedstock_root/build_artifacts/grpc-split_1754634529307/work
+h11==0.16.0
+h2 @ file:///home/conda/feedstock_root/build_artifacts/h2_1738578511449/work
+hf-xet==1.2.0
+hpack @ file:///home/conda/feedstock_root/build_artifacts/hpack_1737618293087/work
+httpcore==1.0.9
+httpx==0.28.1
+huggingface_hub==1.1.7
+hyperframe @ file:///home/conda/feedstock_root/build_artifacts/hyperframe_1737618333194/work
+idna @ file:///home/conda/feedstock_root/build_artifacts/idna_1733211830134/work
+importlib_metadata @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_importlib-metadata_1747934053/work
+Jinja2 @ file:///home/conda/feedstock_root/build_artifacts/jinja2_1741263328855/work
+joblib @ file:///home/conda/feedstock_root/build_artifacts/joblib_1748019130050/work
+Markdown @ file:///home/conda/feedstock_root/build_artifacts/markdown_1750360292101/work
+MarkupSafe @ file:///home/conda/feedstock_root/build_artifacts/markupsafe_1733219680183/work
+mpmath @ file:///home/conda/feedstock_root/build_artifacts/mpmath_1733302684489/work
+multidict==6.7.0
+multiprocess==0.70.18
+networkx @ file:///home/conda/feedstock_root/build_artifacts/networkx_1698504735452/work
+numpy @ file:///home/conda/feedstock_root/build_artifacts/numpy_1707225342954/work/dist/numpy-1.26.4-cp39-cp39-linux_x86_64.whl#sha256=c799942b5898f6e6c60264d1663a6469a475290e758c654aeeb78e2596463abd
+packaging @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_packaging_1745345660/work
+pandas @ file:///home/conda/feedstock_root/build_artifacts/pandas_1752081702369/work
+pandas_flavor==0.7.0
+pillow @ file:///home/conda/feedstock_root/build_artifacts/pillow_1751482006338/work
+propcache==0.4.1
+protobuf @ file:///home/conda/feedstock_root/build_artifacts/protobuf_1751668301193/work/bazel-bin/python/dist/protobuf-6.31.1-cp39-abi3-linux_x86_64.whl#sha256=91a4a00a210b50fbca2de99b20633990d9f00a443829a9badc867ec313e0fecc
+pyarrow==21.0.0
+pycairo==1.28.0
+pycparser @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_pycparser_1733195786/work
+PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1733217236728/work
+python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_python-dateutil_1751104122/work
+pytz @ file:///home/conda/feedstock_root/build_artifacts/pytz_1742920838005/work
+PyYAML @ file:///home/conda/feedstock_root/build_artifacts/pyyaml_1737454647378/work
+rdkit==2025.9.1
+rdkit-pypi==2022.9.5
+requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1755614211359/work
+scikit-learn @ file:///home/conda/feedstock_root/build_artifacts/scikit-learn_1736496755362/work/dist/scikit_learn-1.6.1-cp39-cp39-linux_x86_64.whl#sha256=e8f978e37bb47e04e1337a63f75697b723d6d25f58e477734555faed033884ba
+scipy==1.10.1
+shellingham==1.5.4
+six @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_six_1753199211/work
+sympy @ file:///home/conda/feedstock_root/build_artifacts/sympy_1745946051654/work
+tensorboard @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_tensorboard_1752825441/work/tensorboard-2.20.0-py3-none-any.whl#sha256=9dc9f978cb84c0723acf9a345d96c184f0293d18f166bb8d59ee098e6cfaaba6
+tensorboard_data_server @ file:///home/conda/feedstock_root/build_artifacts/tensorboard-data-server_1728639721704/work/tensorboard_data_server-0.7.0-py3-none-manylinux2014_x86_64.whl#sha256=3b7dc7cf17b685028f955453a839cc9b2871818de53e7911eae158fe6b3a80cf
+threadpoolctl @ file:///home/conda/feedstock_root/build_artifacts/threadpoolctl_1741878222898/work
+torch==2.4.0
+torchvision==0.19.0
+tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1735661334605/work
+triton==3.0.0
+typer-slim==0.20.0
+typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_typing_extensions_1751643513/work
+tzdata @ file:///home/conda/feedstock_root/build_artifacts/python-tzdata_1742745135198/work
+urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1750271362675/work
+Werkzeug @ file:///home/conda/feedstock_root/build_artifacts/werkzeug_1733160440960/work
+xarray==2024.7.0
+xxhash==3.6.0
+yarl==1.22.0
+zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1749421620841/work
+zstandard==0.23.0
+fastapi
+uvicorn[standard]

scripts/__init__.py ADDED Viewed

File without changes

scripts/build_vocab.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""
+The vocabulary building scripts.
+"""
+import os
+from grover.data.torchvocab import MolVocab
+def build():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_path', default="../../dataset/grover_new_dataset/druglike_merged_refine2.csv", type=str)
+    parser.add_argument('--vocab_save_folder', default="../../dataset/grover_new_dataset", type=str)
+    parser.add_argument('--dataset_name', type=str, default=None,
+                        help="Will be the first part of the vocab file name. If it is None,"
+                             "the vocab files will be: atom_vocab.pkl and bond_vocab.pkl")
+    parser.add_argument('--vocab_max_size', type=int, default=None)
+    parser.add_argument('--vocab_min_freq', type=int, default=1)
+    args = parser.parse_args()
+    # fin = open(args.data_path, 'r')
+    # lines = fin.readlines()
+    for vocab_type in ['atom', 'bond']:
+        vocab_file = f"{vocab_type}_vocab.pkl"
+        if args.dataset_name is not None:
+            vocab_file = args.dataset_name + '_' + vocab_file
+        vocab_save_path = os.path.join(args.vocab_save_folder, vocab_file)
+        os.makedirs(os.path.dirname(vocab_save_path), exist_ok=True)
+        vocab = MolVocab(file_path=args.data_path,
+                         max_size=args.vocab_max_size,
+                         min_freq=args.vocab_min_freq,
+                         num_workers=100,
+                         vocab_type=vocab_type)
+        print(f"{vocab_type} vocab size", len(vocab))
+        vocab.save_vocab(vocab_save_path)
+if __name__ == '__main__':
+    build()

scripts/save_features.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+Computes and saves molecular features for a dataset.
+"""
+import os
+import shutil
+import sys
+from argparse import ArgumentParser, Namespace
+from multiprocessing import Pool
+from typing import List, Tuple
+from tqdm import tqdm
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+from grover.util.utils import get_data, makedirs, load_features, save_features
+from grover.data.molfeaturegenerator import get_available_features_generators, \
+    get_features_generator
+from grover.data.task_labels import rdkit_functional_group_label_features_generator
+def load_temp(temp_dir: str) -> Tuple[List[List[float]], int]:
+    """
+    Loads all features saved as .npz files in load_dir.
+    Assumes temporary files are named in order 0.npz, 1.npz, ...
+    :param temp_dir: Directory in which temporary .npz files containing features are stored.
+    :return: A tuple with a list of molecule features, where each molecule's features is a list of floats,
+    and the number of temporary files.
+    """
+    features = []
+    temp_num = 0
+    temp_path = os.path.join(temp_dir, f'{temp_num}.npz')
+    while os.path.exists(temp_path):
+        features.extend(load_features(temp_path))
+        temp_num += 1
+        temp_path = os.path.join(temp_dir, f'{temp_num}.npz')
+    return features, temp_num
+def generate_and_save_features(args: Namespace):
+    """
+    Computes and saves features for a dataset of molecules as a 2D array in a .npz file.
+    :param args: Arguments.
+    """
+    # Create directory for save_path
+    makedirs(args.save_path, isfile=True)
+    # Get data and features function
+    data = get_data(path=args.data_path, max_data_size=None)
+    features_generator = get_features_generator(args.features_generator)
+    temp_save_dir = args.save_path + '_temp'
+    # Load partially complete data
+    if args.restart:
+        if os.path.exists(args.save_path):
+            os.remove(args.save_path)
+        if os.path.exists(temp_save_dir):
+            shutil.rmtree(temp_save_dir)
+    else:
+        if os.path.exists(args.save_path):
+            raise ValueError(f'"{args.save_path}" already exists and args.restart is False.')
+        if os.path.exists(temp_save_dir):
+            features, temp_num = load_temp(temp_save_dir)
+    if not os.path.exists(temp_save_dir):
+        makedirs(temp_save_dir)
+        features, temp_num = [], 0
+    # Build features map function
+    data = data[len(features):]  # restrict to data for which features have not been computed yet
+    mols = (d.smiles for d in data)
+    if args.sequential:
+        features_map = map(features_generator, mols)
+    else:
+        features_map = Pool(30).imap(features_generator, mols)
+    # Get features
+    temp_features = []
+    for i, feats in tqdm(enumerate(features_map), total=len(data)):
+        temp_features.append(feats)
+        # Save temporary features every save_frequency
+        if (i > 0 and (i + 1) % args.save_frequency == 0) or i == len(data) - 1:
+            save_features(os.path.join(temp_save_dir, f'{temp_num}.npz'), temp_features)
+            features.extend(temp_features)
+            temp_features = []
+            temp_num += 1
+    try:
+        # Save all features
+        save_features(args.save_path, features)
+        # Remove temporary features
+        shutil.rmtree(temp_save_dir)
+    except OverflowError:
+        print('Features array is too large to save as a single file. Instead keeping features as a directory of files.')
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--data_path', type=str, required=True,
+                        help='Path to data CSV')
+    parser.add_argument('--features_generator', type=str, required=True,
+                        choices=get_available_features_generators(),
+                        help='Type of features to generate')
+    parser.add_argument('--save_path', type=str, default=None,
+                        help='Path to .npz file where features will be saved as a compressed numpy archive')
+    parser.add_argument('--save_frequency', type=int, default=10000,
+                        help='Frequency with which to save the features')
+    parser.add_argument('--restart', action='store_true', default=False,
+                        help='Whether to not load partially complete featurization and instead start from scratch')
+    parser.add_argument('--max_data_size', type=int,
+                        help='Maximum number of data points to load')
+    parser.add_argument('--sequential', action='store_true', default=False,
+                        help='Whether to task sequentially rather than in parallel')
+    args = parser.parse_args()
+    if args.save_path is None:
+        args.save_path = args.data_path.split('csv')[0] + 'npz'
+    generate_and_save_features(args)

scripts/split_data.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+The data splitting script for pretraining.
+"""
+import os
+from argparse import ArgumentParser
+import csv
+import shutil
+import numpy as np
+import grover.util.utils as fea_utils
+parser = ArgumentParser()
+parser.add_argument("--data_path", default="../drug_data/grover_data/delaneyfreesolvlipo.csv")
+parser.add_argument("--features_path", default="../drug_data/grover_data/delaneyfreesolvlipo_molbert.npz")
+parser.add_argument("--sample_per_file", type=int, default=1000)
+parser.add_argument("--output_path", default="../drug_data/grover_data/delaneyfreesolvlipo")
+def load_smiles(data_path):
+    with open(data_path) as f:
+        reader = csv.reader(f)
+        header = next(reader)
+        res = []
+        for line in reader:
+            res.append(line)
+    return res, header
+def load_features(data_path):
+    fea = fea_utils.load_features(data_path)
+    return fea
+def save_smiles(data_path, index, data, header):
+    fn = os.path.join(data_path, str(index) + ".csv")
+    with open(fn, "w") as f:
+        fw = csv.writer(f)
+        fw.writerow(header)
+        for d in data:
+            fw.writerow(d)
+def save_features(data_path, index, data):
+    fn = os.path.join(data_path, str(index) + ".npz")
+    np.savez_compressed(fn, features=data)
+def run():
+    args = parser.parse_args()
+    res, header = load_smiles(data_path=args.data_path)
+    fea = load_features(data_path=args.features_path)
+    assert len(res) == fea.shape[0]
+    n_graphs = len(res)
+    perm = np.random.permutation(n_graphs)
+    nfold = int(n_graphs / args.sample_per_file + 1)
+    print("Number of files: %d" % nfold)
+    if os.path.exists(args.output_path):
+        shutil.rmtree(args.output_path)
+    os.makedirs(args.output_path, exist_ok=True)
+    graph_path = os.path.join(args.output_path, "graph")
+    fea_path = os.path.join(args.output_path, "feature")
+    os.makedirs(graph_path, exist_ok=True)
+    os.makedirs(fea_path, exist_ok=True)
+    for i in range(nfold):
+        sidx = i * args.sample_per_file
+        eidx = min((i + 1) * args.sample_per_file, n_graphs)
+        indexes = perm[sidx:eidx]
+        sres = [res[j] for j in indexes]
+        sfea = fea[indexes]
+        save_smiles(graph_path, i, sres, header)
+        save_features(fea_path, i, sfea)
+    summary_path = os.path.join(args.output_path, "summary.txt")
+    summary_fout = open(summary_path, 'w')
+    summary_fout.write("n_files:%d\n" % nfold)
+    summary_fout.write("n_samples:%d\n" % n_graphs)
+    summary_fout.write("sample_per_file:%d\n" % args.sample_per_file)
+    summary_fout.close()
+if __name__ == "__main__":
+    run()

src/commands.py CHANGED Viewed

@@ -16,6 +16,16 @@ def generate_features(data_path, save_path):
         f"--restart"
     )
 def finetune(train_path, val_path, train_features_path, val_features_path,
         save_dir, checkpoint_path, args

         f"--restart"
     )
+def predict_from_csv(data_path, features_path, checkpoint_dir, output_path):
+    predict_cmd = (
+        f"python main.py predict "
+        f"--data_path {data_path} "
+        f"--features_path {features_path} "
+        f"--checkpoint_dir {checkpoint_dir} "
+        f"--no_features_scaling "
+        f"--output {output_path}"
+    )
 def finetune(train_path, val_path, train_features_path, val_features_path,
         save_dir, checkpoint_path, args

task/__init__.py ADDED Viewed

File without changes

task/cross_validate.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+The cross validation function for finetuning.
+This implementation is adapted from
+https://github.com/chemprop/chemprop/blob/master/chemprop/train/cross_validate.py
+"""
+import os
+import time
+from argparse import Namespace
+from logging import Logger
+from typing import Tuple
+import numpy as np
+from grover.util.utils import get_task_names
+from grover.util.utils import makedirs
+from task.run_evaluation import run_evaluation
+from task.train import run_training
+def cross_validate(args: Namespace, logger: Logger = None) -> Tuple[float, float]:
+    """
+    k-fold cross validation.
+    :return: A tuple of mean_score and std_score.
+    """
+    info = logger.info if logger is not None else print
+    # Initialize relevant variables
+    init_seed = args.seed
+    save_dir = args.save_dir
+    task_names = get_task_names(args.data_path)
+    # Run training with different random seeds for each fold
+    all_scores = []
+    time_start = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
+    for fold_num in range(args.num_folds):
+        info(f'Fold {fold_num}')
+        args.seed = init_seed + fold_num
+        args.save_dir = os.path.join(save_dir, f'fold_{fold_num}')
+        makedirs(args.save_dir)
+        if args.parser_name == "finetune":
+            model_scores = run_training(args, time_start, logger)
+        else:
+            model_scores = run_evaluation(args, logger)
+        all_scores.append(model_scores)
+    all_scores = np.array(all_scores)
+    # Report scores for each fold
+    info(f'{args.num_folds}-fold cross validation')
+    for fold_num, scores in enumerate(all_scores):
+        info(f'Seed {init_seed + fold_num} ==> test {args.metric} = {np.nanmean(scores):.6f}')
+        if args.show_individual_scores:
+            for task_name, score in zip(task_names, scores):
+                info(f'Seed {init_seed + fold_num} ==> test {task_name} {args.metric} = {score:.6f}')
+    # Report scores across models
+    avg_scores = np.nanmean(all_scores, axis=1)  # average score for each model across tasks
+    mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores)
+    info(f'overall_{args.split_type}_test_{args.metric}={mean_score:.6f}')
+    info(f'std={std_score:.6f}')
+    if args.show_individual_scores:
+        for task_num, task_name in enumerate(task_names):
+            info(f'Overall test {task_name} {args.metric} = '
+                 f'{np.nanmean(all_scores[:, task_num]):.6f} +/- {np.nanstd(all_scores[:, task_num]):.6f}')
+    return mean_score, std_score

task/fingerprint.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""
+The fingerprint generation function.
+"""
+from argparse import Namespace
+from logging import Logger
+from typing import List
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from grover.data import MolCollator
+from grover.data import MoleculeDataset
+from grover.util.utils import get_data, create_logger, load_checkpoint
+def do_generate(model: nn.Module,
+                data: MoleculeDataset,
+                args: Namespace,
+                ) -> List[List[float]]:
+    """
+    Do the fingerprint generation on a dataset using the pre-trained models.
+    :param model: A model.
+    :param data: A MoleculeDataset.
+    :param args: A StandardScaler object fit on the training targets.
+    :return: A list of fingerprints.
+    """
+    model.eval()
+    args.bond_drop_rate = 0
+    preds = []
+    mol_collator = MolCollator(args=args, shared_dict={})
+    num_workers = 4
+    mol_loader = DataLoader(data,
+                            batch_size=32,
+                            shuffle=False,
+                            num_workers=num_workers,
+                            collate_fn=mol_collator)
+    for item in mol_loader:
+        _, batch, features_batch, _, _ = item
+        with torch.no_grad():
+            batch_preds = model(batch, features_batch)
+            preds.extend(batch_preds.data.cpu().numpy())
+    return preds
+def generate_fingerprints(args: Namespace, logger: Logger = None) -> List[List[float]]:
+    """
+    Generate the fingerprints.
+    :param logger:
+    :param args: Arguments.
+    :return: A list of lists of target fingerprints.
+    """
+    checkpoint_path = args.checkpoint_paths[0]
+    if logger is None:
+        logger = create_logger('fingerprints', quiet=False)
+    print('Loading data')
+    test_data = get_data(path=args.data_path,
+                         args=args,
+                         use_compound_names=False,
+                         max_data_size=float("inf"),
+                         skip_invalid_smiles=False)
+    test_data = MoleculeDataset(test_data)
+    logger.info(f'Total size = {len(test_data):,}')
+    logger.info(f'Generating...')
+    # Load model
+    model = load_checkpoint(checkpoint_path, cuda=args.cuda, current_args=args, logger=logger)
+    model_preds = do_generate(
+        model=model,
+        data=test_data,
+        args=args
+    )
+    return model_preds

task/grovertrainer.py ADDED Viewed

	@@ -0,0 +1,279 @@

+"""
+The GROVER trainer.
+"""
+import os
+import time
+from logging import Logger
+from typing import List, Tuple
+from collections.abc import Callable
+import torch
+from torch.nn import Module
+from torch.utils.data import DataLoader
+from grover.model.models import GroverTask
+from grover.util.multi_gpu_wrapper import MultiGpuWrapper as mgw
+class GROVERTrainer:
+    def __init__(self,
+                 args,
+                 embedding_model: Module,
+                 atom_vocab_size: int,  # atom vocab size
+                 bond_vocab_size: int,
+                 fg_szie: int,
+                 train_dataloader: DataLoader,
+                 test_dataloader: DataLoader,
+                 optimizer_builder: Callable,
+                 scheduler_builder: Callable,
+                 logger: Logger = None,
+                 with_cuda: bool = False,
+                 enable_multi_gpu: bool = False):
+        """
+        The init function of GROVERTrainer
+        :param args: the input arguments.
+        :param embedding_model: the model to generate atom/bond embeddings.
+        :param atom_vocab_size: the vocabulary size of atoms.
+        :param bond_vocab_size: the vocabulary size of bonds.
+        :param fg_szie: the size of semantic motifs (functional groups)
+        :param train_dataloader: the data loader of train data.
+        :param test_dataloader: the data loader of validation data.
+        :param optimizer_builder: the function of building the optimizer.
+        :param scheduler_builder: the function of building the scheduler.
+        :param logger: the logger
+        :param with_cuda: enable gpu training.
+        :param enable_multi_gpu: enable multi_gpu traning.
+        """
+        self.args = args
+        self.with_cuda = with_cuda
+        self.grover = embedding_model
+        self.model = GroverTask(args, embedding_model, atom_vocab_size, bond_vocab_size, fg_szie)
+        self.loss_func = self.model.get_loss_func(args)
+        self.enable_multi_gpu = enable_multi_gpu
+        self.atom_vocab_size = atom_vocab_size
+        self.bond_vocab_size = bond_vocab_size
+        self.debug = logger.debug if logger is not None else print
+        if self.with_cuda:
+            # print("Using %d GPUs for training." % (torch.cuda.device_count()))
+            self.model = self.model.cuda()
+        self.train_data = train_dataloader
+        self.test_data = test_dataloader
+        self.optimizer = optimizer_builder(self.model, self.args)
+        self.scheduler = scheduler_builder(self.optimizer, self.args)
+        if self.enable_multi_gpu:
+            self.optimizer = mgw.DistributedOptimizer(self.optimizer,
+                                                      named_parameters=self.model.named_parameters())
+        self.args = args
+        self.n_iter = 0
+    def broadcast_parameters(self) -> None:
+        """
+        Broadcast parameters before training.
+        :return: no return.
+        """
+        if self.enable_multi_gpu:
+            # broadcast parameters & optimizer state.
+            mgw.broadcast_parameters(self.model.state_dict(), root_rank=0)
+            mgw.broadcast_optimizer_state(self.optimizer, root_rank=0)
+    def train(self, epoch: int) -> List:
+        """
+        The training iteration
+        :param epoch: the current epoch number.
+        :return: the loss terms of current epoch.
+        """
+        # return self.mock_iter(epoch, self.train_data, train=True)
+        return self.iter(epoch, self.train_data, train=True)
+    def test(self, epoch: int) -> List:
+        """
+        The test/validaiion iteration
+        :param epoch: the current epoch number.
+        :return:  the loss terms as a list
+        """
+        # return self.mock_iter(epoch, self.test_data, train=False)
+        return self.iter(epoch, self.test_data, train=False)
+    def mock_iter(self, epoch: int, data_loader: DataLoader, train: bool = True) -> List:
+        """
+        Perform a mock iteration. For test only.
+        :param epoch: the current epoch number.
+        :param data_loader: the data loader.
+        :param train: True: train model, False: validation model.
+        :return: the loss terms as a list
+        """
+        for _, _ in enumerate(data_loader):
+            self.scheduler.step()
+        cum_loss_sum = 0.0
+        self.n_iter += self.args.batch_size
+        return self.n_iter, cum_loss_sum, (0, 0, 0, 0, 0, 0)
+    def iter(self, epoch, data_loader, train=True) -> List:
+        """
+        Perform a training / validation iteration.
+        :param epoch: the current epoch number.
+        :param data_loader: the data loader.
+        :param train: True: train model, False: validation model.
+        :return: the loss terms as a list
+        """
+        if train:
+            self.model.train()
+        else:
+            self.model.eval()
+        loss_sum, iter_count = 0, 0
+        cum_loss_sum, cum_iter_count = 0, 0
+        av_loss_sum, bv_loss_sum, fg_loss_sum, av_dist_loss_sum, bv_dist_loss_sum, fg_dist_loss_sum = 0, 0, 0, 0, 0, 0
+        # loss_func = self.model.get_loss_func(self.args)
+        for _, item in enumerate(data_loader):
+            batch_graph = item["graph_input"]
+            targets = item["targets"]
+            if next(self.model.parameters()).is_cuda:
+                targets["av_task"] = targets["av_task"].cuda()
+                targets["bv_task"] = targets["bv_task"].cuda()
+                targets["fg_task"] = targets["fg_task"].cuda()
+            preds = self.model(batch_graph)
+            # # ad-hoc code, for visualizing a model, comment this block when it is not needed
+            # import dglt.contrib.grover.vis_model as vis_model
+            # for task in ['av_task', 'bv_task', 'fg_task']:
+            #     vis_graph = vis_model.make_dot(self.model(batch_graph)[task],
+            #                                    params=dict(self.model.named_parameters()))
+            #     # vis_graph.view()
+            #     vis_graph.render(f"{self.args.backbone}_model_{task}_vis.png", format="png")
+            # exit()
+            loss, av_loss, bv_loss, fg_loss, av_dist_loss, bv_dist_loss, fg_dist_loss = self.loss_func(preds, targets)
+            loss_sum += loss.item()
+            iter_count += self.args.batch_size
+            if train:
+                cum_loss_sum += loss.item()
+                # Run model
+                self.model.zero_grad()
+                self.optimizer.zero_grad()
+                loss.backward()
+                self.optimizer.step()
+                self.scheduler.step()
+            else:
+                # For eval model, only consider the loss of three task.
+                cum_loss_sum += av_loss.item()
+                cum_loss_sum += bv_loss.item()
+                cum_loss_sum += fg_loss.item()
+            av_loss_sum += av_loss.item()
+            bv_loss_sum += bv_loss.item()
+            fg_loss_sum += fg_loss.item()
+            av_dist_loss_sum += av_dist_loss.item() if type(av_dist_loss) != float else av_dist_loss
+            bv_dist_loss_sum += bv_dist_loss.item() if type(bv_dist_loss) != float else bv_dist_loss
+            fg_dist_loss_sum += fg_dist_loss.item() if type(fg_dist_loss) != float else fg_dist_loss
+            cum_iter_count += 1
+            self.n_iter += self.args.batch_size
+            # Debug only.
+            # if i % 50 == 0:
+            #     print(f"epoch: {epoch}, batch_id: {i}, av_loss: {av_loss}, bv_loss: {bv_loss}, "
+            #           f"fg_loss: {fg_loss}, av_dist_loss: {av_dist_loss}, bv_dist_loss: {bv_dist_loss}, "
+            #           f"fg_dist_loss: {fg_dist_loss}")
+        cum_loss_sum /= cum_iter_count
+        av_loss_sum /= cum_iter_count
+        bv_loss_sum /= cum_iter_count
+        fg_loss_sum /= cum_iter_count
+        av_dist_loss_sum /= cum_iter_count
+        bv_dist_loss_sum /= cum_iter_count
+        fg_dist_loss_sum /= cum_iter_count
+        return self.n_iter, cum_loss_sum, (av_loss_sum, bv_loss_sum, fg_loss_sum, av_dist_loss_sum,
+                                           bv_dist_loss_sum, fg_dist_loss_sum)
+    def save(self, epoch, file_path, name=None) -> str:
+        """
+        Save the intermediate models during training.
+        :param epoch: the epoch number.
+        :param file_path: the file_path to save the model.
+        :return: the output path.
+        """
+        # add specific time in model fine name, in order to distinguish different saved models
+        now = time.localtime()
+        if name is None:
+            name = "_%04d_%02d_%02d_%02d_%02d_%02d" % (
+                now.tm_year, now.tm_mon, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec)
+        output_path = file_path + name + ".ep%d" % epoch
+        scaler = None
+        features_scaler = None
+        state = {
+            'args': self.args,
+            'state_dict': self.model.state_dict(),
+            'optimizer': self.optimizer.state_dict(),
+            'scheduler_step': self.scheduler.current_step,
+            "epoch": epoch,
+            'data_scaler': {
+                'means': scaler.means,
+                'stds': scaler.stds
+            } if scaler is not None else None,
+            'features_scaler': {
+                'means': features_scaler.means,
+                'stds': features_scaler.stds
+            } if features_scaler is not None else None
+        }
+        torch.save(state, output_path)
+        # Is this necessary?
+        # if self.with_cuda:
+        #    self.model = self.model.cuda()
+        print("EP:%d Model Saved on:" % epoch, output_path)
+        return output_path
+    def save_tmp(self, epoch, file_path, rank=0):
+        """
+        Save the models for auto-restore during training.
+        The model are stored in file_path/tmp folder and will replaced on each epoch.
+        :param epoch: the epoch number.
+        :param file_path: the file_path to store the model.
+        :param rank: the current rank (decrypted).
+        :return:
+        """
+        store_path = os.path.join(file_path, "tmp")
+        if not os.path.exists(store_path):
+            os.makedirs(store_path, exist_ok=True)
+        store_path = os.path.join(store_path, "model.%d" % rank)
+        state = {
+            'args': self.args,
+            'state_dict': self.model.state_dict(),
+            'optimizer': self.optimizer.state_dict(),
+            'scheduler_step': self.scheduler.current_step,
+            "epoch": epoch
+        }
+        torch.save(state, store_path)
+    def restore(self, file_path, rank=0) -> Tuple[int, int]:
+        """
+        Restore the training state saved by save_tmp.
+        :param file_path: the file_path to store the model.
+        :param rank: the current rank (decrypted).
+        :return: the restored epoch number and the scheduler_step in scheduler.
+        """
+        cpt_path = os.path.join(file_path, "tmp", "model.%d" % rank)
+        if not os.path.exists(cpt_path):
+            print("No checkpoint found %d")
+            return 0, 0
+        cpt = torch.load(cpt_path)
+        self.model.load_state_dict(cpt["state_dict"])
+        self.optimizer.load_state_dict(cpt["optimizer"])
+        epoch = cpt["epoch"]
+        scheduler_step = cpt["scheduler_step"]
+        self.scheduler.current_step = scheduler_step
+        print("Restore checkpoint, current epoch: %d" % (epoch))
+        return epoch, scheduler_step

task/predict.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""
+The predict function using the finetuned model to make the prediction. .
+"""
+from argparse import Namespace
+from typing import List
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from grover.data import MolCollator
+from grover.data import MoleculeDataset
+from grover.data import StandardScaler
+from grover.util.utils import get_data, get_data_from_smiles, create_logger, load_args, get_task_names, tqdm, \
+    load_checkpoint, load_scalars
+def predict(model: nn.Module,
+            data: MoleculeDataset,
+            args: Namespace,
+            batch_size: int,
+            loss_func,
+            logger,
+            shared_dict,
+            scaler: StandardScaler = None
+            ) -> List[List[float]]:
+    """
+    Makes predictions on a dataset using an ensemble of models.
+    :param model: A model.
+    :param data: A MoleculeDataset.
+    :param batch_size: Batch size.
+    :param scaler: A StandardScaler object fit on the training targets.
+    :return: A list of lists of predictions. The outer list is examples
+    while the inner list is tasks.
+    """
+    # debug = logger.debug if logger is not None else print
+    model.eval()
+    args.bond_drop_rate = 0
+    preds = []
+    # num_iters, iter_step = len(data), batch_size
+    loss_sum, iter_count = 0, 0
+    mol_collator = MolCollator(args=args, shared_dict=shared_dict)
+    # mol_dataset = MoleculeDataset(data)
+    num_workers = 4
+    mol_loader = DataLoader(data, batch_size=batch_size, shuffle=False, num_workers=num_workers,
+                            collate_fn=mol_collator)
+    for _, item in enumerate(mol_loader):
+        _, batch, features_batch, mask, targets = item
+        class_weights = torch.ones(targets.shape)
+        if next(model.parameters()).is_cuda:
+            targets = targets.cuda()
+            mask = mask.cuda()
+            class_weights = class_weights.cuda()
+        with torch.no_grad():
+            batch_preds = model(batch, features_batch)
+            iter_count += 1
+            if args.fingerprint:
+                preds.extend(batch_preds.data.cpu().numpy())
+                continue
+            if loss_func is not None:
+                loss = loss_func(batch_preds, targets) * class_weights * mask
+                loss = loss.sum() / mask.sum()
+                loss_sum += loss.item()
+        # Collect vectors
+        batch_preds = batch_preds.data.cpu().numpy().tolist()
+        if scaler is not None:
+            batch_preds = scaler.inverse_transform(batch_preds)
+        preds.extend(batch_preds)
+    loss_avg = loss_sum / iter_count
+    return preds, loss_avg
+def make_predictions(args: Namespace, newest_train_args=None, smiles: List[str] = None):
+    """
+    Makes predictions. If smiles is provided, makes predictions on smiles.
+    Otherwise makes predictions on args.test_data.
+    :param args: Arguments.
+    :param smiles: Smiles to make predictions on.
+    :return: A list of lists of target predictions.
+    """
+    if args.gpu is not None:
+        torch.cuda.set_device(args.gpu)
+    print('Loading training args')
+    path = args.checkpoint_paths[0]
+    scaler, features_scaler = load_scalars(path)
+    train_args = load_args(path)
+    # Update args with training arguments saved in checkpoint
+    for key, value in vars(train_args).items():
+        if not hasattr(args, key):
+            setattr(args, key, value)
+    # update args with newest training args
+    if newest_train_args is not None:
+        for key, value in vars(newest_train_args).items():
+            if not hasattr(args, key):
+                setattr(args, key, value)
+    # deal with multiprocess problem
+    args.debug = True
+    logger = create_logger('predict', quiet=False)
+    print('Loading data')
+    args.task_names = get_task_names(args.data_path)
+    if smiles is not None:
+        test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False)
+    else:
+        test_data = get_data(path=args.data_path, args=args,
+                             use_compound_names=args.use_compound_names, skip_invalid_smiles=False)
+    args.num_tasks = test_data.num_tasks()
+    args.features_size = test_data.features_size()
+    print('Validating SMILES')
+    valid_indices = [i for i in range(len(test_data))]
+    full_data = test_data
+    # test_data = MoleculeDataset([test_data[i] for i in valid_indices])
+    test_data_list = []
+    for i in valid_indices:
+        test_data_list.append(test_data[i])
+    test_data = MoleculeDataset(test_data_list)
+    # Edge case if empty list of smiles is provided
+    if len(test_data) == 0:
+        return [None] * len(full_data)
+    print(f'Test size = {len(test_data):,}')
+    # Normalize features
+    if hasattr(train_args, 'features_scaling'):
+        if train_args.features_scaling:
+            test_data.normalize_features(features_scaler)
+    # Predict with each model individually and sum predictions
+    if hasattr(args, 'num_tasks'):
+        sum_preds = np.zeros((len(test_data), args.num_tasks))
+    print(f'Predicting...')
+    shared_dict = {}
+    # loss_func = torch.nn.BCEWithLogitsLoss()
+    count = 0
+    for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)):
+        # Load model
+        model = load_checkpoint(checkpoint_path, cuda=args.cuda, current_args=args, logger=logger)
+        model_preds, _ = predict(
+            model=model,
+            data=test_data,
+            batch_size=args.batch_size,
+            scaler=scaler,
+            shared_dict=shared_dict,
+            args=args,
+            logger=logger,
+            loss_func=None
+        )
+        if args.fingerprint:
+            return model_preds
+        sum_preds += np.array(model_preds, dtype=float)
+        count += 1
+    # Ensemble predictions
+    avg_preds = sum_preds / len(args.checkpoint_paths)
+    # Save predictions
+    assert len(test_data) == len(avg_preds)
+    # Put Nones for invalid smiles
+    args.valid_indices = valid_indices
+    avg_preds = np.array(avg_preds)
+    test_smiles = full_data.smiles()
+    return avg_preds, test_smiles
+def write_prediction(avg_preds, test_smiles, args):
+    """
+    write prediction to disk
+    :param avg_preds: prediction value
+    :param test_smiles: input smiles
+    :param args: Arguments
+    """
+    if args.dataset_type == 'multiclass':
+        avg_preds = np.argmax(avg_preds, -1)
+    full_preds = [[None]] * len(test_smiles)
+    for i, si in enumerate(args.valid_indices):
+        full_preds[si] = avg_preds[i]
+    result = pd.DataFrame(data=full_preds, index=test_smiles, columns=args.task_names)
+    result.to_csv(args.output_path)
+    print(f'Saving predictions to {args.output_path}')
+def evaluate_predictions(preds: List[List[float]],
+                         targets: List[List[float]],
+                         num_tasks: int,
+                         metric_func,
+                         dataset_type: str,
+                         logger = None) -> List[float]:
+    """
+    Evaluates predictions using a metric function and filtering out invalid targets.
+    :param preds: A list of lists of shape (data_size, num_tasks) with model predictions.
+    :param targets: A list of lists of shape (data_size, num_tasks) with targets.
+    :param num_tasks: Number of tasks.
+    :param metric_func: Metric function which takes in a list of targets and a list of predictions.
+    :param dataset_type: Dataset type.
+    :param logger: Logger.
+    :return: A list with the score for each task based on `metric_func`.
+    """
+    if dataset_type == 'multiclass':
+        results = metric_func(np.argmax(preds, -1), [i[0] for i in targets])
+        return [results]
+    # info = logger.info if logger is not None else print
+    if len(preds) == 0:
+        return [float('nan')] * num_tasks
+    # Filter out empty targets
+    # valid_preds and valid_targets have shape (num_tasks, data_size)
+    valid_preds = [[] for _ in range(num_tasks)]
+    valid_targets = [[] for _ in range(num_tasks)]
+    for i in range(num_tasks):
+        for j in range(len(preds)):
+            if targets[j][i] is not None:  # Skip those without targets
+                valid_preds[i].append(preds[j][i])
+                valid_targets[i].append(targets[j][i])
+    # Compute metric
+    results = []
+    for i in range(num_tasks):
+        # # Skip if all targets or preds are identical, otherwise we'll crash during classification
+        if dataset_type == 'classification':
+            nan = False
+            if all(target == 0 for target in valid_targets[i]) or all(target == 1 for target in valid_targets[i]):
+                nan = True
+                # info('Warning: Found a task with targets all 0s or all 1s')
+            if all(pred == 0 for pred in valid_preds[i]) or all(pred == 1 for pred in valid_preds[i]):
+                nan = True
+                # info('Warning: Found a task with predictions all 0s or all 1s')
+            if nan:
+                results.append(float('nan'))
+                continue
+        if len(valid_targets[i]) == 0:
+            continue
+        results.append(metric_func(valid_targets[i], valid_preds[i]))
+    return results
+def evaluate(model: nn.Module,
+             data: MoleculeDataset,
+             num_tasks: int,
+             metric_func,
+             loss_func,
+             batch_size: int,
+             dataset_type: str,
+             args: Namespace,
+             shared_dict,
+             scaler: StandardScaler = None,
+             logger = None) -> List[float]:
+    """
+    Evaluates an ensemble of models on a dataset.
+    :param model: A model.
+    :param data: A MoleculeDataset.
+    :param num_tasks: Number of tasks.
+    :param metric_func: Metric function which takes in a list of targets and a list of predictions.
+    :param batch_size: Batch size.
+    :param dataset_type: Dataset type.
+    :param scaler: A StandardScaler object fit on the training targets.
+    :param logger: Logger.
+    :return: A list with the score for each task based on `metric_func`.
+    """
+    preds, loss_avg = predict(
+        model=model,
+        data=data,
+        loss_func=loss_func,
+        batch_size=batch_size,
+        scaler=scaler,
+        shared_dict=shared_dict,
+        logger=logger,
+        args=args
+    )
+    targets = data.targets()
+    if scaler is not None:
+        targets = scaler.inverse_transform(targets)
+    results = evaluate_predictions(
+        preds=preds,
+        targets=targets,
+        num_tasks=num_tasks,
+        metric_func=metric_func,
+        dataset_type=dataset_type,
+        logger=logger
+    )
+    return results, loss_avg

task/pretrain.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+The GROVER pretrain function.
+"""
+import os
+import time
+from argparse import Namespace
+from logging import Logger
+import torch
+from torch.utils.data import DataLoader
+from grover.data.dist_sampler import DistributedSampler
+from grover.data.groverdataset import get_data, split_data, GroverCollator, BatchMolDataset
+from grover.data.torchvocab import MolVocab
+from grover.model.models import GROVEREmbedding
+from grover.util.multi_gpu_wrapper import MultiGpuWrapper as mgw
+from grover.util.nn_utils import param_count
+from grover.util.utils import build_optimizer, build_lr_scheduler
+from task.grovertrainer import GROVERTrainer
+def pretrain_model(args: Namespace, logger: Logger = None):
+    """
+    The entrey of pretrain.
+    :param args: the argument.
+    :param logger: the logger.
+    :return:
+    """
+    # avoid auto optimized import by pycharm.
+    a = MolVocab
+    s_time = time.time()
+    run_training(args=args, logger=logger)
+    e_time = time.time()
+    print("Total Time: %.3f" % (e_time - s_time))
+def pre_load_data(dataset: BatchMolDataset, rank: int, num_replicas: int, sample_per_file: int = None, epoch: int = 0):
+    """
+    Pre-load data at the beginning of each epoch.
+    :param dataset: the training dataset.
+    :param rank: the rank of the current worker.
+    :param num_replicas: the replicas.
+    :param sample_per_file: the number of the data points in each file. When sample_per_file is None, all data will be
+    loaded. It implies the testing phase. (TODO: bad design here.)
+    :param epoch: the epoch number.
+    :return:
+    """
+    mock_sampler = DistributedSampler(dataset, num_replicas=num_replicas, rank=rank, shuffle=False,
+                                      sample_per_file=sample_per_file)
+    mock_sampler.set_epoch(epoch)
+    pre_indices = mock_sampler.get_indices()
+    for i in pre_indices:
+        dataset.load_data(i)
+def run_training(args, logger):
+    """
+    Run the pretrain task.
+    :param args:
+    :param logger:
+    :return:
+    """
+    # initalize the logger.
+    if logger is not None:
+        debug, _ = logger.debug, logger.info
+    else:
+        debug = print
+    # initialize the horovod library
+    if args.enable_multi_gpu:
+        mgw.init()
+    # binding training to GPUs.
+    master_worker = (mgw.rank() == 0) if args.enable_multi_gpu else True
+    # pin GPU to local rank. By default, we use gpu:0 for training.
+    local_gpu_idx = mgw.local_rank() if args.enable_multi_gpu else 0
+    with_cuda = args.cuda
+    if with_cuda:
+        torch.cuda.set_device(local_gpu_idx)
+    # get rank an  number of workers
+    rank = mgw.rank() if args.enable_multi_gpu else 0
+    num_replicas = mgw.size() if args.enable_multi_gpu else 1
+    # print("Rank: %d Rep: %d" % (rank, num_replicas))
+    # load file paths of the data.
+    if master_worker:
+        print(args)
+        if args.enable_multi_gpu:
+            debug("Total workers: %d" % (mgw.size()))
+        debug('Loading data')
+    data, sample_per_file = get_data(data_path=args.data_path)
+    # data splitting
+    if master_worker:
+        debug(f'Splitting data with seed 0.')
+    train_data, test_data, _ = split_data(data=data, sizes=(0.9, 0.1, 0.0), seed=0, logger=logger)
+    # Here the true train data size is the train_data divided by #GPUs
+    if args.enable_multi_gpu:
+        args.train_data_size = len(train_data) // mgw.size()
+    else:
+        args.train_data_size = len(train_data)
+    if master_worker:
+        debug(f'Total size = {len(data):,} | '
+              f'train size = {len(train_data):,} | val size = {len(test_data):,}')
+    # load atom and bond vocabulary and the semantic motif labels.
+    atom_vocab = MolVocab.load_vocab(args.atom_vocab_path)
+    bond_vocab = MolVocab.load_vocab(args.bond_vocab_path)
+    atom_vocab_size, bond_vocab_size = len(atom_vocab), len(bond_vocab)
+    # Hard coding here, since we haven't load any data yet!
+    fg_size = 85
+    shared_dict = {}
+    mol_collator = GroverCollator(shared_dict=shared_dict, atom_vocab=atom_vocab, bond_vocab=bond_vocab, args=args)
+    if master_worker:
+        debug("atom vocab size: %d, bond vocab size: %d, Number of FG tasks: %d" % (atom_vocab_size,
+                                                                                    bond_vocab_size, fg_size))
+    # Define the distributed sampler. If using the single card, the sampler will be None.
+    train_sampler = None
+    test_sampler = None
+    shuffle = True
+    if args.enable_multi_gpu:
+        # If not shuffle, the performance may decayed.
+        train_sampler = DistributedSampler(
+            train_data, num_replicas=mgw.size(), rank=mgw.rank(), shuffle=True, sample_per_file=sample_per_file)
+        # Here sample_per_file in test_sampler is None, indicating the test sampler would not divide the test samples by
+        # rank. (TODO: bad design here.)
+        test_sampler = DistributedSampler(
+            test_data, num_replicas=mgw.size(), rank=mgw.rank(), shuffle=False)
+        train_sampler.set_epoch(args.epochs)
+        test_sampler.set_epoch(1)
+        # if we enables multi_gpu training. shuffle should be disabled.
+        shuffle = False
+    # Pre load data. (Maybe unnecessary. )
+    pre_load_data(train_data, rank, num_replicas, sample_per_file)
+    pre_load_data(test_data, rank, num_replicas)
+    if master_worker:
+        # print("Pre-loaded training data: %d" % train_data.count_loaded_datapoints())
+        print("Pre-loaded test data: %d" % test_data.count_loaded_datapoints())
+    # Build dataloader
+    train_data_dl = DataLoader(train_data,
+                               batch_size=args.batch_size,
+                               shuffle=shuffle,
+                               num_workers=12,
+                               sampler=train_sampler,
+                               collate_fn=mol_collator)
+    test_data_dl = DataLoader(test_data,
+                              batch_size=args.batch_size,
+                              shuffle=shuffle,
+                              num_workers=10,
+                              sampler=test_sampler,
+                              collate_fn=mol_collator)
+    # Build the embedding model.
+    grover_model = GROVEREmbedding(args)
+    #  Build the trainer.
+    trainer = GROVERTrainer(args=args,
+                            embedding_model=grover_model,
+                            atom_vocab_size=atom_vocab_size,
+                            bond_vocab_size=bond_vocab_size,
+                            fg_szie=fg_size,
+                            train_dataloader=train_data_dl,
+                            test_dataloader=test_data_dl,
+                            optimizer_builder=build_optimizer,
+                            scheduler_builder=build_lr_scheduler,
+                            logger=logger,
+                            with_cuda=with_cuda,
+                            enable_multi_gpu=args.enable_multi_gpu)
+    # Restore the interrupted training.
+    model_dir = os.path.join(args.save_dir, "model")
+    resume_from_epoch = 0
+    resume_scheduler_step = 0
+    if master_worker:
+        resume_from_epoch, resume_scheduler_step = trainer.restore(model_dir)
+    if args.enable_multi_gpu:
+        resume_from_epoch = mgw.broadcast(torch.tensor(resume_from_epoch), root_rank=0, name="resume_from_epoch").item()
+        resume_scheduler_step = mgw.broadcast(torch.tensor(resume_scheduler_step),
+                                              root_rank=0, name="resume_scheduler_step").item()
+        trainer.scheduler.current_step = resume_scheduler_step
+        print("Restored epoch: %d Restored scheduler step: %d" % (resume_from_epoch, trainer.scheduler.current_step))
+    trainer.broadcast_parameters()
+    # Print model details.
+    if master_worker:
+        # Change order here.
+        print(grover_model)
+        print("Total parameters: %d" % param_count(trainer.grover))
+    # Perform training.
+    for epoch in range(resume_from_epoch + 1, args.epochs):
+        s_time = time.time()
+        # Data pre-loading.
+        if args.enable_multi_gpu:
+            train_sampler.set_epoch(epoch)
+            train_data.clean_cache()
+            idxs = train_sampler.get_indices()
+            for local_gpu_idx in idxs:
+                train_data.load_data(local_gpu_idx)
+        d_time = time.time() - s_time
+        # perform training and validation.
+        s_time = time.time()
+        _, train_loss, _ = trainer.train(epoch)
+        t_time = time.time() - s_time
+        s_time = time.time()
+        _, val_loss, detailed_loss_val = trainer.test(epoch)
+        val_av_loss, val_bv_loss, val_fg_loss, _, _, _ = detailed_loss_val
+        v_time = time.time() - s_time
+        # print information.
+        if master_worker:
+            print('Epoch: {:04d}'.format(epoch),
+                  'loss_train: {:.6f}'.format(train_loss),
+                  'loss_val: {:.6f}'.format(val_loss),
+                  'loss_val_av: {:.6f}'.format(val_av_loss),
+                  'loss_val_bv: {:.6f}'.format(val_bv_loss),
+                  'loss_val_fg: {:.6f}'.format(val_fg_loss),
+                  'cur_lr: {:.5f}'.format(trainer.scheduler.get_lr()[0]),
+                  't_time: {:.4f}s'.format(t_time),
+                  'v_time: {:.4f}s'.format(v_time),
+                  'd_time: {:.4f}s'.format(d_time), flush=True)
+            if epoch % args.save_interval == 0:
+                trainer.save(epoch, model_dir)
+            trainer.save_tmp(epoch, model_dir, rank)
+    # Only save final version.
+    if master_worker:
+        trainer.save(args.epochs, model_dir, "")

task/run_evaluation.py ADDED Viewed

	@@ -0,0 +1,157 @@

+"""
+The evaluation function.
+"""
+from argparse import Namespace
+from logging import Logger
+from typing import List
+import numpy as np
+import torch
+import torch.utils.data.distributed
+from grover.data.scaler import StandardScaler
+from grover.util.utils import get_class_sizes, get_data, split_data, get_task_names, get_loss_func
+from grover.util.utils import load_checkpoint
+from task.predict import evaluate_predictions
+from grover.util.metrics import get_metric_func
+from grover.util.nn_utils import param_count
+from task.predict import predict
+def run_evaluation(args: Namespace, logger: Logger = None) -> List[float]:
+    """
+    Trains a model and returns test scores on the model checkpoint with the highest validation score.
+    :param args: Arguments.
+    :param logger: Logger.
+    :return: A list of ensemble scores for each task.
+    """
+    if logger is not None:
+        debug, info = logger.debug, logger.info
+    else:
+        debug = info = print
+    torch.cuda.set_device(0)
+    # Get data
+    debug('Loading data')
+    args.task_names = get_task_names(args.data_path)
+    data = get_data(path=args.data_path, args=args, logger=logger)
+    args.num_tasks = data.num_tasks()
+    args.features_size = data.features_size()
+    debug(f'Number of tasks = {args.num_tasks}')
+    # Split data
+    debug(f'Splitting data with seed {args.seed}')
+    train_data, val_data, test_data = split_data(data=data,
+                                                 split_type=args.split_type,
+                                                 sizes=[0.8, 0.1, 0.1],
+                                                 seed=args.seed,
+                                                 args=args,
+                                                 logger=logger)
+    if args.dataset_type == 'classification':
+        class_sizes = get_class_sizes(data)
+        debug('Class sizes')
+        for i, task_class_sizes in enumerate(class_sizes):
+            debug(f'{args.task_names[i]} '
+                  f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}')
+    if args.features_scaling:
+        features_scaler = train_data.normalize_features(replace_nan_token=0)
+        val_data.normalize_features(features_scaler)
+        test_data.normalize_features(features_scaler)
+    else:
+        features_scaler = None
+    args.train_data_size = len(train_data)
+    debug(f'Total size = {len(data):,} | '
+          f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}')
+    # Initialize scaler  (regression only)
+    scaler = None
+    if args.dataset_type == 'regression':
+        debug('Fitting scaler')
+        _, train_targets = train_data.smiles(), train_data.targets()
+        scaler = StandardScaler().fit(train_targets)
+        scaled_targets = scaler.transform(train_targets).tolist()
+        train_data.set_targets(scaled_targets)
+        val_targets = val_data.targets()
+        scaled_val_targets = scaler.transform(val_targets).tolist()
+        val_data.set_targets(scaled_val_targets)
+    metric_func = get_metric_func(metric=args.metric)
+    # Set up test set evaluation
+    test_smiles, test_targets = test_data.smiles(), test_data.targets()
+    sum_test_preds = np.zeros((len(test_smiles), args.num_tasks))
+    # Load/build model
+    if args.checkpoint_paths is not None:
+        cur_model = args.seed
+        target_path = []
+        for path in args.checkpoint_paths:
+            if "fold_%d" % cur_model in path:
+                target_path = path
+        debug(f'Loading model {args.seed} from {target_path}')
+        model = load_checkpoint(target_path, current_args=args, cuda=args.cuda, logger=logger)
+        # Get loss and metric functions
+        loss_func = get_loss_func(args, model)
+    debug(f'Number of parameters = {param_count(model):,}')
+    test_preds, _ = predict(
+        model=model,
+        data=test_data,
+        batch_size=args.batch_size,
+        loss_func=loss_func,
+        logger=logger,
+        shared_dict={},
+        scaler=scaler,
+        args=args
+    )
+    test_scores = evaluate_predictions(
+        preds=test_preds,
+        targets=test_targets,
+        num_tasks=args.num_tasks,
+        metric_func=metric_func,
+        dataset_type=args.dataset_type,
+        logger=logger
+    )
+    if len(test_preds) != 0:
+        sum_test_preds += np.array(test_preds, dtype=float)
+    # Average test score
+    avg_test_score = np.nanmean(test_scores)
+    info(f'Model test {args.metric} = {avg_test_score:.6f}')
+    if args.show_individual_scores:
+        # Individual test scores
+        for task_name, test_score in zip(args.task_names, test_scores):
+            info(f'Model test {task_name} {args.metric} = {test_score:.6f}')
+    # Evaluate ensemble on test set
+    avg_test_preds = (sum_test_preds / args.ensemble_size).tolist()
+    ensemble_scores = evaluate_predictions(
+        preds=avg_test_preds,
+        targets=test_targets,
+        num_tasks=args.num_tasks,
+        metric_func=metric_func,
+        dataset_type=args.dataset_type,
+        logger=logger
+    )
+    # If you want to save the prediction result, uncomment these lines.
+    # ind = [['preds'] * args.num_tasks + ['targets'] * args.num_tasks, args.task_names * 2]
+    # ind = pd.MultiIndex.from_tuples(list(zip(*ind)))
+    # data = np.concatenate([np.array(avg_test_preds), np.array(test_targets)], 1)
+    # test_result = pd.DataFrame(data, index=test_smiles, columns=ind)
+    # test_result.to_csv(os.path.join(args.save_dir, 'test_result.csv'))
+    return ensemble_scores

task/train.py ADDED Viewed

	@@ -0,0 +1,454 @@

+"""
+The training function used in the finetuning task.
+"""
+import csv
+import logging
+import os
+import pickle
+import time
+from argparse import Namespace
+from logging import Logger
+from typing import List
+import numpy as np
+import pandas as pd
+import torch
+from torch.optim.lr_scheduler import ExponentialLR
+from torch.utils.data import DataLoader
+from grover.data import MolCollator
+from grover.data import StandardScaler
+from grover.util.metrics import get_metric_func
+from grover.util.nn_utils import initialize_weights, param_count
+from grover.util.scheduler import NoamLR
+from grover.util.utils import build_optimizer, build_lr_scheduler, makedirs, load_checkpoint, get_loss_func, \
+    save_checkpoint, build_model
+from grover.util.utils import get_class_sizes, get_data, split_data, get_task_names
+from task.predict import predict, evaluate, evaluate_predictions
+def train(epoch, model, data, loss_func, optimizer, scheduler,
+          shared_dict, args: Namespace, n_iter: int = 0,
+          logger: logging.Logger = None):
+    """
+    Trains a model for an epoch.
+    :param model: Model.
+    :param data: A MoleculeDataset (or a list of MoleculeDatasets if using moe).
+    :param loss_func: Loss function.
+    :param optimizer: An Optimizer.
+    :param scheduler: A learning rate scheduler.
+    :param args: Arguments.
+    :param n_iter: The number of iterations (training examples) trained on so far.
+    :param logger: A logger for printing intermediate results.
+    :param writer: A tensorboardX SummaryWriter.
+    :return: The total number of iterations (training examples) trained on so far.
+    """
+    # debug = logger.debug if logger is not None else print
+    model.train()
+    # data.shuffle()
+    loss_sum, iter_count = 0, 0
+    cum_loss_sum, cum_iter_count = 0, 0
+    mol_collator = MolCollator(shared_dict=shared_dict, args=args)
+    num_workers = 4
+    if type(data) == DataLoader:
+        mol_loader = data
+    else:
+        mol_loader = DataLoader(data, batch_size=args.batch_size, shuffle=True,
+                            num_workers=num_workers, collate_fn=mol_collator)
+    for _, item in enumerate(mol_loader):
+        _, batch, features_batch, mask, targets = item
+        if next(model.parameters()).is_cuda:
+            mask, targets = mask.cuda(), targets.cuda()
+        class_weights = torch.ones(targets.shape)
+        if args.cuda:
+            class_weights = class_weights.cuda()
+        # Run model
+        model.zero_grad()
+        preds = model(batch, features_batch)
+        loss = loss_func(preds, targets) * class_weights * mask
+        loss = loss.sum() / mask.sum()
+        loss_sum += loss.item()
+        iter_count += args.batch_size
+        cum_loss_sum += loss.item()
+        cum_iter_count += 1
+        loss.backward()
+        optimizer.step()
+        if isinstance(scheduler, NoamLR):
+            scheduler.step()
+        n_iter += args.batch_size
+        #if (n_iter // args.batch_size) % args.log_frequency == 0:
+        #    lrs = scheduler.get_lr()
+        #    loss_avg = loss_sum / iter_count
+        #    loss_sum, iter_count = 0, 0
+        #    lrs_str = ', '.join(f'lr_{i} = {lr:.4e}' for i, lr in enumerate(lrs))
+    return n_iter, cum_loss_sum / cum_iter_count
+def run_training(args: Namespace, time_start, logger: Logger = None) -> List[float]:
+    """
+    Trains a model and returns test scores on the model checkpoint with the highest validation score.
+    :param args: Arguments.
+    :param logger: Logger.
+    :return: A list of ensemble scores for each task.
+    """
+    if logger is not None:
+        debug, info = logger.debug, logger.info
+    else:
+        debug = info = print
+    # pin GPU to local rank.
+    idx = args.gpu
+    if args.gpu is not None:
+        torch.cuda.set_device(idx)
+    features_scaler, scaler, shared_dict, test_data, train_data, val_data = load_data(args, debug, logger)
+    metric_func = get_metric_func(metric=args.metric)
+    # Set up test set evaluation
+    test_smiles, test_targets = test_data.smiles(), test_data.targets()
+    sum_test_preds = np.zeros((len(test_smiles), args.num_tasks))
+    # Train ensemble of models
+    for model_idx in range(args.ensemble_size):
+        # Tensorboard writer
+        save_dir = os.path.join(args.save_dir, f'model_{model_idx}')
+        makedirs(save_dir)
+        # Load/build model
+        if args.checkpoint_paths is not None:
+            if len(args.checkpoint_paths) == 1:
+                cur_model = 0
+            else:
+                cur_model = model_idx
+            debug(f'Loading model {cur_model} from {args.checkpoint_paths[cur_model]}')
+            model = load_checkpoint(args.checkpoint_paths[cur_model], current_args=args, logger=logger)
+        else:
+            debug(f'Building model {model_idx}')
+            model = build_model(model_idx=model_idx, args=args)
+        if args.fine_tune_coff != 1 and args.checkpoint_paths is not None:
+            debug("Fine tune fc layer with different lr")
+            initialize_weights(model_idx=model_idx, model=model.ffn, distinct_init=args.distinct_init)
+        ############### FREEZE BLOCK ###########
+        # for name, param in model.named_parameters():
+        #     if name.startswith("grover."):
+        #         param.requires_grad = False
+        #     # Train prediction layers (readout + two FFNs)
+        #     else:
+        #         param.requires_grad = True
+        # print("TRAINABLE PARAMETERS:")
+        # for name, p in model.named_parameters():
+        #     if p.requires_grad:
+        #         print("  ", name)
+        ############### FREEZE BLOCK ###########
+        # Get loss and metric functions
+        loss_func = get_loss_func(args, model)
+        optimizer = build_optimizer(model, args)
+        debug(model)
+        debug(f'Number of parameters = {param_count(model):,}')
+        if args.cuda:
+            debug('Moving model to cuda')
+            model = model.cuda()
+        # Ensure that model is saved in correct location for evaluation if 0 epochs
+        save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args)
+        # Learning rate schedulers
+        scheduler = build_lr_scheduler(optimizer, args)
+        # Bulid data_loader
+        shuffle = True
+        mol_collator = MolCollator(shared_dict={}, args=args)
+        train_data = DataLoader(train_data,
+                                batch_size=args.batch_size,
+                                shuffle=shuffle,
+                                num_workers=10,
+                                collate_fn=mol_collator)
+        # Run training
+        best_score = float('inf') if args.minimize_score else -float('inf')
+        best_epoch, n_iter = 0, 0
+        min_val_loss = float('inf')
+        for epoch in range(args.epochs):
+            s_time = time.time()
+            n_iter, train_loss = train(
+                epoch=epoch,
+                model=model,
+                data=train_data,
+                loss_func=loss_func,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                args=args,
+                n_iter=n_iter,
+                shared_dict=shared_dict,
+                logger=logger
+            )
+            t_time = time.time() - s_time
+            s_time = time.time()
+            val_scores, val_loss = evaluate(
+                model=model,
+                data=val_data,
+                loss_func=loss_func,
+                num_tasks=args.num_tasks,
+                metric_func=metric_func,
+                batch_size=args.batch_size,
+                dataset_type=args.dataset_type,
+                scaler=scaler,
+                shared_dict=shared_dict,
+                logger=logger,
+                args=args
+            )
+            v_time = time.time() - s_time
+            # Average validation score
+            avg_val_score = np.nanmean(val_scores)
+            # Logged after lr step
+            if isinstance(scheduler, ExponentialLR):
+                scheduler.step()
+            if args.show_individual_scores:
+                # Individual validation scores
+                for task_name, val_score in zip(args.task_names, val_scores):
+                    debug(f'Validation {task_name} {args.metric} = {val_score:.6f}')
+            print('Epoch: {:04d}'.format(epoch),
+                  'loss_train: {:.6f}'.format(train_loss),
+                  'loss_val: {:.6f}'.format(val_loss),
+                  f'{args.metric}_val: {avg_val_score:.4f}',
+                  # 'auc_val: {:.4f}'.format(avg_val_score),
+                  'cur_lr: {:.5f}'.format(scheduler.get_lr()[-1]),
+                  't_time: {:.4f}s'.format(t_time),
+                  'v_time: {:.4f}s'.format(v_time))
+            if args.tensorboard:
+                writer.add_scalar('loss/train', train_loss, epoch)
+                writer.add_scalar('loss/val', val_loss, epoch)
+                writer.add_scalar(f'{args.metric}_val', avg_val_score, epoch)
+            # Save model checkpoint if improved validation score
+            if args.select_by_loss:
+                if val_loss < min_val_loss:
+                    min_val_loss, best_epoch = val_loss, epoch
+                    save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args)
+            else:
+                if args.minimize_score and avg_val_score < best_score or \
+                        not args.minimize_score and avg_val_score > best_score:
+                    best_score, best_epoch = avg_val_score, epoch
+                    save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args)
+            if epoch - best_epoch > args.early_stop_epoch:
+                break
+        ensemble_scores = 0.0
+        # Evaluate on test set using model with best validation score
+        if args.select_by_loss:
+            info(f'Model {model_idx} best val loss = {min_val_loss:.6f} on epoch {best_epoch}')
+        else:
+            info(f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}')
+        model = load_checkpoint(os.path.join(save_dir, 'model.pt'), cuda=args.cuda, logger=logger)
+        test_preds, _ = predict(
+            model=model,
+            data=test_data,
+            loss_func=loss_func,
+            batch_size=args.batch_size,
+            logger=logger,
+            shared_dict=shared_dict,
+            scaler=scaler,
+            args=args
+        )
+        test_scores = evaluate_predictions(
+            preds=test_preds,
+            targets=test_targets,
+            num_tasks=args.num_tasks,
+            metric_func=metric_func,
+            dataset_type=args.dataset_type,
+            logger=logger
+        )
+        if len(test_preds) != 0:
+            sum_test_preds += np.array(test_preds, dtype=float)
+        # Average test score
+        avg_test_score = np.nanmean(test_scores)
+        info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}')
+        if args.show_individual_scores:
+            # Individual test scores
+            for task_name, test_score in zip(args.task_names, test_scores):
+                info(f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}')
+        # Evaluate ensemble on test set
+        avg_test_preds = (sum_test_preds / args.ensemble_size).tolist()
+        ensemble_scores = evaluate_predictions(
+            preds=avg_test_preds,
+            targets=test_targets,
+            num_tasks=args.num_tasks,
+            metric_func=metric_func,
+            dataset_type=args.dataset_type,
+            logger=logger
+        )
+        ind = [['preds'] * args.num_tasks + ['targets'] * args.num_tasks, args.task_names * 2]
+        ind = pd.MultiIndex.from_tuples(list(zip(*ind)))
+        data = np.concatenate([np.array(avg_test_preds), np.array(test_targets)], 1)
+        test_result = pd.DataFrame(data, index=test_smiles, columns=ind)
+        test_result.to_csv(os.path.join(args.save_dir, 'test_result.csv'))
+        # Average ensemble score
+        avg_ensemble_test_score = np.nanmean(ensemble_scores)
+        info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}')
+        # Individual ensemble scores
+        if args.show_individual_scores:
+            for task_name, ensemble_score in zip(args.task_names, ensemble_scores):
+                info(f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}')
+    return ensemble_scores
+def load_data(args, debug, logger):
+    """
+    load the training data.
+    :param args:
+    :param debug:
+    :param logger:
+    :return:
+    """
+    # Get data
+    debug('Loading data')
+    args.task_names = get_task_names(args.data_path)
+    data = get_data(path=args.data_path, args=args, logger=logger)
+    if data.data[0].features is not None:
+        args.features_dim = len(data.data[0].features)
+    else:
+        args.features_dim = 0
+    shared_dict = {}
+    args.num_tasks = data.num_tasks()
+    args.features_size = data.features_size()
+    debug(f'Number of tasks = {args.num_tasks}')
+    # Split data
+    debug(f'Splitting data with seed {args.seed}')
+    if args.separate_test_path:
+        test_data = get_data(path=args.separate_test_path, args=args,
+                             features_path=args.separate_test_features_path, logger=logger)
+    if args.separate_val_path:
+        val_data = get_data(path=args.separate_val_path, args=args,
+                            features_path=args.separate_val_features_path, logger=logger)
+    if args.separate_val_path and args.separate_test_path:
+        train_data = data
+    elif args.separate_val_path:
+        train_data, _, test_data = split_data(data=data, split_type=args.split_type,
+                                              sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger)
+    elif args.separate_test_path:
+        train_data, val_data, _ = split_data(data=data, split_type=args.split_type,
+                                             sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger)
+    else:
+        train_data, val_data, test_data = split_data(data=data, split_type=args.split_type,
+                                                     sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
+    if args.dataset_type == 'classification':
+        class_sizes = get_class_sizes(data)
+        debug('Class sizes')
+        for i, task_class_sizes in enumerate(class_sizes):
+            debug(f'{args.task_names[i]} '
+                  f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}')
+    #if args.save_smiles_splits:
+    #    save_splits(args, test_data, train_data, val_data)
+    if args.features_scaling:
+        features_scaler = train_data.normalize_features(replace_nan_token=0)
+        val_data.normalize_features(features_scaler)
+        test_data.normalize_features(features_scaler)
+    else:
+        features_scaler = None
+    args.train_data_size = len(train_data)
+    debug(f'Total size = {len(data):,} | '
+          f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}')
+    # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only)
+    if args.dataset_type == 'regression':
+        debug('Fitting scaler')
+        _, train_targets = train_data.smiles(), train_data.targets()
+        scaler = StandardScaler().fit(train_targets)
+        scaled_targets = scaler.transform(train_targets).tolist()
+        train_data.set_targets(scaled_targets)
+        val_targets = val_data.targets()
+        scaled_val_targets = scaler.transform(val_targets).tolist()
+        val_data.set_targets(scaled_val_targets)
+    else:
+        scaler = None
+    return features_scaler, scaler, shared_dict, test_data, train_data, val_data
+def save_splits(args, test_data, train_data, val_data):
+    """
+    Save the splits.
+    :param args:
+    :param test_data:
+    :param train_data:
+    :param val_data:
+    :return:
+    """
+    with open(args.data_path, 'r') as f:
+        reader = csv.reader(f)
+        header = next(reader)
+        lines_by_smiles = {}
+        indices_by_smiles = {}
+        for i, line in enumerate(reader):
+            smiles = line[0]
+            lines_by_smiles[smiles] = line
+            indices_by_smiles[smiles] = i
+    all_split_indices = []
+    for dataset, name in [(train_data, 'train'), (val_data, 'val'), (test_data, 'test')]:
+        with open(os.path.join(args.save_dir, name + '_smiles.csv'), 'w') as f:
+            writer = csv.writer(f)
+            writer.writerow(['smiles'])
+            for smiles in dataset.smiles():
+                writer.writerow([smiles])
+        with open(os.path.join(args.save_dir, name + '_full.csv'), 'w') as f:
+            writer = csv.writer(f)
+            writer.writerow(header)
+            for smiles in dataset.smiles():
+                writer.writerow(lines_by_smiles[smiles])
+        split_indices = []
+        for smiles in dataset.smiles():
+            split_indices.append(indices_by_smiles[smiles])
+            split_indices = sorted(split_indices)
+        all_split_indices.append(split_indices)
+    with open(os.path.join(args.save_dir, 'split_indices.pckl'), 'wb') as f:
+        pickle.dump(all_split_indices, f)
+    return writer