Spaces:
Sleeping
Sleeping
| """ | |
| Computes and saves molecular features for a dataset. | |
| """ | |
| import os | |
| import shutil | |
| import sys | |
| from argparse import ArgumentParser, Namespace | |
| from multiprocessing import Pool | |
| from typing import List, Tuple | |
| from tqdm import tqdm | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) | |
| from grover.util.utils import get_data, makedirs, load_features, save_features | |
| from grover.data.molfeaturegenerator import get_available_features_generators, \ | |
| get_features_generator | |
| from grover.data.task_labels import rdkit_functional_group_label_features_generator | |
| def load_temp(temp_dir: str) -> Tuple[List[List[float]], int]: | |
| """ | |
| Loads all features saved as .npz files in load_dir. | |
| Assumes temporary files are named in order 0.npz, 1.npz, ... | |
| :param temp_dir: Directory in which temporary .npz files containing features are stored. | |
| :return: A tuple with a list of molecule features, where each molecule's features is a list of floats, | |
| and the number of temporary files. | |
| """ | |
| features = [] | |
| temp_num = 0 | |
| temp_path = os.path.join(temp_dir, f'{temp_num}.npz') | |
| while os.path.exists(temp_path): | |
| features.extend(load_features(temp_path)) | |
| temp_num += 1 | |
| temp_path = os.path.join(temp_dir, f'{temp_num}.npz') | |
| return features, temp_num | |
| def generate_and_save_features(args: Namespace): | |
| """ | |
| Computes and saves features for a dataset of molecules as a 2D array in a .npz file. | |
| :param args: Arguments. | |
| """ | |
| # Create directory for save_path | |
| makedirs(args.save_path, isfile=True) | |
| # Get data and features function | |
| data = get_data(path=args.data_path, max_data_size=None) | |
| features_generator = get_features_generator(args.features_generator) | |
| temp_save_dir = args.save_path + '_temp' | |
| # Load partially complete data | |
| if args.restart: | |
| if os.path.exists(args.save_path): | |
| os.remove(args.save_path) | |
| if os.path.exists(temp_save_dir): | |
| shutil.rmtree(temp_save_dir) | |
| else: | |
| if os.path.exists(args.save_path): | |
| raise ValueError(f'"{args.save_path}" already exists and args.restart is False.') | |
| if os.path.exists(temp_save_dir): | |
| features, temp_num = load_temp(temp_save_dir) | |
| if not os.path.exists(temp_save_dir): | |
| makedirs(temp_save_dir) | |
| features, temp_num = [], 0 | |
| # Build features map function | |
| data = data[len(features):] # restrict to data for which features have not been computed yet | |
| mols = (d.smiles for d in data) | |
| if args.sequential: | |
| features_map = map(features_generator, mols) | |
| else: | |
| features_map = Pool(30).imap(features_generator, mols) | |
| # Get features | |
| temp_features = [] | |
| for i, feats in tqdm(enumerate(features_map), total=len(data)): | |
| temp_features.append(feats) | |
| # Save temporary features every save_frequency | |
| if (i > 0 and (i + 1) % args.save_frequency == 0) or i == len(data) - 1: | |
| save_features(os.path.join(temp_save_dir, f'{temp_num}.npz'), temp_features) | |
| features.extend(temp_features) | |
| temp_features = [] | |
| temp_num += 1 | |
| try: | |
| # Save all features | |
| save_features(args.save_path, features) | |
| # Remove temporary features | |
| shutil.rmtree(temp_save_dir) | |
| except OverflowError: | |
| print('Features array is too large to save as a single file. Instead keeping features as a directory of files.') | |
| if __name__ == '__main__': | |
| parser = ArgumentParser() | |
| parser.add_argument('--data_path', type=str, required=True, | |
| help='Path to data CSV') | |
| parser.add_argument('--features_generator', type=str, required=True, | |
| choices=get_available_features_generators(), | |
| help='Type of features to generate') | |
| parser.add_argument('--save_path', type=str, default=None, | |
| help='Path to .npz file where features will be saved as a compressed numpy archive') | |
| parser.add_argument('--save_frequency', type=int, default=10000, | |
| help='Frequency with which to save the features') | |
| parser.add_argument('--restart', action='store_true', default=False, | |
| help='Whether to not load partially complete featurization and instead start from scratch') | |
| parser.add_argument('--max_data_size', type=int, | |
| help='Maximum number of data points to load') | |
| parser.add_argument('--sequential', action='store_true', default=False, | |
| help='Whether to task sequentially rather than in parallel') | |
| args = parser.parse_args() | |
| if args.save_path is None: | |
| args.save_path = args.data_path.split('csv')[0] + 'npz' | |
| generate_and_save_features(args) | |