FlowFinal / src /compressor_with_embeddings.py

Add compressor_with_embeddings.py

97eb7cb verified 7 months ago

10.8 kB

	import torch
	import torch.nn as nn
	import torch.optim as optim
	from torch.utils.data import Dataset, DataLoader
	from torch.optim.lr_scheduler import LinearLR, CosineAnnealingLR, SequentialLR
	import json
	import numpy as np
	from tqdm import tqdm

	# ---------------- Hyperparameters ----------------
	ESM_DIM = 1280 # ESM-2 hidden dim (esm2_t33_650M_UR50D)
	COMP_RATIO = 16 # compression factor
	COMP_DIM = ESM_DIM // COMP_RATIO
	MAX_SEQ_LEN = 50 # Actual sequence length from final_sequence_encoder.py
	BATCH_SIZE = 32
	EPOCHS = 30
	BASE_LR = 1e-3 # initial learning rate
	LR_MIN = 8e-5 # minimum learning rate for cosine schedule
	WARMUP_STEPS = 10_000
	DEPTH = 4 # total transformer layers (2 pre-pool, 2 post-pool)
	HEADS = 8 # attention heads
	DIM_FF = ESM_DIM * 4
	POOLING = True # enforce ProtFlow hourglass pooling

	# ---------------- Dataset for Pre-computed Embeddings ----------------
	class PrecomputedEmbeddingDataset(Dataset):
	def __init__(self, embeddings_path):
	"""
	Load pre-computed embeddings from the final_sequence_encoder.py output.
	Args:
	embeddings_path: Path to the directory containing individual .pt embedding files
	"""
	print(f"Loading pre-computed embeddings from {embeddings_path}...")

	# Load all individual embedding files
	import glob
	import os

	embedding_files = glob.glob(os.path.join(embeddings_path, "*.pt"))
	embedding_files = [f for f in embedding_files if not f.endswith('metadata.json') and not f.endswith('sequence_ids.json')]

	print(f"Found {len(embedding_files)} embedding files")

	# Load and stack all embeddings
	embeddings_list = []
	for file_path in embedding_files:
	try:
	embedding = torch.load(file_path)
	if embedding.dim() == 2: # (seq_len, hidden_dim)
	embeddings_list.append(embedding)
	else:
	print(f"Warning: Skipping {file_path} - unexpected shape {embedding.shape}")
	except Exception as e:
	print(f"Warning: Could not load {file_path}: {e}")

	if not embeddings_list:
	raise ValueError("No valid embeddings found!")

	self.embeddings = torch.stack(embeddings_list)
	print(f"Loaded {len(self.embeddings)} embeddings with shape {self.embeddings.shape}")

	# Ensure embeddings are the right shape
	if len(self.embeddings.shape) != 3:
	raise ValueError(f"Expected 3D tensor, got shape {self.embeddings.shape}")

	if self.embeddings.shape[1] != MAX_SEQ_LEN:
	print(f"Warning: Expected sequence length {MAX_SEQ_LEN}, got {self.embeddings.shape[1]}")

	if self.embeddings.shape[2] != ESM_DIM:
	print(f"Warning: Expected embedding dim {ESM_DIM}, got {self.embeddings.shape[2]}")

	def __len__(self):
	return len(self.embeddings)

	def __getitem__(self, idx):
	return self.embeddings[idx]

	# ---------------- Compressor ----------------
	class Compressor(nn.Module):
	def __init__(self, in_dim=ESM_DIM, out_dim=COMP_DIM):
	super().__init__()
	self.norm = nn.LayerNorm(in_dim)
	layer = lambda: nn.TransformerEncoderLayer(
	d_model=in_dim, nhead=HEADS, dim_feedforward=DIM_FF,
	batch_first=True)
	# two layers before pool, two after
	self.pre_tr = nn.TransformerEncoder(layer(), num_layers=DEPTH//2)
	self.post_tr = nn.TransformerEncoder(layer(), num_layers=DEPTH//2)
	self.proj = nn.Sequential(
	nn.LayerNorm(in_dim),
	nn.Linear(in_dim, out_dim),
	nn.Tanh()
	)
	self.pooling = POOLING

	def forward(self, x, stats=None):
	if stats:
	m, s, mn, mx = stats['mean'], stats['std'], stats['min'], stats['max']
	# Move stats to the same device as x
	m = m.to(x.device)
	s = s.to(x.device)
	mn = mn.to(x.device)
	mx = mx.to(x.device)
	x = torch.clamp((x - m) / s, -4, 4)
	x = torch.clamp((x - mn) / (mx - mn + 1e-8), 0, 1)
	x = self.norm(x)
	x = self.pre_tr(x) # [B, L, D]
	if self.pooling:
	B, L, D = x.shape
	if L % 2: x = x[:, :-1, :]
	x = x.view(B, L//2, 2, D).mean(2) # halve sequence length
	x = self.post_tr(x) # [B, L' , D]
	return self.proj(x) # [B, L', COMP_DIM]

	# ---------------- Decompressor ----------------
	class Decompressor(nn.Module):
	def __init__(self, in_dim=COMP_DIM, out_dim=ESM_DIM):
	super().__init__()
	self.proj = nn.Sequential(
	nn.LayerNorm(in_dim),
	nn.Linear(in_dim, out_dim)
	)
	layer = lambda: nn.TransformerEncoderLayer(
	d_model=out_dim, nhead=HEADS, dim_feedforward=DIM_FF,
	batch_first=True)
	self.decoder = nn.TransformerEncoder(layer(), num_layers=DEPTH//2)
	self.pooling = POOLING

	def forward(self, z):
	x = self.proj(z) # [B, L', D]
	if self.pooling:
	x = x.repeat_interleave(2, dim=1) # unpool to full length
	return self.decoder(x) # [B, L, out_dim]

	# ---------------- Training Loop ----------------
	def train_with_precomputed_embeddings(embeddings_path, device='cuda'):
	"""
	Train compressor using pre-computed embeddings from final_sequence_encoder.py
	"""
	# Load dataset
	ds = PrecomputedEmbeddingDataset(embeddings_path)

	# Compute normalization statistics
	print("Computing normalization statistics...")
	flat = ds.embeddings.view(-1, ESM_DIM)
	stats = {
	'mean': flat.mean(0),
	'std': flat.std(0) + 1e-8,
	'min': torch.clamp((flat - flat.mean(0)) / (flat.std(0) + 1e-8), -4,4).min(0)[0],
	'max': torch.clamp((flat - flat.mean(0)) / (flat.std(0) + 1e-8), -4,4).max(0)[0]
	}

	# Save statistics for later use
	torch.save(stats, 'normalization_stats.pt')
	print("Saved normalization statistics to normalization_stats.pt")

	# Create data loader
	dl = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True)

	# Initialize models
	comp = Compressor().to(device)
	decomp = Decompressor().to(device)

	# Initialize optimizer
	opt = optim.AdamW(list(comp.parameters()) + list(decomp.parameters()), lr=BASE_LR)

	# LR scheduling: warmup -> cosine
	warmup_sched = LinearLR(opt, start_factor=1e-8, end_factor=1.0, total_iters=WARMUP_STEPS)
	cosine_sched = CosineAnnealingLR(opt, T_max=EPOCHS*len(dl), eta_min=LR_MIN)
	sched = SequentialLR(opt, [warmup_sched, cosine_sched], milestones=[WARMUP_STEPS])

	print(f"Starting training for {EPOCHS} epochs...")
	print(f"Device: {device}")
	print(f"Batch size: {BATCH_SIZE}")
	print(f"Total batches per epoch: {len(dl)}")

	# Training loop
	for epoch in range(1, EPOCHS+1):
	total_loss = 0
	comp.train()
	decomp.train()

	for batch_idx, x in enumerate(tqdm(dl, desc=f"Epoch {epoch}/{EPOCHS}")):
	x = x.to(device)
	z = comp(x, stats)
	xr = decomp(z)
	loss = (x - xr).pow(2).mean()

	opt.zero_grad()
	loss.backward()
	opt.step()
	sched.step()

	total_loss += loss.item()

	# Print progress every 100 batches
	if batch_idx % 100 == 0:
	print(f" Batch {batch_idx}/{len(dl)} - Loss: {loss.item():.6f}")

	avg_loss = total_loss / len(dl)
	print(f"Epoch {epoch}/{EPOCHS} — Average MSE: {avg_loss:.6f}")

	# Save checkpoint every 5 epochs
	if epoch % 5 == 0:
	torch.save({
	'epoch': epoch,
	'compressor_state_dict': comp.state_dict(),
	'decompressor_state_dict': decomp.state_dict(),
	'optimizer_state_dict': opt.state_dict(),
	'loss': avg_loss,
	}, f'checkpoint_epoch_{epoch}.pth')

	# Save final models
	torch.save(comp.state_dict(), 'compressor_final.pth')
	torch.save(decomp.state_dict(), 'decompressor_final.pth')
	print("Training completed! Models saved as compressor_final.pth and decompressor_final.pth")

	# ---------------- Utility Functions ----------------
	def load_and_test_models(compressor_path, decompressor_path, embeddings_path, device='cuda'):
	"""
	Load trained models and test reconstruction quality
	"""
	print("Loading trained models...")
	comp = Compressor().to(device)
	decomp = Decompressor().to(device)

	comp.load_state_dict(torch.load(compressor_path))
	decomp.load_state_dict(torch.load(decompressor_path))

	comp.eval()
	decomp.eval()

	# Load test data
	ds = PrecomputedEmbeddingDataset(embeddings_path)
	test_loader = DataLoader(ds, batch_size=16, shuffle=False)

	# Load normalization stats
	stats = torch.load('normalization_stats.pt')

	print("Testing reconstruction quality...")
	total_mse = 0
	total_samples = 0

	with torch.no_grad():
	for batch in tqdm(test_loader, desc="Testing"):
	x = batch.to(device)
	z = comp(x, stats)
	xr = decomp(z)
	mse = (x - xr).pow(2).mean()
	total_mse += mse.item() * len(x)
	total_samples += len(x)

	avg_mse = total_mse / total_samples
	print(f"Average reconstruction MSE: {avg_mse:.6f}")

	return avg_mse

	# ---------------- Entrypoint ----------------
	if __name__ == '__main__':
	import argparse

	parser = argparse.ArgumentParser(description='Train protein compressor with pre-computed embeddings')
	parser.add_argument('--embeddings', type=str, default='/data2/edwardsun/flow_project/compressor_dataset/peptide_embeddings.pt',
	help='Path to pre-computed embeddings from final_sequence_encoder.py')
	parser.add_argument('--device', type=str, default='cuda', help='Device to use (cuda/cpu)')
	parser.add_argument('--test', action='store_true', help='Test existing models instead of training')

	args = parser.parse_args()

	device = torch.device(args.device if torch.cuda.is_available() else 'cpu')
	print(f"Using device: {device}")

	if args.test:
	# Test existing models
	load_and_test_models('compressor_final.pth', 'decompressor_final.pth', args.embeddings, device)
	else:
	# Train new models
	train_with_precomputed_embeddings(args.embeddings, device)