|
|
import types |
|
|
from typing import List, Optional |
|
|
import torch |
|
|
from torch import nn |
|
|
from einops import rearrange |
|
|
from utils.scheduler import SchedulerInterface, FlowMatchScheduler |
|
|
from wan.modules.tokenizers import HuggingfaceTokenizer |
|
|
from wan.modules.model import WanModel |
|
|
from wan.modules.vae import _video_vae |
|
|
|
|
|
from wan.modules.causal_model import CausalWanModel |
|
|
|
|
|
|
|
|
class WanVAEWrapper(torch.nn.Module): |
|
|
def __init__(self): |
|
|
super().__init__() |
|
|
mean = [ |
|
|
-0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508, |
|
|
0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921 |
|
|
] |
|
|
std = [ |
|
|
2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743, |
|
|
3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160 |
|
|
] |
|
|
self.mean = torch.tensor(mean, dtype=torch.float32) |
|
|
self.std = torch.tensor(std, dtype=torch.float32) |
|
|
|
|
|
|
|
|
self.model = _video_vae( |
|
|
pretrained_path="skyreels_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth", |
|
|
z_dim=16, |
|
|
).eval().requires_grad_(False) |
|
|
|
|
|
def encode_to_latent(self, pixel: torch.Tensor) -> torch.Tensor: |
|
|
|
|
|
device, dtype = pixel.device, pixel.dtype |
|
|
scale = [self.mean.to(device=device, dtype=dtype), |
|
|
1.0 / self.std.to(device=device, dtype=dtype)] |
|
|
|
|
|
output = [ |
|
|
self.model.encode(u.unsqueeze(0), scale).float().squeeze(0) |
|
|
for u in pixel |
|
|
] |
|
|
output = torch.stack(output, dim=0) |
|
|
return output |
|
|
|
|
|
def decode_to_pixel(self, latent: torch.Tensor, use_cache: bool = False) -> torch.Tensor: |
|
|
if use_cache: |
|
|
assert latent.shape[0] == 1, "Batch size must be 1 when using cache" |
|
|
|
|
|
device, dtype = latent.device, latent.dtype |
|
|
scale = [self.mean.to(device=device, dtype=dtype), |
|
|
1.0 / self.std.to(device=device, dtype=dtype)] |
|
|
|
|
|
if use_cache: |
|
|
decode_function = self.model.cached_decode |
|
|
else: |
|
|
decode_function = self.model.decode |
|
|
|
|
|
output = [] |
|
|
for u in zs: |
|
|
output.append(decode_function(u.unsqueeze(0), scale).float().clamp_(-1, 1).squeeze(0)) |
|
|
output = torch.stack(output, dim=0) |
|
|
return output |
|
|
|
|
|
|
|
|
class WanDiffusionWrapper(torch.nn.Module): |
|
|
def __init__( |
|
|
self, |
|
|
model_config="", |
|
|
timestep_shift=5.0, |
|
|
is_causal=True, |
|
|
): |
|
|
super().__init__() |
|
|
print(model_config) |
|
|
self.model = CausalWanModel.from_config(model_config) |
|
|
self.model.eval() |
|
|
|
|
|
|
|
|
self.uniform_timestep = not is_causal |
|
|
|
|
|
self.scheduler = FlowMatchScheduler( |
|
|
shift=timestep_shift, sigma_min=0.0, extra_one_step=True |
|
|
) |
|
|
self.scheduler.set_timesteps(1000, training=True) |
|
|
|
|
|
self.seq_len = 15 * 880 |
|
|
self.post_init() |
|
|
|
|
|
def enable_gradient_checkpointing(self) -> None: |
|
|
self.model.enable_gradient_checkpointing() |
|
|
|
|
|
def _convert_flow_pred_to_x0(self, flow_pred: torch.Tensor, xt: torch.Tensor, timestep: torch.Tensor) -> torch.Tensor: |
|
|
""" |
|
|
Convert flow matching's prediction to x0 prediction. |
|
|
flow_pred: the prediction with shape [B, C, H, W] |
|
|
xt: the input noisy data with shape [B, C, H, W] |
|
|
timestep: the timestep with shape [B] |
|
|
|
|
|
pred = noise - x0 |
|
|
x_t = (1-sigma_t) * x0 + sigma_t * noise |
|
|
we have x0 = x_t - sigma_t * pred |
|
|
see derivations https://chatgpt.com/share/67bf8589-3d04-8008-bc6e-4cf1a24e2d0e |
|
|
""" |
|
|
|
|
|
|
|
|
original_dtype = flow_pred.dtype |
|
|
flow_pred, xt, sigmas, timesteps = map( |
|
|
lambda x: x.double().to(flow_pred.device), [flow_pred, xt, |
|
|
self.scheduler.sigmas, |
|
|
self.scheduler.timesteps] |
|
|
) |
|
|
|
|
|
timestep_id = torch.argmin( |
|
|
(timesteps.unsqueeze(0) - timestep.unsqueeze(1)).abs(), dim=1) |
|
|
sigma_t = sigmas[timestep_id].reshape(-1, 1, 1, 1) |
|
|
x0_pred = xt - sigma_t * flow_pred |
|
|
return x0_pred.to(original_dtype) |
|
|
|
|
|
@staticmethod |
|
|
def _convert_x0_to_flow_pred(scheduler, x0_pred: torch.Tensor, xt: torch.Tensor, timestep: torch.Tensor) -> torch.Tensor: |
|
|
""" |
|
|
Convert x0 prediction to flow matching's prediction. |
|
|
x0_pred: the x0 prediction with shape [B, C, H, W] |
|
|
xt: the input noisy data with shape [B, C, H, W] |
|
|
timestep: the timestep with shape [B] |
|
|
|
|
|
pred = (x_t - x_0) / sigma_t |
|
|
""" |
|
|
|
|
|
original_dtype = x0_pred.dtype |
|
|
x0_pred, xt, sigmas, timesteps = map( |
|
|
lambda x: x.double().to(x0_pred.device), [x0_pred, xt, |
|
|
scheduler.sigmas, |
|
|
scheduler.timesteps] |
|
|
) |
|
|
timestep_id = torch.argmin( |
|
|
(timesteps.unsqueeze(0) - timestep.unsqueeze(1)).abs(), dim=1) |
|
|
sigma_t = sigmas[timestep_id].reshape(-1, 1, 1, 1) |
|
|
flow_pred = (xt - x0_pred) / sigma_t |
|
|
return flow_pred.to(original_dtype) |
|
|
|
|
|
def forward( |
|
|
self, |
|
|
noisy_image_or_video: torch.Tensor, conditional_dict: dict, |
|
|
timestep: torch.Tensor, kv_cache: Optional[List[dict]] = None, kv_cache_mouse: Optional[List[dict]] = None, kv_cache_keyboard: Optional[List[dict]] = None, |
|
|
crossattn_cache: Optional[List[dict]] = None, |
|
|
current_start: Optional[int] = None, |
|
|
cache_start: Optional[int] = None |
|
|
) -> torch.Tensor: |
|
|
|
|
|
assert noisy_image_or_video.shape[1] == 16 |
|
|
|
|
|
if self.uniform_timestep: |
|
|
input_timestep = timestep[:, 0] |
|
|
else: |
|
|
input_timestep = timestep |
|
|
logits = None |
|
|
|
|
|
if kv_cache is not None: |
|
|
flow_pred = self.model( |
|
|
noisy_image_or_video.to(self.model.dtype), |
|
|
t=input_timestep, **conditional_dict, |
|
|
|
|
|
kv_cache=kv_cache, |
|
|
kv_cache_mouse=kv_cache_mouse, kv_cache_keyboard=kv_cache_keyboard, |
|
|
crossattn_cache=crossattn_cache, |
|
|
current_start=current_start, |
|
|
cache_start=cache_start |
|
|
) |
|
|
|
|
|
else: |
|
|
flow_pred = self.model( |
|
|
noisy_image_or_video.to(self.model.dtype), |
|
|
t=input_timestep, **conditional_dict) |
|
|
|
|
|
pred_x0 = self._convert_flow_pred_to_x0( |
|
|
flow_pred=rearrange(flow_pred, 'b c f h w -> (b f) c h w'), |
|
|
xt=rearrange(noisy_image_or_video, 'b c f h w -> (b f) c h w'), |
|
|
timestep=timestep.flatten(0, 1) |
|
|
) |
|
|
pred_x0 = rearrange(pred_x0, '(b f) c h w -> b c f h w', b=flow_pred.shape[0]) |
|
|
if logits is not None: |
|
|
return flow_pred, pred_x0, logits |
|
|
|
|
|
return flow_pred, pred_x0 |
|
|
|
|
|
def get_scheduler(self) -> SchedulerInterface: |
|
|
""" |
|
|
Update the current scheduler with the interface's static method |
|
|
""" |
|
|
scheduler = self.scheduler |
|
|
scheduler.convert_x0_to_noise = types.MethodType( |
|
|
SchedulerInterface.convert_x0_to_noise, scheduler) |
|
|
scheduler.convert_noise_to_x0 = types.MethodType( |
|
|
SchedulerInterface.convert_noise_to_x0, scheduler) |
|
|
scheduler.convert_velocity_to_x0 = types.MethodType( |
|
|
SchedulerInterface.convert_velocity_to_x0, scheduler) |
|
|
self.scheduler = scheduler |
|
|
return scheduler |
|
|
|
|
|
def post_init(self): |
|
|
""" |
|
|
A few custom initialization steps that should be called after the object is created. |
|
|
Currently, the only one we have is to bind a few methods to scheduler. |
|
|
We can gradually add more methods here if needed. |
|
|
""" |
|
|
self.get_scheduler() |
|
|
|
|
|
|