Add diffusers support (#1)

a3d8cb2 verified about 2 months ago

4.76 kB

	# Copyright (C) 2025 Hugging Face Team and Overworld
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <https://www.gnu.org/licenses/>.

	"""VAE model for WorldEngine frame encoding/decoding."""

	from dataclasses import dataclass
	from typing import List, Tuple

	import torch
	from torch import Tensor

	from diffusers.configuration_utils import ConfigMixin, register_to_config
	from diffusers.models.modeling_utils import ModelMixin
	from .dcae import Encoder, Decoder


	@dataclass
	class EncoderDecoderConfig:
	"""Config object for Encoder/Decoder initialization."""

	sample_size: Tuple[int, int]
	channels: int
	latent_channels: int
	ch_0: int
	ch_max: int
	encoder_blocks_per_stage: List[int]
	decoder_blocks_per_stage: List[int]
	use_middle_block: bool
	skip_logvar: bool = False
	skip_residuals: bool = False
	normalize_mu: bool = False


	class WorldEngineVAE(ModelMixin, ConfigMixin):
	"""
	VAE for encoding/decoding video frames using DCAE architecture.

	Encodes RGB uint8 images to latent space and decodes latents back to RGB.
	"""

	_supports_gradient_checkpointing = False

	@register_to_config
	def __init__(
	self,
	# Common parameters
	sample_size: Tuple[int, int] = (360, 640),
	channels: int = 3,
	latent_channels: int = 16,
	# Encoder parameters
	encoder_ch_0: int = 64,
	encoder_ch_max: int = 256,
	encoder_blocks_per_stage: List[int] = None,
	# Decoder parameters
	decoder_ch_0: int = 128,
	decoder_ch_max: int = 1024,
	decoder_blocks_per_stage: List[int] = None,
	# Shared parameters
	use_middle_block: bool = False,
	skip_logvar: bool = False,
	# Scaling factors
	scale_factor: float = 1.0,
	shift_factor: float = 0.0,
	):
	super().__init__()

	# Default blocks per stage
	if encoder_blocks_per_stage is None:
	encoder_blocks_per_stage = [1, 1, 1, 1]
	if decoder_blocks_per_stage is None:
	decoder_blocks_per_stage = [1, 1, 1, 1]

	# Create encoder config
	encoder_config = EncoderDecoderConfig(
	sample_size=tuple(sample_size),
	channels=channels,
	latent_channels=latent_channels,
	ch_0=encoder_ch_0,
	ch_max=encoder_ch_max,
	encoder_blocks_per_stage=list(encoder_blocks_per_stage),
	decoder_blocks_per_stage=list(decoder_blocks_per_stage),
	use_middle_block=use_middle_block,
	skip_logvar=skip_logvar,
	)

	# Create decoder config
	decoder_config = EncoderDecoderConfig(
	sample_size=tuple(sample_size),
	channels=channels,
	latent_channels=latent_channels,
	ch_0=decoder_ch_0,
	ch_max=decoder_ch_max,
	encoder_blocks_per_stage=list(encoder_blocks_per_stage),
	decoder_blocks_per_stage=list(decoder_blocks_per_stage),
	use_middle_block=use_middle_block,
	skip_logvar=skip_logvar,
	)

	self.encoder = Encoder(encoder_config)
	self.decoder = Decoder(decoder_config)

	def encode(self, img: Tensor):
	"""RGB -> RGB+D -> latent"""
	assert img.dim() == 3, "Expected [H, W, C] image tensor"
	img = img.unsqueeze(0).to(device=self.device, dtype=self.dtype)
	rgb = img.permute(0, 3, 1, 2).contiguous().div(255).mul(2).sub(1)
	return self.encoder(rgb)

	@torch.compile
	def decode(self, latent: Tensor):
	decoded = self.decoder(latent)
	decoded = (decoded / 2 + 0.5).clamp(0, 1)
	decoded = (decoded * 255).round().to(torch.uint8)
	return decoded.squeeze(0).permute(1, 2, 0)[..., :3]

	def forward(self, x: Tensor, encode: bool = True) -> Tensor:
	"""
	Forward pass - encode or decode based on flag.

	Args:
	x: Input tensor (image for encode, latent for decode)
	encode: If True, encode; if False, decode

	Returns:
	Encoded latent or decoded image
	"""
	if encode:
	return self.encode(x)
	else:
	return self.decode(x)