from PIL import Image import torch import numpy as np from transformers import Qwen2_5_VLForConditionalGeneration from diffusers import ( QwenImagePipeline, QwenImageTransformer2DModel, QwenImageInpaintPipeline, ) from optimum.quanto import quantize, qint8, freeze prompt = ( "equirectangular, a woman and a man sitting at a cafe, the woman has red hair " "and she's wearing purple sweater with a black scarf and a white hat, the man " "is sitting on the other side of the table and he's wearing a white shirt with " "a purple scarf and red hat, both of them are sipping their coffee while in the " "table there's some cake slices on their respective plates, each with forks and " "knives at each side." ) negative_prompt = "" output_filename = "qwen_int8.png" width, height = 2048, 1024 true_cfg_scale = 4.0 num_inference_steps = 25 seed = 42 lora_model_id = "ProGamerGov/qwen-360-diffusion" lora_filename = "qwen-360-diffusion-int8-bf16-v1.safetensors" # Use the base fp16/bf16 model, not the nf4 variant model_id = "Qwen/Qwen-Image" torch_dtype = torch.bfloat16 device = "cuda" fix_seam = True inpaint_strength, seam_width = 0.5, 0.10 def shift_equirect(img): """Horizontal 50% shift using torch.roll.""" t = torch.from_numpy(np.array(img)).permute(2, 0, 1).float() / 255.0 t = torch.roll(t, shifts=(0, t.shape[2] // 2), dims=(1, 2)) return Image.fromarray((t.permute(1, 2, 0).numpy() * 255).astype(np.uint8)) def create_seam_mask(w, h, frac=0.10): """Create vertical seam mask as PIL Image (center seam).""" mask = torch.zeros((h, w)) seam_w = max(1, int(w * frac)) c = w // 2 mask[:, c - seam_w // 2:c + seam_w // 2] = 1.0 return Image.fromarray((mask.numpy() * 255).astype("uint8"), "L") def load_pipeline(text_encoder, transformer, mode="t2i"): pip_class = QwenImagePipeline if mode == "t2i" else QwenImageInpaintPipeline pipe = pip_class.from_pretrained( model_id, transformer=transformer, text_encoder=text_encoder, torch_dtype=torch_dtype, use_safetensors=True, low_cpu_mem_usage=True, ) pipe.load_lora_weights(lora_model_id, weight_name=lora_filename) pipe.enable_model_cpu_offload() pipe.enable_vae_tiling() # This still works with the quantized transformer return pipe def main(): # 1) Load and INT8-quantize transformer on CPU transformer = QwenImageTransformer2DModel.from_pretrained( model_id, subfolder="transformer", torch_dtype=torch_dtype, low_cpu_mem_usage=True, ) quantize(transformer, weights=qint8) freeze(transformer) # 2) Load and INT8-quantize text encoder on CPU text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained( model_id, subfolder="text_encoder", torch_dtype=torch_dtype, low_cpu_mem_usage=True, device_map={"": "cpu"}, # keep it on CPU; offload will move as needed ) quantize(text_encoder, weights=qint8) freeze(text_encoder) # 3) Build T2I pipeline generator = torch.Generator(device=device).manual_seed(seed) pipe = load_pipeline(text_encoder, transformer, mode="t2i") # 4) First pass: base panorama image = pipe( prompt=prompt, negative_prompt=negative_prompt, width=width, height=height, num_inference_steps=num_inference_steps, true_cfg_scale=true_cfg_scale, generator=generator, ).images[0] image.save(output_filename) # 5) Optional seam-fix pass using inpainting if fix_seam: del pipe if torch.cuda.is_available(): torch.cuda.empty_cache() shifted = shift_equirect(image) # roll 50% to expose seam mask = create_seam_mask(width, height, frac=seam_width) pipe = load_pipeline(text_encoder, transformer, mode="i2i") image_fixed = pipe( prompt=prompt, negative_prompt=negative_prompt, image=shifted, mask_image=mask, strength=inpaint_strength, width=width, height=height, num_inference_steps=num_inference_steps, true_cfg_scale=true_cfg_scale, generator=generator, ).images[0] image_fixed = shift_equirect(image_fixed) image_fixed.save(output_filename.replace(".png", "_seamfix.png")) if __name__ == "__main__": main()