diff --git a/invokeai/app/invocations/latent.py b/invokeai/app/invocations/latent.py index d8250bc4c3..b6dd843a19 100644 --- a/invokeai/app/invocations/latent.py +++ b/invokeai/app/invocations/latent.py @@ -8,7 +8,8 @@ import numpy as np import torch import torchvision.transforms as T from diffusers.image_processor import VaeImageProcessor -from diffusers.models import MultiAdapter, T2IAdapter, UNet2DConditionModel +from diffusers.models import UNet2DConditionModel +from diffusers.models.adapter import FullAdapterXL from diffusers.models.attention_processor import ( AttnProcessor2_0, LoRAAttnProcessor2_0, @@ -482,13 +483,28 @@ class DenoiseLatentsInvocation(BaseInvocation): ) image = context.services.images.get_pil_image(t2i_adapter_field.image.image_name) + # The max_unet_downscale is the maximum amount that the UNet model downscales the latent image internally. + if t2i_adapter_field.t2i_adapter_model.base_model == BaseModelType.StableDiffusion1: + max_unet_downscale = 8 + elif t2i_adapter_field.t2i_adapter_model.base_model == BaseModelType.StableDiffusionXL: + max_unet_downscale = 4 + else: + raise ValueError( + f"Unexpected T2I-Adapter base model type: '{t2i_adapter_field.t2i_adapter_model.base_model}'." + ) + t2i_adapter_data = [] with t2i_adapter_model_info as t2i_adapter_model: + total_downscale_factor = t2i_adapter_model.total_downscale_factor + if isinstance(t2i_adapter_model.adapter, FullAdapterXL): + # HACK(ryand): Work around a bug in FullAdapterXL. This will be fixed upstream in diffusers. + total_downscale_factor = total_downscale_factor // 2 + # Resize the T2I-Adapter input image. - # We select the resize dimensions so that after the T2I-Adapter's downsampling factor is applied, the result - # will match the latents_shape. - t2i_input_height = latents_shape[2] * t2i_adapter_model.total_downscale_factor - t2i_input_width = latents_shape[3] * t2i_adapter_model.total_downscale_factor + # We select the resize dimensions so that after the T2I-Adapter's total_downscale_factor is applied, the + # result will match the latent image's dimensions after max_unet_downscale is applied. + t2i_input_height = latents_shape[2] // max_unet_downscale * total_downscale_factor + t2i_input_width = latents_shape[3] // max_unet_downscale * total_downscale_factor # Note: We have hard-coded `do_classifier_free_guidance=False`. This is because we only want to prepare # a single image. If CFG is enabled, we will duplicate the resultant tensor after applying the