From f9af32a6d1d05738eefc9ef2826e1048b23c51ed Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 9 Apr 2024 15:25:20 -0400 Subject: [PATCH] Fix the padding behavior when max-pooling regional IP-Adapter masks to mirror the downscaling behavior of SD and SDXL. Prior to this change, denoising with input latent dimensions that were not evenly divisible by 8 would raise an exception. --- .../backend/stable_diffusion/diffusion/regional_ip_data.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/invokeai/backend/stable_diffusion/diffusion/regional_ip_data.py b/invokeai/backend/stable_diffusion/diffusion/regional_ip_data.py index d3b4505f58..792c97114d 100644 --- a/invokeai/backend/stable_diffusion/diffusion/regional_ip_data.py +++ b/invokeai/backend/stable_diffusion/diffusion/regional_ip_data.py @@ -59,8 +59,11 @@ class RegionalIPData: if downscale_factor <= max_downscale_factor: # We use max pooling because we downscale to a pretty low resolution, so we don't want small mask # regions to be lost entirely. + # + # ceil_mode=True is set to mirror the downsampling behavior of SD and SDXL. + # # TODO(ryand): In the future, we may want to experiment with other downsampling methods. - mask_tensor = torch.nn.functional.max_pool2d(mask_tensor, kernel_size=2, stride=2) + mask_tensor = torch.nn.functional.max_pool2d(mask_tensor, kernel_size=2, stride=2, ceil_mode=True) return masks_by_seq_len