Merge branch 'main' into feat/taesd

2024-08-30 20:32:17 +00:00 · 2023-08-31 20:12:00 -07:00
parent dff466244d a74e2108bb
commit bc1bce18b0
194 changed files with 6664 additions and 4050 deletions
--- a/invokeai/backend/image_util/seamless.py
+++ b/invokeai/backend/image_util/seamless.py
@ -20,7 +20,8 @@ def _conv_forward_asymmetric(self, input, weight, bias):

 def configure_model_padding(model, seamless, seamless_axes):
    """
-    Modifies the 2D convolution layers to use a circular padding mode based on the `seamless` and `seamless_axes` options.
+    Modifies the 2D convolution layers to use a circular padding mode based on
+    the `seamless` and `seamless_axes` options.
    """
    # TODO: get an explicit interface for this in diffusers: https://github.com/huggingface/diffusers/issues/556
    for m in model.modules():
--- a/invokeai/backend/install/migrate_to_3.py
+++ b/invokeai/backend/install/migrate_to_3.py
@ -492,10 +492,10 @@ def _parse_legacy_yamlfile(root: Path, initfile: Path) -> ModelPaths:
    loras = paths.get("lora_dir", "loras")
    controlnets = paths.get("controlnet_dir", "controlnets")
    return ModelPaths(
-        models=root / models,
-        embeddings=root / embeddings,
-        loras=root / loras,
-        controlnets=root / controlnets,
+        models=root / models if models else None,
+        embeddings=root / embeddings if embeddings else None,
+        loras=root / loras if loras else None,
+        controlnets=root / controlnets if controlnets else None,
    )


--- a/invokeai/backend/model_management/seamless.py
+++ b/invokeai/backend/model_management/seamless.py
@ -0,0 +1,102 @@
+from __future__ import annotations
+
+from contextlib import contextmanager
+from typing import List, Union
+
+import torch.nn as nn
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+
+
+def _conv_forward_asymmetric(self, input, weight, bias):
+    """
+    Patch for Conv2d._conv_forward that supports asymmetric padding
+    """
+    working = nn.functional.pad(input, self.asymmetric_padding["x"], mode=self.asymmetric_padding_mode["x"])
+    working = nn.functional.pad(working, self.asymmetric_padding["y"], mode=self.asymmetric_padding_mode["y"])
+    return nn.functional.conv2d(
+        working,
+        weight,
+        bias,
+        self.stride,
+        nn.modules.utils._pair(0),
+        self.dilation,
+        self.groups,
+    )
+
+
+@contextmanager
+def set_seamless(model: Union[UNet2DConditionModel, AutoencoderKL], seamless_axes: List[str]):
+    try:
+        to_restore = []
+
+        for m_name, m in model.named_modules():
+            if isinstance(model, UNet2DConditionModel):
+                if ".attentions." in m_name:
+                    continue
+
+                if ".resnets." in m_name:
+                    if ".conv2" in m_name:
+                        continue
+                    if ".conv_shortcut" in m_name:
+                        continue
+
+            """
+            if isinstance(model, UNet2DConditionModel):
+                if False and ".upsamplers." in m_name:
+                    continue
+
+                if False and ".downsamplers." in m_name:
+                    continue
+
+                if True and ".resnets." in m_name:
+                    if True and ".conv1" in m_name:
+                        if False and "down_blocks" in m_name:
+                            continue
+                        if False and "mid_block" in m_name:
+                            continue
+                        if False and "up_blocks" in m_name:
+                            continue
+
+                    if True and ".conv2" in m_name:
+                        continue
+
+                    if True and ".conv_shortcut" in m_name:
+                        continue
+
+                if True and ".attentions." in m_name:
+                    continue
+
+                if False and m_name in ["conv_in", "conv_out"]:
+                    continue
+            """
+
+            if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
+                m.asymmetric_padding_mode = {}
+                m.asymmetric_padding = {}
+                m.asymmetric_padding_mode["x"] = "circular" if ("x" in seamless_axes) else "constant"
+                m.asymmetric_padding["x"] = (
+                    m._reversed_padding_repeated_twice[0],
+                    m._reversed_padding_repeated_twice[1],
+                    0,
+                    0,
+                )
+                m.asymmetric_padding_mode["y"] = "circular" if ("y" in seamless_axes) else "constant"
+                m.asymmetric_padding["y"] = (
+                    0,
+                    0,
+                    m._reversed_padding_repeated_twice[2],
+                    m._reversed_padding_repeated_twice[3],
+                )
+
+                to_restore.append((m, m._conv_forward))
+                m._conv_forward = _conv_forward_asymmetric.__get__(m, nn.Conv2d)
+
+        yield
+
+    finally:
+        for module, orig_conv_forward in to_restore:
+            module._conv_forward = orig_conv_forward
+            if hasattr(m, "asymmetric_padding_mode"):
+                del m.asymmetric_padding_mode
+            if hasattr(m, "asymmetric_padding"):
+                del m.asymmetric_padding
--- a/invokeai/backend/stable_diffusion/diffusers_pipeline.py
+++ b/invokeai/backend/stable_diffusion/diffusers_pipeline.py
@ -144,7 +144,7 @@ def image_resized_to_grid_as_tensor(image: PIL.Image.Image, normalize: bool = Tr
    w, h = trim_to_multiple_of(*image.size, multiple_of=multiple_of)
    transformation = T.Compose(
        [
-            T.Resize((h, w), T.InterpolationMode.LANCZOS),
+            T.Resize((h, w), T.InterpolationMode.LANCZOS, antialias=True),
            T.ToTensor(),
        ]
    )
@ -358,6 +358,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
        callback: Callable[[PipelineIntermediateState], None] = None,
        control_data: List[ControlNetData] = None,
        mask: Optional[torch.Tensor] = None,
+        masked_latents: Optional[torch.Tensor] = None,
        seed: Optional[int] = None,
    ) -> tuple[torch.Tensor, Optional[AttentionMapSaver]]:
        if init_timestep.shape[0] == 0:
@ -376,28 +377,28 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
            latents = self.scheduler.add_noise(latents, noise, batched_t)

        if mask is not None:
+            # if no noise provided, noisify unmasked area based on seed(or 0 as fallback)
+            if noise is None:
+                noise = torch.randn(
+                    orig_latents.shape,
+                    dtype=torch.float32,
+                    device="cpu",
+                    generator=torch.Generator(device="cpu").manual_seed(seed or 0),
+                ).to(device=orig_latents.device, dtype=orig_latents.dtype)
+
+                latents = self.scheduler.add_noise(latents, noise, batched_t)
+                latents = torch.lerp(
+                    orig_latents, latents.to(dtype=orig_latents.dtype), mask.to(dtype=orig_latents.dtype)
+                )
+
            if is_inpainting_model(self.unet):
-                # You'd think the inpainting model wouldn't be paying attention to the area it is going to repaint
-                # (that's why there's a mask!) but it seems to really want that blanked out.
-                # masked_latents = latents * torch.where(mask < 0.5, 1, 0) TODO: inpaint/outpaint/infill
+                if masked_latents is None:
+                    raise Exception("Source image required for inpaint mask when inpaint model used!")

-                # TODO: we should probably pass this in so we don't have to try/finally around setting it.
-                self.invokeai_diffuser.model_forward_callback = AddsMaskLatents(self._unet_forward, mask, orig_latents)
+                self.invokeai_diffuser.model_forward_callback = AddsMaskLatents(
+                    self._unet_forward, mask, masked_latents
+                )
            else:
-                # if no noise provided, noisify unmasked area based on seed(or 0 as fallback)
-                if noise is None:
-                    noise = torch.randn(
-                        orig_latents.shape,
-                        dtype=torch.float32,
-                        device="cpu",
-                        generator=torch.Generator(device="cpu").manual_seed(seed or 0),
-                    ).to(device=orig_latents.device, dtype=orig_latents.dtype)
-
-                    latents = self.scheduler.add_noise(latents, noise, batched_t)
-                    latents = torch.lerp(
-                        orig_latents, latents.to(dtype=orig_latents.dtype), mask.to(dtype=orig_latents.dtype)
-                    )
-
                additional_guidance.append(AddsMaskGuidance(mask, orig_latents, self.scheduler, noise))

        try:
@ -557,12 +558,22 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
        # compute the previous noisy sample x_t -> x_t-1
        step_output = self.scheduler.step(noise_pred, timestep, latents, **conditioning_data.scheduler_args)

+        # TODO: issue to diffusers?
+        # undo internal counter increment done by scheduler.step, so timestep can be resolved as before call
+        # this needed to be able call scheduler.add_noise with current timestep
+        if self.scheduler.order == 2:
+            self.scheduler._index_counter[timestep.item()] -= 1
+
        # TODO: this additional_guidance extension point feels redundant with InvokeAIDiffusionComponent.
        #    But the way things are now, scheduler runs _after_ that, so there was
        #    no way to use it to apply an operation that happens after the last scheduler.step.
        for guidance in additional_guidance:
            step_output = guidance(step_output, timestep, conditioning_data)

+        # restore internal counter
+        if self.scheduler.order == 2:
+            self.scheduler._index_counter[timestep.item()] += 1
+
        return step_output

    def _unet_forward(
--- a/invokeai/backend/stable_diffusion/diffusion/cross_attention_control.py
+++ b/invokeai/backend/stable_diffusion/diffusion/cross_attention_control.py
@ -265,7 +265,7 @@ class InvokeAICrossAttentionMixin:
        if q.shape[1] <= 4096:  # (512x512) max q.shape[1]: 4096
            return self.einsum_lowest_level(q, k, v, None, None, None)
        else:
-            slice_size = math.floor(2**30 / (q.shape[0] * q.shape[1]))
+            slice_size = math.floor(2 ** 30 / (q.shape[0] * q.shape[1]))
            return self.einsum_op_slice_dim1(q, k, v, slice_size)

    def einsum_op_mps_v2(self, q, k, v):
--- a/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py
+++ b/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py
@ -215,10 +215,7 @@ class InvokeAIDiffuserComponent:
                                dim=0,
                            ),
                        }
-                    (
-                        encoder_hidden_states,
-                        encoder_attention_mask,
-                    ) = self._concat_conditionings_for_batch(
+                    (encoder_hidden_states, encoder_attention_mask,) = self._concat_conditionings_for_batch(
                        conditioning_data.unconditioned_embeddings.embeds,
                        conditioning_data.text_embeddings.embeds,
                    )
@ -280,10 +277,7 @@ class InvokeAIDiffuserComponent:
        wants_cross_attention_control = len(cross_attention_control_types_to_do) > 0

        if wants_cross_attention_control:
-            (
-                unconditioned_next_x,
-                conditioned_next_x,
-            ) = self._apply_cross_attention_controlled_conditioning(
+            (unconditioned_next_x, conditioned_next_x,) = self._apply_cross_attention_controlled_conditioning(
                sample,
                timestep,
                conditioning_data,
@ -291,10 +285,7 @@ class InvokeAIDiffuserComponent:
                **kwargs,
            )
        elif self.sequential_guidance:
-            (
-                unconditioned_next_x,
-                conditioned_next_x,
-            ) = self._apply_standard_conditioning_sequentially(
+            (unconditioned_next_x, conditioned_next_x,) = self._apply_standard_conditioning_sequentially(
                sample,
                timestep,
                conditioning_data,
@ -302,10 +293,7 @@ class InvokeAIDiffuserComponent:
            )

        else:
-            (
-                unconditioned_next_x,
-                conditioned_next_x,
-            ) = self._apply_standard_conditioning(
+            (unconditioned_next_x, conditioned_next_x,) = self._apply_standard_conditioning(
                sample,
                timestep,
                conditioning_data,
--- a/invokeai/backend/stable_diffusion/image_degradation/bsrgan.py
+++ b/invokeai/backend/stable_diffusion/image_degradation/bsrgan.py
@ -395,7 +395,7 @@ def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
        D = np.diag(np.random.rand(3))
        U = orth(np.random.rand(3, 3))
        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
+        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
    img = np.clip(img, 0.0, 1.0)
    return img

@ -413,7 +413,7 @@ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
        D = np.diag(np.random.rand(3))
        U = orth(np.random.rand(3, 3))
        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
+        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
    img = np.clip(img, 0.0, 1.0)
    return img

--- a/invokeai/backend/stable_diffusion/image_degradation/bsrgan_light.py
+++ b/invokeai/backend/stable_diffusion/image_degradation/bsrgan_light.py
@ -399,7 +399,7 @@ def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
        D = np.diag(np.random.rand(3))
        U = orth(np.random.rand(3, 3))
        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
+        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
    img = np.clip(img, 0.0, 1.0)
    return img

@ -417,7 +417,7 @@ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
        D = np.diag(np.random.rand(3))
        U = orth(np.random.rand(3, 3))
        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
+        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
    img = np.clip(img, 0.0, 1.0)
    return img

--- a/invokeai/backend/stable_diffusion/image_degradation/utils_image.py
+++ b/invokeai/backend/stable_diffusion/image_degradation/utils_image.py
@ -562,14 +562,18 @@ def rgb2ycbcr(img, only_y=True):
    if only_y:
        rlt = np.dot(img, [65.481, 128.553, 24.966]) / 255.0 + 16.0
    else:
-        rlt = np.matmul(
-            img,
-            [
-                [65.481, -37.797, 112.0],
-                [128.553, -74.203, -93.786],
-                [24.966, 112.0, -18.214],
-            ],
-        ) / 255.0 + [16, 128, 128]
+        rlt = (
+            np.matmul(
+                img,
+                [
+                    [65.481, -37.797, 112.0],
+                    [128.553, -74.203, -93.786],
+                    [24.966, 112.0, -18.214],
+                ],
+            )
+            / 255.0
+            + [16, 128, 128]
+        )
    if in_img_type == np.uint8:
        rlt = rlt.round()
    else:
@ -588,14 +592,18 @@ def ycbcr2rgb(img):
    if in_img_type != np.uint8:
        img *= 255.0
    # convert
-    rlt = np.matmul(
-        img,
-        [
-            [0.00456621, 0.00456621, 0.00456621],
-            [0, -0.00153632, 0.00791071],
-            [0.00625893, -0.00318811, 0],
-        ],
-    ) * 255.0 + [-222.921, 135.576, -276.836]
+    rlt = (
+        np.matmul(
+            img,
+            [
+                [0.00456621, 0.00456621, 0.00456621],
+                [0, -0.00153632, 0.00791071],
+                [0.00625893, -0.00318811, 0],
+            ],
+        )
+        * 255.0
+        + [-222.921, 135.576, -276.836]
+    )
    if in_img_type == np.uint8:
        rlt = rlt.round()
    else:
@ -618,14 +626,18 @@ def bgr2ycbcr(img, only_y=True):
    if only_y:
        rlt = np.dot(img, [24.966, 128.553, 65.481]) / 255.0 + 16.0
    else:
-        rlt = np.matmul(
-            img,
-            [
-                [24.966, 112.0, -18.214],
-                [128.553, -74.203, -93.786],
-                [65.481, -37.797, 112.0],
-            ],
-        ) / 255.0 + [16, 128, 128]
+        rlt = (
+            np.matmul(
+                img,
+                [
+                    [24.966, 112.0, -18.214],
+                    [128.553, -74.203, -93.786],
+                    [65.481, -37.797, 112.0],
+                ],
+            )
+            / 255.0
+            + [16, 128, 128]
+        )
    if in_img_type == np.uint8:
        rlt = rlt.round()
    else:
@ -716,11 +728,11 @@ def ssim(img1, img2):

    mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5]  # valid
    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
-    mu1_sq = mu1**2
-    mu2_sq = mu2**2
+    mu1_sq = mu1 ** 2
+    mu2_sq = mu2 ** 2
    mu1_mu2 = mu1 * mu2
-    sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
-    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
+    sigma1_sq = cv2.filter2D(img1 ** 2, -1, window)[5:-5, 5:-5] - mu1_sq
+    sigma2_sq = cv2.filter2D(img2 ** 2, -1, window)[5:-5, 5:-5] - mu2_sq
    sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2

    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
@ -737,8 +749,8 @@ def ssim(img1, img2):
 # matlab 'imresize' function, now only support 'bicubic'
 def cubic(x):
    absx = torch.abs(x)
-    absx2 = absx**2
-    absx3 = absx**3
+    absx2 = absx ** 2
+    absx3 = absx ** 3
    return (1.5 * absx3 - 2.5 * absx2 + 1) * ((absx <= 1).type_as(absx)) + (
        -0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2
    ) * (((absx > 1) * (absx <= 2)).type_as(absx))
--- a/invokeai/backend/training/textual_inversion_training.py
+++ b/invokeai/backend/training/textual_inversion_training.py
@ -475,10 +475,7 @@ class TextualInversionDataset(Dataset):

        if self.center_crop:
            crop = min(img.shape[0], img.shape[1])
-            (
-                h,
-                w,
-            ) = (
+            (h, w,) = (
                img.shape[0],
                img.shape[1],
            )
--- a/invokeai/backend/util/hotfixes.py
+++ b/invokeai/backend/util/hotfixes.py
@ -1,11 +1,11 @@
 from typing import Any, Dict, List, Optional, Tuple, Union

+import diffusers
 import torch
-from torch import nn
-
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.loaders import FromOriginalControlnetMixin
 from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor
+from diffusers.models.controlnet import ControlNetConditioningEmbedding, ControlNetOutput, zero_module
 from diffusers.models.embeddings import (
    TextImageProjection,
    TextImageTimeEmbedding,
@ -14,16 +14,9 @@ from diffusers.models.embeddings import (
    Timesteps,
 )
 from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.unet_2d_blocks import (
-    CrossAttnDownBlock2D,
-    DownBlock2D,
-    UNetMidBlock2DCrossAttn,
-    get_down_block,
-)
+from diffusers.models.unet_2d_blocks import CrossAttnDownBlock2D, DownBlock2D, UNetMidBlock2DCrossAttn, get_down_block
 from diffusers.models.unet_2d_condition import UNet2DConditionModel
-
-import diffusers
-from diffusers.models.controlnet import ControlNetConditioningEmbedding, ControlNetOutput, zero_module
+from torch import nn

 from invokeai.backend.util.logging import InvokeAILogger

@ -45,7 +38,8 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
            Whether to flip the sin to cos in the time embedding.
        freq_shift (`int`, defaults to 0):
            The frequency shift to apply to the time embedding.
-        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", \
+            "CrossAttnDownBlock2D", "DownBlock2D")`):
            The tuple of downsample blocks to use.
        only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
        block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
@ -147,7 +141,9 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
        # If `num_attention_heads` is not defined (which is the case for most models)
        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
        # The reason for this behavior is to correct for incorrectly named variables that were introduced
-        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # when this library was created...
+        # The incorrect naming was only discovered much ...
+        # later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
        # which is why we correct for the naming here.
        num_attention_heads = num_attention_heads or attention_head_dim
@ -155,17 +151,20 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
        # Check inputs
        if len(block_out_channels) != len(down_block_types):
            raise ValueError(
-                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. \
+                    `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
            )

        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
            raise ValueError(
-                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. \
+                    `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
            )

        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
            raise ValueError(
-                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. \
+                    `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
            )

        if isinstance(transformer_layers_per_block, int):
@ -202,7 +201,8 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
        elif encoder_hid_dim_type == "text_image_proj":
            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
-            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension ...
+            # for the currently only use
            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
            self.encoder_hid_proj = TextImageProjection(
                text_embed_dim=encoder_hid_dim,
@ -250,8 +250,10 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
            )
        elif addition_embed_type == "text_image":
-            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
-            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`.
+            # To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension...
+            # for the currently only use
            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
            self.add_embedding = TextImageTimeEmbedding(
                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
@ -673,12 +675,14 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
            elif self.config.addition_embed_type == "text_time":
                if "text_embeds" not in added_cond_kwargs:
                    raise ValueError(
-                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which \
+                            requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
                    )
                text_embeds = added_cond_kwargs.get("text_embeds")
                if "time_ids" not in added_cond_kwargs:
                    raise ValueError(
-                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which \
+                            requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
                    )
                time_ids = added_cond_kwargs.get("time_ids")
                time_embeds = self.add_time_proj(time_ids.flatten())
@ -761,3 +765,64 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):

 diffusers.ControlNetModel = ControlNetModel
 diffusers.models.controlnet.ControlNetModel = ControlNetModel
+
+
+# patch LoRACompatibleConv to use original Conv2D forward function
+# this needed to make work seamless patch
+# NOTE: with this patch, torch.compile crashes on 2.0 torch(already fixed in nightly)
+# https://github.com/huggingface/diffusers/pull/4315
+# https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/lora.py#L96C18-L96C18
+def new_LoRACompatibleConv_forward(self, x):
+    if self.lora_layer is None:
+        return super(diffusers.models.lora.LoRACompatibleConv, self).forward(x)
+    else:
+        return super(diffusers.models.lora.LoRACompatibleConv, self).forward(x) + self.lora_layer(x)
+
+
+diffusers.models.lora.LoRACompatibleConv.forward = new_LoRACompatibleConv_forward
+
+try:
+    import xformers
+
+    xformers_available = True
+except Exception:
+    xformers_available = False
+
+
+if xformers_available:
+    # TODO: remove when fixed in diffusers
+    _xformers_memory_efficient_attention = xformers.ops.memory_efficient_attention
+
+    def new_memory_efficient_attention(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_bias=None,
+        p: float = 0.0,
+        scale: Optional[float] = None,
+        *,
+        op=None,
+    ):
+        # diffusers not align shape to 8, which is required by xformers
+        if attn_bias is not None and type(attn_bias) is torch.Tensor:
+            orig_size = attn_bias.shape[-1]
+            new_size = ((orig_size + 7) // 8) * 8
+            aligned_attn_bias = torch.zeros(
+                (attn_bias.shape[0], attn_bias.shape[1], new_size),
+                device=attn_bias.device,
+                dtype=attn_bias.dtype,
+            )
+            aligned_attn_bias[:, :, :orig_size] = attn_bias
+            attn_bias = aligned_attn_bias[:, :, :orig_size]
+
+        return _xformers_memory_efficient_attention(
+            query=query,
+            key=key,
+            value=value,
+            attn_bias=attn_bias,
+            p=p,
+            scale=scale,
+            op=op,
+        )
+
+    xformers.ops.memory_efficient_attention = new_memory_efficient_attention
--- a/invokeai/backend/util/mps_fixes.py
+++ b/invokeai/backend/util/mps_fixes.py
@ -203,7 +203,7 @@ class ChunkedSlicedAttnProcessor:
        if attn.upcast_attention:
            out_item_size = 4

-        chunk_size = 2**29
+        chunk_size = 2 ** 29

        out_size = query.shape[1] * key.shape[1] * out_item_size
        chunks_count = min(query.shape[1], math.ceil((out_size - 1) / chunk_size))
--- a/invokeai/backend/util/util.py
+++ b/invokeai/backend/util/util.py
@ -207,7 +207,7 @@ def parallel_data_prefetch(
        return gather_res


-def rand_perlin_2d(shape, res, device, fade=lambda t: 6 * t**5 - 15 * t**4 + 10 * t**3):
+def rand_perlin_2d(shape, res, device, fade=lambda t: 6 * t ** 5 - 15 * t ** 4 + 10 * t ** 3):
    delta = (res[0] / shape[0], res[1] / shape[1])
    d = (shape[0] // res[0], shape[1] // res[1])