mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
Merge branch 'main' into feat/taesd
This commit is contained in:
@ -20,7 +20,8 @@ def _conv_forward_asymmetric(self, input, weight, bias):
|
||||
|
||||
def configure_model_padding(model, seamless, seamless_axes):
|
||||
"""
|
||||
Modifies the 2D convolution layers to use a circular padding mode based on the `seamless` and `seamless_axes` options.
|
||||
Modifies the 2D convolution layers to use a circular padding mode based on
|
||||
the `seamless` and `seamless_axes` options.
|
||||
"""
|
||||
# TODO: get an explicit interface for this in diffusers: https://github.com/huggingface/diffusers/issues/556
|
||||
for m in model.modules():
|
||||
|
@ -492,10 +492,10 @@ def _parse_legacy_yamlfile(root: Path, initfile: Path) -> ModelPaths:
|
||||
loras = paths.get("lora_dir", "loras")
|
||||
controlnets = paths.get("controlnet_dir", "controlnets")
|
||||
return ModelPaths(
|
||||
models=root / models,
|
||||
embeddings=root / embeddings,
|
||||
loras=root / loras,
|
||||
controlnets=root / controlnets,
|
||||
models=root / models if models else None,
|
||||
embeddings=root / embeddings if embeddings else None,
|
||||
loras=root / loras if loras else None,
|
||||
controlnets=root / controlnets if controlnets else None,
|
||||
)
|
||||
|
||||
|
||||
|
102
invokeai/backend/model_management/seamless.py
Normal file
102
invokeai/backend/model_management/seamless.py
Normal file
@ -0,0 +1,102 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from contextlib import contextmanager
|
||||
from typing import List, Union
|
||||
|
||||
import torch.nn as nn
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
|
||||
|
||||
def _conv_forward_asymmetric(self, input, weight, bias):
|
||||
"""
|
||||
Patch for Conv2d._conv_forward that supports asymmetric padding
|
||||
"""
|
||||
working = nn.functional.pad(input, self.asymmetric_padding["x"], mode=self.asymmetric_padding_mode["x"])
|
||||
working = nn.functional.pad(working, self.asymmetric_padding["y"], mode=self.asymmetric_padding_mode["y"])
|
||||
return nn.functional.conv2d(
|
||||
working,
|
||||
weight,
|
||||
bias,
|
||||
self.stride,
|
||||
nn.modules.utils._pair(0),
|
||||
self.dilation,
|
||||
self.groups,
|
||||
)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def set_seamless(model: Union[UNet2DConditionModel, AutoencoderKL], seamless_axes: List[str]):
|
||||
try:
|
||||
to_restore = []
|
||||
|
||||
for m_name, m in model.named_modules():
|
||||
if isinstance(model, UNet2DConditionModel):
|
||||
if ".attentions." in m_name:
|
||||
continue
|
||||
|
||||
if ".resnets." in m_name:
|
||||
if ".conv2" in m_name:
|
||||
continue
|
||||
if ".conv_shortcut" in m_name:
|
||||
continue
|
||||
|
||||
"""
|
||||
if isinstance(model, UNet2DConditionModel):
|
||||
if False and ".upsamplers." in m_name:
|
||||
continue
|
||||
|
||||
if False and ".downsamplers." in m_name:
|
||||
continue
|
||||
|
||||
if True and ".resnets." in m_name:
|
||||
if True and ".conv1" in m_name:
|
||||
if False and "down_blocks" in m_name:
|
||||
continue
|
||||
if False and "mid_block" in m_name:
|
||||
continue
|
||||
if False and "up_blocks" in m_name:
|
||||
continue
|
||||
|
||||
if True and ".conv2" in m_name:
|
||||
continue
|
||||
|
||||
if True and ".conv_shortcut" in m_name:
|
||||
continue
|
||||
|
||||
if True and ".attentions." in m_name:
|
||||
continue
|
||||
|
||||
if False and m_name in ["conv_in", "conv_out"]:
|
||||
continue
|
||||
"""
|
||||
|
||||
if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
|
||||
m.asymmetric_padding_mode = {}
|
||||
m.asymmetric_padding = {}
|
||||
m.asymmetric_padding_mode["x"] = "circular" if ("x" in seamless_axes) else "constant"
|
||||
m.asymmetric_padding["x"] = (
|
||||
m._reversed_padding_repeated_twice[0],
|
||||
m._reversed_padding_repeated_twice[1],
|
||||
0,
|
||||
0,
|
||||
)
|
||||
m.asymmetric_padding_mode["y"] = "circular" if ("y" in seamless_axes) else "constant"
|
||||
m.asymmetric_padding["y"] = (
|
||||
0,
|
||||
0,
|
||||
m._reversed_padding_repeated_twice[2],
|
||||
m._reversed_padding_repeated_twice[3],
|
||||
)
|
||||
|
||||
to_restore.append((m, m._conv_forward))
|
||||
m._conv_forward = _conv_forward_asymmetric.__get__(m, nn.Conv2d)
|
||||
|
||||
yield
|
||||
|
||||
finally:
|
||||
for module, orig_conv_forward in to_restore:
|
||||
module._conv_forward = orig_conv_forward
|
||||
if hasattr(m, "asymmetric_padding_mode"):
|
||||
del m.asymmetric_padding_mode
|
||||
if hasattr(m, "asymmetric_padding"):
|
||||
del m.asymmetric_padding
|
@ -144,7 +144,7 @@ def image_resized_to_grid_as_tensor(image: PIL.Image.Image, normalize: bool = Tr
|
||||
w, h = trim_to_multiple_of(*image.size, multiple_of=multiple_of)
|
||||
transformation = T.Compose(
|
||||
[
|
||||
T.Resize((h, w), T.InterpolationMode.LANCZOS),
|
||||
T.Resize((h, w), T.InterpolationMode.LANCZOS, antialias=True),
|
||||
T.ToTensor(),
|
||||
]
|
||||
)
|
||||
@ -358,6 +358,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
||||
callback: Callable[[PipelineIntermediateState], None] = None,
|
||||
control_data: List[ControlNetData] = None,
|
||||
mask: Optional[torch.Tensor] = None,
|
||||
masked_latents: Optional[torch.Tensor] = None,
|
||||
seed: Optional[int] = None,
|
||||
) -> tuple[torch.Tensor, Optional[AttentionMapSaver]]:
|
||||
if init_timestep.shape[0] == 0:
|
||||
@ -376,28 +377,28 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
||||
latents = self.scheduler.add_noise(latents, noise, batched_t)
|
||||
|
||||
if mask is not None:
|
||||
# if no noise provided, noisify unmasked area based on seed(or 0 as fallback)
|
||||
if noise is None:
|
||||
noise = torch.randn(
|
||||
orig_latents.shape,
|
||||
dtype=torch.float32,
|
||||
device="cpu",
|
||||
generator=torch.Generator(device="cpu").manual_seed(seed or 0),
|
||||
).to(device=orig_latents.device, dtype=orig_latents.dtype)
|
||||
|
||||
latents = self.scheduler.add_noise(latents, noise, batched_t)
|
||||
latents = torch.lerp(
|
||||
orig_latents, latents.to(dtype=orig_latents.dtype), mask.to(dtype=orig_latents.dtype)
|
||||
)
|
||||
|
||||
if is_inpainting_model(self.unet):
|
||||
# You'd think the inpainting model wouldn't be paying attention to the area it is going to repaint
|
||||
# (that's why there's a mask!) but it seems to really want that blanked out.
|
||||
# masked_latents = latents * torch.where(mask < 0.5, 1, 0) TODO: inpaint/outpaint/infill
|
||||
if masked_latents is None:
|
||||
raise Exception("Source image required for inpaint mask when inpaint model used!")
|
||||
|
||||
# TODO: we should probably pass this in so we don't have to try/finally around setting it.
|
||||
self.invokeai_diffuser.model_forward_callback = AddsMaskLatents(self._unet_forward, mask, orig_latents)
|
||||
self.invokeai_diffuser.model_forward_callback = AddsMaskLatents(
|
||||
self._unet_forward, mask, masked_latents
|
||||
)
|
||||
else:
|
||||
# if no noise provided, noisify unmasked area based on seed(or 0 as fallback)
|
||||
if noise is None:
|
||||
noise = torch.randn(
|
||||
orig_latents.shape,
|
||||
dtype=torch.float32,
|
||||
device="cpu",
|
||||
generator=torch.Generator(device="cpu").manual_seed(seed or 0),
|
||||
).to(device=orig_latents.device, dtype=orig_latents.dtype)
|
||||
|
||||
latents = self.scheduler.add_noise(latents, noise, batched_t)
|
||||
latents = torch.lerp(
|
||||
orig_latents, latents.to(dtype=orig_latents.dtype), mask.to(dtype=orig_latents.dtype)
|
||||
)
|
||||
|
||||
additional_guidance.append(AddsMaskGuidance(mask, orig_latents, self.scheduler, noise))
|
||||
|
||||
try:
|
||||
@ -557,12 +558,22 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
step_output = self.scheduler.step(noise_pred, timestep, latents, **conditioning_data.scheduler_args)
|
||||
|
||||
# TODO: issue to diffusers?
|
||||
# undo internal counter increment done by scheduler.step, so timestep can be resolved as before call
|
||||
# this needed to be able call scheduler.add_noise with current timestep
|
||||
if self.scheduler.order == 2:
|
||||
self.scheduler._index_counter[timestep.item()] -= 1
|
||||
|
||||
# TODO: this additional_guidance extension point feels redundant with InvokeAIDiffusionComponent.
|
||||
# But the way things are now, scheduler runs _after_ that, so there was
|
||||
# no way to use it to apply an operation that happens after the last scheduler.step.
|
||||
for guidance in additional_guidance:
|
||||
step_output = guidance(step_output, timestep, conditioning_data)
|
||||
|
||||
# restore internal counter
|
||||
if self.scheduler.order == 2:
|
||||
self.scheduler._index_counter[timestep.item()] += 1
|
||||
|
||||
return step_output
|
||||
|
||||
def _unet_forward(
|
||||
|
@ -265,7 +265,7 @@ class InvokeAICrossAttentionMixin:
|
||||
if q.shape[1] <= 4096: # (512x512) max q.shape[1]: 4096
|
||||
return self.einsum_lowest_level(q, k, v, None, None, None)
|
||||
else:
|
||||
slice_size = math.floor(2**30 / (q.shape[0] * q.shape[1]))
|
||||
slice_size = math.floor(2 ** 30 / (q.shape[0] * q.shape[1]))
|
||||
return self.einsum_op_slice_dim1(q, k, v, slice_size)
|
||||
|
||||
def einsum_op_mps_v2(self, q, k, v):
|
||||
|
@ -215,10 +215,7 @@ class InvokeAIDiffuserComponent:
|
||||
dim=0,
|
||||
),
|
||||
}
|
||||
(
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
) = self._concat_conditionings_for_batch(
|
||||
(encoder_hidden_states, encoder_attention_mask,) = self._concat_conditionings_for_batch(
|
||||
conditioning_data.unconditioned_embeddings.embeds,
|
||||
conditioning_data.text_embeddings.embeds,
|
||||
)
|
||||
@ -280,10 +277,7 @@ class InvokeAIDiffuserComponent:
|
||||
wants_cross_attention_control = len(cross_attention_control_types_to_do) > 0
|
||||
|
||||
if wants_cross_attention_control:
|
||||
(
|
||||
unconditioned_next_x,
|
||||
conditioned_next_x,
|
||||
) = self._apply_cross_attention_controlled_conditioning(
|
||||
(unconditioned_next_x, conditioned_next_x,) = self._apply_cross_attention_controlled_conditioning(
|
||||
sample,
|
||||
timestep,
|
||||
conditioning_data,
|
||||
@ -291,10 +285,7 @@ class InvokeAIDiffuserComponent:
|
||||
**kwargs,
|
||||
)
|
||||
elif self.sequential_guidance:
|
||||
(
|
||||
unconditioned_next_x,
|
||||
conditioned_next_x,
|
||||
) = self._apply_standard_conditioning_sequentially(
|
||||
(unconditioned_next_x, conditioned_next_x,) = self._apply_standard_conditioning_sequentially(
|
||||
sample,
|
||||
timestep,
|
||||
conditioning_data,
|
||||
@ -302,10 +293,7 @@ class InvokeAIDiffuserComponent:
|
||||
)
|
||||
|
||||
else:
|
||||
(
|
||||
unconditioned_next_x,
|
||||
conditioned_next_x,
|
||||
) = self._apply_standard_conditioning(
|
||||
(unconditioned_next_x, conditioned_next_x,) = self._apply_standard_conditioning(
|
||||
sample,
|
||||
timestep,
|
||||
conditioning_data,
|
||||
|
@ -395,7 +395,7 @@ def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
|
||||
D = np.diag(np.random.rand(3))
|
||||
U = orth(np.random.rand(3, 3))
|
||||
conv = np.dot(np.dot(np.transpose(U), D), U)
|
||||
img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
|
||||
img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
return img
|
||||
|
||||
@ -413,7 +413,7 @@ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
|
||||
D = np.diag(np.random.rand(3))
|
||||
U = orth(np.random.rand(3, 3))
|
||||
conv = np.dot(np.dot(np.transpose(U), D), U)
|
||||
img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
|
||||
img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
return img
|
||||
|
||||
|
@ -399,7 +399,7 @@ def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
|
||||
D = np.diag(np.random.rand(3))
|
||||
U = orth(np.random.rand(3, 3))
|
||||
conv = np.dot(np.dot(np.transpose(U), D), U)
|
||||
img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
|
||||
img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
return img
|
||||
|
||||
@ -417,7 +417,7 @@ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
|
||||
D = np.diag(np.random.rand(3))
|
||||
U = orth(np.random.rand(3, 3))
|
||||
conv = np.dot(np.dot(np.transpose(U), D), U)
|
||||
img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
|
||||
img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
return img
|
||||
|
||||
|
@ -562,14 +562,18 @@ def rgb2ycbcr(img, only_y=True):
|
||||
if only_y:
|
||||
rlt = np.dot(img, [65.481, 128.553, 24.966]) / 255.0 + 16.0
|
||||
else:
|
||||
rlt = np.matmul(
|
||||
img,
|
||||
[
|
||||
[65.481, -37.797, 112.0],
|
||||
[128.553, -74.203, -93.786],
|
||||
[24.966, 112.0, -18.214],
|
||||
],
|
||||
) / 255.0 + [16, 128, 128]
|
||||
rlt = (
|
||||
np.matmul(
|
||||
img,
|
||||
[
|
||||
[65.481, -37.797, 112.0],
|
||||
[128.553, -74.203, -93.786],
|
||||
[24.966, 112.0, -18.214],
|
||||
],
|
||||
)
|
||||
/ 255.0
|
||||
+ [16, 128, 128]
|
||||
)
|
||||
if in_img_type == np.uint8:
|
||||
rlt = rlt.round()
|
||||
else:
|
||||
@ -588,14 +592,18 @@ def ycbcr2rgb(img):
|
||||
if in_img_type != np.uint8:
|
||||
img *= 255.0
|
||||
# convert
|
||||
rlt = np.matmul(
|
||||
img,
|
||||
[
|
||||
[0.00456621, 0.00456621, 0.00456621],
|
||||
[0, -0.00153632, 0.00791071],
|
||||
[0.00625893, -0.00318811, 0],
|
||||
],
|
||||
) * 255.0 + [-222.921, 135.576, -276.836]
|
||||
rlt = (
|
||||
np.matmul(
|
||||
img,
|
||||
[
|
||||
[0.00456621, 0.00456621, 0.00456621],
|
||||
[0, -0.00153632, 0.00791071],
|
||||
[0.00625893, -0.00318811, 0],
|
||||
],
|
||||
)
|
||||
* 255.0
|
||||
+ [-222.921, 135.576, -276.836]
|
||||
)
|
||||
if in_img_type == np.uint8:
|
||||
rlt = rlt.round()
|
||||
else:
|
||||
@ -618,14 +626,18 @@ def bgr2ycbcr(img, only_y=True):
|
||||
if only_y:
|
||||
rlt = np.dot(img, [24.966, 128.553, 65.481]) / 255.0 + 16.0
|
||||
else:
|
||||
rlt = np.matmul(
|
||||
img,
|
||||
[
|
||||
[24.966, 112.0, -18.214],
|
||||
[128.553, -74.203, -93.786],
|
||||
[65.481, -37.797, 112.0],
|
||||
],
|
||||
) / 255.0 + [16, 128, 128]
|
||||
rlt = (
|
||||
np.matmul(
|
||||
img,
|
||||
[
|
||||
[24.966, 112.0, -18.214],
|
||||
[128.553, -74.203, -93.786],
|
||||
[65.481, -37.797, 112.0],
|
||||
],
|
||||
)
|
||||
/ 255.0
|
||||
+ [16, 128, 128]
|
||||
)
|
||||
if in_img_type == np.uint8:
|
||||
rlt = rlt.round()
|
||||
else:
|
||||
@ -716,11 +728,11 @@ def ssim(img1, img2):
|
||||
|
||||
mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5] # valid
|
||||
mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
|
||||
mu1_sq = mu1**2
|
||||
mu2_sq = mu2**2
|
||||
mu1_sq = mu1 ** 2
|
||||
mu2_sq = mu2 ** 2
|
||||
mu1_mu2 = mu1 * mu2
|
||||
sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
|
||||
sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
|
||||
sigma1_sq = cv2.filter2D(img1 ** 2, -1, window)[5:-5, 5:-5] - mu1_sq
|
||||
sigma2_sq = cv2.filter2D(img2 ** 2, -1, window)[5:-5, 5:-5] - mu2_sq
|
||||
sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
|
||||
|
||||
ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
|
||||
@ -737,8 +749,8 @@ def ssim(img1, img2):
|
||||
# matlab 'imresize' function, now only support 'bicubic'
|
||||
def cubic(x):
|
||||
absx = torch.abs(x)
|
||||
absx2 = absx**2
|
||||
absx3 = absx**3
|
||||
absx2 = absx ** 2
|
||||
absx3 = absx ** 3
|
||||
return (1.5 * absx3 - 2.5 * absx2 + 1) * ((absx <= 1).type_as(absx)) + (
|
||||
-0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2
|
||||
) * (((absx > 1) * (absx <= 2)).type_as(absx))
|
||||
|
@ -475,10 +475,7 @@ class TextualInversionDataset(Dataset):
|
||||
|
||||
if self.center_crop:
|
||||
crop = min(img.shape[0], img.shape[1])
|
||||
(
|
||||
h,
|
||||
w,
|
||||
) = (
|
||||
(h, w,) = (
|
||||
img.shape[0],
|
||||
img.shape[1],
|
||||
)
|
||||
|
@ -1,11 +1,11 @@
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import diffusers
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from diffusers.configuration_utils import ConfigMixin, register_to_config
|
||||
from diffusers.loaders import FromOriginalControlnetMixin
|
||||
from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor
|
||||
from diffusers.models.controlnet import ControlNetConditioningEmbedding, ControlNetOutput, zero_module
|
||||
from diffusers.models.embeddings import (
|
||||
TextImageProjection,
|
||||
TextImageTimeEmbedding,
|
||||
@ -14,16 +14,9 @@ from diffusers.models.embeddings import (
|
||||
Timesteps,
|
||||
)
|
||||
from diffusers.models.modeling_utils import ModelMixin
|
||||
from diffusers.models.unet_2d_blocks import (
|
||||
CrossAttnDownBlock2D,
|
||||
DownBlock2D,
|
||||
UNetMidBlock2DCrossAttn,
|
||||
get_down_block,
|
||||
)
|
||||
from diffusers.models.unet_2d_blocks import CrossAttnDownBlock2D, DownBlock2D, UNetMidBlock2DCrossAttn, get_down_block
|
||||
from diffusers.models.unet_2d_condition import UNet2DConditionModel
|
||||
|
||||
import diffusers
|
||||
from diffusers.models.controlnet import ControlNetConditioningEmbedding, ControlNetOutput, zero_module
|
||||
from torch import nn
|
||||
|
||||
from invokeai.backend.util.logging import InvokeAILogger
|
||||
|
||||
@ -45,7 +38,8 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
|
||||
Whether to flip the sin to cos in the time embedding.
|
||||
freq_shift (`int`, defaults to 0):
|
||||
The frequency shift to apply to the time embedding.
|
||||
down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
|
||||
down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", \
|
||||
"CrossAttnDownBlock2D", "DownBlock2D")`):
|
||||
The tuple of downsample blocks to use.
|
||||
only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
|
||||
block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
|
||||
@ -147,7 +141,9 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
|
||||
# If `num_attention_heads` is not defined (which is the case for most models)
|
||||
# it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
|
||||
# The reason for this behavior is to correct for incorrectly named variables that were introduced
|
||||
# when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
|
||||
# when this library was created...
|
||||
# The incorrect naming was only discovered much ...
|
||||
# later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
|
||||
# Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
|
||||
# which is why we correct for the naming here.
|
||||
num_attention_heads = num_attention_heads or attention_head_dim
|
||||
@ -155,17 +151,20 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
|
||||
# Check inputs
|
||||
if len(block_out_channels) != len(down_block_types):
|
||||
raise ValueError(
|
||||
f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
|
||||
f"Must provide the same number of `block_out_channels` as `down_block_types`. \
|
||||
`block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
|
||||
)
|
||||
|
||||
if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
|
||||
raise ValueError(
|
||||
f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
|
||||
f"Must provide the same number of `only_cross_attention` as `down_block_types`. \
|
||||
`only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
|
||||
)
|
||||
|
||||
if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
|
||||
raise ValueError(
|
||||
f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
|
||||
f"Must provide the same number of `num_attention_heads` as `down_block_types`. \
|
||||
`num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
|
||||
)
|
||||
|
||||
if isinstance(transformer_layers_per_block, int):
|
||||
@ -202,7 +201,8 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
|
||||
self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
|
||||
elif encoder_hid_dim_type == "text_image_proj":
|
||||
# image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
|
||||
# they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
|
||||
# they are set to `cross_attention_dim` here as this is exactly the required dimension ...
|
||||
# for the currently only use
|
||||
# case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
|
||||
self.encoder_hid_proj = TextImageProjection(
|
||||
text_embed_dim=encoder_hid_dim,
|
||||
@ -250,8 +250,10 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
|
||||
text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
|
||||
)
|
||||
elif addition_embed_type == "text_image":
|
||||
# text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
|
||||
# they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
|
||||
# text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`.
|
||||
# To not clutter the __init__ too much
|
||||
# they are set to `cross_attention_dim` here as this is exactly the required dimension...
|
||||
# for the currently only use
|
||||
# case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
|
||||
self.add_embedding = TextImageTimeEmbedding(
|
||||
text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
|
||||
@ -673,12 +675,14 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
|
||||
elif self.config.addition_embed_type == "text_time":
|
||||
if "text_embeds" not in added_cond_kwargs:
|
||||
raise ValueError(
|
||||
f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
|
||||
f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which \
|
||||
requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
|
||||
)
|
||||
text_embeds = added_cond_kwargs.get("text_embeds")
|
||||
if "time_ids" not in added_cond_kwargs:
|
||||
raise ValueError(
|
||||
f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
|
||||
f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which \
|
||||
requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
|
||||
)
|
||||
time_ids = added_cond_kwargs.get("time_ids")
|
||||
time_embeds = self.add_time_proj(time_ids.flatten())
|
||||
@ -761,3 +765,64 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
|
||||
|
||||
diffusers.ControlNetModel = ControlNetModel
|
||||
diffusers.models.controlnet.ControlNetModel = ControlNetModel
|
||||
|
||||
|
||||
# patch LoRACompatibleConv to use original Conv2D forward function
|
||||
# this needed to make work seamless patch
|
||||
# NOTE: with this patch, torch.compile crashes on 2.0 torch(already fixed in nightly)
|
||||
# https://github.com/huggingface/diffusers/pull/4315
|
||||
# https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/lora.py#L96C18-L96C18
|
||||
def new_LoRACompatibleConv_forward(self, x):
|
||||
if self.lora_layer is None:
|
||||
return super(diffusers.models.lora.LoRACompatibleConv, self).forward(x)
|
||||
else:
|
||||
return super(diffusers.models.lora.LoRACompatibleConv, self).forward(x) + self.lora_layer(x)
|
||||
|
||||
|
||||
diffusers.models.lora.LoRACompatibleConv.forward = new_LoRACompatibleConv_forward
|
||||
|
||||
try:
|
||||
import xformers
|
||||
|
||||
xformers_available = True
|
||||
except Exception:
|
||||
xformers_available = False
|
||||
|
||||
|
||||
if xformers_available:
|
||||
# TODO: remove when fixed in diffusers
|
||||
_xformers_memory_efficient_attention = xformers.ops.memory_efficient_attention
|
||||
|
||||
def new_memory_efficient_attention(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_bias=None,
|
||||
p: float = 0.0,
|
||||
scale: Optional[float] = None,
|
||||
*,
|
||||
op=None,
|
||||
):
|
||||
# diffusers not align shape to 8, which is required by xformers
|
||||
if attn_bias is not None and type(attn_bias) is torch.Tensor:
|
||||
orig_size = attn_bias.shape[-1]
|
||||
new_size = ((orig_size + 7) // 8) * 8
|
||||
aligned_attn_bias = torch.zeros(
|
||||
(attn_bias.shape[0], attn_bias.shape[1], new_size),
|
||||
device=attn_bias.device,
|
||||
dtype=attn_bias.dtype,
|
||||
)
|
||||
aligned_attn_bias[:, :, :orig_size] = attn_bias
|
||||
attn_bias = aligned_attn_bias[:, :, :orig_size]
|
||||
|
||||
return _xformers_memory_efficient_attention(
|
||||
query=query,
|
||||
key=key,
|
||||
value=value,
|
||||
attn_bias=attn_bias,
|
||||
p=p,
|
||||
scale=scale,
|
||||
op=op,
|
||||
)
|
||||
|
||||
xformers.ops.memory_efficient_attention = new_memory_efficient_attention
|
||||
|
@ -203,7 +203,7 @@ class ChunkedSlicedAttnProcessor:
|
||||
if attn.upcast_attention:
|
||||
out_item_size = 4
|
||||
|
||||
chunk_size = 2**29
|
||||
chunk_size = 2 ** 29
|
||||
|
||||
out_size = query.shape[1] * key.shape[1] * out_item_size
|
||||
chunks_count = min(query.shape[1], math.ceil((out_size - 1) / chunk_size))
|
||||
|
@ -207,7 +207,7 @@ def parallel_data_prefetch(
|
||||
return gather_res
|
||||
|
||||
|
||||
def rand_perlin_2d(shape, res, device, fade=lambda t: 6 * t**5 - 15 * t**4 + 10 * t**3):
|
||||
def rand_perlin_2d(shape, res, device, fade=lambda t: 6 * t ** 5 - 15 * t ** 4 + 10 * t ** 3):
|
||||
delta = (res[0] / shape[0], res[1] / shape[1])
|
||||
d = (shape[0] // res[0], shape[1] // res[1])
|
||||
|
||||
|
Reference in New Issue
Block a user