mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
Merge branch 'main' into lstein/feat/simple-mm2-api
This commit is contained in:
commit
7d19af2caa
98
invokeai/app/invocations/blend_latents.py
Normal file
98
invokeai/app/invocations/blend_latents.py
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import numpy.typing as npt
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
|
||||||
|
from invokeai.app.invocations.fields import FieldDescriptions, Input, InputField, LatentsField
|
||||||
|
from invokeai.app.invocations.primitives import LatentsOutput
|
||||||
|
from invokeai.app.services.shared.invocation_context import InvocationContext
|
||||||
|
from invokeai.backend.util.devices import TorchDevice
|
||||||
|
|
||||||
|
|
||||||
|
@invocation(
|
||||||
|
"lblend",
|
||||||
|
title="Blend Latents",
|
||||||
|
tags=["latents", "blend"],
|
||||||
|
category="latents",
|
||||||
|
version="1.0.3",
|
||||||
|
)
|
||||||
|
class BlendLatentsInvocation(BaseInvocation):
|
||||||
|
"""Blend two latents using a given alpha. Latents must have same size."""
|
||||||
|
|
||||||
|
latents_a: LatentsField = InputField(
|
||||||
|
description=FieldDescriptions.latents,
|
||||||
|
input=Input.Connection,
|
||||||
|
)
|
||||||
|
latents_b: LatentsField = InputField(
|
||||||
|
description=FieldDescriptions.latents,
|
||||||
|
input=Input.Connection,
|
||||||
|
)
|
||||||
|
alpha: float = InputField(default=0.5, description=FieldDescriptions.blend_alpha)
|
||||||
|
|
||||||
|
def invoke(self, context: InvocationContext) -> LatentsOutput:
|
||||||
|
latents_a = context.tensors.load(self.latents_a.latents_name)
|
||||||
|
latents_b = context.tensors.load(self.latents_b.latents_name)
|
||||||
|
|
||||||
|
if latents_a.shape != latents_b.shape:
|
||||||
|
raise Exception("Latents to blend must be the same size.")
|
||||||
|
|
||||||
|
device = TorchDevice.choose_torch_device()
|
||||||
|
|
||||||
|
def slerp(
|
||||||
|
t: Union[float, npt.NDArray[Any]], # FIXME: maybe use np.float32 here?
|
||||||
|
v0: Union[torch.Tensor, npt.NDArray[Any]],
|
||||||
|
v1: Union[torch.Tensor, npt.NDArray[Any]],
|
||||||
|
DOT_THRESHOLD: float = 0.9995,
|
||||||
|
) -> Union[torch.Tensor, npt.NDArray[Any]]:
|
||||||
|
"""
|
||||||
|
Spherical linear interpolation
|
||||||
|
Args:
|
||||||
|
t (float/np.ndarray): Float value between 0.0 and 1.0
|
||||||
|
v0 (np.ndarray): Starting vector
|
||||||
|
v1 (np.ndarray): Final vector
|
||||||
|
DOT_THRESHOLD (float): Threshold for considering the two vectors as
|
||||||
|
colineal. Not recommended to alter this.
|
||||||
|
Returns:
|
||||||
|
v2 (np.ndarray): Interpolation vector between v0 and v1
|
||||||
|
"""
|
||||||
|
inputs_are_torch = False
|
||||||
|
if not isinstance(v0, np.ndarray):
|
||||||
|
inputs_are_torch = True
|
||||||
|
v0 = v0.detach().cpu().numpy()
|
||||||
|
if not isinstance(v1, np.ndarray):
|
||||||
|
inputs_are_torch = True
|
||||||
|
v1 = v1.detach().cpu().numpy()
|
||||||
|
|
||||||
|
dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
|
||||||
|
if np.abs(dot) > DOT_THRESHOLD:
|
||||||
|
v2 = (1 - t) * v0 + t * v1
|
||||||
|
else:
|
||||||
|
theta_0 = np.arccos(dot)
|
||||||
|
sin_theta_0 = np.sin(theta_0)
|
||||||
|
theta_t = theta_0 * t
|
||||||
|
sin_theta_t = np.sin(theta_t)
|
||||||
|
s0 = np.sin(theta_0 - theta_t) / sin_theta_0
|
||||||
|
s1 = sin_theta_t / sin_theta_0
|
||||||
|
v2 = s0 * v0 + s1 * v1
|
||||||
|
|
||||||
|
if inputs_are_torch:
|
||||||
|
v2_torch: torch.Tensor = torch.from_numpy(v2).to(device)
|
||||||
|
return v2_torch
|
||||||
|
else:
|
||||||
|
assert isinstance(v2, np.ndarray)
|
||||||
|
return v2
|
||||||
|
|
||||||
|
# blend
|
||||||
|
bl = slerp(self.alpha, latents_a, latents_b)
|
||||||
|
assert isinstance(bl, torch.Tensor)
|
||||||
|
blended_latents: torch.Tensor = bl # for type checking convenience
|
||||||
|
|
||||||
|
# https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699
|
||||||
|
blended_latents = blended_latents.to("cpu")
|
||||||
|
|
||||||
|
TorchDevice.empty_cache()
|
||||||
|
|
||||||
|
name = context.tensors.save(tensor=blended_latents)
|
||||||
|
return LatentsOutput.build(latents_name=name, latents=blended_latents, seed=self.latents_a.seed)
|
80
invokeai/app/invocations/create_denoise_mask.py
Normal file
80
invokeai/app/invocations/create_denoise_mask.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torchvision.transforms as T
|
||||||
|
from PIL import Image
|
||||||
|
from torchvision.transforms.functional import resize as tv_resize
|
||||||
|
|
||||||
|
from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
|
||||||
|
from invokeai.app.invocations.denoise_latents import DEFAULT_PRECISION
|
||||||
|
from invokeai.app.invocations.fields import FieldDescriptions, ImageField, Input, InputField
|
||||||
|
from invokeai.app.invocations.image_to_latents import ImageToLatentsInvocation
|
||||||
|
from invokeai.app.invocations.model import VAEField
|
||||||
|
from invokeai.app.invocations.primitives import DenoiseMaskOutput
|
||||||
|
from invokeai.app.services.shared.invocation_context import InvocationContext
|
||||||
|
from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor
|
||||||
|
|
||||||
|
|
||||||
|
@invocation(
|
||||||
|
"create_denoise_mask",
|
||||||
|
title="Create Denoise Mask",
|
||||||
|
tags=["mask", "denoise"],
|
||||||
|
category="latents",
|
||||||
|
version="1.0.2",
|
||||||
|
)
|
||||||
|
class CreateDenoiseMaskInvocation(BaseInvocation):
|
||||||
|
"""Creates mask for denoising model run."""
|
||||||
|
|
||||||
|
vae: VAEField = InputField(description=FieldDescriptions.vae, input=Input.Connection, ui_order=0)
|
||||||
|
image: Optional[ImageField] = InputField(default=None, description="Image which will be masked", ui_order=1)
|
||||||
|
mask: ImageField = InputField(description="The mask to use when pasting", ui_order=2)
|
||||||
|
tiled: bool = InputField(default=False, description=FieldDescriptions.tiled, ui_order=3)
|
||||||
|
fp32: bool = InputField(
|
||||||
|
default=DEFAULT_PRECISION == "float32",
|
||||||
|
description=FieldDescriptions.fp32,
|
||||||
|
ui_order=4,
|
||||||
|
)
|
||||||
|
|
||||||
|
def prep_mask_tensor(self, mask_image: Image.Image) -> torch.Tensor:
|
||||||
|
if mask_image.mode != "L":
|
||||||
|
mask_image = mask_image.convert("L")
|
||||||
|
mask_tensor: torch.Tensor = image_resized_to_grid_as_tensor(mask_image, normalize=False)
|
||||||
|
if mask_tensor.dim() == 3:
|
||||||
|
mask_tensor = mask_tensor.unsqueeze(0)
|
||||||
|
# if shape is not None:
|
||||||
|
# mask_tensor = tv_resize(mask_tensor, shape, T.InterpolationMode.BILINEAR)
|
||||||
|
return mask_tensor
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def invoke(self, context: InvocationContext) -> DenoiseMaskOutput:
|
||||||
|
if self.image is not None:
|
||||||
|
image = context.images.get_pil(self.image.image_name)
|
||||||
|
image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
|
||||||
|
if image_tensor.dim() == 3:
|
||||||
|
image_tensor = image_tensor.unsqueeze(0)
|
||||||
|
else:
|
||||||
|
image_tensor = None
|
||||||
|
|
||||||
|
mask = self.prep_mask_tensor(
|
||||||
|
context.images.get_pil(self.mask.image_name),
|
||||||
|
)
|
||||||
|
|
||||||
|
if image_tensor is not None:
|
||||||
|
vae_info = context.models.load(self.vae.vae)
|
||||||
|
|
||||||
|
img_mask = tv_resize(mask, image_tensor.shape[-2:], T.InterpolationMode.BILINEAR, antialias=False)
|
||||||
|
masked_image = image_tensor * torch.where(img_mask < 0.5, 0.0, 1.0)
|
||||||
|
# TODO:
|
||||||
|
masked_latents = ImageToLatentsInvocation.vae_encode(vae_info, self.fp32, self.tiled, masked_image.clone())
|
||||||
|
|
||||||
|
masked_latents_name = context.tensors.save(tensor=masked_latents)
|
||||||
|
else:
|
||||||
|
masked_latents_name = None
|
||||||
|
|
||||||
|
mask_name = context.tensors.save(tensor=mask)
|
||||||
|
|
||||||
|
return DenoiseMaskOutput.build(
|
||||||
|
mask_name=mask_name,
|
||||||
|
masked_latents_name=masked_latents_name,
|
||||||
|
gradient=False,
|
||||||
|
)
|
138
invokeai/app/invocations/create_gradient_mask.py
Normal file
138
invokeai/app/invocations/create_gradient_mask.py
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
from typing import Literal, Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torchvision.transforms as T
|
||||||
|
from PIL import Image, ImageFilter
|
||||||
|
from torchvision.transforms.functional import resize as tv_resize
|
||||||
|
|
||||||
|
from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
|
||||||
|
from invokeai.app.invocations.denoise_latents import DEFAULT_PRECISION
|
||||||
|
from invokeai.app.invocations.fields import (
|
||||||
|
DenoiseMaskField,
|
||||||
|
FieldDescriptions,
|
||||||
|
ImageField,
|
||||||
|
Input,
|
||||||
|
InputField,
|
||||||
|
OutputField,
|
||||||
|
)
|
||||||
|
from invokeai.app.invocations.image_to_latents import ImageToLatentsInvocation
|
||||||
|
from invokeai.app.invocations.model import UNetField, VAEField
|
||||||
|
from invokeai.app.services.shared.invocation_context import InvocationContext
|
||||||
|
from invokeai.backend.model_manager import LoadedModel
|
||||||
|
from invokeai.backend.model_manager.config import MainConfigBase, ModelVariantType
|
||||||
|
from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor
|
||||||
|
|
||||||
|
|
||||||
|
@invocation_output("gradient_mask_output")
|
||||||
|
class GradientMaskOutput(BaseInvocationOutput):
|
||||||
|
"""Outputs a denoise mask and an image representing the total gradient of the mask."""
|
||||||
|
|
||||||
|
denoise_mask: DenoiseMaskField = OutputField(description="Mask for denoise model run")
|
||||||
|
expanded_mask_area: ImageField = OutputField(
|
||||||
|
description="Image representing the total gradient area of the mask. For paste-back purposes."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@invocation(
|
||||||
|
"create_gradient_mask",
|
||||||
|
title="Create Gradient Mask",
|
||||||
|
tags=["mask", "denoise"],
|
||||||
|
category="latents",
|
||||||
|
version="1.1.0",
|
||||||
|
)
|
||||||
|
class CreateGradientMaskInvocation(BaseInvocation):
|
||||||
|
"""Creates mask for denoising model run."""
|
||||||
|
|
||||||
|
mask: ImageField = InputField(default=None, description="Image which will be masked", ui_order=1)
|
||||||
|
edge_radius: int = InputField(
|
||||||
|
default=16, ge=0, description="How far to blur/expand the edges of the mask", ui_order=2
|
||||||
|
)
|
||||||
|
coherence_mode: Literal["Gaussian Blur", "Box Blur", "Staged"] = InputField(default="Gaussian Blur", ui_order=3)
|
||||||
|
minimum_denoise: float = InputField(
|
||||||
|
default=0.0, ge=0, le=1, description="Minimum denoise level for the coherence region", ui_order=4
|
||||||
|
)
|
||||||
|
image: Optional[ImageField] = InputField(
|
||||||
|
default=None,
|
||||||
|
description="OPTIONAL: Only connect for specialized Inpainting models, masked_latents will be generated from the image with the VAE",
|
||||||
|
title="[OPTIONAL] Image",
|
||||||
|
ui_order=6,
|
||||||
|
)
|
||||||
|
unet: Optional[UNetField] = InputField(
|
||||||
|
description="OPTIONAL: If the Unet is a specialized Inpainting model, masked_latents will be generated from the image with the VAE",
|
||||||
|
default=None,
|
||||||
|
input=Input.Connection,
|
||||||
|
title="[OPTIONAL] UNet",
|
||||||
|
ui_order=5,
|
||||||
|
)
|
||||||
|
vae: Optional[VAEField] = InputField(
|
||||||
|
default=None,
|
||||||
|
description="OPTIONAL: Only connect for specialized Inpainting models, masked_latents will be generated from the image with the VAE",
|
||||||
|
title="[OPTIONAL] VAE",
|
||||||
|
input=Input.Connection,
|
||||||
|
ui_order=7,
|
||||||
|
)
|
||||||
|
tiled: bool = InputField(default=False, description=FieldDescriptions.tiled, ui_order=8)
|
||||||
|
fp32: bool = InputField(
|
||||||
|
default=DEFAULT_PRECISION == "float32",
|
||||||
|
description=FieldDescriptions.fp32,
|
||||||
|
ui_order=9,
|
||||||
|
)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def invoke(self, context: InvocationContext) -> GradientMaskOutput:
|
||||||
|
mask_image = context.images.get_pil(self.mask.image_name, mode="L")
|
||||||
|
if self.edge_radius > 0:
|
||||||
|
if self.coherence_mode == "Box Blur":
|
||||||
|
blur_mask = mask_image.filter(ImageFilter.BoxBlur(self.edge_radius))
|
||||||
|
else: # Gaussian Blur OR Staged
|
||||||
|
# Gaussian Blur uses standard deviation. 1/2 radius is a good approximation
|
||||||
|
blur_mask = mask_image.filter(ImageFilter.GaussianBlur(self.edge_radius / 2))
|
||||||
|
|
||||||
|
blur_tensor: torch.Tensor = image_resized_to_grid_as_tensor(blur_mask, normalize=False)
|
||||||
|
|
||||||
|
# redistribute blur so that the original edges are 0 and blur outwards to 1
|
||||||
|
blur_tensor = (blur_tensor - 0.5) * 2
|
||||||
|
|
||||||
|
threshold = 1 - self.minimum_denoise
|
||||||
|
|
||||||
|
if self.coherence_mode == "Staged":
|
||||||
|
# wherever the blur_tensor is less than fully masked, convert it to threshold
|
||||||
|
blur_tensor = torch.where((blur_tensor < 1) & (blur_tensor > 0), threshold, blur_tensor)
|
||||||
|
else:
|
||||||
|
# wherever the blur_tensor is above threshold but less than 1, drop it to threshold
|
||||||
|
blur_tensor = torch.where((blur_tensor > threshold) & (blur_tensor < 1), threshold, blur_tensor)
|
||||||
|
|
||||||
|
else:
|
||||||
|
blur_tensor: torch.Tensor = image_resized_to_grid_as_tensor(mask_image, normalize=False)
|
||||||
|
|
||||||
|
mask_name = context.tensors.save(tensor=blur_tensor.unsqueeze(1))
|
||||||
|
|
||||||
|
# compute a [0, 1] mask from the blur_tensor
|
||||||
|
expanded_mask = torch.where((blur_tensor < 1), 0, 1)
|
||||||
|
expanded_mask_image = Image.fromarray((expanded_mask.squeeze(0).numpy() * 255).astype(np.uint8), mode="L")
|
||||||
|
expanded_image_dto = context.images.save(expanded_mask_image)
|
||||||
|
|
||||||
|
masked_latents_name = None
|
||||||
|
if self.unet is not None and self.vae is not None and self.image is not None:
|
||||||
|
# all three fields must be present at the same time
|
||||||
|
main_model_config = context.models.get_config(self.unet.unet.key)
|
||||||
|
assert isinstance(main_model_config, MainConfigBase)
|
||||||
|
if main_model_config.variant is ModelVariantType.Inpaint:
|
||||||
|
mask = blur_tensor
|
||||||
|
vae_info: LoadedModel = context.models.load(self.vae.vae)
|
||||||
|
image = context.images.get_pil(self.image.image_name)
|
||||||
|
image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
|
||||||
|
if image_tensor.dim() == 3:
|
||||||
|
image_tensor = image_tensor.unsqueeze(0)
|
||||||
|
img_mask = tv_resize(mask, image_tensor.shape[-2:], T.InterpolationMode.BILINEAR, antialias=False)
|
||||||
|
masked_image = image_tensor * torch.where(img_mask < 0.5, 0.0, 1.0)
|
||||||
|
masked_latents = ImageToLatentsInvocation.vae_encode(
|
||||||
|
vae_info, self.fp32, self.tiled, masked_image.clone()
|
||||||
|
)
|
||||||
|
masked_latents_name = context.tensors.save(tensor=masked_latents)
|
||||||
|
|
||||||
|
return GradientMaskOutput(
|
||||||
|
denoise_mask=DenoiseMaskField(mask_name=mask_name, masked_latents_name=masked_latents_name, gradient=True),
|
||||||
|
expanded_mask_area=ImageField(image_name=expanded_image_dto.image_name),
|
||||||
|
)
|
61
invokeai/app/invocations/crop_latents.py
Normal file
61
invokeai/app/invocations/crop_latents.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
|
||||||
|
from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
|
||||||
|
from invokeai.app.invocations.fields import FieldDescriptions, Input, InputField, LatentsField
|
||||||
|
from invokeai.app.invocations.primitives import LatentsOutput
|
||||||
|
from invokeai.app.services.shared.invocation_context import InvocationContext
|
||||||
|
|
||||||
|
|
||||||
|
# The Crop Latents node was copied from @skunkworxdark's implementation here:
|
||||||
|
# https://github.com/skunkworxdark/XYGrid_nodes/blob/74647fa9c1fa57d317a94bd43ca689af7f0aae5e/images_to_grids.py#L1117C1-L1167C80
|
||||||
|
@invocation(
|
||||||
|
"crop_latents",
|
||||||
|
title="Crop Latents",
|
||||||
|
tags=["latents", "crop"],
|
||||||
|
category="latents",
|
||||||
|
version="1.0.2",
|
||||||
|
)
|
||||||
|
# TODO(ryand): Named `CropLatentsCoreInvocation` to prevent a conflict with custom node `CropLatentsInvocation`.
|
||||||
|
# Currently, if the class names conflict then 'GET /openapi.json' fails.
|
||||||
|
class CropLatentsCoreInvocation(BaseInvocation):
|
||||||
|
"""Crops a latent-space tensor to a box specified in image-space. The box dimensions and coordinates must be
|
||||||
|
divisible by the latent scale factor of 8.
|
||||||
|
"""
|
||||||
|
|
||||||
|
latents: LatentsField = InputField(
|
||||||
|
description=FieldDescriptions.latents,
|
||||||
|
input=Input.Connection,
|
||||||
|
)
|
||||||
|
x: int = InputField(
|
||||||
|
ge=0,
|
||||||
|
multiple_of=LATENT_SCALE_FACTOR,
|
||||||
|
description="The left x coordinate (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
|
||||||
|
)
|
||||||
|
y: int = InputField(
|
||||||
|
ge=0,
|
||||||
|
multiple_of=LATENT_SCALE_FACTOR,
|
||||||
|
description="The top y coordinate (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
|
||||||
|
)
|
||||||
|
width: int = InputField(
|
||||||
|
ge=1,
|
||||||
|
multiple_of=LATENT_SCALE_FACTOR,
|
||||||
|
description="The width (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
|
||||||
|
)
|
||||||
|
height: int = InputField(
|
||||||
|
ge=1,
|
||||||
|
multiple_of=LATENT_SCALE_FACTOR,
|
||||||
|
description="The height (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
|
||||||
|
)
|
||||||
|
|
||||||
|
def invoke(self, context: InvocationContext) -> LatentsOutput:
|
||||||
|
latents = context.tensors.load(self.latents.latents_name)
|
||||||
|
|
||||||
|
x1 = self.x // LATENT_SCALE_FACTOR
|
||||||
|
y1 = self.y // LATENT_SCALE_FACTOR
|
||||||
|
x2 = x1 + (self.width // LATENT_SCALE_FACTOR)
|
||||||
|
y2 = y1 + (self.height // LATENT_SCALE_FACTOR)
|
||||||
|
|
||||||
|
cropped_latents = latents[..., y1:y2, x1:x2]
|
||||||
|
|
||||||
|
name = context.tensors.save(tensor=cropped_latents)
|
||||||
|
|
||||||
|
return LatentsOutput.build(latents_name=name, latents=cropped_latents)
|
@ -1,32 +1,17 @@
|
|||||||
# Copyright (c) 2023 Kyle Schouviller (https://github.com/kyle0654)
|
# Copyright (c) 2023 Kyle Schouviller (https://github.com/kyle0654)
|
||||||
import inspect
|
import inspect
|
||||||
import math
|
|
||||||
from contextlib import ExitStack
|
from contextlib import ExitStack
|
||||||
from functools import singledispatchmethod
|
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
||||||
from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union
|
|
||||||
|
|
||||||
import einops
|
|
||||||
import numpy as np
|
|
||||||
import numpy.typing as npt
|
|
||||||
import torch
|
import torch
|
||||||
import torchvision
|
import torchvision
|
||||||
import torchvision.transforms as T
|
import torchvision.transforms as T
|
||||||
from diffusers.configuration_utils import ConfigMixin
|
from diffusers.configuration_utils import ConfigMixin
|
||||||
from diffusers.image_processor import VaeImageProcessor
|
|
||||||
from diffusers.models.adapter import T2IAdapter
|
from diffusers.models.adapter import T2IAdapter
|
||||||
from diffusers.models.attention_processor import (
|
|
||||||
AttnProcessor2_0,
|
|
||||||
LoRAAttnProcessor2_0,
|
|
||||||
LoRAXFormersAttnProcessor,
|
|
||||||
XFormersAttnProcessor,
|
|
||||||
)
|
|
||||||
from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
|
|
||||||
from diffusers.models.autoencoders.autoencoder_tiny import AutoencoderTiny
|
|
||||||
from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
|
from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
|
||||||
from diffusers.schedulers.scheduling_dpmsolver_sde import DPMSolverSDEScheduler
|
from diffusers.schedulers.scheduling_dpmsolver_sde import DPMSolverSDEScheduler
|
||||||
from diffusers.schedulers.scheduling_tcd import TCDScheduler
|
from diffusers.schedulers.scheduling_tcd import TCDScheduler
|
||||||
from diffusers.schedulers.scheduling_utils import SchedulerMixin as Scheduler
|
from diffusers.schedulers.scheduling_utils import SchedulerMixin as Scheduler
|
||||||
from PIL import Image, ImageFilter
|
|
||||||
from pydantic import field_validator
|
from pydantic import field_validator
|
||||||
from torchvision.transforms.functional import resize as tv_resize
|
from torchvision.transforms.functional import resize as tv_resize
|
||||||
from transformers import CLIPVisionModelWithProjection
|
from transformers import CLIPVisionModelWithProjection
|
||||||
@ -36,24 +21,19 @@ from invokeai.app.invocations.fields import (
|
|||||||
ConditioningField,
|
ConditioningField,
|
||||||
DenoiseMaskField,
|
DenoiseMaskField,
|
||||||
FieldDescriptions,
|
FieldDescriptions,
|
||||||
ImageField,
|
|
||||||
Input,
|
Input,
|
||||||
InputField,
|
InputField,
|
||||||
LatentsField,
|
LatentsField,
|
||||||
OutputField,
|
|
||||||
UIType,
|
UIType,
|
||||||
WithBoard,
|
|
||||||
WithMetadata,
|
|
||||||
)
|
)
|
||||||
from invokeai.app.invocations.ip_adapter import IPAdapterField
|
from invokeai.app.invocations.ip_adapter import IPAdapterField
|
||||||
from invokeai.app.invocations.primitives import DenoiseMaskOutput, ImageOutput, LatentsOutput
|
from invokeai.app.invocations.primitives import LatentsOutput
|
||||||
from invokeai.app.invocations.t2i_adapter import T2IAdapterField
|
from invokeai.app.invocations.t2i_adapter import T2IAdapterField
|
||||||
from invokeai.app.services.shared.invocation_context import InvocationContext
|
from invokeai.app.services.shared.invocation_context import InvocationContext
|
||||||
from invokeai.app.util.controlnet_utils import prepare_control_image
|
from invokeai.app.util.controlnet_utils import prepare_control_image
|
||||||
from invokeai.backend.ip_adapter.ip_adapter import IPAdapter
|
from invokeai.backend.ip_adapter.ip_adapter import IPAdapter
|
||||||
from invokeai.backend.lora import LoRAModelRaw
|
from invokeai.backend.lora import LoRAModelRaw
|
||||||
from invokeai.backend.model_manager import BaseModelType, LoadedModel
|
from invokeai.backend.model_manager import BaseModelType
|
||||||
from invokeai.backend.model_manager.config import MainConfigBase, ModelVariantType
|
|
||||||
from invokeai.backend.model_patcher import ModelPatcher
|
from invokeai.backend.model_patcher import ModelPatcher
|
||||||
from invokeai.backend.stable_diffusion import PipelineIntermediateState, set_seamless
|
from invokeai.backend.stable_diffusion import PipelineIntermediateState, set_seamless
|
||||||
from invokeai.backend.stable_diffusion.diffusion.conditioning_data import (
|
from invokeai.backend.stable_diffusion.diffusion.conditioning_data import (
|
||||||
@ -72,221 +52,16 @@ from ...backend.stable_diffusion.diffusers_pipeline import (
|
|||||||
ControlNetData,
|
ControlNetData,
|
||||||
StableDiffusionGeneratorPipeline,
|
StableDiffusionGeneratorPipeline,
|
||||||
T2IAdapterData,
|
T2IAdapterData,
|
||||||
image_resized_to_grid_as_tensor,
|
|
||||||
)
|
)
|
||||||
from ...backend.stable_diffusion.schedulers import SCHEDULER_MAP
|
from ...backend.stable_diffusion.schedulers import SCHEDULER_MAP
|
||||||
from ...backend.util.devices import TorchDevice
|
from ...backend.util.devices import TorchDevice
|
||||||
from .baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
|
from .baseinvocation import BaseInvocation, invocation
|
||||||
from .controlnet_image_processors import ControlField
|
from .controlnet_image_processors import ControlField
|
||||||
from .model import ModelIdentifierField, UNetField, VAEField
|
from .model import ModelIdentifierField, UNetField
|
||||||
|
|
||||||
DEFAULT_PRECISION = TorchDevice.choose_torch_dtype()
|
DEFAULT_PRECISION = TorchDevice.choose_torch_dtype()
|
||||||
|
|
||||||
|
|
||||||
@invocation_output("scheduler_output")
|
|
||||||
class SchedulerOutput(BaseInvocationOutput):
|
|
||||||
scheduler: SCHEDULER_NAME_VALUES = OutputField(description=FieldDescriptions.scheduler, ui_type=UIType.Scheduler)
|
|
||||||
|
|
||||||
|
|
||||||
@invocation(
|
|
||||||
"scheduler",
|
|
||||||
title="Scheduler",
|
|
||||||
tags=["scheduler"],
|
|
||||||
category="latents",
|
|
||||||
version="1.0.0",
|
|
||||||
)
|
|
||||||
class SchedulerInvocation(BaseInvocation):
|
|
||||||
"""Selects a scheduler."""
|
|
||||||
|
|
||||||
scheduler: SCHEDULER_NAME_VALUES = InputField(
|
|
||||||
default="euler",
|
|
||||||
description=FieldDescriptions.scheduler,
|
|
||||||
ui_type=UIType.Scheduler,
|
|
||||||
)
|
|
||||||
|
|
||||||
def invoke(self, context: InvocationContext) -> SchedulerOutput:
|
|
||||||
return SchedulerOutput(scheduler=self.scheduler)
|
|
||||||
|
|
||||||
|
|
||||||
@invocation(
|
|
||||||
"create_denoise_mask",
|
|
||||||
title="Create Denoise Mask",
|
|
||||||
tags=["mask", "denoise"],
|
|
||||||
category="latents",
|
|
||||||
version="1.0.2",
|
|
||||||
)
|
|
||||||
class CreateDenoiseMaskInvocation(BaseInvocation):
|
|
||||||
"""Creates mask for denoising model run."""
|
|
||||||
|
|
||||||
vae: VAEField = InputField(description=FieldDescriptions.vae, input=Input.Connection, ui_order=0)
|
|
||||||
image: Optional[ImageField] = InputField(default=None, description="Image which will be masked", ui_order=1)
|
|
||||||
mask: ImageField = InputField(description="The mask to use when pasting", ui_order=2)
|
|
||||||
tiled: bool = InputField(default=False, description=FieldDescriptions.tiled, ui_order=3)
|
|
||||||
fp32: bool = InputField(
|
|
||||||
default=DEFAULT_PRECISION == "float32",
|
|
||||||
description=FieldDescriptions.fp32,
|
|
||||||
ui_order=4,
|
|
||||||
)
|
|
||||||
|
|
||||||
def prep_mask_tensor(self, mask_image: Image.Image) -> torch.Tensor:
|
|
||||||
if mask_image.mode != "L":
|
|
||||||
mask_image = mask_image.convert("L")
|
|
||||||
mask_tensor: torch.Tensor = image_resized_to_grid_as_tensor(mask_image, normalize=False)
|
|
||||||
if mask_tensor.dim() == 3:
|
|
||||||
mask_tensor = mask_tensor.unsqueeze(0)
|
|
||||||
# if shape is not None:
|
|
||||||
# mask_tensor = tv_resize(mask_tensor, shape, T.InterpolationMode.BILINEAR)
|
|
||||||
return mask_tensor
|
|
||||||
|
|
||||||
@torch.no_grad()
|
|
||||||
def invoke(self, context: InvocationContext) -> DenoiseMaskOutput:
|
|
||||||
if self.image is not None:
|
|
||||||
image = context.images.get_pil(self.image.image_name)
|
|
||||||
image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
|
|
||||||
if image_tensor.dim() == 3:
|
|
||||||
image_tensor = image_tensor.unsqueeze(0)
|
|
||||||
else:
|
|
||||||
image_tensor = None
|
|
||||||
|
|
||||||
mask = self.prep_mask_tensor(
|
|
||||||
context.images.get_pil(self.mask.image_name),
|
|
||||||
)
|
|
||||||
|
|
||||||
if image_tensor is not None:
|
|
||||||
vae_info = context.models.load(self.vae.vae)
|
|
||||||
|
|
||||||
img_mask = tv_resize(mask, image_tensor.shape[-2:], T.InterpolationMode.BILINEAR, antialias=False)
|
|
||||||
masked_image = image_tensor * torch.where(img_mask < 0.5, 0.0, 1.0)
|
|
||||||
# TODO:
|
|
||||||
masked_latents = ImageToLatentsInvocation.vae_encode(vae_info, self.fp32, self.tiled, masked_image.clone())
|
|
||||||
|
|
||||||
masked_latents_name = context.tensors.save(tensor=masked_latents)
|
|
||||||
else:
|
|
||||||
masked_latents_name = None
|
|
||||||
|
|
||||||
mask_name = context.tensors.save(tensor=mask)
|
|
||||||
|
|
||||||
return DenoiseMaskOutput.build(
|
|
||||||
mask_name=mask_name,
|
|
||||||
masked_latents_name=masked_latents_name,
|
|
||||||
gradient=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@invocation_output("gradient_mask_output")
|
|
||||||
class GradientMaskOutput(BaseInvocationOutput):
|
|
||||||
"""Outputs a denoise mask and an image representing the total gradient of the mask."""
|
|
||||||
|
|
||||||
denoise_mask: DenoiseMaskField = OutputField(description="Mask for denoise model run")
|
|
||||||
expanded_mask_area: ImageField = OutputField(
|
|
||||||
description="Image representing the total gradient area of the mask. For paste-back purposes."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@invocation(
|
|
||||||
"create_gradient_mask",
|
|
||||||
title="Create Gradient Mask",
|
|
||||||
tags=["mask", "denoise"],
|
|
||||||
category="latents",
|
|
||||||
version="1.1.0",
|
|
||||||
)
|
|
||||||
class CreateGradientMaskInvocation(BaseInvocation):
|
|
||||||
"""Creates mask for denoising model run."""
|
|
||||||
|
|
||||||
mask: ImageField = InputField(default=None, description="Image which will be masked", ui_order=1)
|
|
||||||
edge_radius: int = InputField(
|
|
||||||
default=16, ge=0, description="How far to blur/expand the edges of the mask", ui_order=2
|
|
||||||
)
|
|
||||||
coherence_mode: Literal["Gaussian Blur", "Box Blur", "Staged"] = InputField(default="Gaussian Blur", ui_order=3)
|
|
||||||
minimum_denoise: float = InputField(
|
|
||||||
default=0.0, ge=0, le=1, description="Minimum denoise level for the coherence region", ui_order=4
|
|
||||||
)
|
|
||||||
image: Optional[ImageField] = InputField(
|
|
||||||
default=None,
|
|
||||||
description="OPTIONAL: Only connect for specialized Inpainting models, masked_latents will be generated from the image with the VAE",
|
|
||||||
title="[OPTIONAL] Image",
|
|
||||||
ui_order=6,
|
|
||||||
)
|
|
||||||
unet: Optional[UNetField] = InputField(
|
|
||||||
description="OPTIONAL: If the Unet is a specialized Inpainting model, masked_latents will be generated from the image with the VAE",
|
|
||||||
default=None,
|
|
||||||
input=Input.Connection,
|
|
||||||
title="[OPTIONAL] UNet",
|
|
||||||
ui_order=5,
|
|
||||||
)
|
|
||||||
vae: Optional[VAEField] = InputField(
|
|
||||||
default=None,
|
|
||||||
description="OPTIONAL: Only connect for specialized Inpainting models, masked_latents will be generated from the image with the VAE",
|
|
||||||
title="[OPTIONAL] VAE",
|
|
||||||
input=Input.Connection,
|
|
||||||
ui_order=7,
|
|
||||||
)
|
|
||||||
tiled: bool = InputField(default=False, description=FieldDescriptions.tiled, ui_order=8)
|
|
||||||
fp32: bool = InputField(
|
|
||||||
default=DEFAULT_PRECISION == "float32",
|
|
||||||
description=FieldDescriptions.fp32,
|
|
||||||
ui_order=9,
|
|
||||||
)
|
|
||||||
|
|
||||||
@torch.no_grad()
|
|
||||||
def invoke(self, context: InvocationContext) -> GradientMaskOutput:
|
|
||||||
mask_image = context.images.get_pil(self.mask.image_name, mode="L")
|
|
||||||
if self.edge_radius > 0:
|
|
||||||
if self.coherence_mode == "Box Blur":
|
|
||||||
blur_mask = mask_image.filter(ImageFilter.BoxBlur(self.edge_radius))
|
|
||||||
else: # Gaussian Blur OR Staged
|
|
||||||
# Gaussian Blur uses standard deviation. 1/2 radius is a good approximation
|
|
||||||
blur_mask = mask_image.filter(ImageFilter.GaussianBlur(self.edge_radius / 2))
|
|
||||||
|
|
||||||
blur_tensor: torch.Tensor = image_resized_to_grid_as_tensor(blur_mask, normalize=False)
|
|
||||||
|
|
||||||
# redistribute blur so that the original edges are 0 and blur outwards to 1
|
|
||||||
blur_tensor = (blur_tensor - 0.5) * 2
|
|
||||||
|
|
||||||
threshold = 1 - self.minimum_denoise
|
|
||||||
|
|
||||||
if self.coherence_mode == "Staged":
|
|
||||||
# wherever the blur_tensor is less than fully masked, convert it to threshold
|
|
||||||
blur_tensor = torch.where((blur_tensor < 1) & (blur_tensor > 0), threshold, blur_tensor)
|
|
||||||
else:
|
|
||||||
# wherever the blur_tensor is above threshold but less than 1, drop it to threshold
|
|
||||||
blur_tensor = torch.where((blur_tensor > threshold) & (blur_tensor < 1), threshold, blur_tensor)
|
|
||||||
|
|
||||||
else:
|
|
||||||
blur_tensor: torch.Tensor = image_resized_to_grid_as_tensor(mask_image, normalize=False)
|
|
||||||
|
|
||||||
mask_name = context.tensors.save(tensor=blur_tensor.unsqueeze(1))
|
|
||||||
|
|
||||||
# compute a [0, 1] mask from the blur_tensor
|
|
||||||
expanded_mask = torch.where((blur_tensor < 1), 0, 1)
|
|
||||||
expanded_mask_image = Image.fromarray((expanded_mask.squeeze(0).numpy() * 255).astype(np.uint8), mode="L")
|
|
||||||
expanded_image_dto = context.images.save(expanded_mask_image)
|
|
||||||
|
|
||||||
masked_latents_name = None
|
|
||||||
if self.unet is not None and self.vae is not None and self.image is not None:
|
|
||||||
# all three fields must be present at the same time
|
|
||||||
main_model_config = context.models.get_config(self.unet.unet.key)
|
|
||||||
assert isinstance(main_model_config, MainConfigBase)
|
|
||||||
if main_model_config.variant is ModelVariantType.Inpaint:
|
|
||||||
mask = blur_tensor
|
|
||||||
vae_info: LoadedModel = context.models.load(self.vae.vae)
|
|
||||||
image = context.images.get_pil(self.image.image_name)
|
|
||||||
image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
|
|
||||||
if image_tensor.dim() == 3:
|
|
||||||
image_tensor = image_tensor.unsqueeze(0)
|
|
||||||
img_mask = tv_resize(mask, image_tensor.shape[-2:], T.InterpolationMode.BILINEAR, antialias=False)
|
|
||||||
masked_image = image_tensor * torch.where(img_mask < 0.5, 0.0, 1.0)
|
|
||||||
masked_latents = ImageToLatentsInvocation.vae_encode(
|
|
||||||
vae_info, self.fp32, self.tiled, masked_image.clone()
|
|
||||||
)
|
|
||||||
masked_latents_name = context.tensors.save(tensor=masked_latents)
|
|
||||||
|
|
||||||
return GradientMaskOutput(
|
|
||||||
denoise_mask=DenoiseMaskField(mask_name=mask_name, masked_latents_name=masked_latents_name, gradient=True),
|
|
||||||
expanded_mask_area=ImageField(image_name=expanded_image_dto.image_name),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_scheduler(
|
def get_scheduler(
|
||||||
context: InvocationContext,
|
context: InvocationContext,
|
||||||
scheduler_info: ModelIdentifierField,
|
scheduler_info: ModelIdentifierField,
|
||||||
@ -1037,469 +812,3 @@ class DenoiseLatentsInvocation(BaseInvocation):
|
|||||||
|
|
||||||
name = context.tensors.save(tensor=result_latents)
|
name = context.tensors.save(tensor=result_latents)
|
||||||
return LatentsOutput.build(latents_name=name, latents=result_latents, seed=None)
|
return LatentsOutput.build(latents_name=name, latents=result_latents, seed=None)
|
||||||
|
|
||||||
|
|
||||||
@invocation(
|
|
||||||
"l2i",
|
|
||||||
title="Latents to Image",
|
|
||||||
tags=["latents", "image", "vae", "l2i"],
|
|
||||||
category="latents",
|
|
||||||
version="1.2.2",
|
|
||||||
)
|
|
||||||
class LatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
|
|
||||||
"""Generates an image from latents."""
|
|
||||||
|
|
||||||
latents: LatentsField = InputField(
|
|
||||||
description=FieldDescriptions.latents,
|
|
||||||
input=Input.Connection,
|
|
||||||
)
|
|
||||||
vae: VAEField = InputField(
|
|
||||||
description=FieldDescriptions.vae,
|
|
||||||
input=Input.Connection,
|
|
||||||
)
|
|
||||||
tiled: bool = InputField(default=False, description=FieldDescriptions.tiled)
|
|
||||||
fp32: bool = InputField(default=DEFAULT_PRECISION == "float32", description=FieldDescriptions.fp32)
|
|
||||||
|
|
||||||
@torch.no_grad()
|
|
||||||
def invoke(self, context: InvocationContext) -> ImageOutput:
|
|
||||||
latents = context.tensors.load(self.latents.latents_name)
|
|
||||||
|
|
||||||
vae_info = context.models.load(self.vae.vae)
|
|
||||||
assert isinstance(vae_info.model, (UNet2DConditionModel, AutoencoderKL, AutoencoderTiny))
|
|
||||||
with set_seamless(vae_info.model, self.vae.seamless_axes), vae_info as vae:
|
|
||||||
assert isinstance(vae, torch.nn.Module)
|
|
||||||
latents = latents.to(vae.device)
|
|
||||||
if self.fp32:
|
|
||||||
vae.to(dtype=torch.float32)
|
|
||||||
|
|
||||||
use_torch_2_0_or_xformers = hasattr(vae.decoder, "mid_block") and isinstance(
|
|
||||||
vae.decoder.mid_block.attentions[0].processor,
|
|
||||||
(
|
|
||||||
AttnProcessor2_0,
|
|
||||||
XFormersAttnProcessor,
|
|
||||||
LoRAXFormersAttnProcessor,
|
|
||||||
LoRAAttnProcessor2_0,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
# if xformers or torch_2_0 is used attention block does not need
|
|
||||||
# to be in float32 which can save lots of memory
|
|
||||||
if use_torch_2_0_or_xformers:
|
|
||||||
vae.post_quant_conv.to(latents.dtype)
|
|
||||||
vae.decoder.conv_in.to(latents.dtype)
|
|
||||||
vae.decoder.mid_block.to(latents.dtype)
|
|
||||||
else:
|
|
||||||
latents = latents.float()
|
|
||||||
|
|
||||||
else:
|
|
||||||
vae.to(dtype=torch.float16)
|
|
||||||
latents = latents.half()
|
|
||||||
|
|
||||||
if self.tiled or context.config.get().force_tiled_decode:
|
|
||||||
vae.enable_tiling()
|
|
||||||
else:
|
|
||||||
vae.disable_tiling()
|
|
||||||
|
|
||||||
# clear memory as vae decode can request a lot
|
|
||||||
TorchDevice.empty_cache()
|
|
||||||
|
|
||||||
with torch.inference_mode():
|
|
||||||
# copied from diffusers pipeline
|
|
||||||
latents = latents / vae.config.scaling_factor
|
|
||||||
image = vae.decode(latents, return_dict=False)[0]
|
|
||||||
image = (image / 2 + 0.5).clamp(0, 1) # denormalize
|
|
||||||
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
|
|
||||||
np_image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
|
||||||
|
|
||||||
image = VaeImageProcessor.numpy_to_pil(np_image)[0]
|
|
||||||
|
|
||||||
TorchDevice.empty_cache()
|
|
||||||
|
|
||||||
image_dto = context.images.save(image=image)
|
|
||||||
|
|
||||||
return ImageOutput.build(image_dto)
|
|
||||||
|
|
||||||
|
|
||||||
LATENTS_INTERPOLATION_MODE = Literal["nearest", "linear", "bilinear", "bicubic", "trilinear", "area", "nearest-exact"]
|
|
||||||
|
|
||||||
|
|
||||||
@invocation(
|
|
||||||
"lresize",
|
|
||||||
title="Resize Latents",
|
|
||||||
tags=["latents", "resize"],
|
|
||||||
category="latents",
|
|
||||||
version="1.0.2",
|
|
||||||
)
|
|
||||||
class ResizeLatentsInvocation(BaseInvocation):
|
|
||||||
"""Resizes latents to explicit width/height (in pixels). Provided dimensions are floor-divided by 8."""
|
|
||||||
|
|
||||||
latents: LatentsField = InputField(
|
|
||||||
description=FieldDescriptions.latents,
|
|
||||||
input=Input.Connection,
|
|
||||||
)
|
|
||||||
width: int = InputField(
|
|
||||||
ge=64,
|
|
||||||
multiple_of=LATENT_SCALE_FACTOR,
|
|
||||||
description=FieldDescriptions.width,
|
|
||||||
)
|
|
||||||
height: int = InputField(
|
|
||||||
ge=64,
|
|
||||||
multiple_of=LATENT_SCALE_FACTOR,
|
|
||||||
description=FieldDescriptions.width,
|
|
||||||
)
|
|
||||||
mode: LATENTS_INTERPOLATION_MODE = InputField(default="bilinear", description=FieldDescriptions.interp_mode)
|
|
||||||
antialias: bool = InputField(default=False, description=FieldDescriptions.torch_antialias)
|
|
||||||
|
|
||||||
def invoke(self, context: InvocationContext) -> LatentsOutput:
|
|
||||||
latents = context.tensors.load(self.latents.latents_name)
|
|
||||||
device = TorchDevice.choose_torch_device()
|
|
||||||
|
|
||||||
resized_latents = torch.nn.functional.interpolate(
|
|
||||||
latents.to(device),
|
|
||||||
size=(self.height // LATENT_SCALE_FACTOR, self.width // LATENT_SCALE_FACTOR),
|
|
||||||
mode=self.mode,
|
|
||||||
antialias=self.antialias if self.mode in ["bilinear", "bicubic"] else False,
|
|
||||||
)
|
|
||||||
|
|
||||||
# https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699
|
|
||||||
resized_latents = resized_latents.to("cpu")
|
|
||||||
|
|
||||||
TorchDevice.empty_cache()
|
|
||||||
|
|
||||||
name = context.tensors.save(tensor=resized_latents)
|
|
||||||
return LatentsOutput.build(latents_name=name, latents=resized_latents, seed=self.latents.seed)
|
|
||||||
|
|
||||||
|
|
||||||
@invocation(
|
|
||||||
"lscale",
|
|
||||||
title="Scale Latents",
|
|
||||||
tags=["latents", "resize"],
|
|
||||||
category="latents",
|
|
||||||
version="1.0.2",
|
|
||||||
)
|
|
||||||
class ScaleLatentsInvocation(BaseInvocation):
|
|
||||||
"""Scales latents by a given factor."""
|
|
||||||
|
|
||||||
latents: LatentsField = InputField(
|
|
||||||
description=FieldDescriptions.latents,
|
|
||||||
input=Input.Connection,
|
|
||||||
)
|
|
||||||
scale_factor: float = InputField(gt=0, description=FieldDescriptions.scale_factor)
|
|
||||||
mode: LATENTS_INTERPOLATION_MODE = InputField(default="bilinear", description=FieldDescriptions.interp_mode)
|
|
||||||
antialias: bool = InputField(default=False, description=FieldDescriptions.torch_antialias)
|
|
||||||
|
|
||||||
def invoke(self, context: InvocationContext) -> LatentsOutput:
|
|
||||||
latents = context.tensors.load(self.latents.latents_name)
|
|
||||||
|
|
||||||
device = TorchDevice.choose_torch_device()
|
|
||||||
|
|
||||||
# resizing
|
|
||||||
resized_latents = torch.nn.functional.interpolate(
|
|
||||||
latents.to(device),
|
|
||||||
scale_factor=self.scale_factor,
|
|
||||||
mode=self.mode,
|
|
||||||
antialias=self.antialias if self.mode in ["bilinear", "bicubic"] else False,
|
|
||||||
)
|
|
||||||
|
|
||||||
# https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699
|
|
||||||
resized_latents = resized_latents.to("cpu")
|
|
||||||
TorchDevice.empty_cache()
|
|
||||||
|
|
||||||
name = context.tensors.save(tensor=resized_latents)
|
|
||||||
return LatentsOutput.build(latents_name=name, latents=resized_latents, seed=self.latents.seed)
|
|
||||||
|
|
||||||
|
|
||||||
@invocation(
|
|
||||||
"i2l",
|
|
||||||
title="Image to Latents",
|
|
||||||
tags=["latents", "image", "vae", "i2l"],
|
|
||||||
category="latents",
|
|
||||||
version="1.0.2",
|
|
||||||
)
|
|
||||||
class ImageToLatentsInvocation(BaseInvocation):
|
|
||||||
"""Encodes an image into latents."""
|
|
||||||
|
|
||||||
image: ImageField = InputField(
|
|
||||||
description="The image to encode",
|
|
||||||
)
|
|
||||||
vae: VAEField = InputField(
|
|
||||||
description=FieldDescriptions.vae,
|
|
||||||
input=Input.Connection,
|
|
||||||
)
|
|
||||||
tiled: bool = InputField(default=False, description=FieldDescriptions.tiled)
|
|
||||||
fp32: bool = InputField(default=DEFAULT_PRECISION == "float32", description=FieldDescriptions.fp32)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def vae_encode(vae_info: LoadedModel, upcast: bool, tiled: bool, image_tensor: torch.Tensor) -> torch.Tensor:
|
|
||||||
with vae_info as vae:
|
|
||||||
assert isinstance(vae, torch.nn.Module)
|
|
||||||
orig_dtype = vae.dtype
|
|
||||||
if upcast:
|
|
||||||
vae.to(dtype=torch.float32)
|
|
||||||
|
|
||||||
use_torch_2_0_or_xformers = hasattr(vae.decoder, "mid_block") and isinstance(
|
|
||||||
vae.decoder.mid_block.attentions[0].processor,
|
|
||||||
(
|
|
||||||
AttnProcessor2_0,
|
|
||||||
XFormersAttnProcessor,
|
|
||||||
LoRAXFormersAttnProcessor,
|
|
||||||
LoRAAttnProcessor2_0,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
# if xformers or torch_2_0 is used attention block does not need
|
|
||||||
# to be in float32 which can save lots of memory
|
|
||||||
if use_torch_2_0_or_xformers:
|
|
||||||
vae.post_quant_conv.to(orig_dtype)
|
|
||||||
vae.decoder.conv_in.to(orig_dtype)
|
|
||||||
vae.decoder.mid_block.to(orig_dtype)
|
|
||||||
# else:
|
|
||||||
# latents = latents.float()
|
|
||||||
|
|
||||||
else:
|
|
||||||
vae.to(dtype=torch.float16)
|
|
||||||
# latents = latents.half()
|
|
||||||
|
|
||||||
if tiled:
|
|
||||||
vae.enable_tiling()
|
|
||||||
else:
|
|
||||||
vae.disable_tiling()
|
|
||||||
|
|
||||||
# non_noised_latents_from_image
|
|
||||||
image_tensor = image_tensor.to(device=vae.device, dtype=vae.dtype)
|
|
||||||
with torch.inference_mode():
|
|
||||||
latents = ImageToLatentsInvocation._encode_to_tensor(vae, image_tensor)
|
|
||||||
|
|
||||||
latents = vae.config.scaling_factor * latents
|
|
||||||
latents = latents.to(dtype=orig_dtype)
|
|
||||||
|
|
||||||
return latents
|
|
||||||
|
|
||||||
@torch.no_grad()
|
|
||||||
def invoke(self, context: InvocationContext) -> LatentsOutput:
|
|
||||||
image = context.images.get_pil(self.image.image_name)
|
|
||||||
|
|
||||||
vae_info = context.models.load(self.vae.vae)
|
|
||||||
|
|
||||||
image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
|
|
||||||
if image_tensor.dim() == 3:
|
|
||||||
image_tensor = einops.rearrange(image_tensor, "c h w -> 1 c h w")
|
|
||||||
|
|
||||||
latents = self.vae_encode(vae_info, self.fp32, self.tiled, image_tensor)
|
|
||||||
|
|
||||||
latents = latents.to("cpu")
|
|
||||||
name = context.tensors.save(tensor=latents)
|
|
||||||
return LatentsOutput.build(latents_name=name, latents=latents, seed=None)
|
|
||||||
|
|
||||||
@singledispatchmethod
|
|
||||||
@staticmethod
|
|
||||||
def _encode_to_tensor(vae: AutoencoderKL, image_tensor: torch.FloatTensor) -> torch.FloatTensor:
|
|
||||||
assert isinstance(vae, torch.nn.Module)
|
|
||||||
image_tensor_dist = vae.encode(image_tensor).latent_dist
|
|
||||||
latents: torch.Tensor = image_tensor_dist.sample().to(
|
|
||||||
dtype=vae.dtype
|
|
||||||
) # FIXME: uses torch.randn. make reproducible!
|
|
||||||
return latents
|
|
||||||
|
|
||||||
@_encode_to_tensor.register
|
|
||||||
@staticmethod
|
|
||||||
def _(vae: AutoencoderTiny, image_tensor: torch.FloatTensor) -> torch.FloatTensor:
|
|
||||||
assert isinstance(vae, torch.nn.Module)
|
|
||||||
latents: torch.FloatTensor = vae.encode(image_tensor).latents
|
|
||||||
return latents
|
|
||||||
|
|
||||||
|
|
||||||
@invocation(
|
|
||||||
"lblend",
|
|
||||||
title="Blend Latents",
|
|
||||||
tags=["latents", "blend"],
|
|
||||||
category="latents",
|
|
||||||
version="1.0.3",
|
|
||||||
)
|
|
||||||
class BlendLatentsInvocation(BaseInvocation):
|
|
||||||
"""Blend two latents using a given alpha. Latents must have same size."""
|
|
||||||
|
|
||||||
latents_a: LatentsField = InputField(
|
|
||||||
description=FieldDescriptions.latents,
|
|
||||||
input=Input.Connection,
|
|
||||||
)
|
|
||||||
latents_b: LatentsField = InputField(
|
|
||||||
description=FieldDescriptions.latents,
|
|
||||||
input=Input.Connection,
|
|
||||||
)
|
|
||||||
alpha: float = InputField(default=0.5, description=FieldDescriptions.blend_alpha)
|
|
||||||
|
|
||||||
def invoke(self, context: InvocationContext) -> LatentsOutput:
|
|
||||||
latents_a = context.tensors.load(self.latents_a.latents_name)
|
|
||||||
latents_b = context.tensors.load(self.latents_b.latents_name)
|
|
||||||
|
|
||||||
if latents_a.shape != latents_b.shape:
|
|
||||||
raise Exception("Latents to blend must be the same size.")
|
|
||||||
|
|
||||||
device = TorchDevice.choose_torch_device()
|
|
||||||
|
|
||||||
def slerp(
|
|
||||||
t: Union[float, npt.NDArray[Any]], # FIXME: maybe use np.float32 here?
|
|
||||||
v0: Union[torch.Tensor, npt.NDArray[Any]],
|
|
||||||
v1: Union[torch.Tensor, npt.NDArray[Any]],
|
|
||||||
DOT_THRESHOLD: float = 0.9995,
|
|
||||||
) -> Union[torch.Tensor, npt.NDArray[Any]]:
|
|
||||||
"""
|
|
||||||
Spherical linear interpolation
|
|
||||||
Args:
|
|
||||||
t (float/np.ndarray): Float value between 0.0 and 1.0
|
|
||||||
v0 (np.ndarray): Starting vector
|
|
||||||
v1 (np.ndarray): Final vector
|
|
||||||
DOT_THRESHOLD (float): Threshold for considering the two vectors as
|
|
||||||
colineal. Not recommended to alter this.
|
|
||||||
Returns:
|
|
||||||
v2 (np.ndarray): Interpolation vector between v0 and v1
|
|
||||||
"""
|
|
||||||
inputs_are_torch = False
|
|
||||||
if not isinstance(v0, np.ndarray):
|
|
||||||
inputs_are_torch = True
|
|
||||||
v0 = v0.detach().cpu().numpy()
|
|
||||||
if not isinstance(v1, np.ndarray):
|
|
||||||
inputs_are_torch = True
|
|
||||||
v1 = v1.detach().cpu().numpy()
|
|
||||||
|
|
||||||
dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
|
|
||||||
if np.abs(dot) > DOT_THRESHOLD:
|
|
||||||
v2 = (1 - t) * v0 + t * v1
|
|
||||||
else:
|
|
||||||
theta_0 = np.arccos(dot)
|
|
||||||
sin_theta_0 = np.sin(theta_0)
|
|
||||||
theta_t = theta_0 * t
|
|
||||||
sin_theta_t = np.sin(theta_t)
|
|
||||||
s0 = np.sin(theta_0 - theta_t) / sin_theta_0
|
|
||||||
s1 = sin_theta_t / sin_theta_0
|
|
||||||
v2 = s0 * v0 + s1 * v1
|
|
||||||
|
|
||||||
if inputs_are_torch:
|
|
||||||
v2_torch: torch.Tensor = torch.from_numpy(v2).to(device)
|
|
||||||
return v2_torch
|
|
||||||
else:
|
|
||||||
assert isinstance(v2, np.ndarray)
|
|
||||||
return v2
|
|
||||||
|
|
||||||
# blend
|
|
||||||
bl = slerp(self.alpha, latents_a, latents_b)
|
|
||||||
assert isinstance(bl, torch.Tensor)
|
|
||||||
blended_latents: torch.Tensor = bl # for type checking convenience
|
|
||||||
|
|
||||||
# https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699
|
|
||||||
blended_latents = blended_latents.to("cpu")
|
|
||||||
|
|
||||||
TorchDevice.empty_cache()
|
|
||||||
|
|
||||||
name = context.tensors.save(tensor=blended_latents)
|
|
||||||
return LatentsOutput.build(latents_name=name, latents=blended_latents, seed=self.latents_a.seed)
|
|
||||||
|
|
||||||
|
|
||||||
# The Crop Latents node was copied from @skunkworxdark's implementation here:
|
|
||||||
# https://github.com/skunkworxdark/XYGrid_nodes/blob/74647fa9c1fa57d317a94bd43ca689af7f0aae5e/images_to_grids.py#L1117C1-L1167C80
|
|
||||||
@invocation(
|
|
||||||
"crop_latents",
|
|
||||||
title="Crop Latents",
|
|
||||||
tags=["latents", "crop"],
|
|
||||||
category="latents",
|
|
||||||
version="1.0.2",
|
|
||||||
)
|
|
||||||
# TODO(ryand): Named `CropLatentsCoreInvocation` to prevent a conflict with custom node `CropLatentsInvocation`.
|
|
||||||
# Currently, if the class names conflict then 'GET /openapi.json' fails.
|
|
||||||
class CropLatentsCoreInvocation(BaseInvocation):
|
|
||||||
"""Crops a latent-space tensor to a box specified in image-space. The box dimensions and coordinates must be
|
|
||||||
divisible by the latent scale factor of 8.
|
|
||||||
"""
|
|
||||||
|
|
||||||
latents: LatentsField = InputField(
|
|
||||||
description=FieldDescriptions.latents,
|
|
||||||
input=Input.Connection,
|
|
||||||
)
|
|
||||||
x: int = InputField(
|
|
||||||
ge=0,
|
|
||||||
multiple_of=LATENT_SCALE_FACTOR,
|
|
||||||
description="The left x coordinate (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
|
|
||||||
)
|
|
||||||
y: int = InputField(
|
|
||||||
ge=0,
|
|
||||||
multiple_of=LATENT_SCALE_FACTOR,
|
|
||||||
description="The top y coordinate (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
|
|
||||||
)
|
|
||||||
width: int = InputField(
|
|
||||||
ge=1,
|
|
||||||
multiple_of=LATENT_SCALE_FACTOR,
|
|
||||||
description="The width (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
|
|
||||||
)
|
|
||||||
height: int = InputField(
|
|
||||||
ge=1,
|
|
||||||
multiple_of=LATENT_SCALE_FACTOR,
|
|
||||||
description="The height (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
|
|
||||||
)
|
|
||||||
|
|
||||||
def invoke(self, context: InvocationContext) -> LatentsOutput:
|
|
||||||
latents = context.tensors.load(self.latents.latents_name)
|
|
||||||
|
|
||||||
x1 = self.x // LATENT_SCALE_FACTOR
|
|
||||||
y1 = self.y // LATENT_SCALE_FACTOR
|
|
||||||
x2 = x1 + (self.width // LATENT_SCALE_FACTOR)
|
|
||||||
y2 = y1 + (self.height // LATENT_SCALE_FACTOR)
|
|
||||||
|
|
||||||
cropped_latents = latents[..., y1:y2, x1:x2]
|
|
||||||
|
|
||||||
name = context.tensors.save(tensor=cropped_latents)
|
|
||||||
|
|
||||||
return LatentsOutput.build(latents_name=name, latents=cropped_latents)
|
|
||||||
|
|
||||||
|
|
||||||
@invocation_output("ideal_size_output")
|
|
||||||
class IdealSizeOutput(BaseInvocationOutput):
|
|
||||||
"""Base class for invocations that output an image"""
|
|
||||||
|
|
||||||
width: int = OutputField(description="The ideal width of the image (in pixels)")
|
|
||||||
height: int = OutputField(description="The ideal height of the image (in pixels)")
|
|
||||||
|
|
||||||
|
|
||||||
@invocation(
|
|
||||||
"ideal_size",
|
|
||||||
title="Ideal Size",
|
|
||||||
tags=["latents", "math", "ideal_size"],
|
|
||||||
version="1.0.3",
|
|
||||||
)
|
|
||||||
class IdealSizeInvocation(BaseInvocation):
|
|
||||||
"""Calculates the ideal size for generation to avoid duplication"""
|
|
||||||
|
|
||||||
width: int = InputField(default=1024, description="Final image width")
|
|
||||||
height: int = InputField(default=576, description="Final image height")
|
|
||||||
unet: UNetField = InputField(default=None, description=FieldDescriptions.unet)
|
|
||||||
multiplier: float = InputField(
|
|
||||||
default=1.0,
|
|
||||||
description="Amount to multiply the model's dimensions by when calculating the ideal size (may result in initial generation artifacts if too large)",
|
|
||||||
)
|
|
||||||
|
|
||||||
def trim_to_multiple_of(self, *args: int, multiple_of: int = LATENT_SCALE_FACTOR) -> Tuple[int, ...]:
|
|
||||||
return tuple((x - x % multiple_of) for x in args)
|
|
||||||
|
|
||||||
def invoke(self, context: InvocationContext) -> IdealSizeOutput:
|
|
||||||
unet_config = context.models.get_config(self.unet.unet.key)
|
|
||||||
aspect = self.width / self.height
|
|
||||||
dimension: float = 512
|
|
||||||
if unet_config.base == BaseModelType.StableDiffusion2:
|
|
||||||
dimension = 768
|
|
||||||
elif unet_config.base == BaseModelType.StableDiffusionXL:
|
|
||||||
dimension = 1024
|
|
||||||
dimension = dimension * self.multiplier
|
|
||||||
min_dimension = math.floor(dimension * 0.5)
|
|
||||||
model_area = dimension * dimension # hardcoded for now since all models are trained on square images
|
|
||||||
|
|
||||||
if aspect > 1.0:
|
|
||||||
init_height = max(min_dimension, math.sqrt(model_area / aspect))
|
|
||||||
init_width = init_height * aspect
|
|
||||||
else:
|
|
||||||
init_width = max(min_dimension, math.sqrt(model_area * aspect))
|
|
||||||
init_height = init_width / aspect
|
|
||||||
|
|
||||||
scaled_width, scaled_height = self.trim_to_multiple_of(
|
|
||||||
math.floor(init_width),
|
|
||||||
math.floor(init_height),
|
|
||||||
)
|
|
||||||
|
|
||||||
return IdealSizeOutput(width=scaled_width, height=scaled_height)
|
|
65
invokeai/app/invocations/ideal_size.py
Normal file
65
invokeai/app/invocations/ideal_size.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
import math
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
|
||||||
|
from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
|
||||||
|
from invokeai.app.invocations.fields import FieldDescriptions, InputField, OutputField
|
||||||
|
from invokeai.app.invocations.model import UNetField
|
||||||
|
from invokeai.app.services.shared.invocation_context import InvocationContext
|
||||||
|
from invokeai.backend.model_manager.config import BaseModelType
|
||||||
|
|
||||||
|
|
||||||
|
@invocation_output("ideal_size_output")
|
||||||
|
class IdealSizeOutput(BaseInvocationOutput):
|
||||||
|
"""Base class for invocations that output an image"""
|
||||||
|
|
||||||
|
width: int = OutputField(description="The ideal width of the image (in pixels)")
|
||||||
|
height: int = OutputField(description="The ideal height of the image (in pixels)")
|
||||||
|
|
||||||
|
|
||||||
|
@invocation(
|
||||||
|
"ideal_size",
|
||||||
|
title="Ideal Size",
|
||||||
|
tags=["latents", "math", "ideal_size"],
|
||||||
|
version="1.0.3",
|
||||||
|
)
|
||||||
|
class IdealSizeInvocation(BaseInvocation):
|
||||||
|
"""Calculates the ideal size for generation to avoid duplication"""
|
||||||
|
|
||||||
|
width: int = InputField(default=1024, description="Final image width")
|
||||||
|
height: int = InputField(default=576, description="Final image height")
|
||||||
|
unet: UNetField = InputField(default=None, description=FieldDescriptions.unet)
|
||||||
|
multiplier: float = InputField(
|
||||||
|
default=1.0,
|
||||||
|
description="Amount to multiply the model's dimensions by when calculating the ideal size (may result in "
|
||||||
|
"initial generation artifacts if too large)",
|
||||||
|
)
|
||||||
|
|
||||||
|
def trim_to_multiple_of(self, *args: int, multiple_of: int = LATENT_SCALE_FACTOR) -> Tuple[int, ...]:
|
||||||
|
return tuple((x - x % multiple_of) for x in args)
|
||||||
|
|
||||||
|
def invoke(self, context: InvocationContext) -> IdealSizeOutput:
|
||||||
|
unet_config = context.models.get_config(self.unet.unet.key)
|
||||||
|
aspect = self.width / self.height
|
||||||
|
dimension: float = 512
|
||||||
|
if unet_config.base == BaseModelType.StableDiffusion2:
|
||||||
|
dimension = 768
|
||||||
|
elif unet_config.base == BaseModelType.StableDiffusionXL:
|
||||||
|
dimension = 1024
|
||||||
|
dimension = dimension * self.multiplier
|
||||||
|
min_dimension = math.floor(dimension * 0.5)
|
||||||
|
model_area = dimension * dimension # hardcoded for now since all models are trained on square images
|
||||||
|
|
||||||
|
if aspect > 1.0:
|
||||||
|
init_height = max(min_dimension, math.sqrt(model_area / aspect))
|
||||||
|
init_width = init_height * aspect
|
||||||
|
else:
|
||||||
|
init_width = max(min_dimension, math.sqrt(model_area * aspect))
|
||||||
|
init_height = init_width / aspect
|
||||||
|
|
||||||
|
scaled_width, scaled_height = self.trim_to_multiple_of(
|
||||||
|
math.floor(init_width),
|
||||||
|
math.floor(init_height),
|
||||||
|
)
|
||||||
|
|
||||||
|
return IdealSizeOutput(width=scaled_width, height=scaled_height)
|
125
invokeai/app/invocations/image_to_latents.py
Normal file
125
invokeai/app/invocations/image_to_latents.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
from functools import singledispatchmethod
|
||||||
|
|
||||||
|
import einops
|
||||||
|
import torch
|
||||||
|
from diffusers.models.attention_processor import (
|
||||||
|
AttnProcessor2_0,
|
||||||
|
LoRAAttnProcessor2_0,
|
||||||
|
LoRAXFormersAttnProcessor,
|
||||||
|
XFormersAttnProcessor,
|
||||||
|
)
|
||||||
|
from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
|
||||||
|
from diffusers.models.autoencoders.autoencoder_tiny import AutoencoderTiny
|
||||||
|
|
||||||
|
from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
|
||||||
|
from invokeai.app.invocations.denoise_latents import DEFAULT_PRECISION
|
||||||
|
from invokeai.app.invocations.fields import (
|
||||||
|
FieldDescriptions,
|
||||||
|
ImageField,
|
||||||
|
Input,
|
||||||
|
InputField,
|
||||||
|
)
|
||||||
|
from invokeai.app.invocations.model import VAEField
|
||||||
|
from invokeai.app.invocations.primitives import LatentsOutput
|
||||||
|
from invokeai.app.services.shared.invocation_context import InvocationContext
|
||||||
|
from invokeai.backend.model_manager import LoadedModel
|
||||||
|
from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor
|
||||||
|
|
||||||
|
|
||||||
|
@invocation(
|
||||||
|
"i2l",
|
||||||
|
title="Image to Latents",
|
||||||
|
tags=["latents", "image", "vae", "i2l"],
|
||||||
|
category="latents",
|
||||||
|
version="1.0.2",
|
||||||
|
)
|
||||||
|
class ImageToLatentsInvocation(BaseInvocation):
|
||||||
|
"""Encodes an image into latents."""
|
||||||
|
|
||||||
|
image: ImageField = InputField(
|
||||||
|
description="The image to encode",
|
||||||
|
)
|
||||||
|
vae: VAEField = InputField(
|
||||||
|
description=FieldDescriptions.vae,
|
||||||
|
input=Input.Connection,
|
||||||
|
)
|
||||||
|
tiled: bool = InputField(default=False, description=FieldDescriptions.tiled)
|
||||||
|
fp32: bool = InputField(default=DEFAULT_PRECISION == "float32", description=FieldDescriptions.fp32)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def vae_encode(vae_info: LoadedModel, upcast: bool, tiled: bool, image_tensor: torch.Tensor) -> torch.Tensor:
|
||||||
|
with vae_info as vae:
|
||||||
|
assert isinstance(vae, torch.nn.Module)
|
||||||
|
orig_dtype = vae.dtype
|
||||||
|
if upcast:
|
||||||
|
vae.to(dtype=torch.float32)
|
||||||
|
|
||||||
|
use_torch_2_0_or_xformers = hasattr(vae.decoder, "mid_block") and isinstance(
|
||||||
|
vae.decoder.mid_block.attentions[0].processor,
|
||||||
|
(
|
||||||
|
AttnProcessor2_0,
|
||||||
|
XFormersAttnProcessor,
|
||||||
|
LoRAXFormersAttnProcessor,
|
||||||
|
LoRAAttnProcessor2_0,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# if xformers or torch_2_0 is used attention block does not need
|
||||||
|
# to be in float32 which can save lots of memory
|
||||||
|
if use_torch_2_0_or_xformers:
|
||||||
|
vae.post_quant_conv.to(orig_dtype)
|
||||||
|
vae.decoder.conv_in.to(orig_dtype)
|
||||||
|
vae.decoder.mid_block.to(orig_dtype)
|
||||||
|
# else:
|
||||||
|
# latents = latents.float()
|
||||||
|
|
||||||
|
else:
|
||||||
|
vae.to(dtype=torch.float16)
|
||||||
|
# latents = latents.half()
|
||||||
|
|
||||||
|
if tiled:
|
||||||
|
vae.enable_tiling()
|
||||||
|
else:
|
||||||
|
vae.disable_tiling()
|
||||||
|
|
||||||
|
# non_noised_latents_from_image
|
||||||
|
image_tensor = image_tensor.to(device=vae.device, dtype=vae.dtype)
|
||||||
|
with torch.inference_mode():
|
||||||
|
latents = ImageToLatentsInvocation._encode_to_tensor(vae, image_tensor)
|
||||||
|
|
||||||
|
latents = vae.config.scaling_factor * latents
|
||||||
|
latents = latents.to(dtype=orig_dtype)
|
||||||
|
|
||||||
|
return latents
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def invoke(self, context: InvocationContext) -> LatentsOutput:
|
||||||
|
image = context.images.get_pil(self.image.image_name)
|
||||||
|
|
||||||
|
vae_info = context.models.load(self.vae.vae)
|
||||||
|
|
||||||
|
image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
|
||||||
|
if image_tensor.dim() == 3:
|
||||||
|
image_tensor = einops.rearrange(image_tensor, "c h w -> 1 c h w")
|
||||||
|
|
||||||
|
latents = self.vae_encode(vae_info, self.fp32, self.tiled, image_tensor)
|
||||||
|
|
||||||
|
latents = latents.to("cpu")
|
||||||
|
name = context.tensors.save(tensor=latents)
|
||||||
|
return LatentsOutput.build(latents_name=name, latents=latents, seed=None)
|
||||||
|
|
||||||
|
@singledispatchmethod
|
||||||
|
@staticmethod
|
||||||
|
def _encode_to_tensor(vae: AutoencoderKL, image_tensor: torch.FloatTensor) -> torch.FloatTensor:
|
||||||
|
assert isinstance(vae, torch.nn.Module)
|
||||||
|
image_tensor_dist = vae.encode(image_tensor).latent_dist
|
||||||
|
latents: torch.Tensor = image_tensor_dist.sample().to(
|
||||||
|
dtype=vae.dtype
|
||||||
|
) # FIXME: uses torch.randn. make reproducible!
|
||||||
|
return latents
|
||||||
|
|
||||||
|
@_encode_to_tensor.register
|
||||||
|
@staticmethod
|
||||||
|
def _(vae: AutoencoderTiny, image_tensor: torch.FloatTensor) -> torch.FloatTensor:
|
||||||
|
assert isinstance(vae, torch.nn.Module)
|
||||||
|
latents: torch.FloatTensor = vae.encode(image_tensor).latents
|
||||||
|
return latents
|
107
invokeai/app/invocations/latents_to_image.py
Normal file
107
invokeai/app/invocations/latents_to_image.py
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
import torch
|
||||||
|
from diffusers.image_processor import VaeImageProcessor
|
||||||
|
from diffusers.models.attention_processor import (
|
||||||
|
AttnProcessor2_0,
|
||||||
|
LoRAAttnProcessor2_0,
|
||||||
|
LoRAXFormersAttnProcessor,
|
||||||
|
XFormersAttnProcessor,
|
||||||
|
)
|
||||||
|
from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
|
||||||
|
from diffusers.models.autoencoders.autoencoder_tiny import AutoencoderTiny
|
||||||
|
from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
|
||||||
|
|
||||||
|
from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
|
||||||
|
from invokeai.app.invocations.denoise_latents import DEFAULT_PRECISION
|
||||||
|
from invokeai.app.invocations.fields import (
|
||||||
|
FieldDescriptions,
|
||||||
|
Input,
|
||||||
|
InputField,
|
||||||
|
LatentsField,
|
||||||
|
WithBoard,
|
||||||
|
WithMetadata,
|
||||||
|
)
|
||||||
|
from invokeai.app.invocations.model import VAEField
|
||||||
|
from invokeai.app.invocations.primitives import ImageOutput
|
||||||
|
from invokeai.app.services.shared.invocation_context import InvocationContext
|
||||||
|
from invokeai.backend.stable_diffusion import set_seamless
|
||||||
|
from invokeai.backend.util.devices import TorchDevice
|
||||||
|
|
||||||
|
|
||||||
|
@invocation(
|
||||||
|
"l2i",
|
||||||
|
title="Latents to Image",
|
||||||
|
tags=["latents", "image", "vae", "l2i"],
|
||||||
|
category="latents",
|
||||||
|
version="1.2.2",
|
||||||
|
)
|
||||||
|
class LatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
|
||||||
|
"""Generates an image from latents."""
|
||||||
|
|
||||||
|
latents: LatentsField = InputField(
|
||||||
|
description=FieldDescriptions.latents,
|
||||||
|
input=Input.Connection,
|
||||||
|
)
|
||||||
|
vae: VAEField = InputField(
|
||||||
|
description=FieldDescriptions.vae,
|
||||||
|
input=Input.Connection,
|
||||||
|
)
|
||||||
|
tiled: bool = InputField(default=False, description=FieldDescriptions.tiled)
|
||||||
|
fp32: bool = InputField(default=DEFAULT_PRECISION == "float32", description=FieldDescriptions.fp32)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def invoke(self, context: InvocationContext) -> ImageOutput:
|
||||||
|
latents = context.tensors.load(self.latents.latents_name)
|
||||||
|
|
||||||
|
vae_info = context.models.load(self.vae.vae)
|
||||||
|
assert isinstance(vae_info.model, (UNet2DConditionModel, AutoencoderKL, AutoencoderTiny))
|
||||||
|
with set_seamless(vae_info.model, self.vae.seamless_axes), vae_info as vae:
|
||||||
|
assert isinstance(vae, torch.nn.Module)
|
||||||
|
latents = latents.to(vae.device)
|
||||||
|
if self.fp32:
|
||||||
|
vae.to(dtype=torch.float32)
|
||||||
|
|
||||||
|
use_torch_2_0_or_xformers = hasattr(vae.decoder, "mid_block") and isinstance(
|
||||||
|
vae.decoder.mid_block.attentions[0].processor,
|
||||||
|
(
|
||||||
|
AttnProcessor2_0,
|
||||||
|
XFormersAttnProcessor,
|
||||||
|
LoRAXFormersAttnProcessor,
|
||||||
|
LoRAAttnProcessor2_0,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# if xformers or torch_2_0 is used attention block does not need
|
||||||
|
# to be in float32 which can save lots of memory
|
||||||
|
if use_torch_2_0_or_xformers:
|
||||||
|
vae.post_quant_conv.to(latents.dtype)
|
||||||
|
vae.decoder.conv_in.to(latents.dtype)
|
||||||
|
vae.decoder.mid_block.to(latents.dtype)
|
||||||
|
else:
|
||||||
|
latents = latents.float()
|
||||||
|
|
||||||
|
else:
|
||||||
|
vae.to(dtype=torch.float16)
|
||||||
|
latents = latents.half()
|
||||||
|
|
||||||
|
if self.tiled or context.config.get().force_tiled_decode:
|
||||||
|
vae.enable_tiling()
|
||||||
|
else:
|
||||||
|
vae.disable_tiling()
|
||||||
|
|
||||||
|
# clear memory as vae decode can request a lot
|
||||||
|
TorchDevice.empty_cache()
|
||||||
|
|
||||||
|
with torch.inference_mode():
|
||||||
|
# copied from diffusers pipeline
|
||||||
|
latents = latents / vae.config.scaling_factor
|
||||||
|
image = vae.decode(latents, return_dict=False)[0]
|
||||||
|
image = (image / 2 + 0.5).clamp(0, 1) # denormalize
|
||||||
|
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
|
||||||
|
np_image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
||||||
|
|
||||||
|
image = VaeImageProcessor.numpy_to_pil(np_image)[0]
|
||||||
|
|
||||||
|
TorchDevice.empty_cache()
|
||||||
|
|
||||||
|
image_dto = context.images.save(image=image)
|
||||||
|
|
||||||
|
return ImageOutput.build(image_dto)
|
103
invokeai/app/invocations/resize_latents.py
Normal file
103
invokeai/app/invocations/resize_latents.py
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
|
||||||
|
from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
|
||||||
|
from invokeai.app.invocations.fields import (
|
||||||
|
FieldDescriptions,
|
||||||
|
Input,
|
||||||
|
InputField,
|
||||||
|
LatentsField,
|
||||||
|
)
|
||||||
|
from invokeai.app.invocations.primitives import LatentsOutput
|
||||||
|
from invokeai.app.services.shared.invocation_context import InvocationContext
|
||||||
|
from invokeai.backend.util.devices import TorchDevice
|
||||||
|
|
||||||
|
LATENTS_INTERPOLATION_MODE = Literal["nearest", "linear", "bilinear", "bicubic", "trilinear", "area", "nearest-exact"]
|
||||||
|
|
||||||
|
|
||||||
|
@invocation(
|
||||||
|
"lresize",
|
||||||
|
title="Resize Latents",
|
||||||
|
tags=["latents", "resize"],
|
||||||
|
category="latents",
|
||||||
|
version="1.0.2",
|
||||||
|
)
|
||||||
|
class ResizeLatentsInvocation(BaseInvocation):
|
||||||
|
"""Resizes latents to explicit width/height (in pixels). Provided dimensions are floor-divided by 8."""
|
||||||
|
|
||||||
|
latents: LatentsField = InputField(
|
||||||
|
description=FieldDescriptions.latents,
|
||||||
|
input=Input.Connection,
|
||||||
|
)
|
||||||
|
width: int = InputField(
|
||||||
|
ge=64,
|
||||||
|
multiple_of=LATENT_SCALE_FACTOR,
|
||||||
|
description=FieldDescriptions.width,
|
||||||
|
)
|
||||||
|
height: int = InputField(
|
||||||
|
ge=64,
|
||||||
|
multiple_of=LATENT_SCALE_FACTOR,
|
||||||
|
description=FieldDescriptions.width,
|
||||||
|
)
|
||||||
|
mode: LATENTS_INTERPOLATION_MODE = InputField(default="bilinear", description=FieldDescriptions.interp_mode)
|
||||||
|
antialias: bool = InputField(default=False, description=FieldDescriptions.torch_antialias)
|
||||||
|
|
||||||
|
def invoke(self, context: InvocationContext) -> LatentsOutput:
|
||||||
|
latents = context.tensors.load(self.latents.latents_name)
|
||||||
|
device = TorchDevice.choose_torch_device()
|
||||||
|
|
||||||
|
resized_latents = torch.nn.functional.interpolate(
|
||||||
|
latents.to(device),
|
||||||
|
size=(self.height // LATENT_SCALE_FACTOR, self.width // LATENT_SCALE_FACTOR),
|
||||||
|
mode=self.mode,
|
||||||
|
antialias=self.antialias if self.mode in ["bilinear", "bicubic"] else False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699
|
||||||
|
resized_latents = resized_latents.to("cpu")
|
||||||
|
|
||||||
|
TorchDevice.empty_cache()
|
||||||
|
|
||||||
|
name = context.tensors.save(tensor=resized_latents)
|
||||||
|
return LatentsOutput.build(latents_name=name, latents=resized_latents, seed=self.latents.seed)
|
||||||
|
|
||||||
|
|
||||||
|
@invocation(
|
||||||
|
"lscale",
|
||||||
|
title="Scale Latents",
|
||||||
|
tags=["latents", "resize"],
|
||||||
|
category="latents",
|
||||||
|
version="1.0.2",
|
||||||
|
)
|
||||||
|
class ScaleLatentsInvocation(BaseInvocation):
|
||||||
|
"""Scales latents by a given factor."""
|
||||||
|
|
||||||
|
latents: LatentsField = InputField(
|
||||||
|
description=FieldDescriptions.latents,
|
||||||
|
input=Input.Connection,
|
||||||
|
)
|
||||||
|
scale_factor: float = InputField(gt=0, description=FieldDescriptions.scale_factor)
|
||||||
|
mode: LATENTS_INTERPOLATION_MODE = InputField(default="bilinear", description=FieldDescriptions.interp_mode)
|
||||||
|
antialias: bool = InputField(default=False, description=FieldDescriptions.torch_antialias)
|
||||||
|
|
||||||
|
def invoke(self, context: InvocationContext) -> LatentsOutput:
|
||||||
|
latents = context.tensors.load(self.latents.latents_name)
|
||||||
|
|
||||||
|
device = TorchDevice.choose_torch_device()
|
||||||
|
|
||||||
|
# resizing
|
||||||
|
resized_latents = torch.nn.functional.interpolate(
|
||||||
|
latents.to(device),
|
||||||
|
scale_factor=self.scale_factor,
|
||||||
|
mode=self.mode,
|
||||||
|
antialias=self.antialias if self.mode in ["bilinear", "bicubic"] else False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699
|
||||||
|
resized_latents = resized_latents.to("cpu")
|
||||||
|
TorchDevice.empty_cache()
|
||||||
|
|
||||||
|
name = context.tensors.save(tensor=resized_latents)
|
||||||
|
return LatentsOutput.build(latents_name=name, latents=resized_latents, seed=self.latents.seed)
|
34
invokeai/app/invocations/scheduler.py
Normal file
34
invokeai/app/invocations/scheduler.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
|
||||||
|
from invokeai.app.invocations.constants import SCHEDULER_NAME_VALUES
|
||||||
|
from invokeai.app.invocations.fields import (
|
||||||
|
FieldDescriptions,
|
||||||
|
InputField,
|
||||||
|
OutputField,
|
||||||
|
UIType,
|
||||||
|
)
|
||||||
|
from invokeai.app.services.shared.invocation_context import InvocationContext
|
||||||
|
|
||||||
|
|
||||||
|
@invocation_output("scheduler_output")
|
||||||
|
class SchedulerOutput(BaseInvocationOutput):
|
||||||
|
scheduler: SCHEDULER_NAME_VALUES = OutputField(description=FieldDescriptions.scheduler, ui_type=UIType.Scheduler)
|
||||||
|
|
||||||
|
|
||||||
|
@invocation(
|
||||||
|
"scheduler",
|
||||||
|
title="Scheduler",
|
||||||
|
tags=["scheduler"],
|
||||||
|
category="latents",
|
||||||
|
version="1.0.0",
|
||||||
|
)
|
||||||
|
class SchedulerInvocation(BaseInvocation):
|
||||||
|
"""Selects a scheduler."""
|
||||||
|
|
||||||
|
scheduler: SCHEDULER_NAME_VALUES = InputField(
|
||||||
|
default="euler",
|
||||||
|
description=FieldDescriptions.scheduler,
|
||||||
|
ui_type=UIType.Scheduler,
|
||||||
|
)
|
||||||
|
|
||||||
|
def invoke(self, context: InvocationContext) -> SchedulerOutput:
|
||||||
|
return SchedulerOutput(scheduler=self.scheduler)
|
@ -12,6 +12,7 @@ from invokeai.app.invocations.baseinvocation import (
|
|||||||
invocation_output,
|
invocation_output,
|
||||||
)
|
)
|
||||||
from invokeai.app.invocations.constants import SCHEDULER_NAME_VALUES
|
from invokeai.app.invocations.constants import SCHEDULER_NAME_VALUES
|
||||||
|
from invokeai.app.invocations.denoise_latents import SchedulerOutput
|
||||||
from invokeai.app.invocations.fields import (
|
from invokeai.app.invocations.fields import (
|
||||||
BoardField,
|
BoardField,
|
||||||
ColorField,
|
ColorField,
|
||||||
@ -31,7 +32,6 @@ from invokeai.app.invocations.fields import (
|
|||||||
WithMetadata,
|
WithMetadata,
|
||||||
WithWorkflow,
|
WithWorkflow,
|
||||||
)
|
)
|
||||||
from invokeai.app.invocations.latent import SchedulerOutput
|
|
||||||
from invokeai.app.invocations.metadata import MetadataItemField, MetadataItemOutput, MetadataOutput
|
from invokeai.app.invocations.metadata import MetadataItemField, MetadataItemOutput, MetadataOutput
|
||||||
from invokeai.app.invocations.model import (
|
from invokeai.app.invocations.model import (
|
||||||
CLIPField,
|
CLIPField,
|
||||||
@ -108,7 +108,7 @@ __all__ = [
|
|||||||
"WithBoard",
|
"WithBoard",
|
||||||
"WithMetadata",
|
"WithMetadata",
|
||||||
"WithWorkflow",
|
"WithWorkflow",
|
||||||
# invokeai.app.invocations.latent
|
# invokeai.app.invocations.scheduler
|
||||||
"SchedulerOutput",
|
"SchedulerOutput",
|
||||||
# invokeai.app.invocations.metadata
|
# invokeai.app.invocations.metadata
|
||||||
"MetadataItemField",
|
"MetadataItemField",
|
||||||
|
@ -224,7 +224,7 @@ follow_imports = "skip" # skips type checking of the modules listed below
|
|||||||
module = [
|
module = [
|
||||||
"invokeai.app.api.routers.models",
|
"invokeai.app.api.routers.models",
|
||||||
"invokeai.app.invocations.compel",
|
"invokeai.app.invocations.compel",
|
||||||
"invokeai.app.invocations.latent",
|
"invokeai.app.invocations.denoise_latents",
|
||||||
"invokeai.app.services.invocation_stats.invocation_stats_default",
|
"invokeai.app.services.invocation_stats.invocation_stats_default",
|
||||||
"invokeai.app.services.model_manager.model_manager_base",
|
"invokeai.app.services.model_manager.model_manager_base",
|
||||||
"invokeai.app.services.model_manager.model_manager_default",
|
"invokeai.app.services.model_manager.model_manager_default",
|
||||||
|
Loading…
Reference in New Issue
Block a user