Change tiling strategy to make TiledStableDiffusionRefineInvocation work with more tile shapes and overlaps.

Expose a few more params from TiledStableDiffusionRefineInvocation.
Add support for LoRA models in TiledStableDiffusionRefineInvocation.
2024-08-30 20:32:17 +00:00 · 2024-06-10 16:40:13 -04:00 · 2024-06-10 15:38:55 -04:00 · 2024-06-10 11:40:46 -04:00 · 2024-06-10 10:52:14 -04:00 · 2024-06-07 16:18:50 -04:00
17 changed files with 1431 additions and 898 deletions
--- a/invokeai/app/invocations/blend_latents.py
+++ b/invokeai/app/invocations/blend_latents.py
@ -0,0 +1,98 @@
+from typing import Any, Union
+
+import numpy as np
+import numpy.typing as npt
+import torch
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.fields import FieldDescriptions, Input, InputField, LatentsField
+from invokeai.app.invocations.primitives import LatentsOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.util.devices import TorchDevice
+
+
+@invocation(
+    "lblend",
+    title="Blend Latents",
+    tags=["latents", "blend"],
+    category="latents",
+    version="1.0.3",
+)
+class BlendLatentsInvocation(BaseInvocation):
+    """Blend two latents using a given alpha. Latents must have same size."""
+
+    latents_a: LatentsField = InputField(
+        description=FieldDescriptions.latents,
+        input=Input.Connection,
+    )
+    latents_b: LatentsField = InputField(
+        description=FieldDescriptions.latents,
+        input=Input.Connection,
+    )
+    alpha: float = InputField(default=0.5, description=FieldDescriptions.blend_alpha)
+
+    def invoke(self, context: InvocationContext) -> LatentsOutput:
+        latents_a = context.tensors.load(self.latents_a.latents_name)
+        latents_b = context.tensors.load(self.latents_b.latents_name)
+
+        if latents_a.shape != latents_b.shape:
+            raise Exception("Latents to blend must be the same size.")
+
+        device = TorchDevice.choose_torch_device()
+
+        def slerp(
+            t: Union[float, npt.NDArray[Any]],  # FIXME: maybe use np.float32 here?
+            v0: Union[torch.Tensor, npt.NDArray[Any]],
+            v1: Union[torch.Tensor, npt.NDArray[Any]],
+            DOT_THRESHOLD: float = 0.9995,
+        ) -> Union[torch.Tensor, npt.NDArray[Any]]:
+            """
+            Spherical linear interpolation
+            Args:
+                t (float/np.ndarray): Float value between 0.0 and 1.0
+                v0 (np.ndarray): Starting vector
+                v1 (np.ndarray): Final vector
+                DOT_THRESHOLD (float): Threshold for considering the two vectors as
+                                    colineal. Not recommended to alter this.
+            Returns:
+                v2 (np.ndarray): Interpolation vector between v0 and v1
+            """
+            inputs_are_torch = False
+            if not isinstance(v0, np.ndarray):
+                inputs_are_torch = True
+                v0 = v0.detach().cpu().numpy()
+            if not isinstance(v1, np.ndarray):
+                inputs_are_torch = True
+                v1 = v1.detach().cpu().numpy()
+
+            dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
+            if np.abs(dot) > DOT_THRESHOLD:
+                v2 = (1 - t) * v0 + t * v1
+            else:
+                theta_0 = np.arccos(dot)
+                sin_theta_0 = np.sin(theta_0)
+                theta_t = theta_0 * t
+                sin_theta_t = np.sin(theta_t)
+                s0 = np.sin(theta_0 - theta_t) / sin_theta_0
+                s1 = sin_theta_t / sin_theta_0
+                v2 = s0 * v0 + s1 * v1
+
+            if inputs_are_torch:
+                v2_torch: torch.Tensor = torch.from_numpy(v2).to(device)
+                return v2_torch
+            else:
+                assert isinstance(v2, np.ndarray)
+                return v2
+
+        # blend
+        bl = slerp(self.alpha, latents_a, latents_b)
+        assert isinstance(bl, torch.Tensor)
+        blended_latents: torch.Tensor = bl  # for type checking convenience
+
+        # https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699
+        blended_latents = blended_latents.to("cpu")
+
+        TorchDevice.empty_cache()
+
+        name = context.tensors.save(tensor=blended_latents)
+        return LatentsOutput.build(latents_name=name, latents=blended_latents, seed=self.latents_a.seed)
--- a/invokeai/app/invocations/constants.py
+++ b/invokeai/app/invocations/constants.py
@ -1,6 +1,7 @@
 from typing import Literal

 from invokeai.backend.stable_diffusion.schedulers import SCHEDULER_MAP
+from invokeai.backend.util.devices import TorchDevice

 LATENT_SCALE_FACTOR = 8
 """
@ -15,3 +16,5 @@ SCHEDULER_NAME_VALUES = Literal[tuple(SCHEDULER_MAP.keys())]

 IMAGE_MODES = Literal["L", "RGB", "RGBA", "CMYK", "YCbCr", "LAB", "HSV", "I", "F"]
 """A literal type for PIL image modes supported by Invoke"""
+
+DEFAULT_PRECISION = TorchDevice.choose_torch_dtype()
--- a/invokeai/app/invocations/create_denoise_mask.py
+++ b/invokeai/app/invocations/create_denoise_mask.py
@ -0,0 +1,80 @@
+from typing import Optional
+
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from torchvision.transforms.functional import resize as tv_resize
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.constants import DEFAULT_PRECISION
+from invokeai.app.invocations.fields import FieldDescriptions, ImageField, Input, InputField
+from invokeai.app.invocations.image_to_latents import ImageToLatentsInvocation
+from invokeai.app.invocations.model import VAEField
+from invokeai.app.invocations.primitives import DenoiseMaskOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor
+
+
+@invocation(
+    "create_denoise_mask",
+    title="Create Denoise Mask",
+    tags=["mask", "denoise"],
+    category="latents",
+    version="1.0.2",
+)
+class CreateDenoiseMaskInvocation(BaseInvocation):
+    """Creates mask for denoising model run."""
+
+    vae: VAEField = InputField(description=FieldDescriptions.vae, input=Input.Connection, ui_order=0)
+    image: Optional[ImageField] = InputField(default=None, description="Image which will be masked", ui_order=1)
+    mask: ImageField = InputField(description="The mask to use when pasting", ui_order=2)
+    tiled: bool = InputField(default=False, description=FieldDescriptions.tiled, ui_order=3)
+    fp32: bool = InputField(
+        default=DEFAULT_PRECISION == torch.float32,
+        description=FieldDescriptions.fp32,
+        ui_order=4,
+    )
+
+    def prep_mask_tensor(self, mask_image: Image.Image) -> torch.Tensor:
+        if mask_image.mode != "L":
+            mask_image = mask_image.convert("L")
+        mask_tensor: torch.Tensor = image_resized_to_grid_as_tensor(mask_image, normalize=False)
+        if mask_tensor.dim() == 3:
+            mask_tensor = mask_tensor.unsqueeze(0)
+        # if shape is not None:
+        #    mask_tensor = tv_resize(mask_tensor, shape, T.InterpolationMode.BILINEAR)
+        return mask_tensor
+
+    @torch.no_grad()
+    def invoke(self, context: InvocationContext) -> DenoiseMaskOutput:
+        if self.image is not None:
+            image = context.images.get_pil(self.image.image_name)
+            image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
+            if image_tensor.dim() == 3:
+                image_tensor = image_tensor.unsqueeze(0)
+        else:
+            image_tensor = None
+
+        mask = self.prep_mask_tensor(
+            context.images.get_pil(self.mask.image_name),
+        )
+
+        if image_tensor is not None:
+            vae_info = context.models.load(self.vae.vae)
+
+            img_mask = tv_resize(mask, image_tensor.shape[-2:], T.InterpolationMode.BILINEAR, antialias=False)
+            masked_image = image_tensor * torch.where(img_mask < 0.5, 0.0, 1.0)
+            # TODO:
+            masked_latents = ImageToLatentsInvocation.vae_encode(vae_info, self.fp32, self.tiled, masked_image.clone())
+
+            masked_latents_name = context.tensors.save(tensor=masked_latents)
+        else:
+            masked_latents_name = None
+
+        mask_name = context.tensors.save(tensor=mask)
+
+        return DenoiseMaskOutput.build(
+            mask_name=mask_name,
+            masked_latents_name=masked_latents_name,
+            gradient=False,
+        )
--- a/invokeai/app/invocations/create_gradient_mask.py
+++ b/invokeai/app/invocations/create_gradient_mask.py
@ -0,0 +1,138 @@
+from typing import Literal, Optional
+
+import numpy as np
+import torch
+import torchvision.transforms as T
+from PIL import Image, ImageFilter
+from torchvision.transforms.functional import resize as tv_resize
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
+from invokeai.app.invocations.constants import DEFAULT_PRECISION
+from invokeai.app.invocations.fields import (
+    DenoiseMaskField,
+    FieldDescriptions,
+    ImageField,
+    Input,
+    InputField,
+    OutputField,
+)
+from invokeai.app.invocations.image_to_latents import ImageToLatentsInvocation
+from invokeai.app.invocations.model import UNetField, VAEField
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.model_manager import LoadedModel
+from invokeai.backend.model_manager.config import MainConfigBase, ModelVariantType
+from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor
+
+
+@invocation_output("gradient_mask_output")
+class GradientMaskOutput(BaseInvocationOutput):
+    """Outputs a denoise mask and an image representing the total gradient of the mask."""
+
+    denoise_mask: DenoiseMaskField = OutputField(description="Mask for denoise model run")
+    expanded_mask_area: ImageField = OutputField(
+        description="Image representing the total gradient area of the mask. For paste-back purposes."
+    )
+
+
+@invocation(
+    "create_gradient_mask",
+    title="Create Gradient Mask",
+    tags=["mask", "denoise"],
+    category="latents",
+    version="1.1.0",
+)
+class CreateGradientMaskInvocation(BaseInvocation):
+    """Creates mask for denoising model run."""
+
+    mask: ImageField = InputField(default=None, description="Image which will be masked", ui_order=1)
+    edge_radius: int = InputField(
+        default=16, ge=0, description="How far to blur/expand the edges of the mask", ui_order=2
+    )
+    coherence_mode: Literal["Gaussian Blur", "Box Blur", "Staged"] = InputField(default="Gaussian Blur", ui_order=3)
+    minimum_denoise: float = InputField(
+        default=0.0, ge=0, le=1, description="Minimum denoise level for the coherence region", ui_order=4
+    )
+    image: Optional[ImageField] = InputField(
+        default=None,
+        description="OPTIONAL: Only connect for specialized Inpainting models, masked_latents will be generated from the image with the VAE",
+        title="[OPTIONAL] Image",
+        ui_order=6,
+    )
+    unet: Optional[UNetField] = InputField(
+        description="OPTIONAL: If the Unet is a specialized Inpainting model, masked_latents will be generated from the image with the VAE",
+        default=None,
+        input=Input.Connection,
+        title="[OPTIONAL] UNet",
+        ui_order=5,
+    )
+    vae: Optional[VAEField] = InputField(
+        default=None,
+        description="OPTIONAL: Only connect for specialized Inpainting models, masked_latents will be generated from the image with the VAE",
+        title="[OPTIONAL] VAE",
+        input=Input.Connection,
+        ui_order=7,
+    )
+    tiled: bool = InputField(default=False, description=FieldDescriptions.tiled, ui_order=8)
+    fp32: bool = InputField(
+        default=DEFAULT_PRECISION == torch.float32,
+        description=FieldDescriptions.fp32,
+        ui_order=9,
+    )
+
+    @torch.no_grad()
+    def invoke(self, context: InvocationContext) -> GradientMaskOutput:
+        mask_image = context.images.get_pil(self.mask.image_name, mode="L")
+        if self.edge_radius > 0:
+            if self.coherence_mode == "Box Blur":
+                blur_mask = mask_image.filter(ImageFilter.BoxBlur(self.edge_radius))
+            else:  # Gaussian Blur OR Staged
+                # Gaussian Blur uses standard deviation. 1/2 radius is a good approximation
+                blur_mask = mask_image.filter(ImageFilter.GaussianBlur(self.edge_radius / 2))
+
+            blur_tensor: torch.Tensor = image_resized_to_grid_as_tensor(blur_mask, normalize=False)
+
+            # redistribute blur so that the original edges are 0 and blur outwards to 1
+            blur_tensor = (blur_tensor - 0.5) * 2
+
+            threshold = 1 - self.minimum_denoise
+
+            if self.coherence_mode == "Staged":
+                # wherever the blur_tensor is less than fully masked, convert it to threshold
+                blur_tensor = torch.where((blur_tensor < 1) & (blur_tensor > 0), threshold, blur_tensor)
+            else:
+                # wherever the blur_tensor is above threshold but less than 1, drop it to threshold
+                blur_tensor = torch.where((blur_tensor > threshold) & (blur_tensor < 1), threshold, blur_tensor)
+
+        else:
+            blur_tensor: torch.Tensor = image_resized_to_grid_as_tensor(mask_image, normalize=False)
+
+        mask_name = context.tensors.save(tensor=blur_tensor.unsqueeze(1))
+
+        # compute a [0, 1] mask from the blur_tensor
+        expanded_mask = torch.where((blur_tensor < 1), 0, 1)
+        expanded_mask_image = Image.fromarray((expanded_mask.squeeze(0).numpy() * 255).astype(np.uint8), mode="L")
+        expanded_image_dto = context.images.save(expanded_mask_image)
+
+        masked_latents_name = None
+        if self.unet is not None and self.vae is not None and self.image is not None:
+            # all three fields must be present at the same time
+            main_model_config = context.models.get_config(self.unet.unet.key)
+            assert isinstance(main_model_config, MainConfigBase)
+            if main_model_config.variant is ModelVariantType.Inpaint:
+                mask = blur_tensor
+                vae_info: LoadedModel = context.models.load(self.vae.vae)
+                image = context.images.get_pil(self.image.image_name)
+                image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
+                if image_tensor.dim() == 3:
+                    image_tensor = image_tensor.unsqueeze(0)
+                img_mask = tv_resize(mask, image_tensor.shape[-2:], T.InterpolationMode.BILINEAR, antialias=False)
+                masked_image = image_tensor * torch.where(img_mask < 0.5, 0.0, 1.0)
+                masked_latents = ImageToLatentsInvocation.vae_encode(
+                    vae_info, self.fp32, self.tiled, masked_image.clone()
+                )
+                masked_latents_name = context.tensors.save(tensor=masked_latents)
+
+        return GradientMaskOutput(
+            denoise_mask=DenoiseMaskField(mask_name=mask_name, masked_latents_name=masked_latents_name, gradient=True),
+            expanded_mask_area=ImageField(image_name=expanded_image_dto.image_name),
+        )
--- a/invokeai/app/invocations/crop_latents.py
+++ b/invokeai/app/invocations/crop_latents.py
@ -0,0 +1,61 @@
+from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
+from invokeai.app.invocations.fields import FieldDescriptions, Input, InputField, LatentsField
+from invokeai.app.invocations.primitives import LatentsOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+
+
+# The Crop Latents node was copied from @skunkworxdark's implementation here:
+# https://github.com/skunkworxdark/XYGrid_nodes/blob/74647fa9c1fa57d317a94bd43ca689af7f0aae5e/images_to_grids.py#L1117C1-L1167C80
+@invocation(
+    "crop_latents",
+    title="Crop Latents",
+    tags=["latents", "crop"],
+    category="latents",
+    version="1.0.2",
+)
+# TODO(ryand): Named `CropLatentsCoreInvocation` to prevent a conflict with custom node `CropLatentsInvocation`.
+# Currently, if the class names conflict then 'GET /openapi.json' fails.
+class CropLatentsCoreInvocation(BaseInvocation):
+    """Crops a latent-space tensor to a box specified in image-space. The box dimensions and coordinates must be
+    divisible by the latent scale factor of 8.
+    """
+
+    latents: LatentsField = InputField(
+        description=FieldDescriptions.latents,
+        input=Input.Connection,
+    )
+    x: int = InputField(
+        ge=0,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description="The left x coordinate (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
+    )
+    y: int = InputField(
+        ge=0,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description="The top y coordinate (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
+    )
+    width: int = InputField(
+        ge=1,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description="The width (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
+    )
+    height: int = InputField(
+        ge=1,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description="The height (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
+    )
+
+    def invoke(self, context: InvocationContext) -> LatentsOutput:
+        latents = context.tensors.load(self.latents.latents_name)
+
+        x1 = self.x // LATENT_SCALE_FACTOR
+        y1 = self.y // LATENT_SCALE_FACTOR
+        x2 = x1 + (self.width // LATENT_SCALE_FACTOR)
+        y2 = y1 + (self.height // LATENT_SCALE_FACTOR)
+
+        cropped_latents = latents[..., y1:y2, x1:x2]
+
+        name = context.tensors.save(tensor=cropped_latents)
+
+        return LatentsOutput.build(latents_name=name, latents=cropped_latents)
--- a/invokeai/app/invocations/ideal_size.py
+++ b/invokeai/app/invocations/ideal_size.py
@ -0,0 +1,65 @@
+import math
+from typing import Tuple
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
+from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
+from invokeai.app.invocations.fields import FieldDescriptions, InputField, OutputField
+from invokeai.app.invocations.model import UNetField
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.model_manager.config import BaseModelType
+
+
+@invocation_output("ideal_size_output")
+class IdealSizeOutput(BaseInvocationOutput):
+    """Base class for invocations that output an image"""
+
+    width: int = OutputField(description="The ideal width of the image (in pixels)")
+    height: int = OutputField(description="The ideal height of the image (in pixels)")
+
+
+@invocation(
+    "ideal_size",
+    title="Ideal Size",
+    tags=["latents", "math", "ideal_size"],
+    version="1.0.3",
+)
+class IdealSizeInvocation(BaseInvocation):
+    """Calculates the ideal size for generation to avoid duplication"""
+
+    width: int = InputField(default=1024, description="Final image width")
+    height: int = InputField(default=576, description="Final image height")
+    unet: UNetField = InputField(default=None, description=FieldDescriptions.unet)
+    multiplier: float = InputField(
+        default=1.0,
+        description="Amount to multiply the model's dimensions by when calculating the ideal size (may result in "
+        "initial generation artifacts if too large)",
+    )
+
+    def trim_to_multiple_of(self, *args: int, multiple_of: int = LATENT_SCALE_FACTOR) -> Tuple[int, ...]:
+        return tuple((x - x % multiple_of) for x in args)
+
+    def invoke(self, context: InvocationContext) -> IdealSizeOutput:
+        unet_config = context.models.get_config(self.unet.unet.key)
+        aspect = self.width / self.height
+        dimension: float = 512
+        if unet_config.base == BaseModelType.StableDiffusion2:
+            dimension = 768
+        elif unet_config.base == BaseModelType.StableDiffusionXL:
+            dimension = 1024
+        dimension = dimension * self.multiplier
+        min_dimension = math.floor(dimension * 0.5)
+        model_area = dimension * dimension  # hardcoded for now since all models are trained on square images
+
+        if aspect > 1.0:
+            init_height = max(min_dimension, math.sqrt(model_area / aspect))
+            init_width = init_height * aspect
+        else:
+            init_width = max(min_dimension, math.sqrt(model_area * aspect))
+            init_height = init_width / aspect
+
+        scaled_width, scaled_height = self.trim_to_multiple_of(
+            math.floor(init_width),
+            math.floor(init_height),
+        )
+
+        return IdealSizeOutput(width=scaled_width, height=scaled_height)
--- a/invokeai/app/invocations/image_to_latents.py
+++ b/invokeai/app/invocations/image_to_latents.py
@ -0,0 +1,125 @@
+from functools import singledispatchmethod
+
+import einops
+import torch
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
+from diffusers.models.autoencoders.autoencoder_tiny import AutoencoderTiny
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.constants import DEFAULT_PRECISION
+from invokeai.app.invocations.fields import (
+    FieldDescriptions,
+    ImageField,
+    Input,
+    InputField,
+)
+from invokeai.app.invocations.model import VAEField
+from invokeai.app.invocations.primitives import LatentsOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.model_manager import LoadedModel
+from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor
+
+
+@invocation(
+    "i2l",
+    title="Image to Latents",
+    tags=["latents", "image", "vae", "i2l"],
+    category="latents",
+    version="1.0.2",
+)
+class ImageToLatentsInvocation(BaseInvocation):
+    """Encodes an image into latents."""
+
+    image: ImageField = InputField(
+        description="The image to encode",
+    )
+    vae: VAEField = InputField(
+        description=FieldDescriptions.vae,
+        input=Input.Connection,
+    )
+    tiled: bool = InputField(default=False, description=FieldDescriptions.tiled)
+    fp32: bool = InputField(default=DEFAULT_PRECISION == torch.float32, description=FieldDescriptions.fp32)
+
+    @staticmethod
+    def vae_encode(vae_info: LoadedModel, upcast: bool, tiled: bool, image_tensor: torch.Tensor) -> torch.Tensor:
+        with vae_info as vae:
+            assert isinstance(vae, torch.nn.Module)
+            orig_dtype = vae.dtype
+            if upcast:
+                vae.to(dtype=torch.float32)
+
+                use_torch_2_0_or_xformers = hasattr(vae.decoder, "mid_block") and isinstance(
+                    vae.decoder.mid_block.attentions[0].processor,
+                    (
+                        AttnProcessor2_0,
+                        XFormersAttnProcessor,
+                        LoRAXFormersAttnProcessor,
+                        LoRAAttnProcessor2_0,
+                    ),
+                )
+                # if xformers or torch_2_0 is used attention block does not need
+                # to be in float32 which can save lots of memory
+                if use_torch_2_0_or_xformers:
+                    vae.post_quant_conv.to(orig_dtype)
+                    vae.decoder.conv_in.to(orig_dtype)
+                    vae.decoder.mid_block.to(orig_dtype)
+                # else:
+                #    latents = latents.float()
+
+            else:
+                vae.to(dtype=torch.float16)
+                # latents = latents.half()
+
+            if tiled:
+                vae.enable_tiling()
+            else:
+                vae.disable_tiling()
+
+            # non_noised_latents_from_image
+            image_tensor = image_tensor.to(device=vae.device, dtype=vae.dtype)
+            with torch.inference_mode():
+                latents = ImageToLatentsInvocation._encode_to_tensor(vae, image_tensor)
+
+            latents = vae.config.scaling_factor * latents
+            latents = latents.to(dtype=orig_dtype)
+
+        return latents
+
+    @torch.no_grad()
+    def invoke(self, context: InvocationContext) -> LatentsOutput:
+        image = context.images.get_pil(self.image.image_name)
+
+        vae_info = context.models.load(self.vae.vae)
+
+        image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
+        if image_tensor.dim() == 3:
+            image_tensor = einops.rearrange(image_tensor, "c h w -> 1 c h w")
+
+        latents = self.vae_encode(vae_info, self.fp32, self.tiled, image_tensor)
+
+        latents = latents.to("cpu")
+        name = context.tensors.save(tensor=latents)
+        return LatentsOutput.build(latents_name=name, latents=latents, seed=None)
+
+    @singledispatchmethod
+    @staticmethod
+    def _encode_to_tensor(vae: AutoencoderKL, image_tensor: torch.FloatTensor) -> torch.FloatTensor:
+        assert isinstance(vae, torch.nn.Module)
+        image_tensor_dist = vae.encode(image_tensor).latent_dist
+        latents: torch.Tensor = image_tensor_dist.sample().to(
+            dtype=vae.dtype
+        )  # FIXME: uses torch.randn. make reproducible!
+        return latents
+
+    @_encode_to_tensor.register
+    @staticmethod
+    def _(vae: AutoencoderTiny, image_tensor: torch.FloatTensor) -> torch.FloatTensor:
+        assert isinstance(vae, torch.nn.Module)
+        latents: torch.FloatTensor = vae.encode(image_tensor).latents
+        return latents
--- a/invokeai/app/invocations/latent.py
+++ b/invokeai/app/invocations/latent.py
--- a/invokeai/app/invocations/latents_to_image.py
+++ b/invokeai/app/invocations/latents_to_image.py
@ -0,0 +1,127 @@
+import torch
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
+from diffusers.models.autoencoders.autoencoder_tiny import AutoencoderTiny
+from PIL import Image
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.constants import DEFAULT_PRECISION
+from invokeai.app.invocations.fields import (
+    FieldDescriptions,
+    Input,
+    InputField,
+    LatentsField,
+    WithBoard,
+    WithMetadata,
+)
+from invokeai.app.invocations.model import VAEField
+from invokeai.app.invocations.primitives import ImageOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.model_manager.load.load_base import LoadedModel
+from invokeai.backend.stable_diffusion import set_seamless
+from invokeai.backend.util.devices import TorchDevice
+
+
+@invocation(
+    "l2i",
+    title="Latents to Image",
+    tags=["latents", "image", "vae", "l2i"],
+    category="latents",
+    version="1.2.2",
+)
+class LatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
+    """Generates an image from latents."""
+
+    latents: LatentsField = InputField(
+        description=FieldDescriptions.latents,
+        input=Input.Connection,
+    )
+    vae: VAEField = InputField(
+        description=FieldDescriptions.vae,
+        input=Input.Connection,
+    )
+    tiled: bool = InputField(default=False, description=FieldDescriptions.tiled)
+    fp32: bool = InputField(default=DEFAULT_PRECISION == torch.float32, description=FieldDescriptions.fp32)
+
+    @staticmethod
+    def vae_decode(
+        context: InvocationContext,
+        vae_info: LoadedModel,
+        seamless_axes: list[str],
+        latents: torch.Tensor,
+        use_fp32: bool,
+        use_tiling: bool,
+    ) -> Image.Image:
+        assert isinstance(vae_info.model, (AutoencoderKL, AutoencoderTiny))
+        with set_seamless(vae_info.model, seamless_axes), vae_info as vae:
+            assert isinstance(vae, (AutoencoderKL, AutoencoderTiny))
+            latents = latents.to(vae.device)
+            if use_fp32:
+                vae.to(dtype=torch.float32)
+
+                use_torch_2_0_or_xformers = hasattr(vae.decoder, "mid_block") and isinstance(
+                    vae.decoder.mid_block.attentions[0].processor,
+                    (
+                        AttnProcessor2_0,
+                        XFormersAttnProcessor,
+                        LoRAXFormersAttnProcessor,
+                        LoRAAttnProcessor2_0,
+                    ),
+                )
+                # if xformers or torch_2_0 is used attention block does not need
+                # to be in float32 which can save lots of memory
+                if use_torch_2_0_or_xformers:
+                    vae.post_quant_conv.to(latents.dtype)
+                    vae.decoder.conv_in.to(latents.dtype)
+                    vae.decoder.mid_block.to(latents.dtype)
+                else:
+                    latents = latents.float()
+
+            else:
+                vae.to(dtype=torch.float16)
+                latents = latents.half()
+
+            if use_tiling or context.config.get().force_tiled_decode:
+                vae.enable_tiling()
+            else:
+                vae.disable_tiling()
+
+            # clear memory as vae decode can request a lot
+            TorchDevice.empty_cache()
+
+            with torch.inference_mode():
+                # copied from diffusers pipeline
+                latents = latents / vae.config.scaling_factor
+                image = vae.decode(latents, return_dict=False)[0]
+                image = (image / 2 + 0.5).clamp(0, 1)  # denormalize
+                # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+                np_image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+                image = VaeImageProcessor.numpy_to_pil(np_image)[0]
+
+        TorchDevice.empty_cache()
+
+        return image
+
+    @torch.no_grad()
+    def invoke(self, context: InvocationContext) -> ImageOutput:
+        latents = context.tensors.load(self.latents.latents_name)
+        vae_info = context.models.load(self.vae.vae)
+
+        image = self.vae_decode(
+            context=context,
+            vae_info=vae_info,
+            seamless_axes=self.vae.seamless_axes,
+            latents=latents,
+            use_fp32=self.fp32,
+            use_tiling=self.tiled,
+        )
+        image_dto = context.images.save(image=image)
+
+        return ImageOutput.build(image_dto)
--- a/invokeai/app/invocations/resize_latents.py
+++ b/invokeai/app/invocations/resize_latents.py
@ -0,0 +1,103 @@
+from typing import Literal
+
+import torch
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
+from invokeai.app.invocations.fields import (
+    FieldDescriptions,
+    Input,
+    InputField,
+    LatentsField,
+)
+from invokeai.app.invocations.primitives import LatentsOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.util.devices import TorchDevice
+
+LATENTS_INTERPOLATION_MODE = Literal["nearest", "linear", "bilinear", "bicubic", "trilinear", "area", "nearest-exact"]
+
+
+@invocation(
+    "lresize",
+    title="Resize Latents",
+    tags=["latents", "resize"],
+    category="latents",
+    version="1.0.2",
+)
+class ResizeLatentsInvocation(BaseInvocation):
+    """Resizes latents to explicit width/height (in pixels). Provided dimensions are floor-divided by 8."""
+
+    latents: LatentsField = InputField(
+        description=FieldDescriptions.latents,
+        input=Input.Connection,
+    )
+    width: int = InputField(
+        ge=64,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description=FieldDescriptions.width,
+    )
+    height: int = InputField(
+        ge=64,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description=FieldDescriptions.width,
+    )
+    mode: LATENTS_INTERPOLATION_MODE = InputField(default="bilinear", description=FieldDescriptions.interp_mode)
+    antialias: bool = InputField(default=False, description=FieldDescriptions.torch_antialias)
+
+    def invoke(self, context: InvocationContext) -> LatentsOutput:
+        latents = context.tensors.load(self.latents.latents_name)
+        device = TorchDevice.choose_torch_device()
+
+        resized_latents = torch.nn.functional.interpolate(
+            latents.to(device),
+            size=(self.height // LATENT_SCALE_FACTOR, self.width // LATENT_SCALE_FACTOR),
+            mode=self.mode,
+            antialias=self.antialias if self.mode in ["bilinear", "bicubic"] else False,
+        )
+
+        # https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699
+        resized_latents = resized_latents.to("cpu")
+
+        TorchDevice.empty_cache()
+
+        name = context.tensors.save(tensor=resized_latents)
+        return LatentsOutput.build(latents_name=name, latents=resized_latents, seed=self.latents.seed)
+
+
+@invocation(
+    "lscale",
+    title="Scale Latents",
+    tags=["latents", "resize"],
+    category="latents",
+    version="1.0.2",
+)
+class ScaleLatentsInvocation(BaseInvocation):
+    """Scales latents by a given factor."""
+
+    latents: LatentsField = InputField(
+        description=FieldDescriptions.latents,
+        input=Input.Connection,
+    )
+    scale_factor: float = InputField(gt=0, description=FieldDescriptions.scale_factor)
+    mode: LATENTS_INTERPOLATION_MODE = InputField(default="bilinear", description=FieldDescriptions.interp_mode)
+    antialias: bool = InputField(default=False, description=FieldDescriptions.torch_antialias)
+
+    def invoke(self, context: InvocationContext) -> LatentsOutput:
+        latents = context.tensors.load(self.latents.latents_name)
+
+        device = TorchDevice.choose_torch_device()
+
+        # resizing
+        resized_latents = torch.nn.functional.interpolate(
+            latents.to(device),
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            antialias=self.antialias if self.mode in ["bilinear", "bicubic"] else False,
+        )
+
+        # https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699
+        resized_latents = resized_latents.to("cpu")
+        TorchDevice.empty_cache()
+
+        name = context.tensors.save(tensor=resized_latents)
+        return LatentsOutput.build(latents_name=name, latents=resized_latents, seed=self.latents.seed)
--- a/invokeai/app/invocations/scheduler.py
+++ b/invokeai/app/invocations/scheduler.py
@ -0,0 +1,34 @@
+from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
+from invokeai.app.invocations.constants import SCHEDULER_NAME_VALUES
+from invokeai.app.invocations.fields import (
+    FieldDescriptions,
+    InputField,
+    OutputField,
+    UIType,
+)
+from invokeai.app.services.shared.invocation_context import InvocationContext
+
+
+@invocation_output("scheduler_output")
+class SchedulerOutput(BaseInvocationOutput):
+    scheduler: SCHEDULER_NAME_VALUES = OutputField(description=FieldDescriptions.scheduler, ui_type=UIType.Scheduler)
+
+
+@invocation(
+    "scheduler",
+    title="Scheduler",
+    tags=["scheduler"],
+    category="latents",
+    version="1.0.0",
+)
+class SchedulerInvocation(BaseInvocation):
+    """Selects a scheduler."""
+
+    scheduler: SCHEDULER_NAME_VALUES = InputField(
+        default="euler",
+        description=FieldDescriptions.scheduler,
+        ui_type=UIType.Scheduler,
+    )
+
+    def invoke(self, context: InvocationContext) -> SchedulerOutput:
+        return SchedulerOutput(scheduler=self.scheduler)
--- a/invokeai/app/invocations/tiled_stable_diffusion_refine.py
+++ b/invokeai/app/invocations/tiled_stable_diffusion_refine.py
@ -0,0 +1,384 @@
+from contextlib import ExitStack
+from typing import Iterator, Tuple
+
+import numpy as np
+import numpy.typing as npt
+import torch
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
+from PIL import Image
+from pydantic import field_validator
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.constants import DEFAULT_PRECISION, LATENT_SCALE_FACTOR, SCHEDULER_NAME_VALUES
+from invokeai.app.invocations.fields import (
+    ConditioningField,
+    FieldDescriptions,
+    ImageField,
+    Input,
+    InputField,
+    UIType,
+)
+from invokeai.app.invocations.image_to_latents import ImageToLatentsInvocation
+from invokeai.app.invocations.latent import DenoiseLatentsInvocation, get_scheduler
+from invokeai.app.invocations.latents_to_image import LatentsToImageInvocation
+from invokeai.app.invocations.model import ModelIdentifierField, UNetField, VAEField
+from invokeai.app.invocations.noise import get_noise
+from invokeai.app.invocations.primitives import ImageOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.app.util.controlnet_utils import CONTROLNET_MODE_VALUES, CONTROLNET_RESIZE_VALUES, prepare_control_image
+from invokeai.backend.lora import LoRAModelRaw
+from invokeai.backend.model_patcher import ModelPatcher
+from invokeai.backend.stable_diffusion.diffusers_pipeline import ControlNetData, image_resized_to_grid_as_tensor
+from invokeai.backend.tiles.tiles import calc_tiles_with_overlap, merge_tiles_with_linear_blending
+from invokeai.backend.tiles.utils import Tile
+from invokeai.backend.util.devices import TorchDevice
+from invokeai.backend.util.hotfixes import ControlNetModel
+
+
+@invocation(
+    "tiled_stable_diffusion_refine",
+    title="Tiled Stable Diffusion Refine",
+    tags=["upscale", "denoise"],
+    category="latents",
+    version="1.0.0",
+)
+class TiledStableDiffusionRefineInvocation(BaseInvocation):
+    """A tiled Stable Diffusion pipeline for refining high resolution images. This invocation is intended to be used to
+    refine an image after upscaling i.e. it is the second step in a typical "tiled upscaling" workflow.
+    """
+
+    image: ImageField = InputField(description="Image to be refined.")
+
+    positive_conditioning: ConditioningField = InputField(
+        description=FieldDescriptions.positive_cond, input=Input.Connection
+    )
+    negative_conditioning: ConditioningField = InputField(
+        description=FieldDescriptions.negative_cond, input=Input.Connection
+    )
+    # TODO(ryand): Add multiple-of validation.
+    tile_height: int = InputField(default=512, gt=0, description="Height of the tiles.")
+    tile_width: int = InputField(default=512, gt=0, description="Width of the tiles.")
+    tile_overlap: int = InputField(
+        default=16,
+        gt=0,
+        description="Target overlap between adjacent tiles (the last row/column may overlap more than this).",
+    )
+    steps: int = InputField(default=18, gt=0, description=FieldDescriptions.steps)
+    cfg_scale: float | list[float] = InputField(default=6.0, description=FieldDescriptions.cfg_scale, title="CFG Scale")
+    denoising_start: float = InputField(
+        default=0.65,
+        ge=0,
+        le=1,
+        description=FieldDescriptions.denoising_start,
+    )
+    denoising_end: float = InputField(default=1.0, ge=0, le=1, description=FieldDescriptions.denoising_end)
+    scheduler: SCHEDULER_NAME_VALUES = InputField(
+        default="euler",
+        description=FieldDescriptions.scheduler,
+        ui_type=UIType.Scheduler,
+    )
+    unet: UNetField = InputField(
+        description=FieldDescriptions.unet,
+        input=Input.Connection,
+        title="UNet",
+    )
+    cfg_rescale_multiplier: float = InputField(
+        title="CFG Rescale Multiplier", default=0, ge=0, lt=1, description=FieldDescriptions.cfg_rescale_multiplier
+    )
+    vae: VAEField = InputField(
+        description=FieldDescriptions.vae,
+        input=Input.Connection,
+    )
+    vae_fp32: bool = InputField(
+        default=DEFAULT_PRECISION == torch.float32, description="Whether to use float32 precision when running the VAE."
+    )
+    # HACK(ryand): We probably want to allow the user to control all of the parameters in ControlField. But, we akwardly
+    # don't want to use the image field. Figure out how best to handle this.
+    # TODO(ryand): Currently, there is no ControlNet preprocessor applied to the tile images. In other words, we pretty
+    # much assume that it is a tile ControlNet. We need to decide how we want to handle this. E.g. find a way to support
+    # CN preprocessors, raise a clear warning when a non-tile CN model is selected, hardcode the supported CN models,
+    # etc.
+    control_model: ModelIdentifierField = InputField(
+        description=FieldDescriptions.controlnet_model, ui_type=UIType.ControlNetModel
+    )
+    control_weight: float = InputField(default=0.6)
+
+    @field_validator("cfg_scale")
+    def ge_one(cls, v: list[float] | float) -> list[float] | float:
+        """Validate that all cfg_scale values are >= 1"""
+        if isinstance(v, list):
+            for i in v:
+                if i < 1:
+                    raise ValueError("cfg_scale must be greater than 1")
+        else:
+            if v < 1:
+                raise ValueError("cfg_scale must be greater than 1")
+        return v
+
+    @staticmethod
+    def crop_latents_to_tile(latents: torch.Tensor, image_tile: Tile) -> torch.Tensor:
+        """Crop the latent-space tensor to the area corresponding to the image-space tile.
+        The tile coordinates must be divisible by the LATENT_SCALE_FACTOR.
+        """
+        for coord in [image_tile.coords.top, image_tile.coords.left, image_tile.coords.right, image_tile.coords.bottom]:
+            if coord % LATENT_SCALE_FACTOR != 0:
+                raise ValueError(
+                    f"The tile coordinates must all be divisible by the latent scale factor"
+                    f" ({LATENT_SCALE_FACTOR}). {image_tile.coords=}."
+                )
+        assert latents.dim() == 4  # We expect: (batch_size, channels, height, width).
+
+        top = image_tile.coords.top // LATENT_SCALE_FACTOR
+        left = image_tile.coords.left // LATENT_SCALE_FACTOR
+        bottom = image_tile.coords.bottom // LATENT_SCALE_FACTOR
+        right = image_tile.coords.right // LATENT_SCALE_FACTOR
+        return latents[..., top:bottom, left:right]
+
+    def run_controlnet(
+        self,
+        image: Image.Image,
+        controlnet_model: ControlNetModel,
+        weight: float,
+        do_classifier_free_guidance: bool,
+        width: int,
+        height: int,
+        device: torch.device,
+        dtype: torch.dtype,
+        control_mode: CONTROLNET_MODE_VALUES = "balanced",
+        resize_mode: CONTROLNET_RESIZE_VALUES = "just_resize_simple",
+    ) -> ControlNetData:
+        control_image = prepare_control_image(
+            image=image,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            width=width,
+            height=height,
+            device=device,
+            dtype=dtype,
+            control_mode=control_mode,
+            resize_mode=resize_mode,
+        )
+        return ControlNetData(
+            model=controlnet_model,
+            image_tensor=control_image,
+            weight=weight,
+            begin_step_percent=0.0,
+            end_step_percent=1.0,
+            control_mode=control_mode,
+            # Any resizing needed should currently be happening in prepare_control_image(), but adding resize_mode to
+            # ControlNetData in case needed in the future.
+            resize_mode=resize_mode,
+        )
+
+    @torch.no_grad()
+    def invoke(self, context: InvocationContext) -> ImageOutput:
+        # TODO(ryand): Expose the seed parameter.
+        seed = 0
+
+        # Load the input image.
+        input_image = context.images.get_pil(self.image.image_name)
+
+        # Calculate the tile locations to cover the image.
+        # We have selected this tiling strategy to make it easy to achieve tile coords that are multiples of 8. This
+        # facilitates conversions between image space and latent space.
+        # TODO(ryand): Expose these tiling parameters. (Keep in mind the multiple-of constraints on these params.)
+        tiles = calc_tiles_with_overlap(
+            image_height=input_image.height,
+            image_width=input_image.width,
+            tile_height=self.tile_height,
+            tile_width=self.tile_width,
+            overlap=self.tile_overlap,
+        )
+
+        # Convert the input image to a torch.Tensor.
+        input_image_torch = image_resized_to_grid_as_tensor(input_image.convert("RGB"), multiple_of=LATENT_SCALE_FACTOR)
+        input_image_torch = input_image_torch.unsqueeze(0)  # Add a batch dimension.
+        # Validate our assumptions about the shape of input_image_torch.
+        assert input_image_torch.dim() == 4  # We expect: (batch_size, channels, height, width).
+        assert input_image_torch.shape[:2] == (1, 3)
+
+        # Split the input image into tiles in torch.Tensor format.
+        image_tiles_torch: list[torch.Tensor] = []
+        for tile in tiles:
+            image_tile = input_image_torch[
+                :,
+                :,
+                tile.coords.top : tile.coords.bottom,
+                tile.coords.left : tile.coords.right,
+            ]
+            image_tiles_torch.append(image_tile)
+
+        # Split the input image into tiles in numpy format.
+        # TODO(ryand): We currently maintain both np.ndarray and torch.Tensor tiles. Ideally, all operations should work
+        # with torch.Tensor tiles.
+        input_image_np = np.array(input_image)
+        image_tiles_np: list[npt.NDArray[np.uint8]] = []
+        for tile in tiles:
+            image_tile_np = input_image_np[
+                tile.coords.top : tile.coords.bottom,
+                tile.coords.left : tile.coords.right,
+                :,
+            ]
+            image_tiles_np.append(image_tile_np)
+
+        # VAE-encode each image tile independently.
+        # TODO(ryand): Is there any advantage to VAE-encoding the entire image before splitting it into tiles? What
+        # about for decoding?
+        vae_info = context.models.load(self.vae.vae)
+        latent_tiles: list[torch.Tensor] = []
+        for image_tile_torch in image_tiles_torch:
+            latent_tiles.append(
+                ImageToLatentsInvocation.vae_encode(
+                    vae_info=vae_info, upcast=self.vae_fp32, tiled=False, image_tensor=image_tile_torch
+                )
+            )
+
+        # Generate noise with dimensions corresponding to the full image in latent space.
+        # It is important that the noise tensor is generated at the full image dimension and then tiled, rather than
+        # generating for each tile independently. This ensures that overlapping regions between tiles use the same
+        # noise.
+        assert input_image_torch.shape[2] % LATENT_SCALE_FACTOR == 0
+        assert input_image_torch.shape[3] % LATENT_SCALE_FACTOR == 0
+        global_noise = get_noise(
+            width=input_image_torch.shape[3],
+            height=input_image_torch.shape[2],
+            device=TorchDevice.choose_torch_device(),
+            seed=seed,
+            downsampling_factor=LATENT_SCALE_FACTOR,
+            use_cpu=True,
+        )
+
+        # Crop the global noise into tiles.
+        noise_tiles = [self.crop_latents_to_tile(latents=global_noise, image_tile=t) for t in tiles]
+
+        # Prepare an iterator that yields the UNet's LoRA models and their weights.
+        def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
+            for lora in self.unet.loras:
+                lora_info = context.models.load(lora.lora)
+                assert isinstance(lora_info.model, LoRAModelRaw)
+                yield (lora_info.model, lora.weight)
+                del lora_info
+
+        # Load the UNet model.
+        unet_info = context.models.load(self.unet.unet)
+
+        refined_latent_tiles: list[torch.Tensor] = []
+        with ExitStack() as exit_stack, unet_info as unet, ModelPatcher.apply_lora_unet(unet, _lora_loader()):
+            assert isinstance(unet, UNet2DConditionModel)
+            scheduler = get_scheduler(
+                context=context,
+                scheduler_info=self.unet.scheduler,
+                scheduler_name=self.scheduler,
+                seed=seed,
+            )
+            pipeline = DenoiseLatentsInvocation.create_pipeline(unet=unet, scheduler=scheduler)
+
+            # Prepare the prompt conditioning data. The same prompt conditioning is applied to all tiles.
+            # Assume that all tiles have the same shape.
+            _, _, latent_height, latent_width = latent_tiles[0].shape
+            conditioning_data = DenoiseLatentsInvocation.get_conditioning_data(
+                context=context,
+                positive_conditioning_field=self.positive_conditioning,
+                negative_conditioning_field=self.negative_conditioning,
+                unet=unet,
+                latent_height=latent_height,
+                latent_width=latent_width,
+                cfg_scale=self.cfg_scale,
+                steps=self.steps,
+                cfg_rescale_multiplier=self.cfg_rescale_multiplier,
+            )
+
+            # Load the ControlNet model.
+            # TODO(ryand): Support multiple ControlNet models.
+            controlnet_model = exit_stack.enter_context(context.models.load(self.control_model))
+            assert isinstance(controlnet_model, ControlNetModel)
+
+            # Denoise (i.e. "refine") each tile independently.
+            for image_tile_np, latent_tile, noise_tile in zip(image_tiles_np, latent_tiles, noise_tiles, strict=True):
+                assert latent_tile.shape == noise_tile.shape
+
+                # Prepare a PIL Image for ControlNet processing.
+                # TODO(ryand): This is a bit awkward that we have to prepare both torch.Tensor and PIL.Image versions of
+                # the tiles. Ideally, the ControlNet code should be able to work with Tensors.
+                image_tile_pil = Image.fromarray(image_tile_np)
+
+                # Run the ControlNet on the image tile.
+                height, width, _ = image_tile_np.shape
+                # The height and width must be evenly divisible by LATENT_SCALE_FACTOR. This is enforced earlier, but we
+                # validate this assumption here.
+                assert height % LATENT_SCALE_FACTOR == 0
+                assert width % LATENT_SCALE_FACTOR == 0
+                controlnet_data = self.run_controlnet(
+                    image=image_tile_pil,
+                    controlnet_model=controlnet_model,
+                    weight=self.control_weight,
+                    do_classifier_free_guidance=True,
+                    width=width,
+                    height=height,
+                    device=controlnet_model.device,
+                    dtype=controlnet_model.dtype,
+                    control_mode="balanced",
+                    resize_mode="just_resize_simple",
+                )
+
+                num_inference_steps, timesteps, init_timestep, scheduler_step_kwargs = (
+                    DenoiseLatentsInvocation.init_scheduler(
+                        scheduler,
+                        device=unet.device,
+                        steps=self.steps,
+                        denoising_start=self.denoising_start,
+                        denoising_end=self.denoising_end,
+                        seed=seed,
+                    )
+                )
+
+                # TODO(ryand): Think about when/if latents/noise should be moved off of the device to save VRAM.
+                latent_tile = latent_tile.to(device=unet.device, dtype=unet.dtype)
+                noise_tile = noise_tile.to(device=unet.device, dtype=unet.dtype)
+                refined_latent_tile = pipeline.latents_from_embeddings(
+                    latents=latent_tile,
+                    timesteps=timesteps,
+                    init_timestep=init_timestep,
+                    noise=noise_tile,
+                    seed=seed,
+                    mask=None,
+                    masked_latents=None,
+                    gradient_mask=None,
+                    num_inference_steps=num_inference_steps,
+                    scheduler_step_kwargs=scheduler_step_kwargs,
+                    conditioning_data=conditioning_data,
+                    control_data=[controlnet_data],
+                    ip_adapter_data=None,
+                    t2i_adapter_data=None,
+                    callback=lambda x: None,
+                )
+                refined_latent_tiles.append(refined_latent_tile)
+
+        # VAE-decode each refined latent tile independently.
+        refined_image_tiles: list[Image.Image] = []
+        for refined_latent_tile in refined_latent_tiles:
+            refined_image_tile = LatentsToImageInvocation.vae_decode(
+                context=context,
+                vae_info=vae_info,
+                seamless_axes=self.vae.seamless_axes,
+                latents=refined_latent_tile,
+                use_fp32=self.vae_fp32,
+                use_tiling=False,
+            )
+            refined_image_tiles.append(refined_image_tile)
+
+        # TODO(ryand): I copied this from DenoiseLatentsInvocation. I'm not sure if it's actually important.
+        TorchDevice.empty_cache()
+
+        # Merge the refined image tiles back into a single image.
+        refined_image_tiles_np = [np.array(t) for t in refined_image_tiles]
+        merged_image_np = np.zeros(shape=(input_image.height, input_image.width, 3), dtype=np.uint8)
+        # TODO(ryand): Tune the blend_amount. Should this be exposed as a parameter?
+        merge_tiles_with_linear_blending(
+            dst_image=merged_image_np, tiles=tiles, tile_images=refined_image_tiles_np, blend_amount=self.tile_overlap
+        )
+
+        # Save the refined image and return its reference.
+        merged_image_pil = Image.fromarray(merged_image_np)
+        image_dto = context.images.save(image=merged_image_pil)
+
+        return ImageOutput.build(image_dto)
--- a/invokeai/app/util/controlnet_utils.py
+++ b/invokeai/app/util/controlnet_utils.py
@ -289,7 +289,7 @@ def prepare_control_image(
    width: int,
    height: int,
    num_channels: int = 3,
-    device: str = "cuda",
+    device: str | torch.device = "cuda",
    dtype: torch.dtype = torch.float16,
    control_mode: CONTROLNET_MODE_VALUES = "balanced",
    resize_mode: CONTROLNET_RESIZE_VALUES = "just_resize_simple",
@ -304,7 +304,7 @@ def prepare_control_image(
        num_channels (int, optional): The target number of image channels. This is achieved by converting the input
            image to RGB, then naively taking the first `num_channels` channels. The primary use case is converting a
            RGB image to a single-channel grayscale image. Raises if `num_channels` cannot be achieved. Defaults to 3.
-        device (str, optional): The target device for the output image. Defaults to "cuda".
+        device (str | torch.Device, optional): The target device for the output image. Defaults to "cuda".
        dtype (_type_, optional): The dtype for the output image. Defaults to torch.float16.
        do_classifier_free_guidance (bool, optional): If True, repeat the output image along the batch dimension.
            Defaults to True.
--- a/invokeai/backend/model_manager/probe.py
+++ b/invokeai/backend/model_manager/probe.py
@ -10,7 +10,7 @@ from picklescan.scanner import scan_file_path
 import invokeai.backend.util.logging as logger
 from invokeai.app.util.misc import uuid_string
 from invokeai.backend.model_hash.model_hash import HASHING_ALGORITHMS, ModelHash
-from invokeai.backend.util.util import SilenceWarnings
+from invokeai.backend.util.silence_warnings import SilenceWarnings

 from .config import (
    AnyModelConfig,
--- a/invokeai/backend/stable_diffusion/diffusers_pipeline.py
+++ b/invokeai/backend/stable_diffusion/diffusers_pipeline.py
@ -11,7 +11,6 @@ import psutil
 import torch
 import torchvision.transforms as T
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.models.controlnet import ControlNetModel
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import KarrasDiffusionSchedulers
@ -26,6 +25,7 @@ from invokeai.backend.stable_diffusion.diffusion.shared_invokeai_diffusion impor
 from invokeai.backend.stable_diffusion.diffusion.unet_attention_patcher import UNetAttentionPatcher, UNetIPAdapterData
 from invokeai.backend.util.attention import auto_detect_slice_size
 from invokeai.backend.util.devices import TorchDevice
+from invokeai.backend.util.hotfixes import ControlNetModel


@dataclass
--- a/invokeai/backend/util/silence_warnings.py
+++ b/invokeai/backend/util/silence_warnings.py
@ -1,29 +1,36 @@
-"""Context class to silence transformers and diffusers warnings."""
-
 import warnings
-from typing import Any
+from contextlib import ContextDecorator

-from diffusers import logging as diffusers_logging
+from diffusers.utils import logging as diffusers_logging
 from transformers import logging as transformers_logging


-class SilenceWarnings(object):
-    """Use in context to temporarily turn off warnings from transformers & diffusers modules.
+# Inherit from ContextDecorator to allow using SilenceWarnings as both a context manager and a decorator.
+class SilenceWarnings(ContextDecorator):
+    """A context manager that disables warnings from transformers & diffusers modules while active.

+    As context manager:
+    ```
    with SilenceWarnings():
        # do something
+    ```
+
+    As decorator:
+    ```
+    @SilenceWarnings()
+    def some_function():
+        # do something
+    ```
    """

-    def __init__(self) -> None:
-        self.transformers_verbosity = transformers_logging.get_verbosity()
-        self.diffusers_verbosity = diffusers_logging.get_verbosity()
-
    def __enter__(self) -> None:
+        self._transformers_verbosity = transformers_logging.get_verbosity()
+        self._diffusers_verbosity = diffusers_logging.get_verbosity()
        transformers_logging.set_verbosity_error()
        diffusers_logging.set_verbosity_error()
        warnings.simplefilter("ignore")

-    def __exit__(self, *args: Any) -> None:
-        transformers_logging.set_verbosity(self.transformers_verbosity)
-        diffusers_logging.set_verbosity(self.diffusers_verbosity)
+    def __exit__(self, *args) -> None:
+        transformers_logging.set_verbosity(self._transformers_verbosity)
+        diffusers_logging.set_verbosity(self._diffusers_verbosity)
        warnings.simplefilter("default")
--- a/invokeai/backend/util/util.py
+++ b/invokeai/backend/util/util.py
@ -1,12 +1,9 @@
 import base64
 import io
 import os
-import warnings
 from pathlib import Path

-from diffusers import logging as diffusers_logging
 from PIL import Image
-from transformers import logging as transformers_logging

 # actual size of a gig
 GIG = 1073741824
@ -51,21 +48,3 @@ class Chdir(object):

    def __exit__(self, *args):
        os.chdir(self.original)
-
-
-class SilenceWarnings(object):
-    """Context manager to temporarily lower verbosity of diffusers & transformers warning messages."""
-
-    def __enter__(self):
-        """Set verbosity to error."""
-        self.transformers_verbosity = transformers_logging.get_verbosity()
-        self.diffusers_verbosity = diffusers_logging.get_verbosity()
-        transformers_logging.set_verbosity_error()
-        diffusers_logging.set_verbosity_error()
-        warnings.simplefilter("ignore")
-
-    def __exit__(self, type, value, traceback):
-        """Restore logger verbosity to state before context was entered."""
-        transformers_logging.set_verbosity(self.transformers_verbosity)
-        diffusers_logging.set_verbosity(self.diffusers_verbosity)
-        warnings.simplefilter("default")
Author	SHA1	Message	Date
Ryan Dick	59284c707e	Change tiling strategy to make TiledStableDiffusionRefineInvocation work with more tile shapes and overlaps.	2024-06-10 16:40:13 -04:00
Ryan Dick	911792f258	Expose a few more params from TiledStableDiffusionRefineInvocation.	2024-06-10 15:38:55 -04:00
Ryan Dick	9567c6e196	Add support for LoRA models in TiledStableDiffusionRefineInvocation.	2024-06-10 11:40:46 -04:00
Ryan Dick	6e47bd14af	Add naive ControlNet support to TiledStableDiffusionRefineInvocation	2024-06-10 10:52:14 -04:00
Ryan Dick	9ac9b6a014	Fix ControlNetModel type hint import source.	2024-06-07 16:18:50 -04:00
Ryan Dick	459d487620	Rough prototype of TiledStableDiffusionRefineInvocation is working.	2024-06-07 15:05:57 -04:00
Ryan Dick	787e1bbb5f	WIP - TiledStableDiffusionRefine	2024-06-07 12:06:35 -04:00
Ryan Dick	bb5648983f	Minor improvements to LatentsToImageInvocation type hints.	2024-06-07 11:45:42 -04:00
Ryan Dick	da066979cf	Expose vae_decode(...) as a staticmethod on LatentsToImageInvocation.	2024-06-07 11:41:39 -04:00
Ryan Dick	2c03a0fa53	Fix return type of prepare_noise_and_latents(...).	2024-06-07 11:01:50 -04:00
Ryan Dick	ea9fc99ce7	Make init_scheduler() a staticmethod on DenoiseLatentsInvocation so that it can be called externally.	2024-06-07 11:00:37 -04:00
Ryan Dick	a406fb725a	Only allow a single positive/negative prompt conditioning input for tiled refine.	2024-06-07 10:01:50 -04:00
Ryan Dick	fe4112c54e	WIP on TiledStableDiffusionRefine	2024-06-06 17:39:34 -04:00
Ryan Dick	385ff0f86c	Convert several methods in DenoiseLatentsInvocation to staticmethods so that they can be called externally.	2024-06-06 17:39:04 -04:00
Ryan Dick	5c3517e2a6	Simplify the logic in prepare_noise_and_latents(...).	2024-06-06 15:16:34 -04:00
Ryan Dick	7cb7f5107e	Split out the prepare_noise_and_latents(...) logic in DenoiseLatentsInvocation so that it can be called from other invocations.	2024-06-06 15:10:04 -04:00
Ryan Dick	084ccccfff	(minor) Add a TODO note to get_scheduler(...).	2024-06-06 15:04:31 -04:00
Ryan Dick	b2cf57d8ff	Remove unused 'uses_inpainting_model' monkey-patch. I don't know the original origin of this patch, but there are currently no other references to it in InvokeAI or diffusers.	2024-06-06 11:44:42 -04:00
Ryan Dick	f5bc616699	(minor) Use SilenceWarnings as a decorator rather than a context manager to save an indentation level.	2024-06-06 10:40:19 -04:00
Ryan Dick	50021dad94	Tidy SilenceWarnings context manager: - Fix type errors - Enable SilenceWarnings to be used as both a context manager and a decorator - Remove duplicate implementation - Check the initial verbosity on __enter__() rather than __init__()	2024-06-06 10:36:12 -04:00
Ryan Dick	dda98f7a4b	Tidy latent.py imports to all use absolute import paths.	2024-06-06 09:30:49 -04:00
Ryan Dick	76c97ec411	Fix all comparisons against the DEFAULT_PRECISION constant. DEFAULT_PRECISION is a torch.dtype. Previously, it was compared to a str in a number of places where it would always resolve to False. This is a bugfix that results in a change to the default behavior. In practice, this will not change the behavior for many users, because it only causes a change in behavior if a users has configured float32 as their default precision.	2024-06-06 09:16:45 -04:00
Ryan Dick	78852228cd	Move SchedulerInvocation to a new file. No functional changes.	2024-06-05 17:18:39 -04:00
Ryan Dick	dec0ffd47c	Move CreateDenoiseMaskInvocation to its own file. No functional changes.	2024-06-05 14:59:45 -04:00
Ryan Dick	638bf33483	Move CreateGradientMaskInvocation to its own file. No functional changes.	2024-06-05 14:48:32 -04:00
Ryan Dick	b961495b57	Move LatentsToImageInvocation to its own file. No functional changes.	2024-06-05 13:53:53 -04:00
Ryan Dick	b35cde7db7	Move ImageToLatentsInvocation to its own file. No functional changes.	2024-06-05 13:47:38 -04:00
Ryan Dick	103e34691b	Move ScaleLatentsInvocation and ResizeLatentsInvocation to their own file. No functional changes.	2024-06-05 11:05:44 -04:00
Ryan Dick	0d90999a19	Move BlendLatentsInvocation to its own file. No functional changes.	2024-06-05 11:04:17 -04:00
Ryan Dick	4cefa48307	Move CropLatentsCoreInvocation to its own file. No functional changes.	2024-06-05 10:53:24 -04:00
Ryan Dick	6ade5df25c	Move IdealSizeInvocation to its own file. No functional changes.	2024-06-05 10:47:26 -04:00