Split up latent.py (code reorganization, no functional changes) (#6491)

## Summary I've started working towards a better tiled upscaling implementation. It is going to require some refactoring of `DenoiseLatentsInvocation`. As a first step, this PR splits up all of the invocations in latent.py into their own files. That file had become a bit of a dumping ground - it should be a bit more manageable to work with now. This PR just re-organizes the code. There should be no functional changes. ## QA Instructions I've done some light smoke testing. I'll do some more before merging. The main risk is that I missed a broken import, or some other copy-paste error. ## Checklist - [x] _The PR has a short but descriptive title, suitable for a changelog_ - [x] _Tests added / updated (if applicable)_: N/A - [x] _Documentation added / updated (if applicable)_: N/A
2024-08-30 20:32:17 +00:00 · 2024-06-07 12:01:56 -04:00 · 2024-06-07 12:01:56 -04:00 · 0dbec3ad8b
commit 0dbec3ad8b
parent 6d067e56f2 52c0c4a32f
12 changed files with 819 additions and 699 deletions
--- a/invokeai/app/invocations/blend_latents.py
+++ b/invokeai/app/invocations/blend_latents.py
@ -0,0 +1,98 @@
+from typing import Any, Union
+
+import numpy as np
+import numpy.typing as npt
+import torch
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.fields import FieldDescriptions, Input, InputField, LatentsField
+from invokeai.app.invocations.primitives import LatentsOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.util.devices import TorchDevice
+
+
+@invocation(
+    "lblend",
+    title="Blend Latents",
+    tags=["latents", "blend"],
+    category="latents",
+    version="1.0.3",
+)
+class BlendLatentsInvocation(BaseInvocation):
+    """Blend two latents using a given alpha. Latents must have same size."""
+
+    latents_a: LatentsField = InputField(
+        description=FieldDescriptions.latents,
+        input=Input.Connection,
+    )
+    latents_b: LatentsField = InputField(
+        description=FieldDescriptions.latents,
+        input=Input.Connection,
+    )
+    alpha: float = InputField(default=0.5, description=FieldDescriptions.blend_alpha)
+
+    def invoke(self, context: InvocationContext) -> LatentsOutput:
+        latents_a = context.tensors.load(self.latents_a.latents_name)
+        latents_b = context.tensors.load(self.latents_b.latents_name)
+
+        if latents_a.shape != latents_b.shape:
+            raise Exception("Latents to blend must be the same size.")
+
+        device = TorchDevice.choose_torch_device()
+
+        def slerp(
+            t: Union[float, npt.NDArray[Any]],  # FIXME: maybe use np.float32 here?
+            v0: Union[torch.Tensor, npt.NDArray[Any]],
+            v1: Union[torch.Tensor, npt.NDArray[Any]],
+            DOT_THRESHOLD: float = 0.9995,
+        ) -> Union[torch.Tensor, npt.NDArray[Any]]:
+            """
+            Spherical linear interpolation
+            Args:
+                t (float/np.ndarray): Float value between 0.0 and 1.0
+                v0 (np.ndarray): Starting vector
+                v1 (np.ndarray): Final vector
+                DOT_THRESHOLD (float): Threshold for considering the two vectors as
+                                    colineal. Not recommended to alter this.
+            Returns:
+                v2 (np.ndarray): Interpolation vector between v0 and v1
+            """
+            inputs_are_torch = False
+            if not isinstance(v0, np.ndarray):
+                inputs_are_torch = True
+                v0 = v0.detach().cpu().numpy()
+            if not isinstance(v1, np.ndarray):
+                inputs_are_torch = True
+                v1 = v1.detach().cpu().numpy()
+
+            dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
+            if np.abs(dot) > DOT_THRESHOLD:
+                v2 = (1 - t) * v0 + t * v1
+            else:
+                theta_0 = np.arccos(dot)
+                sin_theta_0 = np.sin(theta_0)
+                theta_t = theta_0 * t
+                sin_theta_t = np.sin(theta_t)
+                s0 = np.sin(theta_0 - theta_t) / sin_theta_0
+                s1 = sin_theta_t / sin_theta_0
+                v2 = s0 * v0 + s1 * v1
+
+            if inputs_are_torch:
+                v2_torch: torch.Tensor = torch.from_numpy(v2).to(device)
+                return v2_torch
+            else:
+                assert isinstance(v2, np.ndarray)
+                return v2
+
+        # blend
+        bl = slerp(self.alpha, latents_a, latents_b)
+        assert isinstance(bl, torch.Tensor)
+        blended_latents: torch.Tensor = bl  # for type checking convenience
+
+        # https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699
+        blended_latents = blended_latents.to("cpu")
+
+        TorchDevice.empty_cache()
+
+        name = context.tensors.save(tensor=blended_latents)
+        return LatentsOutput.build(latents_name=name, latents=blended_latents, seed=self.latents_a.seed)
--- a/invokeai/app/invocations/create_denoise_mask.py
+++ b/invokeai/app/invocations/create_denoise_mask.py
@ -0,0 +1,80 @@
+from typing import Optional
+
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from torchvision.transforms.functional import resize as tv_resize
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.denoise_latents import DEFAULT_PRECISION
+from invokeai.app.invocations.fields import FieldDescriptions, ImageField, Input, InputField
+from invokeai.app.invocations.image_to_latents import ImageToLatentsInvocation
+from invokeai.app.invocations.model import VAEField
+from invokeai.app.invocations.primitives import DenoiseMaskOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor
+
+
+@invocation(
+    "create_denoise_mask",
+    title="Create Denoise Mask",
+    tags=["mask", "denoise"],
+    category="latents",
+    version="1.0.2",
+)
+class CreateDenoiseMaskInvocation(BaseInvocation):
+    """Creates mask for denoising model run."""
+
+    vae: VAEField = InputField(description=FieldDescriptions.vae, input=Input.Connection, ui_order=0)
+    image: Optional[ImageField] = InputField(default=None, description="Image which will be masked", ui_order=1)
+    mask: ImageField = InputField(description="The mask to use when pasting", ui_order=2)
+    tiled: bool = InputField(default=False, description=FieldDescriptions.tiled, ui_order=3)
+    fp32: bool = InputField(
+        default=DEFAULT_PRECISION == "float32",
+        description=FieldDescriptions.fp32,
+        ui_order=4,
+    )
+
+    def prep_mask_tensor(self, mask_image: Image.Image) -> torch.Tensor:
+        if mask_image.mode != "L":
+            mask_image = mask_image.convert("L")
+        mask_tensor: torch.Tensor = image_resized_to_grid_as_tensor(mask_image, normalize=False)
+        if mask_tensor.dim() == 3:
+            mask_tensor = mask_tensor.unsqueeze(0)
+        # if shape is not None:
+        #    mask_tensor = tv_resize(mask_tensor, shape, T.InterpolationMode.BILINEAR)
+        return mask_tensor
+
+    @torch.no_grad()
+    def invoke(self, context: InvocationContext) -> DenoiseMaskOutput:
+        if self.image is not None:
+            image = context.images.get_pil(self.image.image_name)
+            image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
+            if image_tensor.dim() == 3:
+                image_tensor = image_tensor.unsqueeze(0)
+        else:
+            image_tensor = None
+
+        mask = self.prep_mask_tensor(
+            context.images.get_pil(self.mask.image_name),
+        )
+
+        if image_tensor is not None:
+            vae_info = context.models.load(self.vae.vae)
+
+            img_mask = tv_resize(mask, image_tensor.shape[-2:], T.InterpolationMode.BILINEAR, antialias=False)
+            masked_image = image_tensor * torch.where(img_mask < 0.5, 0.0, 1.0)
+            # TODO:
+            masked_latents = ImageToLatentsInvocation.vae_encode(vae_info, self.fp32, self.tiled, masked_image.clone())
+
+            masked_latents_name = context.tensors.save(tensor=masked_latents)
+        else:
+            masked_latents_name = None
+
+        mask_name = context.tensors.save(tensor=mask)
+
+        return DenoiseMaskOutput.build(
+            mask_name=mask_name,
+            masked_latents_name=masked_latents_name,
+            gradient=False,
+        )
--- a/invokeai/app/invocations/create_gradient_mask.py
+++ b/invokeai/app/invocations/create_gradient_mask.py
@ -0,0 +1,138 @@
+from typing import Literal, Optional
+
+import numpy as np
+import torch
+import torchvision.transforms as T
+from PIL import Image, ImageFilter
+from torchvision.transforms.functional import resize as tv_resize
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
+from invokeai.app.invocations.denoise_latents import DEFAULT_PRECISION
+from invokeai.app.invocations.fields import (
+    DenoiseMaskField,
+    FieldDescriptions,
+    ImageField,
+    Input,
+    InputField,
+    OutputField,
+)
+from invokeai.app.invocations.image_to_latents import ImageToLatentsInvocation
+from invokeai.app.invocations.model import UNetField, VAEField
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.model_manager import LoadedModel
+from invokeai.backend.model_manager.config import MainConfigBase, ModelVariantType
+from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor
+
+
+@invocation_output("gradient_mask_output")
+class GradientMaskOutput(BaseInvocationOutput):
+    """Outputs a denoise mask and an image representing the total gradient of the mask."""
+
+    denoise_mask: DenoiseMaskField = OutputField(description="Mask for denoise model run")
+    expanded_mask_area: ImageField = OutputField(
+        description="Image representing the total gradient area of the mask. For paste-back purposes."
+    )
+
+
+@invocation(
+    "create_gradient_mask",
+    title="Create Gradient Mask",
+    tags=["mask", "denoise"],
+    category="latents",
+    version="1.1.0",
+)
+class CreateGradientMaskInvocation(BaseInvocation):
+    """Creates mask for denoising model run."""
+
+    mask: ImageField = InputField(default=None, description="Image which will be masked", ui_order=1)
+    edge_radius: int = InputField(
+        default=16, ge=0, description="How far to blur/expand the edges of the mask", ui_order=2
+    )
+    coherence_mode: Literal["Gaussian Blur", "Box Blur", "Staged"] = InputField(default="Gaussian Blur", ui_order=3)
+    minimum_denoise: float = InputField(
+        default=0.0, ge=0, le=1, description="Minimum denoise level for the coherence region", ui_order=4
+    )
+    image: Optional[ImageField] = InputField(
+        default=None,
+        description="OPTIONAL: Only connect for specialized Inpainting models, masked_latents will be generated from the image with the VAE",
+        title="[OPTIONAL] Image",
+        ui_order=6,
+    )
+    unet: Optional[UNetField] = InputField(
+        description="OPTIONAL: If the Unet is a specialized Inpainting model, masked_latents will be generated from the image with the VAE",
+        default=None,
+        input=Input.Connection,
+        title="[OPTIONAL] UNet",
+        ui_order=5,
+    )
+    vae: Optional[VAEField] = InputField(
+        default=None,
+        description="OPTIONAL: Only connect for specialized Inpainting models, masked_latents will be generated from the image with the VAE",
+        title="[OPTIONAL] VAE",
+        input=Input.Connection,
+        ui_order=7,
+    )
+    tiled: bool = InputField(default=False, description=FieldDescriptions.tiled, ui_order=8)
+    fp32: bool = InputField(
+        default=DEFAULT_PRECISION == "float32",
+        description=FieldDescriptions.fp32,
+        ui_order=9,
+    )
+
+    @torch.no_grad()
+    def invoke(self, context: InvocationContext) -> GradientMaskOutput:
+        mask_image = context.images.get_pil(self.mask.image_name, mode="L")
+        if self.edge_radius > 0:
+            if self.coherence_mode == "Box Blur":
+                blur_mask = mask_image.filter(ImageFilter.BoxBlur(self.edge_radius))
+            else:  # Gaussian Blur OR Staged
+                # Gaussian Blur uses standard deviation. 1/2 radius is a good approximation
+                blur_mask = mask_image.filter(ImageFilter.GaussianBlur(self.edge_radius / 2))
+
+            blur_tensor: torch.Tensor = image_resized_to_grid_as_tensor(blur_mask, normalize=False)
+
+            # redistribute blur so that the original edges are 0 and blur outwards to 1
+            blur_tensor = (blur_tensor - 0.5) * 2
+
+            threshold = 1 - self.minimum_denoise
+
+            if self.coherence_mode == "Staged":
+                # wherever the blur_tensor is less than fully masked, convert it to threshold
+                blur_tensor = torch.where((blur_tensor < 1) & (blur_tensor > 0), threshold, blur_tensor)
+            else:
+                # wherever the blur_tensor is above threshold but less than 1, drop it to threshold
+                blur_tensor = torch.where((blur_tensor > threshold) & (blur_tensor < 1), threshold, blur_tensor)
+
+        else:
+            blur_tensor: torch.Tensor = image_resized_to_grid_as_tensor(mask_image, normalize=False)
+
+        mask_name = context.tensors.save(tensor=blur_tensor.unsqueeze(1))
+
+        # compute a [0, 1] mask from the blur_tensor
+        expanded_mask = torch.where((blur_tensor < 1), 0, 1)
+        expanded_mask_image = Image.fromarray((expanded_mask.squeeze(0).numpy() * 255).astype(np.uint8), mode="L")
+        expanded_image_dto = context.images.save(expanded_mask_image)
+
+        masked_latents_name = None
+        if self.unet is not None and self.vae is not None and self.image is not None:
+            # all three fields must be present at the same time
+            main_model_config = context.models.get_config(self.unet.unet.key)
+            assert isinstance(main_model_config, MainConfigBase)
+            if main_model_config.variant is ModelVariantType.Inpaint:
+                mask = blur_tensor
+                vae_info: LoadedModel = context.models.load(self.vae.vae)
+                image = context.images.get_pil(self.image.image_name)
+                image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
+                if image_tensor.dim() == 3:
+                    image_tensor = image_tensor.unsqueeze(0)
+                img_mask = tv_resize(mask, image_tensor.shape[-2:], T.InterpolationMode.BILINEAR, antialias=False)
+                masked_image = image_tensor * torch.where(img_mask < 0.5, 0.0, 1.0)
+                masked_latents = ImageToLatentsInvocation.vae_encode(
+                    vae_info, self.fp32, self.tiled, masked_image.clone()
+                )
+                masked_latents_name = context.tensors.save(tensor=masked_latents)
+
+        return GradientMaskOutput(
+            denoise_mask=DenoiseMaskField(mask_name=mask_name, masked_latents_name=masked_latents_name, gradient=True),
+            expanded_mask_area=ImageField(image_name=expanded_image_dto.image_name),
+        )
--- a/invokeai/app/invocations/crop_latents.py
+++ b/invokeai/app/invocations/crop_latents.py
@ -0,0 +1,61 @@
+from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
+from invokeai.app.invocations.fields import FieldDescriptions, Input, InputField, LatentsField
+from invokeai.app.invocations.primitives import LatentsOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+
+
+# The Crop Latents node was copied from @skunkworxdark's implementation here:
+# https://github.com/skunkworxdark/XYGrid_nodes/blob/74647fa9c1fa57d317a94bd43ca689af7f0aae5e/images_to_grids.py#L1117C1-L1167C80
+@invocation(
+    "crop_latents",
+    title="Crop Latents",
+    tags=["latents", "crop"],
+    category="latents",
+    version="1.0.2",
+)
+# TODO(ryand): Named `CropLatentsCoreInvocation` to prevent a conflict with custom node `CropLatentsInvocation`.
+# Currently, if the class names conflict then 'GET /openapi.json' fails.
+class CropLatentsCoreInvocation(BaseInvocation):
+    """Crops a latent-space tensor to a box specified in image-space. The box dimensions and coordinates must be
+    divisible by the latent scale factor of 8.
+    """
+
+    latents: LatentsField = InputField(
+        description=FieldDescriptions.latents,
+        input=Input.Connection,
+    )
+    x: int = InputField(
+        ge=0,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description="The left x coordinate (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
+    )
+    y: int = InputField(
+        ge=0,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description="The top y coordinate (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
+    )
+    width: int = InputField(
+        ge=1,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description="The width (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
+    )
+    height: int = InputField(
+        ge=1,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description="The height (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
+    )
+
+    def invoke(self, context: InvocationContext) -> LatentsOutput:
+        latents = context.tensors.load(self.latents.latents_name)
+
+        x1 = self.x // LATENT_SCALE_FACTOR
+        y1 = self.y // LATENT_SCALE_FACTOR
+        x2 = x1 + (self.width // LATENT_SCALE_FACTOR)
+        y2 = y1 + (self.height // LATENT_SCALE_FACTOR)
+
+        cropped_latents = latents[..., y1:y2, x1:x2]
+
+        name = context.tensors.save(tensor=cropped_latents)
+
+        return LatentsOutput.build(latents_name=name, latents=cropped_latents)
--- a/invokeai/app/invocations/denoise_latents.py
+++ b/invokeai/app/invocations/denoise_latents.py
@ -1,32 +1,17 @@
 # Copyright (c) 2023 Kyle Schouviller (https://github.com/kyle0654)
 import inspect
-import math
 from contextlib import ExitStack
-from functools import singledispatchmethod
-from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

-import einops
-import numpy as np
-import numpy.typing as npt
 import torch
 import torchvision
 import torchvision.transforms as T
 from diffusers.configuration_utils import ConfigMixin
-from diffusers.image_processor import VaeImageProcessor
 from diffusers.models.adapter import T2IAdapter
-from diffusers.models.attention_processor import (
-    AttnProcessor2_0,
-    LoRAAttnProcessor2_0,
-    LoRAXFormersAttnProcessor,
-    XFormersAttnProcessor,
-)
-from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
-from diffusers.models.autoencoders.autoencoder_tiny import AutoencoderTiny
 from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
 from diffusers.schedulers.scheduling_dpmsolver_sde import DPMSolverSDEScheduler
 from diffusers.schedulers.scheduling_tcd import TCDScheduler
 from diffusers.schedulers.scheduling_utils import SchedulerMixin as Scheduler
-from PIL import Image, ImageFilter
 from pydantic import field_validator
 from torchvision.transforms.functional import resize as tv_resize
 from transformers import CLIPVisionModelWithProjection
@ -36,24 +21,19 @@ from invokeai.app.invocations.fields import (
    ConditioningField,
    DenoiseMaskField,
    FieldDescriptions,
-    ImageField,
    Input,
    InputField,
    LatentsField,
-    OutputField,
    UIType,
-    WithBoard,
-    WithMetadata,
 )
 from invokeai.app.invocations.ip_adapter import IPAdapterField
-from invokeai.app.invocations.primitives import DenoiseMaskOutput, ImageOutput, LatentsOutput
+from invokeai.app.invocations.primitives import LatentsOutput
 from invokeai.app.invocations.t2i_adapter import T2IAdapterField
 from invokeai.app.services.shared.invocation_context import InvocationContext
 from invokeai.app.util.controlnet_utils import prepare_control_image
 from invokeai.backend.ip_adapter.ip_adapter import IPAdapter
 from invokeai.backend.lora import LoRAModelRaw
-from invokeai.backend.model_manager import BaseModelType, LoadedModel
-from invokeai.backend.model_manager.config import MainConfigBase, ModelVariantType
+from invokeai.backend.model_manager import BaseModelType
 from invokeai.backend.model_patcher import ModelPatcher
 from invokeai.backend.stable_diffusion import PipelineIntermediateState, set_seamless
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import (
@ -72,221 +52,16 @@ from ...backend.stable_diffusion.diffusers_pipeline import (
    ControlNetData,
    StableDiffusionGeneratorPipeline,
    T2IAdapterData,
-    image_resized_to_grid_as_tensor,
 )
 from ...backend.stable_diffusion.schedulers import SCHEDULER_MAP
 from ...backend.util.devices import TorchDevice
-from .baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
+from .baseinvocation import BaseInvocation, invocation
 from .controlnet_image_processors import ControlField
-from .model import ModelIdentifierField, UNetField, VAEField
+from .model import ModelIdentifierField, UNetField

 DEFAULT_PRECISION = TorchDevice.choose_torch_dtype()


-@invocation_output("scheduler_output")
-class SchedulerOutput(BaseInvocationOutput):
-    scheduler: SCHEDULER_NAME_VALUES = OutputField(description=FieldDescriptions.scheduler, ui_type=UIType.Scheduler)
-
-
-@invocation(
-    "scheduler",
-    title="Scheduler",
-    tags=["scheduler"],
-    category="latents",
-    version="1.0.0",
-)
-class SchedulerInvocation(BaseInvocation):
-    """Selects a scheduler."""
-
-    scheduler: SCHEDULER_NAME_VALUES = InputField(
-        default="euler",
-        description=FieldDescriptions.scheduler,
-        ui_type=UIType.Scheduler,
-    )
-
-    def invoke(self, context: InvocationContext) -> SchedulerOutput:
-        return SchedulerOutput(scheduler=self.scheduler)
-
-
-@invocation(
-    "create_denoise_mask",
-    title="Create Denoise Mask",
-    tags=["mask", "denoise"],
-    category="latents",
-    version="1.0.2",
-)
-class CreateDenoiseMaskInvocation(BaseInvocation):
-    """Creates mask for denoising model run."""
-
-    vae: VAEField = InputField(description=FieldDescriptions.vae, input=Input.Connection, ui_order=0)
-    image: Optional[ImageField] = InputField(default=None, description="Image which will be masked", ui_order=1)
-    mask: ImageField = InputField(description="The mask to use when pasting", ui_order=2)
-    tiled: bool = InputField(default=False, description=FieldDescriptions.tiled, ui_order=3)
-    fp32: bool = InputField(
-        default=DEFAULT_PRECISION == "float32",
-        description=FieldDescriptions.fp32,
-        ui_order=4,
-    )
-
-    def prep_mask_tensor(self, mask_image: Image.Image) -> torch.Tensor:
-        if mask_image.mode != "L":
-            mask_image = mask_image.convert("L")
-        mask_tensor: torch.Tensor = image_resized_to_grid_as_tensor(mask_image, normalize=False)
-        if mask_tensor.dim() == 3:
-            mask_tensor = mask_tensor.unsqueeze(0)
-        # if shape is not None:
-        #    mask_tensor = tv_resize(mask_tensor, shape, T.InterpolationMode.BILINEAR)
-        return mask_tensor
-
-    @torch.no_grad()
-    def invoke(self, context: InvocationContext) -> DenoiseMaskOutput:
-        if self.image is not None:
-            image = context.images.get_pil(self.image.image_name)
-            image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
-            if image_tensor.dim() == 3:
-                image_tensor = image_tensor.unsqueeze(0)
-        else:
-            image_tensor = None
-
-        mask = self.prep_mask_tensor(
-            context.images.get_pil(self.mask.image_name),
-        )
-
-        if image_tensor is not None:
-            vae_info = context.models.load(self.vae.vae)
-
-            img_mask = tv_resize(mask, image_tensor.shape[-2:], T.InterpolationMode.BILINEAR, antialias=False)
-            masked_image = image_tensor * torch.where(img_mask < 0.5, 0.0, 1.0)
-            # TODO:
-            masked_latents = ImageToLatentsInvocation.vae_encode(vae_info, self.fp32, self.tiled, masked_image.clone())
-
-            masked_latents_name = context.tensors.save(tensor=masked_latents)
-        else:
-            masked_latents_name = None
-
-        mask_name = context.tensors.save(tensor=mask)
-
-        return DenoiseMaskOutput.build(
-            mask_name=mask_name,
-            masked_latents_name=masked_latents_name,
-            gradient=False,
-        )
-
-
-@invocation_output("gradient_mask_output")
-class GradientMaskOutput(BaseInvocationOutput):
-    """Outputs a denoise mask and an image representing the total gradient of the mask."""
-
-    denoise_mask: DenoiseMaskField = OutputField(description="Mask for denoise model run")
-    expanded_mask_area: ImageField = OutputField(
-        description="Image representing the total gradient area of the mask. For paste-back purposes."
-    )
-
-
-@invocation(
-    "create_gradient_mask",
-    title="Create Gradient Mask",
-    tags=["mask", "denoise"],
-    category="latents",
-    version="1.1.0",
-)
-class CreateGradientMaskInvocation(BaseInvocation):
-    """Creates mask for denoising model run."""
-
-    mask: ImageField = InputField(default=None, description="Image which will be masked", ui_order=1)
-    edge_radius: int = InputField(
-        default=16, ge=0, description="How far to blur/expand the edges of the mask", ui_order=2
-    )
-    coherence_mode: Literal["Gaussian Blur", "Box Blur", "Staged"] = InputField(default="Gaussian Blur", ui_order=3)
-    minimum_denoise: float = InputField(
-        default=0.0, ge=0, le=1, description="Minimum denoise level for the coherence region", ui_order=4
-    )
-    image: Optional[ImageField] = InputField(
-        default=None,
-        description="OPTIONAL: Only connect for specialized Inpainting models, masked_latents will be generated from the image with the VAE",
-        title="[OPTIONAL] Image",
-        ui_order=6,
-    )
-    unet: Optional[UNetField] = InputField(
-        description="OPTIONAL: If the Unet is a specialized Inpainting model, masked_latents will be generated from the image with the VAE",
-        default=None,
-        input=Input.Connection,
-        title="[OPTIONAL] UNet",
-        ui_order=5,
-    )
-    vae: Optional[VAEField] = InputField(
-        default=None,
-        description="OPTIONAL: Only connect for specialized Inpainting models, masked_latents will be generated from the image with the VAE",
-        title="[OPTIONAL] VAE",
-        input=Input.Connection,
-        ui_order=7,
-    )
-    tiled: bool = InputField(default=False, description=FieldDescriptions.tiled, ui_order=8)
-    fp32: bool = InputField(
-        default=DEFAULT_PRECISION == "float32",
-        description=FieldDescriptions.fp32,
-        ui_order=9,
-    )
-
-    @torch.no_grad()
-    def invoke(self, context: InvocationContext) -> GradientMaskOutput:
-        mask_image = context.images.get_pil(self.mask.image_name, mode="L")
-        if self.edge_radius > 0:
-            if self.coherence_mode == "Box Blur":
-                blur_mask = mask_image.filter(ImageFilter.BoxBlur(self.edge_radius))
-            else:  # Gaussian Blur OR Staged
-                # Gaussian Blur uses standard deviation. 1/2 radius is a good approximation
-                blur_mask = mask_image.filter(ImageFilter.GaussianBlur(self.edge_radius / 2))
-
-            blur_tensor: torch.Tensor = image_resized_to_grid_as_tensor(blur_mask, normalize=False)
-
-            # redistribute blur so that the original edges are 0 and blur outwards to 1
-            blur_tensor = (blur_tensor - 0.5) * 2
-
-            threshold = 1 - self.minimum_denoise
-
-            if self.coherence_mode == "Staged":
-                # wherever the blur_tensor is less than fully masked, convert it to threshold
-                blur_tensor = torch.where((blur_tensor < 1) & (blur_tensor > 0), threshold, blur_tensor)
-            else:
-                # wherever the blur_tensor is above threshold but less than 1, drop it to threshold
-                blur_tensor = torch.where((blur_tensor > threshold) & (blur_tensor < 1), threshold, blur_tensor)
-
-        else:
-            blur_tensor: torch.Tensor = image_resized_to_grid_as_tensor(mask_image, normalize=False)
-
-        mask_name = context.tensors.save(tensor=blur_tensor.unsqueeze(1))
-
-        # compute a [0, 1] mask from the blur_tensor
-        expanded_mask = torch.where((blur_tensor < 1), 0, 1)
-        expanded_mask_image = Image.fromarray((expanded_mask.squeeze(0).numpy() * 255).astype(np.uint8), mode="L")
-        expanded_image_dto = context.images.save(expanded_mask_image)
-
-        masked_latents_name = None
-        if self.unet is not None and self.vae is not None and self.image is not None:
-            # all three fields must be present at the same time
-            main_model_config = context.models.get_config(self.unet.unet.key)
-            assert isinstance(main_model_config, MainConfigBase)
-            if main_model_config.variant is ModelVariantType.Inpaint:
-                mask = blur_tensor
-                vae_info: LoadedModel = context.models.load(self.vae.vae)
-                image = context.images.get_pil(self.image.image_name)
-                image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
-                if image_tensor.dim() == 3:
-                    image_tensor = image_tensor.unsqueeze(0)
-                img_mask = tv_resize(mask, image_tensor.shape[-2:], T.InterpolationMode.BILINEAR, antialias=False)
-                masked_image = image_tensor * torch.where(img_mask < 0.5, 0.0, 1.0)
-                masked_latents = ImageToLatentsInvocation.vae_encode(
-                    vae_info, self.fp32, self.tiled, masked_image.clone()
-                )
-                masked_latents_name = context.tensors.save(tensor=masked_latents)
-
-        return GradientMaskOutput(
-            denoise_mask=DenoiseMaskField(mask_name=mask_name, masked_latents_name=masked_latents_name, gradient=True),
-            expanded_mask_area=ImageField(image_name=expanded_image_dto.image_name),
-        )
-
-
 def get_scheduler(
    context: InvocationContext,
    scheduler_info: ModelIdentifierField,
@ -1037,469 +812,3 @@ class DenoiseLatentsInvocation(BaseInvocation):

            name = context.tensors.save(tensor=result_latents)
        return LatentsOutput.build(latents_name=name, latents=result_latents, seed=None)
-
-
-@invocation(
-    "l2i",
-    title="Latents to Image",
-    tags=["latents", "image", "vae", "l2i"],
-    category="latents",
-    version="1.2.2",
-)
-class LatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
-    """Generates an image from latents."""
-
-    latents: LatentsField = InputField(
-        description=FieldDescriptions.latents,
-        input=Input.Connection,
-    )
-    vae: VAEField = InputField(
-        description=FieldDescriptions.vae,
-        input=Input.Connection,
-    )
-    tiled: bool = InputField(default=False, description=FieldDescriptions.tiled)
-    fp32: bool = InputField(default=DEFAULT_PRECISION == "float32", description=FieldDescriptions.fp32)
-
-    @torch.no_grad()
-    def invoke(self, context: InvocationContext) -> ImageOutput:
-        latents = context.tensors.load(self.latents.latents_name)
-
-        vae_info = context.models.load(self.vae.vae)
-        assert isinstance(vae_info.model, (UNet2DConditionModel, AutoencoderKL, AutoencoderTiny))
-        with set_seamless(vae_info.model, self.vae.seamless_axes), vae_info as vae:
-            assert isinstance(vae, torch.nn.Module)
-            latents = latents.to(vae.device)
-            if self.fp32:
-                vae.to(dtype=torch.float32)
-
-                use_torch_2_0_or_xformers = hasattr(vae.decoder, "mid_block") and isinstance(
-                    vae.decoder.mid_block.attentions[0].processor,
-                    (
-                        AttnProcessor2_0,
-                        XFormersAttnProcessor,
-                        LoRAXFormersAttnProcessor,
-                        LoRAAttnProcessor2_0,
-                    ),
-                )
-                # if xformers or torch_2_0 is used attention block does not need
-                # to be in float32 which can save lots of memory
-                if use_torch_2_0_or_xformers:
-                    vae.post_quant_conv.to(latents.dtype)
-                    vae.decoder.conv_in.to(latents.dtype)
-                    vae.decoder.mid_block.to(latents.dtype)
-                else:
-                    latents = latents.float()
-
-            else:
-                vae.to(dtype=torch.float16)
-                latents = latents.half()
-
-            if self.tiled or context.config.get().force_tiled_decode:
-                vae.enable_tiling()
-            else:
-                vae.disable_tiling()
-
-            # clear memory as vae decode can request a lot
-            TorchDevice.empty_cache()
-
-            with torch.inference_mode():
-                # copied from diffusers pipeline
-                latents = latents / vae.config.scaling_factor
-                image = vae.decode(latents, return_dict=False)[0]
-                image = (image / 2 + 0.5).clamp(0, 1)  # denormalize
-                # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-                np_image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-
-                image = VaeImageProcessor.numpy_to_pil(np_image)[0]
-
-        TorchDevice.empty_cache()
-
-        image_dto = context.images.save(image=image)
-
-        return ImageOutput.build(image_dto)
-
-
-LATENTS_INTERPOLATION_MODE = Literal["nearest", "linear", "bilinear", "bicubic", "trilinear", "area", "nearest-exact"]
-
-
-@invocation(
-    "lresize",
-    title="Resize Latents",
-    tags=["latents", "resize"],
-    category="latents",
-    version="1.0.2",
-)
-class ResizeLatentsInvocation(BaseInvocation):
-    """Resizes latents to explicit width/height (in pixels). Provided dimensions are floor-divided by 8."""
-
-    latents: LatentsField = InputField(
-        description=FieldDescriptions.latents,
-        input=Input.Connection,
-    )
-    width: int = InputField(
-        ge=64,
-        multiple_of=LATENT_SCALE_FACTOR,
-        description=FieldDescriptions.width,
-    )
-    height: int = InputField(
-        ge=64,
-        multiple_of=LATENT_SCALE_FACTOR,
-        description=FieldDescriptions.width,
-    )
-    mode: LATENTS_INTERPOLATION_MODE = InputField(default="bilinear", description=FieldDescriptions.interp_mode)
-    antialias: bool = InputField(default=False, description=FieldDescriptions.torch_antialias)
-
-    def invoke(self, context: InvocationContext) -> LatentsOutput:
-        latents = context.tensors.load(self.latents.latents_name)
-        device = TorchDevice.choose_torch_device()
-
-        resized_latents = torch.nn.functional.interpolate(
-            latents.to(device),
-            size=(self.height // LATENT_SCALE_FACTOR, self.width // LATENT_SCALE_FACTOR),
-            mode=self.mode,
-            antialias=self.antialias if self.mode in ["bilinear", "bicubic"] else False,
-        )
-
-        # https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699
-        resized_latents = resized_latents.to("cpu")
-
-        TorchDevice.empty_cache()
-
-        name = context.tensors.save(tensor=resized_latents)
-        return LatentsOutput.build(latents_name=name, latents=resized_latents, seed=self.latents.seed)
-
-
-@invocation(
-    "lscale",
-    title="Scale Latents",
-    tags=["latents", "resize"],
-    category="latents",
-    version="1.0.2",
-)
-class ScaleLatentsInvocation(BaseInvocation):
-    """Scales latents by a given factor."""
-
-    latents: LatentsField = InputField(
-        description=FieldDescriptions.latents,
-        input=Input.Connection,
-    )
-    scale_factor: float = InputField(gt=0, description=FieldDescriptions.scale_factor)
-    mode: LATENTS_INTERPOLATION_MODE = InputField(default="bilinear", description=FieldDescriptions.interp_mode)
-    antialias: bool = InputField(default=False, description=FieldDescriptions.torch_antialias)
-
-    def invoke(self, context: InvocationContext) -> LatentsOutput:
-        latents = context.tensors.load(self.latents.latents_name)
-
-        device = TorchDevice.choose_torch_device()
-
-        # resizing
-        resized_latents = torch.nn.functional.interpolate(
-            latents.to(device),
-            scale_factor=self.scale_factor,
-            mode=self.mode,
-            antialias=self.antialias if self.mode in ["bilinear", "bicubic"] else False,
-        )
-
-        # https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699
-        resized_latents = resized_latents.to("cpu")
-        TorchDevice.empty_cache()
-
-        name = context.tensors.save(tensor=resized_latents)
-        return LatentsOutput.build(latents_name=name, latents=resized_latents, seed=self.latents.seed)
-
-
-@invocation(
-    "i2l",
-    title="Image to Latents",
-    tags=["latents", "image", "vae", "i2l"],
-    category="latents",
-    version="1.0.2",
-)
-class ImageToLatentsInvocation(BaseInvocation):
-    """Encodes an image into latents."""
-
-    image: ImageField = InputField(
-        description="The image to encode",
-    )
-    vae: VAEField = InputField(
-        description=FieldDescriptions.vae,
-        input=Input.Connection,
-    )
-    tiled: bool = InputField(default=False, description=FieldDescriptions.tiled)
-    fp32: bool = InputField(default=DEFAULT_PRECISION == "float32", description=FieldDescriptions.fp32)
-
-    @staticmethod
-    def vae_encode(vae_info: LoadedModel, upcast: bool, tiled: bool, image_tensor: torch.Tensor) -> torch.Tensor:
-        with vae_info as vae:
-            assert isinstance(vae, torch.nn.Module)
-            orig_dtype = vae.dtype
-            if upcast:
-                vae.to(dtype=torch.float32)
-
-                use_torch_2_0_or_xformers = hasattr(vae.decoder, "mid_block") and isinstance(
-                    vae.decoder.mid_block.attentions[0].processor,
-                    (
-                        AttnProcessor2_0,
-                        XFormersAttnProcessor,
-                        LoRAXFormersAttnProcessor,
-                        LoRAAttnProcessor2_0,
-                    ),
-                )
-                # if xformers or torch_2_0 is used attention block does not need
-                # to be in float32 which can save lots of memory
-                if use_torch_2_0_or_xformers:
-                    vae.post_quant_conv.to(orig_dtype)
-                    vae.decoder.conv_in.to(orig_dtype)
-                    vae.decoder.mid_block.to(orig_dtype)
-                # else:
-                #    latents = latents.float()
-
-            else:
-                vae.to(dtype=torch.float16)
-                # latents = latents.half()
-
-            if tiled:
-                vae.enable_tiling()
-            else:
-                vae.disable_tiling()
-
-            # non_noised_latents_from_image
-            image_tensor = image_tensor.to(device=vae.device, dtype=vae.dtype)
-            with torch.inference_mode():
-                latents = ImageToLatentsInvocation._encode_to_tensor(vae, image_tensor)
-
-            latents = vae.config.scaling_factor * latents
-            latents = latents.to(dtype=orig_dtype)
-
-        return latents
-
-    @torch.no_grad()
-    def invoke(self, context: InvocationContext) -> LatentsOutput:
-        image = context.images.get_pil(self.image.image_name)
-
-        vae_info = context.models.load(self.vae.vae)
-
-        image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
-        if image_tensor.dim() == 3:
-            image_tensor = einops.rearrange(image_tensor, "c h w -> 1 c h w")
-
-        latents = self.vae_encode(vae_info, self.fp32, self.tiled, image_tensor)
-
-        latents = latents.to("cpu")
-        name = context.tensors.save(tensor=latents)
-        return LatentsOutput.build(latents_name=name, latents=latents, seed=None)
-
-    @singledispatchmethod
-    @staticmethod
-    def _encode_to_tensor(vae: AutoencoderKL, image_tensor: torch.FloatTensor) -> torch.FloatTensor:
-        assert isinstance(vae, torch.nn.Module)
-        image_tensor_dist = vae.encode(image_tensor).latent_dist
-        latents: torch.Tensor = image_tensor_dist.sample().to(
-            dtype=vae.dtype
-        )  # FIXME: uses torch.randn. make reproducible!
-        return latents
-
-    @_encode_to_tensor.register
-    @staticmethod
-    def _(vae: AutoencoderTiny, image_tensor: torch.FloatTensor) -> torch.FloatTensor:
-        assert isinstance(vae, torch.nn.Module)
-        latents: torch.FloatTensor = vae.encode(image_tensor).latents
-        return latents
-
-
-@invocation(
-    "lblend",
-    title="Blend Latents",
-    tags=["latents", "blend"],
-    category="latents",
-    version="1.0.3",
-)
-class BlendLatentsInvocation(BaseInvocation):
-    """Blend two latents using a given alpha. Latents must have same size."""
-
-    latents_a: LatentsField = InputField(
-        description=FieldDescriptions.latents,
-        input=Input.Connection,
-    )
-    latents_b: LatentsField = InputField(
-        description=FieldDescriptions.latents,
-        input=Input.Connection,
-    )
-    alpha: float = InputField(default=0.5, description=FieldDescriptions.blend_alpha)
-
-    def invoke(self, context: InvocationContext) -> LatentsOutput:
-        latents_a = context.tensors.load(self.latents_a.latents_name)
-        latents_b = context.tensors.load(self.latents_b.latents_name)
-
-        if latents_a.shape != latents_b.shape:
-            raise Exception("Latents to blend must be the same size.")
-
-        device = TorchDevice.choose_torch_device()
-
-        def slerp(
-            t: Union[float, npt.NDArray[Any]],  # FIXME: maybe use np.float32 here?
-            v0: Union[torch.Tensor, npt.NDArray[Any]],
-            v1: Union[torch.Tensor, npt.NDArray[Any]],
-            DOT_THRESHOLD: float = 0.9995,
-        ) -> Union[torch.Tensor, npt.NDArray[Any]]:
-            """
-            Spherical linear interpolation
-            Args:
-                t (float/np.ndarray): Float value between 0.0 and 1.0
-                v0 (np.ndarray): Starting vector
-                v1 (np.ndarray): Final vector
-                DOT_THRESHOLD (float): Threshold for considering the two vectors as
-                                    colineal. Not recommended to alter this.
-            Returns:
-                v2 (np.ndarray): Interpolation vector between v0 and v1
-            """
-            inputs_are_torch = False
-            if not isinstance(v0, np.ndarray):
-                inputs_are_torch = True
-                v0 = v0.detach().cpu().numpy()
-            if not isinstance(v1, np.ndarray):
-                inputs_are_torch = True
-                v1 = v1.detach().cpu().numpy()
-
-            dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
-            if np.abs(dot) > DOT_THRESHOLD:
-                v2 = (1 - t) * v0 + t * v1
-            else:
-                theta_0 = np.arccos(dot)
-                sin_theta_0 = np.sin(theta_0)
-                theta_t = theta_0 * t
-                sin_theta_t = np.sin(theta_t)
-                s0 = np.sin(theta_0 - theta_t) / sin_theta_0
-                s1 = sin_theta_t / sin_theta_0
-                v2 = s0 * v0 + s1 * v1
-
-            if inputs_are_torch:
-                v2_torch: torch.Tensor = torch.from_numpy(v2).to(device)
-                return v2_torch
-            else:
-                assert isinstance(v2, np.ndarray)
-                return v2
-
-        # blend
-        bl = slerp(self.alpha, latents_a, latents_b)
-        assert isinstance(bl, torch.Tensor)
-        blended_latents: torch.Tensor = bl  # for type checking convenience
-
-        # https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699
-        blended_latents = blended_latents.to("cpu")
-
-        TorchDevice.empty_cache()
-
-        name = context.tensors.save(tensor=blended_latents)
-        return LatentsOutput.build(latents_name=name, latents=blended_latents, seed=self.latents_a.seed)
-
-
-# The Crop Latents node was copied from @skunkworxdark's implementation here:
-# https://github.com/skunkworxdark/XYGrid_nodes/blob/74647fa9c1fa57d317a94bd43ca689af7f0aae5e/images_to_grids.py#L1117C1-L1167C80
-@invocation(
-    "crop_latents",
-    title="Crop Latents",
-    tags=["latents", "crop"],
-    category="latents",
-    version="1.0.2",
-)
-# TODO(ryand): Named `CropLatentsCoreInvocation` to prevent a conflict with custom node `CropLatentsInvocation`.
-# Currently, if the class names conflict then 'GET /openapi.json' fails.
-class CropLatentsCoreInvocation(BaseInvocation):
-    """Crops a latent-space tensor to a box specified in image-space. The box dimensions and coordinates must be
-    divisible by the latent scale factor of 8.
-    """
-
-    latents: LatentsField = InputField(
-        description=FieldDescriptions.latents,
-        input=Input.Connection,
-    )
-    x: int = InputField(
-        ge=0,
-        multiple_of=LATENT_SCALE_FACTOR,
-        description="The left x coordinate (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
-    )
-    y: int = InputField(
-        ge=0,
-        multiple_of=LATENT_SCALE_FACTOR,
-        description="The top y coordinate (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
-    )
-    width: int = InputField(
-        ge=1,
-        multiple_of=LATENT_SCALE_FACTOR,
-        description="The width (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
-    )
-    height: int = InputField(
-        ge=1,
-        multiple_of=LATENT_SCALE_FACTOR,
-        description="The height (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
-    )
-
-    def invoke(self, context: InvocationContext) -> LatentsOutput:
-        latents = context.tensors.load(self.latents.latents_name)
-
-        x1 = self.x // LATENT_SCALE_FACTOR
-        y1 = self.y // LATENT_SCALE_FACTOR
-        x2 = x1 + (self.width // LATENT_SCALE_FACTOR)
-        y2 = y1 + (self.height // LATENT_SCALE_FACTOR)
-
-        cropped_latents = latents[..., y1:y2, x1:x2]
-
-        name = context.tensors.save(tensor=cropped_latents)
-
-        return LatentsOutput.build(latents_name=name, latents=cropped_latents)
-
-
-@invocation_output("ideal_size_output")
-class IdealSizeOutput(BaseInvocationOutput):
-    """Base class for invocations that output an image"""
-
-    width: int = OutputField(description="The ideal width of the image (in pixels)")
-    height: int = OutputField(description="The ideal height of the image (in pixels)")
-
-
-@invocation(
-    "ideal_size",
-    title="Ideal Size",
-    tags=["latents", "math", "ideal_size"],
-    version="1.0.3",
-)
-class IdealSizeInvocation(BaseInvocation):
-    """Calculates the ideal size for generation to avoid duplication"""
-
-    width: int = InputField(default=1024, description="Final image width")
-    height: int = InputField(default=576, description="Final image height")
-    unet: UNetField = InputField(default=None, description=FieldDescriptions.unet)
-    multiplier: float = InputField(
-        default=1.0,
-        description="Amount to multiply the model's dimensions by when calculating the ideal size (may result in initial generation artifacts if too large)",
-    )
-
-    def trim_to_multiple_of(self, *args: int, multiple_of: int = LATENT_SCALE_FACTOR) -> Tuple[int, ...]:
-        return tuple((x - x % multiple_of) for x in args)
-
-    def invoke(self, context: InvocationContext) -> IdealSizeOutput:
-        unet_config = context.models.get_config(self.unet.unet.key)
-        aspect = self.width / self.height
-        dimension: float = 512
-        if unet_config.base == BaseModelType.StableDiffusion2:
-            dimension = 768
-        elif unet_config.base == BaseModelType.StableDiffusionXL:
-            dimension = 1024
-        dimension = dimension * self.multiplier
-        min_dimension = math.floor(dimension * 0.5)
-        model_area = dimension * dimension  # hardcoded for now since all models are trained on square images
-
-        if aspect > 1.0:
-            init_height = max(min_dimension, math.sqrt(model_area / aspect))
-            init_width = init_height * aspect
-        else:
-            init_width = max(min_dimension, math.sqrt(model_area * aspect))
-            init_height = init_width / aspect
-
-        scaled_width, scaled_height = self.trim_to_multiple_of(
-            math.floor(init_width),
-            math.floor(init_height),
-        )
-
-        return IdealSizeOutput(width=scaled_width, height=scaled_height)
--- a/invokeai/app/invocations/ideal_size.py
+++ b/invokeai/app/invocations/ideal_size.py
@ -0,0 +1,65 @@
+import math
+from typing import Tuple
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
+from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
+from invokeai.app.invocations.fields import FieldDescriptions, InputField, OutputField
+from invokeai.app.invocations.model import UNetField
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.model_manager.config import BaseModelType
+
+
+@invocation_output("ideal_size_output")
+class IdealSizeOutput(BaseInvocationOutput):
+    """Base class for invocations that output an image"""
+
+    width: int = OutputField(description="The ideal width of the image (in pixels)")
+    height: int = OutputField(description="The ideal height of the image (in pixels)")
+
+
+@invocation(
+    "ideal_size",
+    title="Ideal Size",
+    tags=["latents", "math", "ideal_size"],
+    version="1.0.3",
+)
+class IdealSizeInvocation(BaseInvocation):
+    """Calculates the ideal size for generation to avoid duplication"""
+
+    width: int = InputField(default=1024, description="Final image width")
+    height: int = InputField(default=576, description="Final image height")
+    unet: UNetField = InputField(default=None, description=FieldDescriptions.unet)
+    multiplier: float = InputField(
+        default=1.0,
+        description="Amount to multiply the model's dimensions by when calculating the ideal size (may result in "
+        "initial generation artifacts if too large)",
+    )
+
+    def trim_to_multiple_of(self, *args: int, multiple_of: int = LATENT_SCALE_FACTOR) -> Tuple[int, ...]:
+        return tuple((x - x % multiple_of) for x in args)
+
+    def invoke(self, context: InvocationContext) -> IdealSizeOutput:
+        unet_config = context.models.get_config(self.unet.unet.key)
+        aspect = self.width / self.height
+        dimension: float = 512
+        if unet_config.base == BaseModelType.StableDiffusion2:
+            dimension = 768
+        elif unet_config.base == BaseModelType.StableDiffusionXL:
+            dimension = 1024
+        dimension = dimension * self.multiplier
+        min_dimension = math.floor(dimension * 0.5)
+        model_area = dimension * dimension  # hardcoded for now since all models are trained on square images
+
+        if aspect > 1.0:
+            init_height = max(min_dimension, math.sqrt(model_area / aspect))
+            init_width = init_height * aspect
+        else:
+            init_width = max(min_dimension, math.sqrt(model_area * aspect))
+            init_height = init_width / aspect
+
+        scaled_width, scaled_height = self.trim_to_multiple_of(
+            math.floor(init_width),
+            math.floor(init_height),
+        )
+
+        return IdealSizeOutput(width=scaled_width, height=scaled_height)
--- a/invokeai/app/invocations/image_to_latents.py
+++ b/invokeai/app/invocations/image_to_latents.py
@ -0,0 +1,125 @@
+from functools import singledispatchmethod
+
+import einops
+import torch
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
+from diffusers.models.autoencoders.autoencoder_tiny import AutoencoderTiny
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.denoise_latents import DEFAULT_PRECISION
+from invokeai.app.invocations.fields import (
+    FieldDescriptions,
+    ImageField,
+    Input,
+    InputField,
+)
+from invokeai.app.invocations.model import VAEField
+from invokeai.app.invocations.primitives import LatentsOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.model_manager import LoadedModel
+from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor
+
+
+@invocation(
+    "i2l",
+    title="Image to Latents",
+    tags=["latents", "image", "vae", "i2l"],
+    category="latents",
+    version="1.0.2",
+)
+class ImageToLatentsInvocation(BaseInvocation):
+    """Encodes an image into latents."""
+
+    image: ImageField = InputField(
+        description="The image to encode",
+    )
+    vae: VAEField = InputField(
+        description=FieldDescriptions.vae,
+        input=Input.Connection,
+    )
+    tiled: bool = InputField(default=False, description=FieldDescriptions.tiled)
+    fp32: bool = InputField(default=DEFAULT_PRECISION == "float32", description=FieldDescriptions.fp32)
+
+    @staticmethod
+    def vae_encode(vae_info: LoadedModel, upcast: bool, tiled: bool, image_tensor: torch.Tensor) -> torch.Tensor:
+        with vae_info as vae:
+            assert isinstance(vae, torch.nn.Module)
+            orig_dtype = vae.dtype
+            if upcast:
+                vae.to(dtype=torch.float32)
+
+                use_torch_2_0_or_xformers = hasattr(vae.decoder, "mid_block") and isinstance(
+                    vae.decoder.mid_block.attentions[0].processor,
+                    (
+                        AttnProcessor2_0,
+                        XFormersAttnProcessor,
+                        LoRAXFormersAttnProcessor,
+                        LoRAAttnProcessor2_0,
+                    ),
+                )
+                # if xformers or torch_2_0 is used attention block does not need
+                # to be in float32 which can save lots of memory
+                if use_torch_2_0_or_xformers:
+                    vae.post_quant_conv.to(orig_dtype)
+                    vae.decoder.conv_in.to(orig_dtype)
+                    vae.decoder.mid_block.to(orig_dtype)
+                # else:
+                #    latents = latents.float()
+
+            else:
+                vae.to(dtype=torch.float16)
+                # latents = latents.half()
+
+            if tiled:
+                vae.enable_tiling()
+            else:
+                vae.disable_tiling()
+
+            # non_noised_latents_from_image
+            image_tensor = image_tensor.to(device=vae.device, dtype=vae.dtype)
+            with torch.inference_mode():
+                latents = ImageToLatentsInvocation._encode_to_tensor(vae, image_tensor)
+
+            latents = vae.config.scaling_factor * latents
+            latents = latents.to(dtype=orig_dtype)
+
+        return latents
+
+    @torch.no_grad()
+    def invoke(self, context: InvocationContext) -> LatentsOutput:
+        image = context.images.get_pil(self.image.image_name)
+
+        vae_info = context.models.load(self.vae.vae)
+
+        image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
+        if image_tensor.dim() == 3:
+            image_tensor = einops.rearrange(image_tensor, "c h w -> 1 c h w")
+
+        latents = self.vae_encode(vae_info, self.fp32, self.tiled, image_tensor)
+
+        latents = latents.to("cpu")
+        name = context.tensors.save(tensor=latents)
+        return LatentsOutput.build(latents_name=name, latents=latents, seed=None)
+
+    @singledispatchmethod
+    @staticmethod
+    def _encode_to_tensor(vae: AutoencoderKL, image_tensor: torch.FloatTensor) -> torch.FloatTensor:
+        assert isinstance(vae, torch.nn.Module)
+        image_tensor_dist = vae.encode(image_tensor).latent_dist
+        latents: torch.Tensor = image_tensor_dist.sample().to(
+            dtype=vae.dtype
+        )  # FIXME: uses torch.randn. make reproducible!
+        return latents
+
+    @_encode_to_tensor.register
+    @staticmethod
+    def _(vae: AutoencoderTiny, image_tensor: torch.FloatTensor) -> torch.FloatTensor:
+        assert isinstance(vae, torch.nn.Module)
+        latents: torch.FloatTensor = vae.encode(image_tensor).latents
+        return latents
--- a/invokeai/app/invocations/latents_to_image.py
+++ b/invokeai/app/invocations/latents_to_image.py
@ -0,0 +1,107 @@
+import torch
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
+from diffusers.models.autoencoders.autoencoder_tiny import AutoencoderTiny
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.denoise_latents import DEFAULT_PRECISION
+from invokeai.app.invocations.fields import (
+    FieldDescriptions,
+    Input,
+    InputField,
+    LatentsField,
+    WithBoard,
+    WithMetadata,
+)
+from invokeai.app.invocations.model import VAEField
+from invokeai.app.invocations.primitives import ImageOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.stable_diffusion import set_seamless
+from invokeai.backend.util.devices import TorchDevice
+
+
+@invocation(
+    "l2i",
+    title="Latents to Image",
+    tags=["latents", "image", "vae", "l2i"],
+    category="latents",
+    version="1.2.2",
+)
+class LatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
+    """Generates an image from latents."""
+
+    latents: LatentsField = InputField(
+        description=FieldDescriptions.latents,
+        input=Input.Connection,
+    )
+    vae: VAEField = InputField(
+        description=FieldDescriptions.vae,
+        input=Input.Connection,
+    )
+    tiled: bool = InputField(default=False, description=FieldDescriptions.tiled)
+    fp32: bool = InputField(default=DEFAULT_PRECISION == "float32", description=FieldDescriptions.fp32)
+
+    @torch.no_grad()
+    def invoke(self, context: InvocationContext) -> ImageOutput:
+        latents = context.tensors.load(self.latents.latents_name)
+
+        vae_info = context.models.load(self.vae.vae)
+        assert isinstance(vae_info.model, (UNet2DConditionModel, AutoencoderKL, AutoencoderTiny))
+        with set_seamless(vae_info.model, self.vae.seamless_axes), vae_info as vae:
+            assert isinstance(vae, torch.nn.Module)
+            latents = latents.to(vae.device)
+            if self.fp32:
+                vae.to(dtype=torch.float32)
+
+                use_torch_2_0_or_xformers = hasattr(vae.decoder, "mid_block") and isinstance(
+                    vae.decoder.mid_block.attentions[0].processor,
+                    (
+                        AttnProcessor2_0,
+                        XFormersAttnProcessor,
+                        LoRAXFormersAttnProcessor,
+                        LoRAAttnProcessor2_0,
+                    ),
+                )
+                # if xformers or torch_2_0 is used attention block does not need
+                # to be in float32 which can save lots of memory
+                if use_torch_2_0_or_xformers:
+                    vae.post_quant_conv.to(latents.dtype)
+                    vae.decoder.conv_in.to(latents.dtype)
+                    vae.decoder.mid_block.to(latents.dtype)
+                else:
+                    latents = latents.float()
+
+            else:
+                vae.to(dtype=torch.float16)
+                latents = latents.half()
+
+            if self.tiled or context.config.get().force_tiled_decode:
+                vae.enable_tiling()
+            else:
+                vae.disable_tiling()
+
+            # clear memory as vae decode can request a lot
+            TorchDevice.empty_cache()
+
+            with torch.inference_mode():
+                # copied from diffusers pipeline
+                latents = latents / vae.config.scaling_factor
+                image = vae.decode(latents, return_dict=False)[0]
+                image = (image / 2 + 0.5).clamp(0, 1)  # denormalize
+                # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+                np_image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+                image = VaeImageProcessor.numpy_to_pil(np_image)[0]
+
+        TorchDevice.empty_cache()
+
+        image_dto = context.images.save(image=image)
+
+        return ImageOutput.build(image_dto)
--- a/invokeai/app/invocations/resize_latents.py
+++ b/invokeai/app/invocations/resize_latents.py
@ -0,0 +1,103 @@
+from typing import Literal
+
+import torch
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
+from invokeai.app.invocations.fields import (
+    FieldDescriptions,
+    Input,
+    InputField,
+    LatentsField,
+)
+from invokeai.app.invocations.primitives import LatentsOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.util.devices import TorchDevice
+
+LATENTS_INTERPOLATION_MODE = Literal["nearest", "linear", "bilinear", "bicubic", "trilinear", "area", "nearest-exact"]
+
+
+@invocation(
+    "lresize",
+    title="Resize Latents",
+    tags=["latents", "resize"],
+    category="latents",
+    version="1.0.2",
+)
+class ResizeLatentsInvocation(BaseInvocation):
+    """Resizes latents to explicit width/height (in pixels). Provided dimensions are floor-divided by 8."""
+
+    latents: LatentsField = InputField(
+        description=FieldDescriptions.latents,
+        input=Input.Connection,
+    )
+    width: int = InputField(
+        ge=64,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description=FieldDescriptions.width,
+    )
+    height: int = InputField(
+        ge=64,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description=FieldDescriptions.width,
+    )
+    mode: LATENTS_INTERPOLATION_MODE = InputField(default="bilinear", description=FieldDescriptions.interp_mode)
+    antialias: bool = InputField(default=False, description=FieldDescriptions.torch_antialias)
+
+    def invoke(self, context: InvocationContext) -> LatentsOutput:
+        latents = context.tensors.load(self.latents.latents_name)
+        device = TorchDevice.choose_torch_device()
+
+        resized_latents = torch.nn.functional.interpolate(
+            latents.to(device),
+            size=(self.height // LATENT_SCALE_FACTOR, self.width // LATENT_SCALE_FACTOR),
+            mode=self.mode,
+            antialias=self.antialias if self.mode in ["bilinear", "bicubic"] else False,
+        )
+
+        # https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699
+        resized_latents = resized_latents.to("cpu")
+
+        TorchDevice.empty_cache()
+
+        name = context.tensors.save(tensor=resized_latents)
+        return LatentsOutput.build(latents_name=name, latents=resized_latents, seed=self.latents.seed)
+
+
+@invocation(
+    "lscale",
+    title="Scale Latents",
+    tags=["latents", "resize"],
+    category="latents",
+    version="1.0.2",
+)
+class ScaleLatentsInvocation(BaseInvocation):
+    """Scales latents by a given factor."""
+
+    latents: LatentsField = InputField(
+        description=FieldDescriptions.latents,
+        input=Input.Connection,
+    )
+    scale_factor: float = InputField(gt=0, description=FieldDescriptions.scale_factor)
+    mode: LATENTS_INTERPOLATION_MODE = InputField(default="bilinear", description=FieldDescriptions.interp_mode)
+    antialias: bool = InputField(default=False, description=FieldDescriptions.torch_antialias)
+
+    def invoke(self, context: InvocationContext) -> LatentsOutput:
+        latents = context.tensors.load(self.latents.latents_name)
+
+        device = TorchDevice.choose_torch_device()
+
+        # resizing
+        resized_latents = torch.nn.functional.interpolate(
+            latents.to(device),
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            antialias=self.antialias if self.mode in ["bilinear", "bicubic"] else False,
+        )
+
+        # https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699
+        resized_latents = resized_latents.to("cpu")
+        TorchDevice.empty_cache()
+
+        name = context.tensors.save(tensor=resized_latents)
+        return LatentsOutput.build(latents_name=name, latents=resized_latents, seed=self.latents.seed)
--- a/invokeai/app/invocations/scheduler.py
+++ b/invokeai/app/invocations/scheduler.py
@ -0,0 +1,34 @@
+from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
+from invokeai.app.invocations.constants import SCHEDULER_NAME_VALUES
+from invokeai.app.invocations.fields import (
+    FieldDescriptions,
+    InputField,
+    OutputField,
+    UIType,
+)
+from invokeai.app.services.shared.invocation_context import InvocationContext
+
+
+@invocation_output("scheduler_output")
+class SchedulerOutput(BaseInvocationOutput):
+    scheduler: SCHEDULER_NAME_VALUES = OutputField(description=FieldDescriptions.scheduler, ui_type=UIType.Scheduler)
+
+
+@invocation(
+    "scheduler",
+    title="Scheduler",
+    tags=["scheduler"],
+    category="latents",
+    version="1.0.0",
+)
+class SchedulerInvocation(BaseInvocation):
+    """Selects a scheduler."""
+
+    scheduler: SCHEDULER_NAME_VALUES = InputField(
+        default="euler",
+        description=FieldDescriptions.scheduler,
+        ui_type=UIType.Scheduler,
+    )
+
+    def invoke(self, context: InvocationContext) -> SchedulerOutput:
+        return SchedulerOutput(scheduler=self.scheduler)
--- a/invokeai/invocation_api/init.py
+++ b/invokeai/invocation_api/init.py
@ -12,6 +12,7 @@ from invokeai.app.invocations.baseinvocation import (
    invocation_output,
 )
 from invokeai.app.invocations.constants import SCHEDULER_NAME_VALUES
+from invokeai.app.invocations.denoise_latents import SchedulerOutput
 from invokeai.app.invocations.fields import (
    BoardField,
    ColorField,
@ -31,7 +32,6 @@ from invokeai.app.invocations.fields import (
    WithMetadata,
    WithWorkflow,
 )
-from invokeai.app.invocations.latent import SchedulerOutput
 from invokeai.app.invocations.metadata import MetadataItemField, MetadataItemOutput, MetadataOutput
 from invokeai.app.invocations.model import (
    CLIPField,
@ -108,7 +108,7 @@ __all__ = [
    "WithBoard",
    "WithMetadata",
    "WithWorkflow",
-    # invokeai.app.invocations.latent
+    # invokeai.app.invocations.scheduler
    "SchedulerOutput",
    # invokeai.app.invocations.metadata
    "MetadataItemField",
--- a/pyproject.toml
+++ b/pyproject.toml
@ -224,7 +224,7 @@ follow_imports = "skip" # skips type checking of the modules listed below
 module = [
  "invokeai.app.api.routers.models",
  "invokeai.app.invocations.compel",
-  "invokeai.app.invocations.latent",
+  "invokeai.app.invocations.denoise_latents",
  "invokeai.app.services.invocation_stats.invocation_stats_default",
  "invokeai.app.services.model_manager.model_manager_base",
  "invokeai.app.services.model_manager.model_manager_default",