Depth Anything V2 (#6674)

- Updated the previous DepthAnything manual implementation to use the `transformers` implementation instead. So we can get upstream features. - Plugged in the DepthAnything models to be handled by Invoke's Model Manager. - `small_v2` model will use DepthAnythingV2. This has been added as a new model option and is now also the default in the Linear UI. ![opera_TxRhmbFole](https://github.com/user-attachments/assets/2a25abe3-ba0b-4f97-b75a-2ce5fd6246e6) # Merge Review and merge.
2024-08-30 20:32:17 +00:00 · 2024-08-07 20:26:58 +05:30 · 2024-08-07 20:26:58 +05:30 · 6cd40965c4
commit 6cd40965c4
parent 140670d00e 408a1d6dbb
14 changed files with 200 additions and 796 deletions
--- a/invokeai/app/invocations/controlnet_image_processors.py
+++ b/invokeai/app/invocations/controlnet_image_processors.py
@ -21,6 +21,8 @@ from controlnet_aux import (
 from controlnet_aux.util import HWC3, ade_palette
 from PIL import Image
 from pydantic import BaseModel, Field, field_validator, model_validator
+from transformers import pipeline
+from transformers.pipelines import DepthEstimationPipeline

 from invokeai.app.invocations.baseinvocation import (
    BaseInvocation,
@ -44,13 +46,12 @@ from invokeai.app.invocations.util import validate_begin_end_step, validate_weig
 from invokeai.app.services.shared.invocation_context import InvocationContext
 from invokeai.app.util.controlnet_utils import CONTROLNET_MODE_VALUES, CONTROLNET_RESIZE_VALUES, heuristic_resize
 from invokeai.backend.image_util.canny import get_canny_edges
-from invokeai.backend.image_util.depth_anything import DEPTH_ANYTHING_MODELS, DepthAnythingDetector
+from invokeai.backend.image_util.depth_anything.depth_anything_pipeline import DepthAnythingPipeline
 from invokeai.backend.image_util.dw_openpose import DWPOSE_MODELS, DWOpenposeDetector
 from invokeai.backend.image_util.hed import HEDProcessor
 from invokeai.backend.image_util.lineart import LineartProcessor
 from invokeai.backend.image_util.lineart_anime import LineartAnimeProcessor
 from invokeai.backend.image_util.util import np_to_pil, pil_to_np
-from invokeai.backend.util.devices import TorchDevice


 class ControlField(BaseModel):
@ -592,7 +593,14 @@ class ColorMapImageProcessorInvocation(ImageProcessorInvocation):
        return color_map


-DEPTH_ANYTHING_MODEL_SIZES = Literal["large", "base", "small"]
+DEPTH_ANYTHING_MODEL_SIZES = Literal["large", "base", "small", "small_v2"]
+# DepthAnything V2 Small model is licensed under Apache 2.0 but not the base and large models.
+DEPTH_ANYTHING_MODELS = {
+    "large": "LiheYoung/depth-anything-large-hf",
+    "base": "LiheYoung/depth-anything-base-hf",
+    "small": "LiheYoung/depth-anything-small-hf",
+    "small_v2": "depth-anything/Depth-Anything-V2-Small-hf",
+}


@invocation(
@ -600,28 +608,33 @@ DEPTH_ANYTHING_MODEL_SIZES = Literal["large", "base", "small"]
    title="Depth Anything Processor",
    tags=["controlnet", "depth", "depth anything"],
    category="controlnet",
-    version="1.1.2",
+    version="1.1.3",
 )
 class DepthAnythingImageProcessorInvocation(ImageProcessorInvocation):
    """Generates a depth map based on the Depth Anything algorithm"""

    model_size: DEPTH_ANYTHING_MODEL_SIZES = InputField(
-        default="small", description="The size of the depth model to use"
+        default="small_v2", description="The size of the depth model to use"
    )
    resolution: int = InputField(default=512, ge=1, description=FieldDescriptions.image_res)

    def run_processor(self, image: Image.Image) -> Image.Image:
-        def loader(model_path: Path):
-            return DepthAnythingDetector.load_model(
-                model_path, model_size=self.model_size, device=TorchDevice.choose_torch_device()
-            )
+        def load_depth_anything(model_path: Path):
+            depth_anything_pipeline = pipeline(model=str(model_path), task="depth-estimation", local_files_only=True)
+            assert isinstance(depth_anything_pipeline, DepthEstimationPipeline)
+            return DepthAnythingPipeline(depth_anything_pipeline)

        with self._context.models.load_remote_model(
-            source=DEPTH_ANYTHING_MODELS[self.model_size], loader=loader
-        ) as model:
-            depth_anything_detector = DepthAnythingDetector(model, TorchDevice.choose_torch_device())
-            processed_image = depth_anything_detector(image=image, resolution=self.resolution)
-            return processed_image
+            source=DEPTH_ANYTHING_MODELS[self.model_size], loader=load_depth_anything
+        ) as depth_anything_detector:
+            assert isinstance(depth_anything_detector, DepthAnythingPipeline)
+            depth_map = depth_anything_detector.generate_depth(image)
+
+            # Resizing to user target specified size
+            new_height = int(image.size[1] * (self.resolution / image.size[0]))
+            depth_map = depth_map.resize((self.resolution, new_height))
+
+            return depth_map


@invocation(
--- a/invokeai/backend/image_util/depth_anything/init.py
+++ b/invokeai/backend/image_util/depth_anything/init.py
@ -1,90 +0,0 @@
-from pathlib import Path
-from typing import Literal
-
-import cv2
-import numpy as np
-import torch
-import torch.nn.functional as F
-from einops import repeat
-from PIL import Image
-from torchvision.transforms import Compose
-
-from invokeai.app.services.config.config_default import get_config
-from invokeai.backend.image_util.depth_anything.model.dpt import DPT_DINOv2
-from invokeai.backend.image_util.depth_anything.utilities.util import NormalizeImage, PrepareForNet, Resize
-from invokeai.backend.util.logging import InvokeAILogger
-
-config = get_config()
-logger = InvokeAILogger.get_logger(config=config)
-
-DEPTH_ANYTHING_MODELS = {
-    "large": "https://huggingface.co/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vitl14.pth?download=true",
-    "base": "https://huggingface.co/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vitb14.pth?download=true",
-    "small": "https://huggingface.co/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vits14.pth?download=true",
-}
-
-
-transform = Compose(
-    [
-        Resize(
-            width=518,
-            height=518,
-            resize_target=False,
-            keep_aspect_ratio=True,
-            ensure_multiple_of=14,
-            resize_method="lower_bound",
-            image_interpolation_method=cv2.INTER_CUBIC,
-        ),
-        NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-        PrepareForNet(),
-    ]
-)
-
-
-class DepthAnythingDetector:
-    def __init__(self, model: DPT_DINOv2, device: torch.device) -> None:
-        self.model = model
-        self.device = device
-
-    @staticmethod
-    def load_model(
-        model_path: Path, device: torch.device, model_size: Literal["large", "base", "small"] = "small"
-    ) -> DPT_DINOv2:
-        match model_size:
-            case "small":
-                model = DPT_DINOv2(encoder="vits", features=64, out_channels=[48, 96, 192, 384])
-            case "base":
-                model = DPT_DINOv2(encoder="vitb", features=128, out_channels=[96, 192, 384, 768])
-            case "large":
-                model = DPT_DINOv2(encoder="vitl", features=256, out_channels=[256, 512, 1024, 1024])
-
-        model.load_state_dict(torch.load(model_path.as_posix(), map_location="cpu"))
-        model.eval()
-
-        model.to(device)
-        return model
-
-    def __call__(self, image: Image.Image, resolution: int = 512) -> Image.Image:
-        if not self.model:
-            logger.warn("DepthAnything model was not loaded. Returning original image")
-            return image
-
-        np_image = np.array(image, dtype=np.uint8)
-        np_image = np_image[:, :, ::-1] / 255.0
-
-        image_height, image_width = np_image.shape[:2]
-        np_image = transform({"image": np_image})["image"]
-        tensor_image = torch.from_numpy(np_image).unsqueeze(0).to(self.device)
-
-        with torch.no_grad():
-            depth = self.model(tensor_image)
-            depth = F.interpolate(depth[None], (image_height, image_width), mode="bilinear", align_corners=False)[0, 0]
-            depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
-
-        depth_map = repeat(depth, "h w -> h w 3").cpu().numpy().astype(np.uint8)
-        depth_map = Image.fromarray(depth_map)
-
-        new_height = int(image_height * (resolution / image_width))
-        depth_map = depth_map.resize((resolution, new_height))
-
-        return depth_map
--- a/invokeai/backend/image_util/depth_anything/depth_anything_pipeline.py
+++ b/invokeai/backend/image_util/depth_anything/depth_anything_pipeline.py
@ -0,0 +1,31 @@
+from typing import Optional
+
+import torch
+from PIL import Image
+from transformers.pipelines import DepthEstimationPipeline
+
+from invokeai.backend.raw_model import RawModel
+
+
+class DepthAnythingPipeline(RawModel):
+    """Custom wrapper for the Depth Estimation pipeline from transformers adding compatibility
+    for Invoke's Model Management System"""
+
+    def __init__(self, pipeline: DepthEstimationPipeline) -> None:
+        self._pipeline = pipeline
+
+    def generate_depth(self, image: Image.Image) -> Image.Image:
+        depth_map = self._pipeline(image)["depth"]
+        assert isinstance(depth_map, Image.Image)
+        return depth_map
+
+    def to(self, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None):
+        if device is not None and device.type not in {"cpu", "cuda"}:
+            device = None
+        self._pipeline.model.to(device=device, dtype=dtype)
+        self._pipeline.device = self._pipeline.model.device
+
+    def calc_size(self) -> int:
+        from invokeai.backend.model_manager.load.model_util import calc_module_size
+
+        return calc_module_size(self._pipeline.model)
--- a/invokeai/backend/image_util/depth_anything/model/blocks.py
+++ b/invokeai/backend/image_util/depth_anything/model/blocks.py
@ -1,145 +0,0 @@
-import torch.nn as nn
-
-
-def _make_scratch(in_shape, out_shape, groups=1, expand=False):
-    scratch = nn.Module()
-
-    out_shape1 = out_shape
-    out_shape2 = out_shape
-    out_shape3 = out_shape
-    if len(in_shape) >= 4:
-        out_shape4 = out_shape
-
-    if expand:
-        out_shape1 = out_shape
-        out_shape2 = out_shape * 2
-        out_shape3 = out_shape * 4
-        if len(in_shape) >= 4:
-            out_shape4 = out_shape * 8
-
-    scratch.layer1_rn = nn.Conv2d(
-        in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-    )
-    scratch.layer2_rn = nn.Conv2d(
-        in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-    )
-    scratch.layer3_rn = nn.Conv2d(
-        in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-    )
-    if len(in_shape) >= 4:
-        scratch.layer4_rn = nn.Conv2d(
-            in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-        )
-
-    return scratch
-
-
-class ResidualConvUnit(nn.Module):
-    """Residual convolution module."""
-
-    def __init__(self, features, activation, bn):
-        """Init.
-
-        Args:
-            features (int): number of features
-        """
-        super().__init__()
-
-        self.bn = bn
-
-        self.groups = 1
-
-        self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
-
-        self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
-
-        if self.bn:
-            self.bn1 = nn.BatchNorm2d(features)
-            self.bn2 = nn.BatchNorm2d(features)
-
-        self.activation = activation
-
-        self.skip_add = nn.quantized.FloatFunctional()
-
-    def forward(self, x):
-        """Forward pass.
-
-        Args:
-            x (tensor): input
-
-        Returns:
-            tensor: output
-        """
-
-        out = self.activation(x)
-        out = self.conv1(out)
-        if self.bn:
-            out = self.bn1(out)
-
-        out = self.activation(out)
-        out = self.conv2(out)
-        if self.bn:
-            out = self.bn2(out)
-
-        if self.groups > 1:
-            out = self.conv_merge(out)
-
-        return self.skip_add.add(out, x)
-
-
-class FeatureFusionBlock(nn.Module):
-    """Feature fusion block."""
-
-    def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True, size=None):
-        """Init.
-
-        Args:
-            features (int): number of features
-        """
-        super(FeatureFusionBlock, self).__init__()
-
-        self.deconv = deconv
-        self.align_corners = align_corners
-
-        self.groups = 1
-
-        self.expand = expand
-        out_features = features
-        if self.expand:
-            out_features = features // 2
-
-        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
-
-        self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
-        self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
-
-        self.skip_add = nn.quantized.FloatFunctional()
-
-        self.size = size
-
-    def forward(self, *xs, size=None):
-        """Forward pass.
-
-        Returns:
-            tensor: output
-        """
-        output = xs[0]
-
-        if len(xs) == 2:
-            res = self.resConfUnit1(xs[1])
-            output = self.skip_add.add(output, res)
-
-        output = self.resConfUnit2(output)
-
-        if (size is None) and (self.size is None):
-            modifier = {"scale_factor": 2}
-        elif size is None:
-            modifier = {"size": self.size}
-        else:
-            modifier = {"size": size}
-
-        output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
-
-        output = self.out_conv(output)
-
-        return output
--- a/invokeai/backend/image_util/depth_anything/model/dpt.py
+++ b/invokeai/backend/image_util/depth_anything/model/dpt.py
@ -1,183 +0,0 @@
-from pathlib import Path
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from invokeai.backend.image_util.depth_anything.model.blocks import FeatureFusionBlock, _make_scratch
-
-torchhub_path = Path(__file__).parent.parent / "torchhub"
-
-
-def _make_fusion_block(features, use_bn, size=None):
-    return FeatureFusionBlock(
-        features,
-        nn.ReLU(False),
-        deconv=False,
-        bn=use_bn,
-        expand=False,
-        align_corners=True,
-        size=size,
-    )
-
-
-class DPTHead(nn.Module):
-    def __init__(self, nclass, in_channels, features, out_channels, use_bn=False, use_clstoken=False):
-        super(DPTHead, self).__init__()
-
-        self.nclass = nclass
-        self.use_clstoken = use_clstoken
-
-        self.projects = nn.ModuleList(
-            [
-                nn.Conv2d(
-                    in_channels=in_channels,
-                    out_channels=out_channel,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                )
-                for out_channel in out_channels
-            ]
-        )
-
-        self.resize_layers = nn.ModuleList(
-            [
-                nn.ConvTranspose2d(
-                    in_channels=out_channels[0], out_channels=out_channels[0], kernel_size=4, stride=4, padding=0
-                ),
-                nn.ConvTranspose2d(
-                    in_channels=out_channels[1], out_channels=out_channels[1], kernel_size=2, stride=2, padding=0
-                ),
-                nn.Identity(),
-                nn.Conv2d(
-                    in_channels=out_channels[3], out_channels=out_channels[3], kernel_size=3, stride=2, padding=1
-                ),
-            ]
-        )
-
-        if use_clstoken:
-            self.readout_projects = nn.ModuleList()
-            for _ in range(len(self.projects)):
-                self.readout_projects.append(nn.Sequential(nn.Linear(2 * in_channels, in_channels), nn.GELU()))
-
-        self.scratch = _make_scratch(
-            out_channels,
-            features,
-            groups=1,
-            expand=False,
-        )
-
-        self.scratch.stem_transpose = None
-
-        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
-        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
-        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
-        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
-
-        head_features_1 = features
-        head_features_2 = 32
-
-        if nclass > 1:
-            self.scratch.output_conv = nn.Sequential(
-                nn.Conv2d(head_features_1, head_features_1, kernel_size=3, stride=1, padding=1),
-                nn.ReLU(True),
-                nn.Conv2d(head_features_1, nclass, kernel_size=1, stride=1, padding=0),
-            )
-        else:
-            self.scratch.output_conv1 = nn.Conv2d(
-                head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1
-            )
-
-            self.scratch.output_conv2 = nn.Sequential(
-                nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
-                nn.ReLU(True),
-                nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
-                nn.ReLU(True),
-                nn.Identity(),
-            )
-
-    def forward(self, out_features, patch_h, patch_w):
-        out = []
-        for i, x in enumerate(out_features):
-            if self.use_clstoken:
-                x, cls_token = x[0], x[1]
-                readout = cls_token.unsqueeze(1).expand_as(x)
-                x = self.readout_projects[i](torch.cat((x, readout), -1))
-            else:
-                x = x[0]
-
-            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
-
-            x = self.projects[i](x)
-            x = self.resize_layers[i](x)
-
-            out.append(x)
-
-        layer_1, layer_2, layer_3, layer_4 = out
-
-        layer_1_rn = self.scratch.layer1_rn(layer_1)
-        layer_2_rn = self.scratch.layer2_rn(layer_2)
-        layer_3_rn = self.scratch.layer3_rn(layer_3)
-        layer_4_rn = self.scratch.layer4_rn(layer_4)
-
-        path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
-        path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
-        path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
-        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
-
-        out = self.scratch.output_conv1(path_1)
-        out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
-        out = self.scratch.output_conv2(out)
-
-        return out
-
-
-class DPT_DINOv2(nn.Module):
-    def __init__(
-        self,
-        features,
-        out_channels,
-        encoder="vitl",
-        use_bn=False,
-        use_clstoken=False,
-    ):
-        super(DPT_DINOv2, self).__init__()
-
-        assert encoder in ["vits", "vitb", "vitl"]
-
-        # # in case the Internet connection is not stable, please load the DINOv2 locally
-        # if use_local:
-        #     self.pretrained = torch.hub.load(
-        #         torchhub_path / "facebookresearch_dinov2_main",
-        #         "dinov2_{:}14".format(encoder),
-        #         source="local",
-        #         pretrained=False,
-        #     )
-        # else:
-        #     self.pretrained = torch.hub.load(
-        #         "facebookresearch/dinov2",
-        #         "dinov2_{:}14".format(encoder),
-        #     )
-
-        self.pretrained = torch.hub.load(
-            "facebookresearch/dinov2",
-            "dinov2_{:}14".format(encoder),
-        )
-
-        dim = self.pretrained.blocks[0].attn.qkv.in_features
-
-        self.depth_head = DPTHead(1, dim, features, out_channels=out_channels, use_bn=use_bn, use_clstoken=use_clstoken)
-
-    def forward(self, x):
-        h, w = x.shape[-2:]
-
-        features = self.pretrained.get_intermediate_layers(x, 4, return_class_token=True)
-
-        patch_h, patch_w = h // 14, w // 14
-
-        depth = self.depth_head(features, patch_h, patch_w)
-        depth = F.interpolate(depth, size=(h, w), mode="bilinear", align_corners=True)
-        depth = F.relu(depth)
-
-        return depth.squeeze(1)
--- a/invokeai/backend/image_util/depth_anything/utilities/util.py
+++ b/invokeai/backend/image_util/depth_anything/utilities/util.py
@ -1,227 +0,0 @@
-import math
-
-import cv2
-import numpy as np
-import torch
-import torch.nn.functional as F
-
-
-def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
-    """Rezise the sample to ensure the given size. Keeps aspect ratio.
-
-    Args:
-        sample (dict): sample
-        size (tuple): image size
-
-    Returns:
-        tuple: new size
-    """
-    shape = list(sample["disparity"].shape)
-
-    if shape[0] >= size[0] and shape[1] >= size[1]:
-        return sample
-
-    scale = [0, 0]
-    scale[0] = size[0] / shape[0]
-    scale[1] = size[1] / shape[1]
-
-    scale = max(scale)
-
-    shape[0] = math.ceil(scale * shape[0])
-    shape[1] = math.ceil(scale * shape[1])
-
-    # resize
-    sample["image"] = cv2.resize(sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method)
-
-    sample["disparity"] = cv2.resize(sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST)
-    sample["mask"] = cv2.resize(
-        sample["mask"].astype(np.float32),
-        tuple(shape[::-1]),
-        interpolation=cv2.INTER_NEAREST,
-    )
-    sample["mask"] = sample["mask"].astype(bool)
-
-    return tuple(shape)
-
-
-class Resize(object):
-    """Resize sample to given size (width, height)."""
-
-    def __init__(
-        self,
-        width,
-        height,
-        resize_target=True,
-        keep_aspect_ratio=False,
-        ensure_multiple_of=1,
-        resize_method="lower_bound",
-        image_interpolation_method=cv2.INTER_AREA,
-    ):
-        """Init.
-
-        Args:
-            width (int): desired output width
-            height (int): desired output height
-            resize_target (bool, optional):
-                True: Resize the full sample (image, mask, target).
-                False: Resize image only.
-                Defaults to True.
-            keep_aspect_ratio (bool, optional):
-                True: Keep the aspect ratio of the input sample.
-                Output sample might not have the given width and height, and
-                resize behaviour depends on the parameter 'resize_method'.
-                Defaults to False.
-            ensure_multiple_of (int, optional):
-                Output width and height is constrained to be multiple of this parameter.
-                Defaults to 1.
-            resize_method (str, optional):
-                "lower_bound": Output will be at least as large as the given size.
-                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller
-                    than given size.)
-                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
-                Defaults to "lower_bound".
-        """
-        self.__width = width
-        self.__height = height
-
-        self.__resize_target = resize_target
-        self.__keep_aspect_ratio = keep_aspect_ratio
-        self.__multiple_of = ensure_multiple_of
-        self.__resize_method = resize_method
-        self.__image_interpolation_method = image_interpolation_method
-
-    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
-        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
-
-        if max_val is not None and y > max_val:
-            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
-
-        if y < min_val:
-            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
-
-        return y
-
-    def get_size(self, width, height):
-        # determine new height and width
-        scale_height = self.__height / height
-        scale_width = self.__width / width
-
-        if self.__keep_aspect_ratio:
-            if self.__resize_method == "lower_bound":
-                # scale such that output size is lower bound
-                if scale_width > scale_height:
-                    # fit width
-                    scale_height = scale_width
-                else:
-                    # fit height
-                    scale_width = scale_height
-            elif self.__resize_method == "upper_bound":
-                # scale such that output size is upper bound
-                if scale_width < scale_height:
-                    # fit width
-                    scale_height = scale_width
-                else:
-                    # fit height
-                    scale_width = scale_height
-            elif self.__resize_method == "minimal":
-                # scale as least as possbile
-                if abs(1 - scale_width) < abs(1 - scale_height):
-                    # fit width
-                    scale_height = scale_width
-                else:
-                    # fit height
-                    scale_width = scale_height
-            else:
-                raise ValueError(f"resize_method {self.__resize_method} not implemented")
-
-        if self.__resize_method == "lower_bound":
-            new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height)
-            new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width)
-        elif self.__resize_method == "upper_bound":
-            new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height)
-            new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width)
-        elif self.__resize_method == "minimal":
-            new_height = self.constrain_to_multiple_of(scale_height * height)
-            new_width = self.constrain_to_multiple_of(scale_width * width)
-        else:
-            raise ValueError(f"resize_method {self.__resize_method} not implemented")
-
-        return (new_width, new_height)
-
-    def __call__(self, sample):
-        width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0])
-
-        # resize sample
-        sample["image"] = cv2.resize(
-            sample["image"],
-            (width, height),
-            interpolation=self.__image_interpolation_method,
-        )
-
-        if self.__resize_target:
-            if "disparity" in sample:
-                sample["disparity"] = cv2.resize(
-                    sample["disparity"],
-                    (width, height),
-                    interpolation=cv2.INTER_NEAREST,
-                )
-
-            if "depth" in sample:
-                sample["depth"] = cv2.resize(sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST)
-
-            if "semseg_mask" in sample:
-                # sample["semseg_mask"] = cv2.resize(
-                #     sample["semseg_mask"], (width, height), interpolation=cv2.INTER_NEAREST
-                # )
-                sample["semseg_mask"] = F.interpolate(
-                    torch.from_numpy(sample["semseg_mask"]).float()[None, None, ...], (height, width), mode="nearest"
-                ).numpy()[0, 0]
-
-            if "mask" in sample:
-                sample["mask"] = cv2.resize(
-                    sample["mask"].astype(np.float32),
-                    (width, height),
-                    interpolation=cv2.INTER_NEAREST,
-                )
-                # sample["mask"] = sample["mask"].astype(bool)
-
-        # print(sample['image'].shape, sample['depth'].shape)
-        return sample
-
-
-class NormalizeImage(object):
-    """Normlize image by given mean and std."""
-
-    def __init__(self, mean, std):
-        self.__mean = mean
-        self.__std = std
-
-    def __call__(self, sample):
-        sample["image"] = (sample["image"] - self.__mean) / self.__std
-
-        return sample
-
-
-class PrepareForNet(object):
-    """Prepare sample for usage as network input."""
-
-    def __init__(self):
-        pass
-
-    def __call__(self, sample):
-        image = np.transpose(sample["image"], (2, 0, 1))
-        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
-
-        if "mask" in sample:
-            sample["mask"] = sample["mask"].astype(np.float32)
-            sample["mask"] = np.ascontiguousarray(sample["mask"])
-
-        if "depth" in sample:
-            depth = sample["depth"].astype(np.float32)
-            sample["depth"] = np.ascontiguousarray(depth)
-
-        if "semseg_mask" in sample:
-            sample["semseg_mask"] = sample["semseg_mask"].astype(np.float32)
-            sample["semseg_mask"] = np.ascontiguousarray(sample["semseg_mask"])
-
-        return sample
--- a/invokeai/backend/model_manager/load/model_util.py
+++ b/invokeai/backend/model_manager/load/model_util.py
@ -11,6 +11,7 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers.scheduling_utils import SchedulerMixin
 from transformers import CLIPTokenizer

+from invokeai.backend.image_util.depth_anything.depth_anything_pipeline import DepthAnythingPipeline
 from invokeai.backend.image_util.grounding_dino.grounding_dino_pipeline import GroundingDinoPipeline
 from invokeai.backend.image_util.segment_anything.segment_anything_pipeline import SegmentAnythingPipeline
 from invokeai.backend.ip_adapter.ip_adapter import IPAdapter
@ -45,6 +46,7 @@ def calc_model_size_by_data(logger: logging.Logger, model: AnyModel) -> int:
            SpandrelImageToImageModel,
            GroundingDinoPipeline,
            SegmentAnythingPipeline,
+            DepthAnythingPipeline,
        ),
    ):
        return model.calc_size()
--- a/invokeai/frontend/web/public/locales/en.json
+++ b/invokeai/frontend/web/public/locales/en.json
@ -200,6 +200,7 @@
        "delete": "Delete",
        "depthAnything": "Depth Anything",
        "depthAnythingDescription": "Depth map generation using the Depth Anything technique",
+        "depthAnythingSmallV2": "Small V2",
        "depthMidas": "Depth (Midas)",
        "depthMidasDescription": "Depth map generation using Midas",
        "depthZoe": "Depth (Zoe)",
--- a/invokeai/frontend/web/src/features/controlAdapters/components/processors/DepthAnyThingProcessor.tsx
+++ b/invokeai/frontend/web/src/features/controlAdapters/components/processors/DepthAnyThingProcessor.tsx
@ -42,6 +42,7 @@ const DepthAnythingProcessor = (props: Props) => {

  const options: { label: string; value: DepthAnythingModelSize }[] = useMemo(
    () => [
+      { label: t('controlnet.depthAnythingSmallV2'), value: 'small_v2' },
      { label: t('controlnet.small'), value: 'small' },
      { label: t('controlnet.base'), value: 'base' },
      { label: t('controlnet.large'), value: 'large' },
--- a/invokeai/frontend/web/src/features/controlAdapters/store/constants.ts
+++ b/invokeai/frontend/web/src/features/controlAdapters/store/constants.ts
@ -94,7 +94,7 @@ export const CONTROLNET_PROCESSORS: ControlNetProcessorsDict = {
    buildDefaults: (baseModel?: BaseModelType) => ({
      id: 'depth_anything_image_processor',
      type: 'depth_anything_image_processor',
-      model_size: 'small',
+      model_size: 'small_v2',
      resolution: baseModel === 'sdxl' ? 1024 : 512,
    }),
  },
--- a/invokeai/frontend/web/src/features/controlAdapters/store/types.ts
+++ b/invokeai/frontend/web/src/features/controlAdapters/store/types.ts
@ -84,7 +84,7 @@ export type RequiredDepthAnythingImageProcessorInvocation = O.Required<
  'type' | 'model_size' | 'resolution' | 'offload'
 >;

-const zDepthAnythingModelSize = z.enum(['large', 'base', 'small']);
+const zDepthAnythingModelSize = z.enum(['large', 'base', 'small', 'small_v2']);
 export type DepthAnythingModelSize = z.infer<typeof zDepthAnythingModelSize>;
 export const isDepthAnythingModelSize = (v: unknown): v is DepthAnythingModelSize =>
  zDepthAnythingModelSize.safeParse(v).success;
--- a/invokeai/frontend/web/src/features/controlLayers/components/ControlAndIPAdapter/processors/DepthAnythingProcessor.tsx
+++ b/invokeai/frontend/web/src/features/controlLayers/components/ControlAndIPAdapter/processors/DepthAnythingProcessor.tsx
@ -24,6 +24,7 @@ export const DepthAnythingProcessor = memo(({ onChange, config }: Props) => {

  const options: { label: string; value: DepthAnythingModelSize }[] = useMemo(
    () => [
+      { label: t('controlnet.depthAnythingSmallV2'), value: 'small_v2' },
      { label: t('controlnet.small'), value: 'small' },
      { label: t('controlnet.base'), value: 'base' },
      { label: t('controlnet.large'), value: 'large' },
--- a/invokeai/frontend/web/src/features/controlLayers/util/controlAdapters.ts
+++ b/invokeai/frontend/web/src/features/controlLayers/util/controlAdapters.ts
@ -36,7 +36,7 @@ const zContentShuffleProcessorConfig = z.object({
 });
 export type ContentShuffleProcessorConfig = z.infer<typeof zContentShuffleProcessorConfig>;

-const zDepthAnythingModelSize = z.enum(['large', 'base', 'small']);
+const zDepthAnythingModelSize = z.enum(['large', 'base', 'small', 'small_v2']);
 export type DepthAnythingModelSize = z.infer<typeof zDepthAnythingModelSize>;
 export const isDepthAnythingModelSize = (v: unknown): v is DepthAnythingModelSize =>
  zDepthAnythingModelSize.safeParse(v).success;
@ -298,7 +298,7 @@ export const CA_PROCESSOR_DATA: CAProcessorsData = {
    buildDefaults: () => ({
      id: 'depth_anything_image_processor',
      type: 'depth_anything_image_processor',
-      model_size: 'small',
+      model_size: 'small_v2',
    }),
    buildNode: (image, config) => ({
      ...config,
--- a/invokeai/frontend/web/src/services/api/schema.ts
+++ b/invokeai/frontend/web/src/services/api/schema.ts
@ -3679,10 +3679,10 @@ export type components = {
      /**
       * Model Size
       * @description The size of the depth model to use
-       * @default small
+       * @default small_v2
       * @enum {string}
       */
-      model_size?: "large" | "base" | "small";
+      model_size?: "large" | "base" | "small" | "small_v2";
      /**
       * Resolution
       * @description Pixel resolution for output image
@ -7306,147 +7306,147 @@ export type components = {
      project_id: string | null;
    };
    InvocationOutputMap: {
-      dynamic_prompt: components["schemas"]["StringCollectionOutput"];
-      img_ilerp: components["schemas"]["ImageOutput"];
-      random_range: components["schemas"]["IntegerCollectionOutput"];
-      ideal_size: components["schemas"]["IdealSizeOutput"];
-      lblend: components["schemas"]["LatentsOutput"];
-      tiled_multi_diffusion_denoise_latents: components["schemas"]["LatentsOutput"];
-      color_correct: components["schemas"]["ImageOutput"];
-      sdxl_model_loader: components["schemas"]["SDXLModelLoaderOutput"];
-      img_hue_adjust: components["schemas"]["ImageOutput"];
+      depth_anything_image_processor: components["schemas"]["ImageOutput"];
      lscale: components["schemas"]["LatentsOutput"];
-      infill_patchmatch: components["schemas"]["ImageOutput"];
-      mask_edge: components["schemas"]["ImageOutput"];
-      spandrel_image_to_image: components["schemas"]["ImageOutput"];
-      round_float: components["schemas"]["FloatOutput"];
-      color: components["schemas"]["ColorOutput"];
-      img_channel_offset: components["schemas"]["ImageOutput"];
-      create_denoise_mask: components["schemas"]["DenoiseMaskOutput"];
-      string_join_three: components["schemas"]["StringOutput"];
-      unsharp_mask: components["schemas"]["ImageOutput"];
-      canvas_paste_back: components["schemas"]["ImageOutput"];
-      string_collection: components["schemas"]["StringCollectionOutput"];
-      face_mask_detection: components["schemas"]["FaceMaskOutput"];
-      color_map_image_processor: components["schemas"]["ImageOutput"];
-      normalbae_image_processor: components["schemas"]["ImageOutput"];
-      create_gradient_mask: components["schemas"]["GradientMaskOutput"];
-      vae_loader: components["schemas"]["VAEOutput"];
-      integer: components["schemas"]["IntegerOutput"];
-      image_mask_to_tensor: components["schemas"]["MaskOutput"];
-      esrgan: components["schemas"]["ImageOutput"];
-      mlsd_image_processor: components["schemas"]["ImageOutput"];
-      blank_image: components["schemas"]["ImageOutput"];
-      zoe_depth_image_processor: components["schemas"]["ImageOutput"];
-      merge_metadata: components["schemas"]["MetadataOutput"];
-      infill_rgba: components["schemas"]["ImageOutput"];
-      mask_from_id: components["schemas"]["ImageOutput"];
-      string_split: components["schemas"]["String2Output"];
-      boolean_collection: components["schemas"]["BooleanCollectionOutput"];
-      img_resize: components["schemas"]["ImageOutput"];
-      float_collection: components["schemas"]["FloatCollectionOutput"];
-      img_conv: components["schemas"]["ImageOutput"];
-      boolean: components["schemas"]["BooleanOutput"];
-      mul: components["schemas"]["IntegerOutput"];
-      img_watermark: components["schemas"]["ImageOutput"];
-      heuristic_resize: components["schemas"]["ImageOutput"];
-      tile_image_processor: components["schemas"]["ImageOutput"];
-      face_identifier: components["schemas"]["ImageOutput"];
-      denoise_latents: components["schemas"]["LatentsOutput"];
-      img_lerp: components["schemas"]["ImageOutput"];
-      pair_tile_image: components["schemas"]["PairTileImageOutput"];
      add: components["schemas"]["IntegerOutput"];
-      range_of_size: components["schemas"]["IntegerCollectionOutput"];
-      img_scale: components["schemas"]["ImageOutput"];
-      latents_collection: components["schemas"]["LatentsCollectionOutput"];
+      range: components["schemas"]["IntegerCollectionOutput"];
+      prompt_from_file: components["schemas"]["StringCollectionOutput"];
+      sdxl_refiner_model_loader: components["schemas"]["SDXLRefinerModelLoaderOutput"];
+      canvas_paste_back: components["schemas"]["ImageOutput"];
+      mul: components["schemas"]["IntegerOutput"];
+      infill_rgba: components["schemas"]["ImageOutput"];
+      metadata_item: components["schemas"]["MetadataItemOutput"];
      canny_image_processor: components["schemas"]["ImageOutput"];
-      float_math: components["schemas"]["FloatOutput"];
+      alpha_mask_to_tensor: components["schemas"]["MaskOutput"];
+      lineart_anime_image_processor: components["schemas"]["ImageOutput"];
+      string: components["schemas"]["StringOutput"];
+      merge_tiles_to_image: components["schemas"]["ImageOutput"];
+      img_ilerp: components["schemas"]["ImageOutput"];
+      sdxl_lora_loader: components["schemas"]["SDXLLoRALoaderOutput"];
+      t2i_adapter: components["schemas"]["T2IAdapterOutput"];
+      face_mask_detection: components["schemas"]["FaceMaskOutput"];
+      img_chan: components["schemas"]["ImageOutput"];
+      img_watermark: components["schemas"]["ImageOutput"];
+      l2i: components["schemas"]["ImageOutput"];
+      create_gradient_mask: components["schemas"]["GradientMaskOutput"];
+      div: components["schemas"]["IntegerOutput"];
+      i2l: components["schemas"]["LatentsOutput"];
+      integer: components["schemas"]["IntegerOutput"];
+      sdxl_refiner_compel_prompt: components["schemas"]["ConditioningOutput"];
+      spandrel_image_to_image_autoscale: components["schemas"]["ImageOutput"];
+      iterate: components["schemas"]["IterateInvocationOutput"];
+      img_resize: components["schemas"]["ImageOutput"];
+      mask_edge: components["schemas"]["ImageOutput"];
+      core_metadata: components["schemas"]["MetadataOutput"];
+      lresize: components["schemas"]["LatentsOutput"];
+      lblend: components["schemas"]["LatentsOutput"];
+      model_identifier: components["schemas"]["ModelIdentifierOutput"];
      infill_tile: components["schemas"]["ImageOutput"];
      controlnet: components["schemas"]["ControlOutput"];
-      dw_openpose_image_processor: components["schemas"]["ImageOutput"];
-      img_chan: components["schemas"]["ImageOutput"];
-      conditioning_collection: components["schemas"]["ConditioningCollectionOutput"];
-      lresize: components["schemas"]["LatentsOutput"];
-      sdxl_refiner_compel_prompt: components["schemas"]["ConditioningOutput"];
-      noise: components["schemas"]["NoiseOutput"];
-      ip_adapter: components["schemas"]["IPAdapterOutput"];
-      sdxl_compel_prompt: components["schemas"]["ConditioningOutput"];
-      sub: components["schemas"]["IntegerOutput"];
-      seamless: components["schemas"]["SeamlessModeOutput"];
-      div: components["schemas"]["IntegerOutput"];
-      img_channel_multiply: components["schemas"]["ImageOutput"];
-      compel: components["schemas"]["ConditioningOutput"];
-      freeu: components["schemas"]["UNetOutput"];
-      conditioning: components["schemas"]["ConditioningOutput"];
-      rand_int: components["schemas"]["IntegerOutput"];
-      rectangle_mask: components["schemas"]["MaskOutput"];
-      string_split_neg: components["schemas"]["StringPosNegOutput"];
-      collect: components["schemas"]["CollectInvocationOutput"];
-      float_range: components["schemas"]["FloatCollectionOutput"];
-      string_join: components["schemas"]["StringOutput"];
-      metadata_item: components["schemas"]["MetadataItemOutput"];
-      l2i: components["schemas"]["ImageOutput"];
-      img_paste: components["schemas"]["ImageOutput"];
-      calculate_image_tiles_even_split: components["schemas"]["CalculateImageTilesOutput"];
-      calculate_image_tiles_min_overlap: components["schemas"]["CalculateImageTilesOutput"];
-      show_image: components["schemas"]["ImageOutput"];
-      invert_tensor_mask: components["schemas"]["MaskOutput"];
-      spandrel_image_to_image_autoscale: components["schemas"]["ImageOutput"];
-      face_off: components["schemas"]["FaceOffOutput"];
-      image_collection: components["schemas"]["ImageCollectionOutput"];
-      i2l: components["schemas"]["LatentsOutput"];
-      mediapipe_face_processor: components["schemas"]["ImageOutput"];
-      scheduler: components["schemas"]["SchedulerOutput"];
-      content_shuffle_image_processor: components["schemas"]["ImageOutput"];
-      lineart_anime_image_processor: components["schemas"]["ImageOutput"];
-      float_to_int: components["schemas"]["IntegerOutput"];
-      pidi_image_processor: components["schemas"]["ImageOutput"];
-      leres_image_processor: components["schemas"]["ImageOutput"];
-      lora_selector: components["schemas"]["LoRASelectorOutput"];
-      depth_anything_image_processor: components["schemas"]["ImageOutput"];
-      img_pad_crop: components["schemas"]["ImageOutput"];
-      calculate_image_tiles: components["schemas"]["CalculateImageTilesOutput"];
-      prompt_from_file: components["schemas"]["StringCollectionOutput"];
-      t2i_adapter: components["schemas"]["T2IAdapterOutput"];
-      lora_collection_loader: components["schemas"]["LoRALoaderOutput"];
-      img_crop: components["schemas"]["ImageOutput"];
-      img_mul: components["schemas"]["ImageOutput"];
-      sdxl_refiner_model_loader: components["schemas"]["SDXLRefinerModelLoaderOutput"];
-      tomask: components["schemas"]["ImageOutput"];
-      tile_to_properties: components["schemas"]["TileToPropertiesOutput"];
-      string_replace: components["schemas"]["StringOutput"];
-      merge_tiles_to_image: components["schemas"]["ImageOutput"];
-      integer_collection: components["schemas"]["IntegerCollectionOutput"];
-      crop_latents: components["schemas"]["LatentsOutput"];
-      lineart_image_processor: components["schemas"]["ImageOutput"];
-      midas_depth_image_processor: components["schemas"]["ImageOutput"];
-      segment_anything_processor: components["schemas"]["ImageOutput"];
-      save_image: components["schemas"]["ImageOutput"];
-      infill_cv2: components["schemas"]["ImageOutput"];
-      cv_inpaint: components["schemas"]["ImageOutput"];
-      lora_loader: components["schemas"]["LoRALoaderOutput"];
-      latents: components["schemas"]["LatentsOutput"];
-      model_identifier: components["schemas"]["ModelIdentifierOutput"];
-      step_param_easing: components["schemas"]["FloatCollectionOutput"];
-      metadata: components["schemas"]["MetadataOutput"];
-      range: components["schemas"]["IntegerCollectionOutput"];
-      img_blur: components["schemas"]["ImageOutput"];
-      core_metadata: components["schemas"]["MetadataOutput"];
      mask_combine: components["schemas"]["ImageOutput"];
-      sdxl_lora_collection_loader: components["schemas"]["SDXLLoRALoaderOutput"];
-      infill_lama: components["schemas"]["ImageOutput"];
-      iterate: components["schemas"]["IterateInvocationOutput"];
-      hed_image_processor: components["schemas"]["ImageOutput"];
+      image_mask_to_tensor: components["schemas"]["MaskOutput"];
+      merge_metadata: components["schemas"]["MetadataOutput"];
+      tile_image_processor: components["schemas"]["ImageOutput"];
+      mask_from_id: components["schemas"]["ImageOutput"];
+      pidi_image_processor: components["schemas"]["ImageOutput"];
+      img_mul: components["schemas"]["ImageOutput"];
+      cv_inpaint: components["schemas"]["ImageOutput"];
+      latents: components["schemas"]["LatentsOutput"];
+      conditioning: components["schemas"]["ConditioningOutput"];
+      img_channel_multiply: components["schemas"]["ImageOutput"];
+      seamless: components["schemas"]["SeamlessModeOutput"];
+      string_collection: components["schemas"]["StringCollectionOutput"];
+      img_crop: components["schemas"]["ImageOutput"];
+      lora_selector: components["schemas"]["LoRASelectorOutput"];
+      infill_patchmatch: components["schemas"]["ImageOutput"];
+      tiled_multi_diffusion_denoise_latents: components["schemas"]["LatentsOutput"];
+      img_hue_adjust: components["schemas"]["ImageOutput"];
+      sub: components["schemas"]["IntegerOutput"];
+      content_shuffle_image_processor: components["schemas"]["ImageOutput"];
+      boolean_collection: components["schemas"]["BooleanCollectionOutput"];
+      img_pad_crop: components["schemas"]["ImageOutput"];
+      latents_collection: components["schemas"]["LatentsCollectionOutput"];
+      segment_anything_processor: components["schemas"]["ImageOutput"];
+      dw_openpose_image_processor: components["schemas"]["ImageOutput"];
+      calculate_image_tiles: components["schemas"]["CalculateImageTilesOutput"];
+      tomask: components["schemas"]["ImageOutput"];
      img_nsfw: components["schemas"]["ImageOutput"];
-      main_model_loader: components["schemas"]["ModelLoaderOutput"];
-      string: components["schemas"]["StringOutput"];
-      sdxl_lora_loader: components["schemas"]["SDXLLoRALoaderOutput"];
+      infill_lama: components["schemas"]["ImageOutput"];
+      midas_depth_image_processor: components["schemas"]["ImageOutput"];
      image: components["schemas"]["ImageOutput"];
-      rand_float: components["schemas"]["FloatOutput"];
-      float: components["schemas"]["FloatOutput"];
-      alpha_mask_to_tensor: components["schemas"]["MaskOutput"];
+      img_blur: components["schemas"]["ImageOutput"];
+      rectangle_mask: components["schemas"]["MaskOutput"];
+      lora_loader: components["schemas"]["LoRALoaderOutput"];
+      mediapipe_face_processor: components["schemas"]["ImageOutput"];
+      create_denoise_mask: components["schemas"]["DenoiseMaskOutput"];
      integer_math: components["schemas"]["IntegerOutput"];
+      metadata: components["schemas"]["MetadataOutput"];
+      img_lerp: components["schemas"]["ImageOutput"];
+      color_correct: components["schemas"]["ImageOutput"];
+      rand_int: components["schemas"]["IntegerOutput"];
      clip_skip: components["schemas"]["CLIPSkipInvocationOutput"];
+      heuristic_resize: components["schemas"]["ImageOutput"];
+      float_collection: components["schemas"]["FloatCollectionOutput"];
+      crop_latents: components["schemas"]["LatentsOutput"];
+      step_param_easing: components["schemas"]["FloatCollectionOutput"];
+      calculate_image_tiles_min_overlap: components["schemas"]["CalculateImageTilesOutput"];
+      noise: components["schemas"]["NoiseOutput"];
+      sdxl_model_loader: components["schemas"]["SDXLModelLoaderOutput"];
+      collect: components["schemas"]["CollectInvocationOutput"];
+      vae_loader: components["schemas"]["VAEOutput"];
+      color_map_image_processor: components["schemas"]["ImageOutput"];
+      show_image: components["schemas"]["ImageOutput"];
+      face_off: components["schemas"]["FaceOffOutput"];
+      float: components["schemas"]["FloatOutput"];
+      denoise_latents: components["schemas"]["LatentsOutput"];
+      string_join_three: components["schemas"]["StringOutput"];
+      img_scale: components["schemas"]["ImageOutput"];
+      zoe_depth_image_processor: components["schemas"]["ImageOutput"];
+      esrgan: components["schemas"]["ImageOutput"];
+      main_model_loader: components["schemas"]["ModelLoaderOutput"];
+      infill_cv2: components["schemas"]["ImageOutput"];
+      mlsd_image_processor: components["schemas"]["ImageOutput"];
+      boolean: components["schemas"]["BooleanOutput"];
+      image_collection: components["schemas"]["ImageCollectionOutput"];
+      random_range: components["schemas"]["IntegerCollectionOutput"];
+      save_image: components["schemas"]["ImageOutput"];
+      string_split: components["schemas"]["String2Output"];
+      tile_to_properties: components["schemas"]["TileToPropertiesOutput"];
+      float_to_int: components["schemas"]["IntegerOutput"];
+      sdxl_lora_collection_loader: components["schemas"]["SDXLLoRALoaderOutput"];
+      conditioning_collection: components["schemas"]["ConditioningCollectionOutput"];
+      string_join: components["schemas"]["StringOutput"];
+      range_of_size: components["schemas"]["IntegerCollectionOutput"];
+      float_range: components["schemas"]["FloatCollectionOutput"];
+      integer_collection: components["schemas"]["IntegerCollectionOutput"];
+      scheduler: components["schemas"]["SchedulerOutput"];
+      ideal_size: components["schemas"]["IdealSizeOutput"];
+      freeu: components["schemas"]["UNetOutput"];
+      lineart_image_processor: components["schemas"]["ImageOutput"];
+      string_split_neg: components["schemas"]["StringPosNegOutput"];
+      rand_float: components["schemas"]["FloatOutput"];
+      ip_adapter: components["schemas"]["IPAdapterOutput"];
+      calculate_image_tiles_even_split: components["schemas"]["CalculateImageTilesOutput"];
+      pair_tile_image: components["schemas"]["PairTileImageOutput"];
+      round_float: components["schemas"]["FloatOutput"];
+      spandrel_image_to_image: components["schemas"]["ImageOutput"];
+      img_paste: components["schemas"]["ImageOutput"];
+      compel: components["schemas"]["ConditioningOutput"];
+      leres_image_processor: components["schemas"]["ImageOutput"];
+      dynamic_prompt: components["schemas"]["StringCollectionOutput"];
+      unsharp_mask: components["schemas"]["ImageOutput"];
+      color: components["schemas"]["ColorOutput"];
+      img_channel_offset: components["schemas"]["ImageOutput"];
+      sdxl_compel_prompt: components["schemas"]["ConditioningOutput"];
+      hed_image_processor: components["schemas"]["ImageOutput"];
+      invert_tensor_mask: components["schemas"]["MaskOutput"];
+      string_replace: components["schemas"]["StringOutput"];
+      blank_image: components["schemas"]["ImageOutput"];
+      lora_collection_loader: components["schemas"]["LoRALoaderOutput"];
+      normalbae_image_processor: components["schemas"]["ImageOutput"];
+      face_identifier: components["schemas"]["ImageOutput"];
+      img_conv: components["schemas"]["ImageOutput"];
+      float_math: components["schemas"]["FloatOutput"];
    };
    /**
     * InvocationStartedEvent