diff --git a/invokeai/backend/image_util/depth_anything/v2/dinov2.py b/invokeai/backend/image_util/depth_anything/v2/dinov2.py index 3fd3be9f3f..da460924f6 100644 --- a/invokeai/backend/image_util/depth_anything/v2/dinov2.py +++ b/invokeai/backend/image_util/depth_anything/v2/dinov2.py @@ -7,7 +7,7 @@ # https://github.com/facebookresearch/dino/blob/main/vision_transformer.py # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py -import logging + import math from functools import partial from typing import Callable, Sequence, Tuple, Union @@ -17,11 +17,8 @@ import torch.nn as nn import torch.utils.checkpoint from torch.nn.init import trunc_normal_ -from .dinov2_layers import MemEffAttention, Mlp -from .dinov2_layers import NestedTensorBlock as Block -from .dinov2_layers import PatchEmbed, SwiGLUFFNFused - -logger = logging.getLogger("dinov2") +from invokeai.backend.image_util.depth_anything.v2.dinov2_layers import MemEffAttention, Mlp, PatchEmbed, SwiGLUFFNFused +from invokeai.backend.image_util.depth_anything.v2.dinov2_layers import NestedTensorBlock as Block def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module: @@ -120,13 +117,10 @@ class DinoVisionTransformer(nn.Module): dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule if ffn_layer == "mlp": - logger.info("using MLP layer as FFN") ffn_layer = Mlp elif ffn_layer == "swiglufused" or ffn_layer == "swiglu": - logger.info("using SwiGLU layer as FFN") ffn_layer = SwiGLUFFNFused elif ffn_layer == "identity": - logger.info("using Identity layer as FFN") def f(*args, **kwargs): return nn.Identity() @@ -232,13 +226,13 @@ class DinoVisionTransformer(nn.Module): return x def forward_features_list(self, x_list, masks_list): - x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)] + x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list, strict=False)] for blk in self.blocks: x = blk(x) all_x = x output = [] - for x, masks in zip(all_x, masks_list): + for x, masks in zip(all_x, masks_list, strict=False): x_norm = self.norm(x) output.append( { @@ -301,7 +295,7 @@ class DinoVisionTransformer(nn.Module): n: Union[int, Sequence] = 1, # Layers or n last layers to take reshape: bool = False, return_class_token: bool = False, - norm=True, + norm: bool = True, ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]: if self.chunked_blocks: outputs = self._get_intermediate_layers_chunked(x, n) @@ -318,7 +312,7 @@ class DinoVisionTransformer(nn.Module): for out in outputs ] if return_class_token: - return tuple(zip(outputs, class_tokens)) + return tuple(zip(outputs, class_tokens, strict=False)) return tuple(outputs) def forward(self, *args, is_training=False, **kwargs): diff --git a/invokeai/backend/image_util/depth_anything/v2/dinov2_layers/__init__.py b/invokeai/backend/image_util/depth_anything/v2/dinov2_layers/__init__.py index b3ee306279..3980163fea 100644 --- a/invokeai/backend/image_util/depth_anything/v2/dinov2_layers/__init__.py +++ b/invokeai/backend/image_util/depth_anything/v2/dinov2_layers/__init__.py @@ -4,8 +4,9 @@ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. -from .attention import MemEffAttention -from .block import NestedTensorBlock -from .mlp import Mlp -from .patch_embed import PatchEmbed -from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused + +from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.attention import MemEffAttention # noqa +from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.block import NestedTensorBlock # noqa +from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.mlp import Mlp # noqa +from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.patch_embed import PatchEmbed # noqa +from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused # noqa diff --git a/invokeai/backend/image_util/depth_anything/v2/dinov2_layers/attention.py b/invokeai/backend/image_util/depth_anything/v2/dinov2_layers/attention.py index 61d7c1cb94..d2a5e5b96a 100644 --- a/invokeai/backend/image_util/depth_anything/v2/dinov2_layers/attention.py +++ b/invokeai/backend/image_util/depth_anything/v2/dinov2_layers/attention.py @@ -8,19 +8,16 @@ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py -import logging +# Referenced from: https://github.com/DepthAnything/Depth-Anything-V2 + from torch import Tensor, nn -logger = logging.getLogger("dinov2") - - try: - from xformers.ops import fmha, memory_efficient_attention, unbind + from xformers.ops import memory_efficient_attention, unbind XFORMERS_AVAILABLE = True except ImportError: - logger.warning("xFormers not available") XFORMERS_AVAILABLE = False diff --git a/invokeai/backend/image_util/depth_anything/v2/dinov2_layers/block.py b/invokeai/backend/image_util/depth_anything/v2/dinov2_layers/block.py index a218a6918d..825dbfb86f 100644 --- a/invokeai/backend/image_util/depth_anything/v2/dinov2_layers/block.py +++ b/invokeai/backend/image_util/depth_anything/v2/dinov2_layers/block.py @@ -8,26 +8,22 @@ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py -import logging + from typing import Any, Callable, Dict, List, Tuple import torch from torch import Tensor, nn -from .attention import Attention, MemEffAttention -from .drop_path import DropPath -from .layer_scale import LayerScale -from .mlp import Mlp - -logger = logging.getLogger("dinov2") - +from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.attention import Attention, MemEffAttention +from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.drop_path import DropPath +from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.layer_scale import LayerScale +from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.mlp import Mlp try: from xformers.ops import fmha, index_select_cat, scaled_index_add XFORMERS_AVAILABLE = True except ImportError: - logger.warning("xFormers not available") XFORMERS_AVAILABLE = False @@ -157,10 +153,10 @@ def get_attn_bias_and_cat(x_list, branges=None): this will perform the index select, cat the tensors, and provide the attn_bias from cache """ batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list] - all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list)) + all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list, strict=False)) if all_shapes not in attn_bias_cache.keys(): seqlens = [] - for b, x in zip(batch_sizes, x_list): + for b, x in zip(batch_sizes, x_list, strict=False): for _ in range(b): seqlens.append(x.shape[1]) attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens) @@ -194,7 +190,9 @@ def drop_add_residual_stochastic_depth_list( residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore outputs = [] - for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors): + for x, brange, residual, residual_scale_factor in zip( + x_list, branges, residual_list, residual_scale_factors, strict=False + ): outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x)) return outputs diff --git a/invokeai/backend/image_util/depth_anything/v2/dpt.py b/invokeai/backend/image_util/depth_anything/v2/dpt.py index 7e83f1d146..3427e963fa 100644 --- a/invokeai/backend/image_util/depth_anything/v2/dpt.py +++ b/invokeai/backend/image_util/depth_anything/v2/dpt.py @@ -1,12 +1,17 @@ +# Referenced from https://github.com/DepthAnything/Depth-Anything-V2/blob/main/depth_anything_v2/dpt.py + +from typing import List, Literal, Optional + import cv2 +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from torchvision.transforms import Compose -from .dinov2 import DINOv2 -from .utils.blocks import FeatureFusionBlock, _make_scratch -from .utils.transform import NormalizeImage, PrepareForNet, Resize +from invokeai.backend.image_util.depth_anything.v2.dinov2 import DINOv2 +from invokeai.backend.image_util.depth_anything.v2.utils.blocks import FeatureFusionBlock, _make_scratch +from invokeai.backend.image_util.depth_anything.v2.utils.transform import NormalizeImage, PrepareForNet, Resize def _make_fusion_block(features, use_bn, size=None): @@ -37,10 +42,18 @@ class ConvBlock(nn.Module): class DPTHead(nn.Module): def __init__( - self, in_channels, features=256, use_bn=False, out_channels=[256, 512, 1024, 1024], use_clstoken=False + self, + in_channels: int, + features: int = 256, + use_bn: bool = False, + out_channels: Optional[List[int]] = None, + use_clstoken: bool = False, ): super(DPTHead, self).__init__() + if out_channels is None: + out_channels = [256, 512, 1024, 1024] + self.use_clstoken = use_clstoken self.projects = nn.ModuleList( @@ -140,10 +153,18 @@ class DPTHead(nn.Module): class DepthAnythingV2(nn.Module): def __init__( - self, encoder="vitl", features=256, out_channels=[256, 512, 1024, 1024], use_bn=False, use_clstoken=False + self, + encoder: Literal["vits", "vitb", "vitl", "vitg"] = "vitl", + features: int = 256, + out_channels: Optional[List[int]] = None, + use_bn: bool = False, + use_clstoken: bool = False, ): super(DepthAnythingV2, self).__init__() + if out_channels is None: + out_channels = [256, 512, 1024, 1024] + self.intermediate_layer_idx = { "vits": [2, 5, 8, 11], "vitb": [2, 5, 8, 11], @@ -158,7 +179,7 @@ class DepthAnythingV2(nn.Module): self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken ) - def forward(self, x): + def forward(self, x: torch.Tensor): patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14 features = self.pretrained.get_intermediate_layers( @@ -171,7 +192,7 @@ class DepthAnythingV2(nn.Module): return depth.squeeze(1) @torch.no_grad() - def infer_image(self, raw_image, input_size=518): + def infer_image(self, raw_image: np.ndarray, input_size: int = 518): image, (h, w) = self.image2tensor(raw_image, input_size) depth = self.forward(image) diff --git a/invokeai/backend/image_util/depth_anything/v2/utils/blocks.py b/invokeai/backend/image_util/depth_anything/v2/utils/blocks.py index b51f045a16..cb1161c62f 100644 --- a/invokeai/backend/image_util/depth_anything/v2/utils/blocks.py +++ b/invokeai/backend/image_util/depth_anything/v2/utils/blocks.py @@ -1,3 +1,5 @@ +# Referenced from: https://github.com/DepthAnything/Depth-Anything-V2/blob/main/depth_anything_v2/util/blocks.py + import torch.nn as nn @@ -53,7 +55,7 @@ class ResidualConvUnit(nn.Module): self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups) - if self.bn == True: + if self.bn: self.bn1 = nn.BatchNorm2d(features) self.bn2 = nn.BatchNorm2d(features) @@ -73,12 +75,12 @@ class ResidualConvUnit(nn.Module): out = self.activation(x) out = self.conv1(out) - if self.bn == True: + if self.bn: out = self.bn1(out) out = self.activation(out) out = self.conv2(out) - if self.bn == True: + if self.bn: out = self.bn2(out) if self.groups > 1: @@ -105,7 +107,7 @@ class FeatureFusionBlock(nn.Module): self.expand = expand out_features = features - if self.expand == True: + if self.expand: out_features = features // 2 self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1) diff --git a/invokeai/backend/image_util/depth_anything/v2/utils/transform.py b/invokeai/backend/image_util/depth_anything/v2/utils/transform.py index 34dfe22df6..933ce4c913 100644 --- a/invokeai/backend/image_util/depth_anything/v2/utils/transform.py +++ b/invokeai/backend/image_util/depth_anything/v2/utils/transform.py @@ -1,3 +1,5 @@ +# Referenced from: https://github.com/DepthAnything/Depth-Anything-V2/blob/main/depth_anything_v2/util/transform.py + import cv2 import numpy as np