wip: depth_anything_v2 init lint fixes

This commit is contained in:
blessedcoolant 2024-07-25 14:20:52 +05:30
parent 4f0dfbd34d
commit 4166c756ce
7 changed files with 62 additions and 47 deletions

View File

@ -7,7 +7,7 @@
# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py # https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
import logging
import math import math
from functools import partial from functools import partial
from typing import Callable, Sequence, Tuple, Union from typing import Callable, Sequence, Tuple, Union
@ -17,11 +17,8 @@ import torch.nn as nn
import torch.utils.checkpoint import torch.utils.checkpoint
from torch.nn.init import trunc_normal_ from torch.nn.init import trunc_normal_
from .dinov2_layers import MemEffAttention, Mlp from invokeai.backend.image_util.depth_anything.v2.dinov2_layers import MemEffAttention, Mlp, PatchEmbed, SwiGLUFFNFused
from .dinov2_layers import NestedTensorBlock as Block from invokeai.backend.image_util.depth_anything.v2.dinov2_layers import NestedTensorBlock as Block
from .dinov2_layers import PatchEmbed, SwiGLUFFNFused
logger = logging.getLogger("dinov2")
def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module: def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
@ -120,13 +117,10 @@ class DinoVisionTransformer(nn.Module):
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
if ffn_layer == "mlp": if ffn_layer == "mlp":
logger.info("using MLP layer as FFN")
ffn_layer = Mlp ffn_layer = Mlp
elif ffn_layer == "swiglufused" or ffn_layer == "swiglu": elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
logger.info("using SwiGLU layer as FFN")
ffn_layer = SwiGLUFFNFused ffn_layer = SwiGLUFFNFused
elif ffn_layer == "identity": elif ffn_layer == "identity":
logger.info("using Identity layer as FFN")
def f(*args, **kwargs): def f(*args, **kwargs):
return nn.Identity() return nn.Identity()
@ -232,13 +226,13 @@ class DinoVisionTransformer(nn.Module):
return x return x
def forward_features_list(self, x_list, masks_list): def forward_features_list(self, x_list, masks_list):
x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)] x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list, strict=False)]
for blk in self.blocks: for blk in self.blocks:
x = blk(x) x = blk(x)
all_x = x all_x = x
output = [] output = []
for x, masks in zip(all_x, masks_list): for x, masks in zip(all_x, masks_list, strict=False):
x_norm = self.norm(x) x_norm = self.norm(x)
output.append( output.append(
{ {
@ -301,7 +295,7 @@ class DinoVisionTransformer(nn.Module):
n: Union[int, Sequence] = 1, # Layers or n last layers to take n: Union[int, Sequence] = 1, # Layers or n last layers to take
reshape: bool = False, reshape: bool = False,
return_class_token: bool = False, return_class_token: bool = False,
norm=True, norm: bool = True,
) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]: ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
if self.chunked_blocks: if self.chunked_blocks:
outputs = self._get_intermediate_layers_chunked(x, n) outputs = self._get_intermediate_layers_chunked(x, n)
@ -318,7 +312,7 @@ class DinoVisionTransformer(nn.Module):
for out in outputs for out in outputs
] ]
if return_class_token: if return_class_token:
return tuple(zip(outputs, class_tokens)) return tuple(zip(outputs, class_tokens, strict=False))
return tuple(outputs) return tuple(outputs)
def forward(self, *args, is_training=False, **kwargs): def forward(self, *args, is_training=False, **kwargs):

View File

@ -4,8 +4,9 @@
# This source code is licensed under the license found in the # This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree. # LICENSE file in the root directory of this source tree.
from .attention import MemEffAttention
from .block import NestedTensorBlock from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.attention import MemEffAttention # noqa
from .mlp import Mlp from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.block import NestedTensorBlock # noqa
from .patch_embed import PatchEmbed from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.mlp import Mlp # noqa
from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.patch_embed import PatchEmbed # noqa
from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused # noqa

View File

@ -8,19 +8,16 @@
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
import logging # Referenced from: https://github.com/DepthAnything/Depth-Anything-V2
from torch import Tensor, nn from torch import Tensor, nn
logger = logging.getLogger("dinov2")
try: try:
from xformers.ops import fmha, memory_efficient_attention, unbind from xformers.ops import memory_efficient_attention, unbind
XFORMERS_AVAILABLE = True XFORMERS_AVAILABLE = True
except ImportError: except ImportError:
logger.warning("xFormers not available")
XFORMERS_AVAILABLE = False XFORMERS_AVAILABLE = False

View File

@ -8,26 +8,22 @@
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
import logging
from typing import Any, Callable, Dict, List, Tuple from typing import Any, Callable, Dict, List, Tuple
import torch import torch
from torch import Tensor, nn from torch import Tensor, nn
from .attention import Attention, MemEffAttention from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.attention import Attention, MemEffAttention
from .drop_path import DropPath from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.drop_path import DropPath
from .layer_scale import LayerScale from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.layer_scale import LayerScale
from .mlp import Mlp from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.mlp import Mlp
logger = logging.getLogger("dinov2")
try: try:
from xformers.ops import fmha, index_select_cat, scaled_index_add from xformers.ops import fmha, index_select_cat, scaled_index_add
XFORMERS_AVAILABLE = True XFORMERS_AVAILABLE = True
except ImportError: except ImportError:
logger.warning("xFormers not available")
XFORMERS_AVAILABLE = False XFORMERS_AVAILABLE = False
@ -157,10 +153,10 @@ def get_attn_bias_and_cat(x_list, branges=None):
this will perform the index select, cat the tensors, and provide the attn_bias from cache this will perform the index select, cat the tensors, and provide the attn_bias from cache
""" """
batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list] batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list)) all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list, strict=False))
if all_shapes not in attn_bias_cache.keys(): if all_shapes not in attn_bias_cache.keys():
seqlens = [] seqlens = []
for b, x in zip(batch_sizes, x_list): for b, x in zip(batch_sizes, x_list, strict=False):
for _ in range(b): for _ in range(b):
seqlens.append(x.shape[1]) seqlens.append(x.shape[1])
attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens) attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
@ -194,7 +190,9 @@ def drop_add_residual_stochastic_depth_list(
residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore
outputs = [] outputs = []
for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors): for x, brange, residual, residual_scale_factor in zip(
x_list, branges, residual_list, residual_scale_factors, strict=False
):
outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x)) outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
return outputs return outputs

View File

@ -1,12 +1,17 @@
# Referenced from https://github.com/DepthAnything/Depth-Anything-V2/blob/main/depth_anything_v2/dpt.py
from typing import List, Literal, Optional
import cv2 import cv2
import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from torchvision.transforms import Compose from torchvision.transforms import Compose
from .dinov2 import DINOv2 from invokeai.backend.image_util.depth_anything.v2.dinov2 import DINOv2
from .utils.blocks import FeatureFusionBlock, _make_scratch from invokeai.backend.image_util.depth_anything.v2.utils.blocks import FeatureFusionBlock, _make_scratch
from .utils.transform import NormalizeImage, PrepareForNet, Resize from invokeai.backend.image_util.depth_anything.v2.utils.transform import NormalizeImage, PrepareForNet, Resize
def _make_fusion_block(features, use_bn, size=None): def _make_fusion_block(features, use_bn, size=None):
@ -37,10 +42,18 @@ class ConvBlock(nn.Module):
class DPTHead(nn.Module): class DPTHead(nn.Module):
def __init__( def __init__(
self, in_channels, features=256, use_bn=False, out_channels=[256, 512, 1024, 1024], use_clstoken=False self,
in_channels: int,
features: int = 256,
use_bn: bool = False,
out_channels: Optional[List[int]] = None,
use_clstoken: bool = False,
): ):
super(DPTHead, self).__init__() super(DPTHead, self).__init__()
if out_channels is None:
out_channels = [256, 512, 1024, 1024]
self.use_clstoken = use_clstoken self.use_clstoken = use_clstoken
self.projects = nn.ModuleList( self.projects = nn.ModuleList(
@ -140,10 +153,18 @@ class DPTHead(nn.Module):
class DepthAnythingV2(nn.Module): class DepthAnythingV2(nn.Module):
def __init__( def __init__(
self, encoder="vitl", features=256, out_channels=[256, 512, 1024, 1024], use_bn=False, use_clstoken=False self,
encoder: Literal["vits", "vitb", "vitl", "vitg"] = "vitl",
features: int = 256,
out_channels: Optional[List[int]] = None,
use_bn: bool = False,
use_clstoken: bool = False,
): ):
super(DepthAnythingV2, self).__init__() super(DepthAnythingV2, self).__init__()
if out_channels is None:
out_channels = [256, 512, 1024, 1024]
self.intermediate_layer_idx = { self.intermediate_layer_idx = {
"vits": [2, 5, 8, 11], "vits": [2, 5, 8, 11],
"vitb": [2, 5, 8, 11], "vitb": [2, 5, 8, 11],
@ -158,7 +179,7 @@ class DepthAnythingV2(nn.Module):
self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken
) )
def forward(self, x): def forward(self, x: torch.Tensor):
patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14 patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14
features = self.pretrained.get_intermediate_layers( features = self.pretrained.get_intermediate_layers(
@ -171,7 +192,7 @@ class DepthAnythingV2(nn.Module):
return depth.squeeze(1) return depth.squeeze(1)
@torch.no_grad() @torch.no_grad()
def infer_image(self, raw_image, input_size=518): def infer_image(self, raw_image: np.ndarray, input_size: int = 518):
image, (h, w) = self.image2tensor(raw_image, input_size) image, (h, w) = self.image2tensor(raw_image, input_size)
depth = self.forward(image) depth = self.forward(image)

View File

@ -1,3 +1,5 @@
# Referenced from: https://github.com/DepthAnything/Depth-Anything-V2/blob/main/depth_anything_v2/util/blocks.py
import torch.nn as nn import torch.nn as nn
@ -53,7 +55,7 @@ class ResidualConvUnit(nn.Module):
self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups) self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
if self.bn == True: if self.bn:
self.bn1 = nn.BatchNorm2d(features) self.bn1 = nn.BatchNorm2d(features)
self.bn2 = nn.BatchNorm2d(features) self.bn2 = nn.BatchNorm2d(features)
@ -73,12 +75,12 @@ class ResidualConvUnit(nn.Module):
out = self.activation(x) out = self.activation(x)
out = self.conv1(out) out = self.conv1(out)
if self.bn == True: if self.bn:
out = self.bn1(out) out = self.bn1(out)
out = self.activation(out) out = self.activation(out)
out = self.conv2(out) out = self.conv2(out)
if self.bn == True: if self.bn:
out = self.bn2(out) out = self.bn2(out)
if self.groups > 1: if self.groups > 1:
@ -105,7 +107,7 @@ class FeatureFusionBlock(nn.Module):
self.expand = expand self.expand = expand
out_features = features out_features = features
if self.expand == True: if self.expand:
out_features = features // 2 out_features = features // 2
self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1) self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)

View File

@ -1,3 +1,5 @@
# Referenced from: https://github.com/DepthAnything/Depth-Anything-V2/blob/main/depth_anything_v2/util/transform.py
import cv2 import cv2
import numpy as np import numpy as np