mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
wip: depth_anything_v2 init lint fixes
This commit is contained in:
parent
4f0dfbd34d
commit
4166c756ce
@ -7,7 +7,7 @@
|
|||||||
# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
|
# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
|
||||||
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
|
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
|
||||||
|
|
||||||
import logging
|
|
||||||
import math
|
import math
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import Callable, Sequence, Tuple, Union
|
from typing import Callable, Sequence, Tuple, Union
|
||||||
@ -17,11 +17,8 @@ import torch.nn as nn
|
|||||||
import torch.utils.checkpoint
|
import torch.utils.checkpoint
|
||||||
from torch.nn.init import trunc_normal_
|
from torch.nn.init import trunc_normal_
|
||||||
|
|
||||||
from .dinov2_layers import MemEffAttention, Mlp
|
from invokeai.backend.image_util.depth_anything.v2.dinov2_layers import MemEffAttention, Mlp, PatchEmbed, SwiGLUFFNFused
|
||||||
from .dinov2_layers import NestedTensorBlock as Block
|
from invokeai.backend.image_util.depth_anything.v2.dinov2_layers import NestedTensorBlock as Block
|
||||||
from .dinov2_layers import PatchEmbed, SwiGLUFFNFused
|
|
||||||
|
|
||||||
logger = logging.getLogger("dinov2")
|
|
||||||
|
|
||||||
|
|
||||||
def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
|
def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
|
||||||
@ -120,13 +117,10 @@ class DinoVisionTransformer(nn.Module):
|
|||||||
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
|
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
|
||||||
|
|
||||||
if ffn_layer == "mlp":
|
if ffn_layer == "mlp":
|
||||||
logger.info("using MLP layer as FFN")
|
|
||||||
ffn_layer = Mlp
|
ffn_layer = Mlp
|
||||||
elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
|
elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
|
||||||
logger.info("using SwiGLU layer as FFN")
|
|
||||||
ffn_layer = SwiGLUFFNFused
|
ffn_layer = SwiGLUFFNFused
|
||||||
elif ffn_layer == "identity":
|
elif ffn_layer == "identity":
|
||||||
logger.info("using Identity layer as FFN")
|
|
||||||
|
|
||||||
def f(*args, **kwargs):
|
def f(*args, **kwargs):
|
||||||
return nn.Identity()
|
return nn.Identity()
|
||||||
@ -232,13 +226,13 @@ class DinoVisionTransformer(nn.Module):
|
|||||||
return x
|
return x
|
||||||
|
|
||||||
def forward_features_list(self, x_list, masks_list):
|
def forward_features_list(self, x_list, masks_list):
|
||||||
x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
|
x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list, strict=False)]
|
||||||
for blk in self.blocks:
|
for blk in self.blocks:
|
||||||
x = blk(x)
|
x = blk(x)
|
||||||
|
|
||||||
all_x = x
|
all_x = x
|
||||||
output = []
|
output = []
|
||||||
for x, masks in zip(all_x, masks_list):
|
for x, masks in zip(all_x, masks_list, strict=False):
|
||||||
x_norm = self.norm(x)
|
x_norm = self.norm(x)
|
||||||
output.append(
|
output.append(
|
||||||
{
|
{
|
||||||
@ -301,7 +295,7 @@ class DinoVisionTransformer(nn.Module):
|
|||||||
n: Union[int, Sequence] = 1, # Layers or n last layers to take
|
n: Union[int, Sequence] = 1, # Layers or n last layers to take
|
||||||
reshape: bool = False,
|
reshape: bool = False,
|
||||||
return_class_token: bool = False,
|
return_class_token: bool = False,
|
||||||
norm=True,
|
norm: bool = True,
|
||||||
) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
|
) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
|
||||||
if self.chunked_blocks:
|
if self.chunked_blocks:
|
||||||
outputs = self._get_intermediate_layers_chunked(x, n)
|
outputs = self._get_intermediate_layers_chunked(x, n)
|
||||||
@ -318,7 +312,7 @@ class DinoVisionTransformer(nn.Module):
|
|||||||
for out in outputs
|
for out in outputs
|
||||||
]
|
]
|
||||||
if return_class_token:
|
if return_class_token:
|
||||||
return tuple(zip(outputs, class_tokens))
|
return tuple(zip(outputs, class_tokens, strict=False))
|
||||||
return tuple(outputs)
|
return tuple(outputs)
|
||||||
|
|
||||||
def forward(self, *args, is_training=False, **kwargs):
|
def forward(self, *args, is_training=False, **kwargs):
|
||||||
|
@ -4,8 +4,9 @@
|
|||||||
# This source code is licensed under the license found in the
|
# This source code is licensed under the license found in the
|
||||||
# LICENSE file in the root directory of this source tree.
|
# LICENSE file in the root directory of this source tree.
|
||||||
|
|
||||||
from .attention import MemEffAttention
|
|
||||||
from .block import NestedTensorBlock
|
from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.attention import MemEffAttention # noqa
|
||||||
from .mlp import Mlp
|
from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.block import NestedTensorBlock # noqa
|
||||||
from .patch_embed import PatchEmbed
|
from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.mlp import Mlp # noqa
|
||||||
from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
|
from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.patch_embed import PatchEmbed # noqa
|
||||||
|
from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused # noqa
|
||||||
|
@ -8,19 +8,16 @@
|
|||||||
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
||||||
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
|
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
|
||||||
|
|
||||||
import logging
|
# Referenced from: https://github.com/DepthAnything/Depth-Anything-V2
|
||||||
|
|
||||||
|
|
||||||
from torch import Tensor, nn
|
from torch import Tensor, nn
|
||||||
|
|
||||||
logger = logging.getLogger("dinov2")
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from xformers.ops import fmha, memory_efficient_attention, unbind
|
from xformers.ops import memory_efficient_attention, unbind
|
||||||
|
|
||||||
XFORMERS_AVAILABLE = True
|
XFORMERS_AVAILABLE = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.warning("xFormers not available")
|
|
||||||
XFORMERS_AVAILABLE = False
|
XFORMERS_AVAILABLE = False
|
||||||
|
|
||||||
|
|
||||||
|
@ -8,26 +8,22 @@
|
|||||||
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
||||||
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
|
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
|
||||||
|
|
||||||
import logging
|
|
||||||
from typing import Any, Callable, Dict, List, Tuple
|
from typing import Any, Callable, Dict, List, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import Tensor, nn
|
from torch import Tensor, nn
|
||||||
|
|
||||||
from .attention import Attention, MemEffAttention
|
from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.attention import Attention, MemEffAttention
|
||||||
from .drop_path import DropPath
|
from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.drop_path import DropPath
|
||||||
from .layer_scale import LayerScale
|
from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.layer_scale import LayerScale
|
||||||
from .mlp import Mlp
|
from invokeai.backend.image_util.depth_anything.v2.dinov2_layers.mlp import Mlp
|
||||||
|
|
||||||
logger = logging.getLogger("dinov2")
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from xformers.ops import fmha, index_select_cat, scaled_index_add
|
from xformers.ops import fmha, index_select_cat, scaled_index_add
|
||||||
|
|
||||||
XFORMERS_AVAILABLE = True
|
XFORMERS_AVAILABLE = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.warning("xFormers not available")
|
|
||||||
XFORMERS_AVAILABLE = False
|
XFORMERS_AVAILABLE = False
|
||||||
|
|
||||||
|
|
||||||
@ -157,10 +153,10 @@ def get_attn_bias_and_cat(x_list, branges=None):
|
|||||||
this will perform the index select, cat the tensors, and provide the attn_bias from cache
|
this will perform the index select, cat the tensors, and provide the attn_bias from cache
|
||||||
"""
|
"""
|
||||||
batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
|
batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
|
||||||
all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
|
all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list, strict=False))
|
||||||
if all_shapes not in attn_bias_cache.keys():
|
if all_shapes not in attn_bias_cache.keys():
|
||||||
seqlens = []
|
seqlens = []
|
||||||
for b, x in zip(batch_sizes, x_list):
|
for b, x in zip(batch_sizes, x_list, strict=False):
|
||||||
for _ in range(b):
|
for _ in range(b):
|
||||||
seqlens.append(x.shape[1])
|
seqlens.append(x.shape[1])
|
||||||
attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
|
attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
|
||||||
@ -194,7 +190,9 @@ def drop_add_residual_stochastic_depth_list(
|
|||||||
residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore
|
residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore
|
||||||
|
|
||||||
outputs = []
|
outputs = []
|
||||||
for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
|
for x, brange, residual, residual_scale_factor in zip(
|
||||||
|
x_list, branges, residual_list, residual_scale_factors, strict=False
|
||||||
|
):
|
||||||
outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
|
outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
@ -1,12 +1,17 @@
|
|||||||
|
# Referenced from https://github.com/DepthAnything/Depth-Anything-V2/blob/main/depth_anything_v2/dpt.py
|
||||||
|
|
||||||
|
from typing import List, Literal, Optional
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torchvision.transforms import Compose
|
from torchvision.transforms import Compose
|
||||||
|
|
||||||
from .dinov2 import DINOv2
|
from invokeai.backend.image_util.depth_anything.v2.dinov2 import DINOv2
|
||||||
from .utils.blocks import FeatureFusionBlock, _make_scratch
|
from invokeai.backend.image_util.depth_anything.v2.utils.blocks import FeatureFusionBlock, _make_scratch
|
||||||
from .utils.transform import NormalizeImage, PrepareForNet, Resize
|
from invokeai.backend.image_util.depth_anything.v2.utils.transform import NormalizeImage, PrepareForNet, Resize
|
||||||
|
|
||||||
|
|
||||||
def _make_fusion_block(features, use_bn, size=None):
|
def _make_fusion_block(features, use_bn, size=None):
|
||||||
@ -37,10 +42,18 @@ class ConvBlock(nn.Module):
|
|||||||
|
|
||||||
class DPTHead(nn.Module):
|
class DPTHead(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, in_channels, features=256, use_bn=False, out_channels=[256, 512, 1024, 1024], use_clstoken=False
|
self,
|
||||||
|
in_channels: int,
|
||||||
|
features: int = 256,
|
||||||
|
use_bn: bool = False,
|
||||||
|
out_channels: Optional[List[int]] = None,
|
||||||
|
use_clstoken: bool = False,
|
||||||
):
|
):
|
||||||
super(DPTHead, self).__init__()
|
super(DPTHead, self).__init__()
|
||||||
|
|
||||||
|
if out_channels is None:
|
||||||
|
out_channels = [256, 512, 1024, 1024]
|
||||||
|
|
||||||
self.use_clstoken = use_clstoken
|
self.use_clstoken = use_clstoken
|
||||||
|
|
||||||
self.projects = nn.ModuleList(
|
self.projects = nn.ModuleList(
|
||||||
@ -140,10 +153,18 @@ class DPTHead(nn.Module):
|
|||||||
|
|
||||||
class DepthAnythingV2(nn.Module):
|
class DepthAnythingV2(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, encoder="vitl", features=256, out_channels=[256, 512, 1024, 1024], use_bn=False, use_clstoken=False
|
self,
|
||||||
|
encoder: Literal["vits", "vitb", "vitl", "vitg"] = "vitl",
|
||||||
|
features: int = 256,
|
||||||
|
out_channels: Optional[List[int]] = None,
|
||||||
|
use_bn: bool = False,
|
||||||
|
use_clstoken: bool = False,
|
||||||
):
|
):
|
||||||
super(DepthAnythingV2, self).__init__()
|
super(DepthAnythingV2, self).__init__()
|
||||||
|
|
||||||
|
if out_channels is None:
|
||||||
|
out_channels = [256, 512, 1024, 1024]
|
||||||
|
|
||||||
self.intermediate_layer_idx = {
|
self.intermediate_layer_idx = {
|
||||||
"vits": [2, 5, 8, 11],
|
"vits": [2, 5, 8, 11],
|
||||||
"vitb": [2, 5, 8, 11],
|
"vitb": [2, 5, 8, 11],
|
||||||
@ -158,7 +179,7 @@ class DepthAnythingV2(nn.Module):
|
|||||||
self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken
|
self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken
|
||||||
)
|
)
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x: torch.Tensor):
|
||||||
patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14
|
patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14
|
||||||
|
|
||||||
features = self.pretrained.get_intermediate_layers(
|
features = self.pretrained.get_intermediate_layers(
|
||||||
@ -171,7 +192,7 @@ class DepthAnythingV2(nn.Module):
|
|||||||
return depth.squeeze(1)
|
return depth.squeeze(1)
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def infer_image(self, raw_image, input_size=518):
|
def infer_image(self, raw_image: np.ndarray, input_size: int = 518):
|
||||||
image, (h, w) = self.image2tensor(raw_image, input_size)
|
image, (h, w) = self.image2tensor(raw_image, input_size)
|
||||||
|
|
||||||
depth = self.forward(image)
|
depth = self.forward(image)
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# Referenced from: https://github.com/DepthAnything/Depth-Anything-V2/blob/main/depth_anything_v2/util/blocks.py
|
||||||
|
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
@ -53,7 +55,7 @@ class ResidualConvUnit(nn.Module):
|
|||||||
|
|
||||||
self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
|
self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
|
||||||
|
|
||||||
if self.bn == True:
|
if self.bn:
|
||||||
self.bn1 = nn.BatchNorm2d(features)
|
self.bn1 = nn.BatchNorm2d(features)
|
||||||
self.bn2 = nn.BatchNorm2d(features)
|
self.bn2 = nn.BatchNorm2d(features)
|
||||||
|
|
||||||
@ -73,12 +75,12 @@ class ResidualConvUnit(nn.Module):
|
|||||||
|
|
||||||
out = self.activation(x)
|
out = self.activation(x)
|
||||||
out = self.conv1(out)
|
out = self.conv1(out)
|
||||||
if self.bn == True:
|
if self.bn:
|
||||||
out = self.bn1(out)
|
out = self.bn1(out)
|
||||||
|
|
||||||
out = self.activation(out)
|
out = self.activation(out)
|
||||||
out = self.conv2(out)
|
out = self.conv2(out)
|
||||||
if self.bn == True:
|
if self.bn:
|
||||||
out = self.bn2(out)
|
out = self.bn2(out)
|
||||||
|
|
||||||
if self.groups > 1:
|
if self.groups > 1:
|
||||||
@ -105,7 +107,7 @@ class FeatureFusionBlock(nn.Module):
|
|||||||
|
|
||||||
self.expand = expand
|
self.expand = expand
|
||||||
out_features = features
|
out_features = features
|
||||||
if self.expand == True:
|
if self.expand:
|
||||||
out_features = features // 2
|
out_features = features // 2
|
||||||
|
|
||||||
self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
|
self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# Referenced from: https://github.com/DepthAnything/Depth-Anything-V2/blob/main/depth_anything_v2/util/transform.py
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user