feat: adaptation of Lineart Anime processor

Adapted from https://github.com/huggingface/controlnet_aux
2024-08-30 20:32:17 +00:00 · 2024-03-21 20:06:14 +11:00
parent c7f8fe4d5e
commit c36d12a50f
2 changed files with 207 additions and 4 deletions
--- a/invokeai/app/invocations/controlnet_image_processors.py
+++ b/invokeai/app/invocations/controlnet_image_processors.py
@ -9,7 +9,6 @@ import numpy as np
 from controlnet_aux import (
    ContentShuffleDetector,
    LeresDetector,
-    LineartAnimeDetector,
    MediapipeFaceDetector,
    MidasDetector,
    MLSDdetector,
@ -41,6 +40,7 @@ from invokeai.backend.image_util.depth_anything import DepthAnythingDetector
 from invokeai.backend.image_util.dw_openpose import DWOpenposeDetector
 from invokeai.backend.image_util.hed import HEDProcessor
 from invokeai.backend.image_util.lineart import LineartProcessor
+from invokeai.backend.image_util.lineart_anime import LineartAnimeProcessor

 from .baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output

@ -264,9 +264,9 @@ class LineartAnimeImageProcessorInvocation(ImageProcessorInvocation):
    detect_resolution: int = InputField(default=512, ge=0, description=FieldDescriptions.detect_res)
    image_resolution: int = InputField(default=512, ge=0, description=FieldDescriptions.image_res)

-    def run_processor(self, image):
-        processor = LineartAnimeDetector.from_pretrained("lllyasviel/Annotators")
-        processed_image = processor(
+    def run_processor(self, image: Image.Image) -> Image.Image:
+        processor = LineartAnimeProcessor()
+        processed_image = processor.run(
            image,
            detect_resolution=self.detect_resolution,
            image_resolution=self.image_resolution,
--- a/invokeai/backend/image_util/lineart_anime.py
+++ b/invokeai/backend/image_util/lineart_anime.py
@ -0,0 +1,203 @@
+"""Adapted from https://github.com/huggingface/controlnet_aux (Apache-2.0 license)."""
+
+import functools
+from typing import Optional
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from invokeai.backend.image_util.util import (
+    fit_image_to_resolution,
+    normalize_image_channel_count,
+    np_to_pil,
+    pil_to_np,
+)
+
+
+class UnetGenerator(nn.Module):
+    """Create a Unet-based generator"""
+
+    def __init__(
+        self,
+        input_nc: int,
+        output_nc: int,
+        num_downs: int,
+        ngf: int = 64,
+        norm_layer=nn.BatchNorm2d,
+        use_dropout: bool = False,
+    ):
+        """Construct a Unet generator
+        Parameters:
+            input_nc (int)  -- the number of channels in input images
+            output_nc (int) -- the number of channels in output images
+            num_downs (int) -- the number of downsamplings in UNet. For example, # if |num_downs| == 7,
+                                image of size 128x128 will become of size 1x1 # at the bottleneck
+            ngf (int)       -- the number of filters in the last conv layer
+            norm_layer      -- normalization layer
+        We construct the U-Net from the innermost layer to the outermost layer.
+        It is a recursive process.
+        """
+        super(UnetGenerator, self).__init__()
+        # construct unet structure
+        unet_block = UnetSkipConnectionBlock(
+            ngf * 8, ngf * 8, input_nc=None, submodule=None, norm_layer=norm_layer, innermost=True
+        )  # add the innermost layer
+        for _ in range(num_downs - 5):  # add intermediate layers with ngf * 8 filters
+            unet_block = UnetSkipConnectionBlock(
+                ngf * 8, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer, use_dropout=use_dropout
+            )
+        # gradually reduce the number of filters from ngf * 8 to ngf
+        unet_block = UnetSkipConnectionBlock(
+            ngf * 4, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer
+        )
+        unet_block = UnetSkipConnectionBlock(
+            ngf * 2, ngf * 4, input_nc=None, submodule=unet_block, norm_layer=norm_layer
+        )
+        unet_block = UnetSkipConnectionBlock(ngf, ngf * 2, input_nc=None, submodule=unet_block, norm_layer=norm_layer)
+        self.model = UnetSkipConnectionBlock(
+            output_nc, ngf, input_nc=input_nc, submodule=unet_block, outermost=True, norm_layer=norm_layer
+        )  # add the outermost layer
+
+    def forward(self, input):
+        """Standard forward"""
+        return self.model(input)
+
+
+class UnetSkipConnectionBlock(nn.Module):
+    """Defines the Unet submodule with skip connection.
+    X -------------------identity----------------------
+    |-- downsampling -- |submodule| -- upsampling --|
+    """
+
+    def __init__(
+        self,
+        outer_nc: int,
+        inner_nc: int,
+        input_nc: Optional[int] = None,
+        submodule=None,
+        outermost: bool = False,
+        innermost: bool = False,
+        norm_layer=nn.BatchNorm2d,
+        use_dropout: bool = False,
+    ):
+        """Construct a Unet submodule with skip connections.
+        Parameters:
+            outer_nc (int) -- the number of filters in the outer conv layer
+            inner_nc (int) -- the number of filters in the inner conv layer
+            input_nc (int) -- the number of channels in input images/features
+            submodule (UnetSkipConnectionBlock) -- previously defined submodules
+            outermost (bool)    -- if this module is the outermost module
+            innermost (bool)    -- if this module is the innermost module
+            norm_layer          -- normalization layer
+            use_dropout (bool)  -- if use dropout layers.
+        """
+        super(UnetSkipConnectionBlock, self).__init__()
+        self.outermost = outermost
+        if type(norm_layer) == functools.partial:
+            use_bias = norm_layer.func == nn.InstanceNorm2d
+        else:
+            use_bias = norm_layer == nn.InstanceNorm2d
+        if input_nc is None:
+            input_nc = outer_nc
+        downconv = nn.Conv2d(input_nc, inner_nc, kernel_size=4, stride=2, padding=1, bias=use_bias)
+        downrelu = nn.LeakyReLU(0.2, True)
+        downnorm = norm_layer(inner_nc)
+        uprelu = nn.ReLU(True)
+        upnorm = norm_layer(outer_nc)
+
+        if outermost:
+            upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc, kernel_size=4, stride=2, padding=1)
+            down = [downconv]
+            up = [uprelu, upconv, nn.Tanh()]
+            model = down + [submodule] + up
+        elif innermost:
+            upconv = nn.ConvTranspose2d(inner_nc, outer_nc, kernel_size=4, stride=2, padding=1, bias=use_bias)
+            down = [downrelu, downconv]
+            up = [uprelu, upconv, upnorm]
+            model = down + up
+        else:
+            upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc, kernel_size=4, stride=2, padding=1, bias=use_bias)
+            down = [downrelu, downconv, downnorm]
+            up = [uprelu, upconv, upnorm]
+
+            if use_dropout:
+                model = down + [submodule] + up + [nn.Dropout(0.5)]
+            else:
+                model = down + [submodule] + up
+
+        self.model = nn.Sequential(*model)
+
+    def forward(self, x):
+        if self.outermost:
+            return self.model(x)
+        else:  # add skip connections
+            return torch.cat([x, self.model(x)], 1)
+
+
+class LineartAnimeProcessor:
+    """Processes an image to detect lineart."""
+
+    def __init__(self):
+        model_path = hf_hub_download("lllyasviel/Annotators", "netG.pth")
+        norm_layer = functools.partial(nn.InstanceNorm2d, affine=False, track_running_stats=False)
+        self.model = UnetGenerator(3, 1, 8, 64, norm_layer=norm_layer, use_dropout=False)
+        ckpt = torch.load(model_path)
+        for key in list(ckpt.keys()):
+            if "module." in key:
+                ckpt[key.replace("module.", "")] = ckpt[key]
+                del ckpt[key]
+        self.model.load_state_dict(ckpt)
+        self.model.eval()
+
+    def to(self, device: torch.device):
+        self.model.to(device)
+        return self
+
+    def run(self, input_image: Image.Image, detect_resolution: int = 512, image_resolution: int = 512) -> Image.Image:
+        """Processes an image to detect lineart.
+
+        Args:
+            input_image: The input image.
+            detect_resolution: The resolution to use for detection.
+            image_resolution: The resolution to use for the output image.
+
+        Returns:
+            The detected lineart.
+        """
+        device = next(iter(self.model.parameters())).device
+        np_image = pil_to_np(input_image)
+
+        np_image = normalize_image_channel_count(np_image)
+        np_image = fit_image_to_resolution(np_image, detect_resolution)
+
+        H, W, C = np_image.shape
+        Hn = 256 * int(np.ceil(float(H) / 256.0))
+        Wn = 256 * int(np.ceil(float(W) / 256.0))
+        img = cv2.resize(np_image, (Wn, Hn), interpolation=cv2.INTER_CUBIC)
+        with torch.no_grad():
+            image_feed = torch.from_numpy(img).float().to(device)
+            image_feed = image_feed / 127.5 - 1.0
+            image_feed = rearrange(image_feed, "h w c -> 1 c h w")
+
+            line = self.model(image_feed)[0, 0] * 127.5 + 127.5
+            line = line.cpu().numpy()
+
+            line = cv2.resize(line, (W, H), interpolation=cv2.INTER_CUBIC)
+            line = line.clip(0, 255).astype(np.uint8)
+
+        detected_map = line
+
+        detected_map = normalize_image_channel_count(detected_map)
+
+        img = fit_image_to_resolution(np_image, image_resolution)
+        H, W, C = img.shape
+
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+        detected_map = 255 - detected_map
+
+        return np_to_pil(detected_map)