Add support for T2I-Adapter in node workflows (#4612)

* Bump diffusers to 0.21.2. * Add T2IAdapterInvocation boilerplate. * Add T2I-Adapter model to model-management. * (minor) Tidy prepare_control_image(...). * Add logic to run the T2I-Adapter models at the start of the DenoiseLatentsInvocation. * Add logic for applying T2I-Adapter weights and accumulating. * Add T2IAdapter to MODEL_CLASSES map. * yarn typegen * Add model probes for T2I-Adapter models. * Add all of the frontend boilerplate required to use T2I-Adapter in the nodes editor. * Add T2IAdapterModel.convert_if_required(...). * Fix errors in T2I-Adapter input image sizing logic. * Fix bug with handling of multiple T2I-Adapters. * black / flake8 * Fix typo * yarn build * Add num_channels param to prepare_control_image(...). * Link to upstream diffusers bugfix PR that currently requires a workaround. * feat: Add Color Map Preprocessor Needed for the color T2I Adapter * feat: Add Color Map Preprocessor to Linear UI * Revert "feat: Add Color Map Preprocessor" This reverts commit a1119a00bf. * Revert "feat: Add Color Map Preprocessor to Linear UI" This reverts commit bd8a9b82d8. * Fix T2I-Adapter field rendering in workflow editor. * yarn build, yarn typegen --------- Co-authored-by: blessedcoolant <54517381+blessedcoolant@users.noreply.github.com> Co-authored-by: psychedelicious <4822129+psychedelicious@users.noreply.github.com>
2024-08-30 20:32:17 +00:00 · 2023-10-05 01:29:16 -04:00
parent fbe6452c45
commit 78377469db
32 changed files with 1610 additions and 248 deletions
--- a/invokeai/app/util/controlnet_utils.py
+++ b/invokeai/app/util/controlnet_utils.py
@ -265,22 +265,41 @@ def np_img_resize(np_img: np.ndarray, resize_mode: str, h: int, w: int, device:


 def prepare_control_image(
-    # image used to be Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor, List[torch.Tensor]]
-    # but now should be able to assume that image is a single PIL.Image, which simplifies things
    image: Image,
-    # FIXME: need to fix hardwiring of width and height, change to basing on latents dimensions?
-    # latents_to_match_resolution, # TorchTensor of shape (batch_size, 3, height, width)
-    width=512,  # should be 8 * latent.shape[3]
-    height=512,  # should be 8 * latent height[2]
-    # batch_size=1, # currently no batching
-    # num_images_per_prompt=1, # currently only single image
+    width: int,
+    height: int,
+    num_channels: int = 3,
    device="cuda",
    dtype=torch.float16,
    do_classifier_free_guidance=True,
    control_mode="balanced",
    resize_mode="just_resize_simple",
 ):
-    # FIXME: implement "crop_resize_simple" and "fill_resize_simple", or pull them out
+    """Pre-process images for ControlNets or T2I-Adapters.
+
+    Args:
+        image (Image): The PIL image to pre-process.
+        width (int): The target width in pixels.
+        height (int): The target height in pixels.
+        num_channels (int, optional): The target number of image channels. This is achieved by converting the input
+            image to RGB, then naively taking the first `num_channels` channels. The primary use case is converting a
+            RGB image to a single-channel grayscale image. Raises if `num_channels` cannot be achieved. Defaults to 3.
+        device (str, optional): The target device for the output image. Defaults to "cuda".
+        dtype (_type_, optional): The dtype for the output image. Defaults to torch.float16.
+        do_classifier_free_guidance (bool, optional): If True, repeat the output image along the batch dimension.
+            Defaults to True.
+        control_mode (str, optional): Defaults to "balanced".
+        resize_mode (str, optional): Defaults to "just_resize_simple".
+
+    Raises:
+        NotImplementedError: If resize_mode == "crop_resize_simple".
+        NotImplementedError: If resize_mode == "fill_resize_simple".
+        ValueError: If `resize_mode` is not recognized.
+        ValueError: If `num_channels` is out of range.
+
+    Returns:
+        torch.Tensor: The pre-processed input tensor.
+    """
    if (
        resize_mode == "just_resize_simple"
        or resize_mode == "crop_resize_simple"
@ -289,10 +308,10 @@ def prepare_control_image(
        image = image.convert("RGB")
        if resize_mode == "just_resize_simple":
            image = image.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
-        elif resize_mode == "crop_resize_simple":  # not yet implemented
-            pass
-        elif resize_mode == "fill_resize_simple":  # not yet implemented
-            pass
+        elif resize_mode == "crop_resize_simple":
+            raise NotImplementedError(f"prepare_control_image is not implemented for resize_mode='{resize_mode}'.")
+        elif resize_mode == "fill_resize_simple":
+            raise NotImplementedError(f"prepare_control_image is not implemented for resize_mode='{resize_mode}'.")
        nimage = np.array(image)
        nimage = nimage[None, :]
        nimage = np.concatenate([nimage], axis=0)
@ -313,9 +332,11 @@ def prepare_control_image(
            device=device,
        )
    else:
-        pass
-        print("ERROR: invalid resize_mode ==> ", resize_mode)
-        exit(1)
+        raise ValueError(f"Unsupported resize_mode: '{resize_mode}'.")
+
+    if timage.shape[1] < num_channels or num_channels <= 0:
+        raise ValueError(f"Cannot achieve the target of num_channels={num_channels}.")
+    timage = timage[:, :num_channels, :, :]

    timage = timage.to(device=device, dtype=dtype)
    cfg_injection = control_mode == "more_control" or control_mode == "unbalanced"