import numpy as np import torch from PIL import Image from invokeai.app.invocations.baseinvocation import ( BaseInvocation, InvocationContext, invocation, ) from invokeai.app.invocations.fields import ( BoundingBoxField, ColorField, ImageField, InputField, TensorField, WithBoard, WithMetadata, ) from invokeai.app.invocations.primitives import BoundingBoxOutput, ImageOutput, MaskOutput from invokeai.backend.image_util.util import pil_to_np @invocation( "rectangle_mask", title="Create Rectangle Mask", tags=["conditioning"], category="conditioning", version="1.0.1", ) class RectangleMaskInvocation(BaseInvocation, WithMetadata): """Create a rectangular mask.""" width: int = InputField(description="The width of the entire mask.") height: int = InputField(description="The height of the entire mask.") x_left: int = InputField(description="The left x-coordinate of the rectangular masked region (inclusive).") y_top: int = InputField(description="The top y-coordinate of the rectangular masked region (inclusive).") rectangle_width: int = InputField(description="The width of the rectangular masked region.") rectangle_height: int = InputField(description="The height of the rectangular masked region.") def invoke(self, context: InvocationContext) -> MaskOutput: mask = torch.zeros((1, self.height, self.width), dtype=torch.bool) mask[:, self.y_top : self.y_top + self.rectangle_height, self.x_left : self.x_left + self.rectangle_width] = ( True ) mask_tensor_name = context.tensors.save(mask) return MaskOutput( mask=TensorField(tensor_name=mask_tensor_name), width=self.width, height=self.height, ) @invocation( "alpha_mask_to_tensor", title="Alpha Mask to Tensor", tags=["conditioning"], category="conditioning", version="1.0.0", ) class AlphaMaskToTensorInvocation(BaseInvocation): """Convert a mask image to a tensor. Opaque regions are 1 and transparent regions are 0.""" image: ImageField = InputField(description="The mask image to convert.") invert: bool = InputField(default=False, description="Whether to invert the mask.") def invoke(self, context: InvocationContext) -> MaskOutput: image = context.images.get_pil(self.image.image_name, mode="RGBA") mask = torch.zeros((1, image.height, image.width), dtype=torch.bool) if self.invert: mask[0] = torch.tensor(np.array(image)[:, :, 3] == 0, dtype=torch.bool) else: mask[0] = torch.tensor(np.array(image)[:, :, 3] > 0, dtype=torch.bool) return MaskOutput( mask=TensorField(tensor_name=context.tensors.save(mask)), height=mask.shape[1], width=mask.shape[2], ) @invocation( "invert_tensor_mask", title="Invert Tensor Mask", tags=["conditioning"], category="conditioning", version="1.1.0", ) class InvertTensorMaskInvocation(BaseInvocation): """Inverts a tensor mask.""" mask: TensorField = InputField(description="The tensor mask to convert.") def invoke(self, context: InvocationContext) -> MaskOutput: mask = context.tensors.load(self.mask.tensor_name) # Verify dtype and shape. assert mask.dtype == torch.bool assert mask.dim() in [2, 3] # Unsqueeze the channel dimension if it is missing. The MaskOutput type expects a single channel. if mask.dim() == 2: mask = mask.unsqueeze(0) inverted = ~mask return MaskOutput( mask=TensorField(tensor_name=context.tensors.save(inverted)), height=inverted.shape[1], width=inverted.shape[2], ) @invocation( "image_mask_to_tensor", title="Image Mask to Tensor", tags=["conditioning"], category="conditioning", version="1.0.0", ) class ImageMaskToTensorInvocation(BaseInvocation, WithMetadata): """Convert a mask image to a tensor. Converts the image to grayscale and uses thresholding at the specified value.""" image: ImageField = InputField(description="The mask image to convert.") cutoff: int = InputField(ge=0, le=255, description="Cutoff (<)", default=128) invert: bool = InputField(default=False, description="Whether to invert the mask.") def invoke(self, context: InvocationContext) -> MaskOutput: image = context.images.get_pil(self.image.image_name, mode="L") mask = torch.zeros((1, image.height, image.width), dtype=torch.bool) if self.invert: mask[0] = torch.tensor(np.array(image)[:, :] >= self.cutoff, dtype=torch.bool) else: mask[0] = torch.tensor(np.array(image)[:, :] < self.cutoff, dtype=torch.bool) return MaskOutput( mask=TensorField(tensor_name=context.tensors.save(mask)), height=mask.shape[1], width=mask.shape[2], ) @invocation( "tensor_mask_to_image", title="Tensor Mask to Image", tags=["mask"], category="mask", version="1.1.0", ) class MaskTensorToImageInvocation(BaseInvocation, WithMetadata, WithBoard): """Convert a mask tensor to an image.""" mask: TensorField = InputField(description="The mask tensor to convert.") def invoke(self, context: InvocationContext) -> ImageOutput: mask = context.tensors.load(self.mask.tensor_name) # Squeeze the channel dimension if it exists. if mask.dim() == 3: mask = mask.squeeze(0) # Ensure that the mask is binary. if mask.dtype != torch.bool: mask = mask > 0.5 mask_np = (mask.float() * 255).byte().cpu().numpy() mask_pil = Image.fromarray(mask_np, mode="L") image_dto = context.images.save(image=mask_pil) return ImageOutput.build(image_dto) @invocation( "apply_tensor_mask_to_image", title="Apply Tensor Mask to Image", tags=["mask"], category="mask", version="1.0.0", ) class ApplyMaskTensorToImageInvocation(BaseInvocation, WithMetadata, WithBoard): """Applies a tensor mask to an image. The image is converted to RGBA and the mask is applied to the alpha channel.""" mask: TensorField = InputField(description="The mask tensor to apply.") image: ImageField = InputField(description="The image to apply the mask to.") invert: bool = InputField(default=False, description="Whether to invert the mask.") def invoke(self, context: InvocationContext) -> ImageOutput: image = context.images.get_pil(self.image.image_name, mode="RGBA") mask = context.tensors.load(self.mask.tensor_name) # Squeeze the channel dimension if it exists. if mask.dim() == 3: mask = mask.squeeze(0) # Ensure that the mask is binary. if mask.dtype != torch.bool: mask = mask > 0.5 mask_np = (mask.float() * 255).byte().cpu().numpy().astype(np.uint8) if self.invert: mask_np = 255 - mask_np # Apply the mask only to the alpha channel where the original alpha is non-zero. This preserves the original # image's transparency - else the transparent regions would end up as opaque black. # Separate the image into R, G, B, and A channels image_np = pil_to_np(image) r, g, b, a = np.split(image_np, 4, axis=-1) # Apply the mask to the alpha channel new_alpha = np.where(a.squeeze() > 0, mask_np, a.squeeze()) # Stack the RGB channels with the modified alpha masked_image_np = np.dstack([r.squeeze(), g.squeeze(), b.squeeze(), new_alpha]) # Convert back to an image (RGBA) masked_image = Image.fromarray(masked_image_np.astype(np.uint8), "RGBA") image_dto = context.images.save(image=masked_image) return ImageOutput.build(image_dto) WHITE = ColorField(r=255, g=255, b=255, a=255) @invocation( "get_image_mask_bounding_box", title="Get Image Mask Bounding Box", tags=["mask"], category="mask", version="1.0.0", ) class GetMaskBoundingBoxInvocation(BaseInvocation): """Gets the bounding box of the given mask image.""" mask: ImageField = InputField(description="The mask to crop.") margin: int = InputField(default=0, description="Margin to add to the bounding box.") mask_color: ColorField = InputField(default=WHITE, description="Color of the mask in the image.") def invoke(self, context: InvocationContext) -> BoundingBoxOutput: mask = context.images.get_pil(self.mask.image_name, mode="RGBA") mask_np = np.array(mask) # Convert mask_color to RGBA tuple mask_color_rgb = self.mask_color.tuple() # Find the bounding box of the mask color y, x = np.where(np.all(mask_np == mask_color_rgb, axis=-1)) if len(x) == 0 or len(y) == 0: # No pixels found with the given color return BoundingBoxOutput(bounding_box=BoundingBoxField(x_min=0, y_min=0, x_max=0, y_max=0)) left, upper, right, lower = x.min(), y.min(), x.max(), y.max() # Add the margin left = max(0, left - self.margin) upper = max(0, upper - self.margin) right = min(mask_np.shape[1], right + self.margin) lower = min(mask_np.shape[0], lower + self.margin) bounding_box = BoundingBoxField(x_min=left, y_min=upper, x_max=right, y_max=lower) return BoundingBoxOutput(bounding_box=bounding_box)