From d742479810f692a0f0dd4015b5efc01abfa3f670 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Fri, 17 Nov 2023 18:36:28 -0500 Subject: [PATCH] Add nodes for tile splitting and merging. The main motivation for these nodes is for use in tiled upscaling workflows. --- invokeai/app/invocations/tiles.py | 162 +++++++++++++++++++++++++++++ invokeai/backend/tiles/__init__.py | 0 invokeai/backend/tiles/tiles.py | 155 +++++++++++++++++++++++++++ invokeai/backend/tiles/utils.py | 36 +++++++ 4 files changed, 353 insertions(+) create mode 100644 invokeai/app/invocations/tiles.py create mode 100644 invokeai/backend/tiles/__init__.py create mode 100644 invokeai/backend/tiles/tiles.py create mode 100644 invokeai/backend/tiles/utils.py diff --git a/invokeai/app/invocations/tiles.py b/invokeai/app/invocations/tiles.py new file mode 100644 index 0000000000..acc87a7864 --- /dev/null +++ b/invokeai/app/invocations/tiles.py @@ -0,0 +1,162 @@ +import numpy as np +from PIL import Image +from pydantic import BaseModel + +from invokeai.app.invocations.baseinvocation import ( + BaseInvocation, + BaseInvocationOutput, + InputField, + InvocationContext, + OutputField, + WithMetadata, + WithWorkflow, + invocation, + invocation_output, +) +from invokeai.app.invocations.primitives import ImageField, ImageOutput +from invokeai.app.services.image_records.image_records_common import ImageCategory, ResourceOrigin +from invokeai.backend.tiles.tiles import calc_tiles, merge_tiles_with_linear_blending +from invokeai.backend.tiles.utils import Tile + +# TODO(ryand): Is this important? +_DIMENSION_MULTIPLE_OF = 8 + + +class TileWithImage(BaseModel): + tile: Tile + image: ImageField + + +@invocation_output("calc_tiles_output") +class CalcTilesOutput(BaseInvocationOutput): + # TODO(ryand): Add description from FieldDescriptions. + tiles: list[Tile] = OutputField(description="") + + +@invocation("calculate_tiles", title="Calculate Tiles", tags=["tiles"], category="tiles", version="1.0.0") +class CalcTiles(BaseInvocation): + """TODO(ryand)""" + + # Inputs + image_height: int = InputField(ge=1) + image_width: int = InputField(ge=1) + tile_height: int = InputField(ge=1, multiple_of=_DIMENSION_MULTIPLE_OF, default=576) + tile_width: int = InputField(ge=1, multiple_of=_DIMENSION_MULTIPLE_OF, default=576) + overlap: int = InputField(ge=0, multiple_of=_DIMENSION_MULTIPLE_OF, default=64) + + def invoke(self, context: InvocationContext) -> CalcTilesOutput: + tiles = calc_tiles( + image_height=self.image_height, + image_width=self.image_width, + tile_height=self.tile_height, + tile_width=self.tile_width, + overlap=self.overlap, + ) + return CalcTilesOutput(tiles=tiles) + + +@invocation_output("tile_to_properties_output") +class TileToPropertiesOutput(BaseInvocationOutput): + # TODO(ryand): Add descriptions. + coords_top: int = OutputField(description="") + coords_bottom: int = OutputField(description="") + coords_left: int = OutputField(description="") + coords_right: int = OutputField(description="") + + overlap_top: int = OutputField(description="") + overlap_bottom: int = OutputField(description="") + overlap_left: int = OutputField(description="") + overlap_right: int = OutputField(description="") + + +@invocation("tile_to_properties") +class TileToProperties(BaseInvocation): + """Split a Tile into its individual properties.""" + + tile: Tile = InputField() + + def invoke(self, context: InvocationContext) -> TileToPropertiesOutput: + return TileToPropertiesOutput( + coords_top=self.tile.coords.top, + coords_bottom=self.tile.coords.bottom, + coords_left=self.tile.coords.left, + coords_right=self.tile.coords.right, + overlap_top=self.tile.overlap.top, + overlap_bottom=self.tile.overlap.bottom, + overlap_left=self.tile.overlap.left, + overlap_right=self.tile.overlap.right, + ) + + +# HACK(ryand): The only reason that PairTileImage is needed is because the iterate/collect nodes don't preserve order. +# Can this be fixed? + + +@invocation_output("pair_tile_image_output") +class PairTileImageOutput(BaseInvocationOutput): + tile_with_image: TileWithImage = OutputField(description="") + + +@invocation("pair_tile_image", title="Pair Tile with Image", tags=["tiles"], category="tiles", version="1.0.0") +class PairTileImage(BaseInvocation): + image: ImageField = InputField() + tile: Tile = InputField() + + def invoke(self, context: InvocationContext) -> PairTileImageOutput: + return PairTileImageOutput( + tile_with_image=TileWithImage( + tile=self.tile, + image=self.image, + ) + ) + + +@invocation("merge_tiles_to_image", title="Merge Tiles To Image", tags=["tiles"], category="tiles", version="1.0.0") +class MergeTilesToImage(BaseInvocation, WithMetadata, WithWorkflow): + """TODO(ryand)""" + + # Inputs + image_height: int = InputField(ge=1) + image_width: int = InputField(ge=1) + tiles_with_images: list[TileWithImage] = InputField() + blend_amount: int = InputField(ge=0) + + def invoke(self, context: InvocationContext) -> ImageOutput: + images = [twi.image for twi in self.tiles_with_images] + tiles = [twi.tile for twi in self.tiles_with_images] + + # Get all tile images for processing. + # TODO(ryand): It pains me that we spend time PNG decoding each tile from disk when they almost certainly + # existed in memory at an earlier point in the graph. + tile_np_images: list[np.ndarray] = [] + for image in images: + pil_image = context.services.images.get_pil_image(image.image_name) + pil_image = pil_image.convert("RGB") + tile_np_images.append(np.array(pil_image)) + + # Prepare the output image buffer. + # Check the first tile to determine how many image channels are expected in the output. + channels = tile_np_images[0].shape[-1] + dtype = tile_np_images[0].dtype + np_image = np.zeros(shape=(self.image_height, self.image_width, channels), dtype=dtype) + + merge_tiles_with_linear_blending( + dst_image=np_image, tiles=tiles, tile_images=tile_np_images, blend_amount=self.blend_amount + ) + pil_image = Image.fromarray(np_image) + + image_dto = context.services.images.create( + image=pil_image, + image_origin=ResourceOrigin.INTERNAL, + image_category=ImageCategory.GENERAL, + node_id=self.id, + session_id=context.graph_execution_state_id, + is_intermediate=self.is_intermediate, + metadata=self.metadata, + workflow=self.workflow, + ) + return ImageOutput( + image=ImageField(image_name=image_dto.image_name), + width=image_dto.width, + height=image_dto.height, + ) diff --git a/invokeai/backend/tiles/__init__.py b/invokeai/backend/tiles/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/invokeai/backend/tiles/tiles.py b/invokeai/backend/tiles/tiles.py new file mode 100644 index 0000000000..566381d1ff --- /dev/null +++ b/invokeai/backend/tiles/tiles.py @@ -0,0 +1,155 @@ +import math + +import numpy as np + +from invokeai.backend.tiles.utils import TBLR, Tile, paste + +# TODO(ryand) +# Test the following: +# - Tile too big in x, y +# - Overlap too big in x, y +# - Single tile fits +# - Multiple tiles fit perfectly +# - Not evenly divisible by tile size(with overlap) + + +def calc_tiles_with_overlap( + image_height: int, image_width: int, tile_height: int, tile_width: int, overlap: int = 0 +) -> list[Tile]: + """Calculate the tile coordinates for a given image shape under a simple tiling scheme with overlaps. + + Args: + image_height (int): The image height in px. + image_width (int): The image width in px. + tile_height (int): The tile height in px. All tiles will have this height. + tile_width (int): The tile width in px. All tiles will have this width. + overlap (int, optional): The target overlap between adjacent tiles. If the tiles do not evenly cover the image + shape, then the last row/column of tiles will overlap more than this. Defaults to 0. + + Returns: + list[Tile]: A list of tiles that cover the image shape. Ordered from left-to-right, top-to-bottom. + """ + assert image_height >= tile_height + assert image_width >= tile_width + assert overlap < tile_height + assert overlap < tile_width + + non_overlap_per_tile_height = tile_height - overlap + non_overlap_per_tile_width = tile_width - overlap + + num_tiles_y = math.ceil((image_height - overlap) / non_overlap_per_tile_height) + num_tiles_x = math.ceil((image_width - overlap) / non_overlap_per_tile_width) + + # Calculate tile coordinates and overlaps. + tiles: list[Tile] = [] + for tile_idx_y in range(num_tiles_y): + for tile_idx_x in range(num_tiles_x): + tile = Tile( + coords=TBLR( + top=tile_idx_y * non_overlap_per_tile_height, + bottom=tile_idx_y * non_overlap_per_tile_height + tile_height, + left=tile_idx_x * non_overlap_per_tile_width, + right=tile_idx_x * non_overlap_per_tile_width + tile_width, + ), + overlap=TBLR( + top=0 if tile_idx_y == 0 else overlap, + bottom=overlap, + left=0 if tile_idx_x == 0 else overlap, + right=overlap, + ), + ) + + if tile.coords.bottom > image_height: + # If this tile would go off the bottom of the image, shift it so that it is aligned with the bottom + # of the image. + tile.coords.bottom = image_height + tile.coords.top = image_height - tile_height + tile.overlap.bottom = 0 + # Note that this could result in a large overlap between this tile and the one above it. + top_neighbor_bottom = (tile_idx_y - 1) * non_overlap_per_tile_height + tile_height + tile.overlap.top = top_neighbor_bottom - tile.coords.top + + if tile.coords.right > image_width: + # If this tile would go off the right edge of the image, shift it so that it is aligned with the + # right edge of the image. + tile.coords.right = image_width + tile.coords.left = image_width - tile_width + tile.overlap.right = 0 + # Note that this could result in a large overlap between this tile and the one to its left. + left_neighbor_right = (tile_idx_x - 1) * non_overlap_per_tile_width + tile_width + tile.overlap.left = left_neighbor_right - tile.coords.left + + tiles.append(tile) + + return tiles + + +# TODO(ryand): +# - Test with blend_amount=0 +# - Test tiles that go off of the dst_image. +# - Test mismatched tiles and tile_images lengths. +# - Test mismatched + + +def merge_tiles_with_linear_blending( + dst_image: np.ndarray, tiles: list[Tile], tile_images: list[np.ndarray], blend_amount: int +): + """Merge a set of image tiles into `dst_image` with linear blending between the tiles. + + We expect every tile edge to either: + 1) have an overlap of 0, because it is aligned with the image edge, or + 2) have an overlap >= blend_amount. + If neither of these conditions are satisfied, we raise an exception. + + The linear blending is centered at the halfway point of the overlap between adjacent tiles. + + Args: + dst_image (np.ndarray): The destination image. Shape: (H, W, C). + tiles (list[Tile]): The list of tiles describing the locations of the respective `tile_images`. + tile_images (list[np.ndarray]): The tile images to merge into `dst_image`. + blend_amount (int): The amount of blending (in px) between adjacent overlapping tiles. + """ + # Sort tiles and images first by left x coordinate, then by top y coordinate. During tile processing, we want to + # iterate over tiles left-to-right, top-to-bottom. + tiles_and_images = list(zip(tiles, tile_images, strict=True)) + tiles_and_images = sorted(tiles_and_images, key=lambda x: x[0].coords.left) + tiles_and_images = sorted(tiles_and_images, key=lambda x: x[0].coords.top) + + # Prepare 1D linear gradients for blending. + gradient_left_x = np.linspace(start=0.0, stop=1.0, num=blend_amount) + gradient_top_y = np.linspace(start=0.0, stop=1.0, num=blend_amount) + # Convert shape: (blend_amount, ) -> (blend_amount, 1). The extra dimension enables the gradient to be applied + # to a 2D image via broadcasting. Note that no additional dimension is needed on gradient_left_x for + # broadcasting to work correctly. + gradient_top_y = np.expand_dims(gradient_top_y, axis=1) + + for tile, tile_image in tiles_and_images: + # We expect tiles to be written left-to-right, top-to-bottom. We construct a mask that applies linear blending + # to the top and to the left of the current tile. The inverse linear blending is automatically applied to the + # bottom/right of the tiles that have already been pasted by the paste(...) operation. + tile_height, tile_width, _ = tile_image.shape + mask = np.ones(shape=(tile_height, tile_width), dtype=np.float64) + # Top blending: + if tile.overlap.top > 0: + assert tile.overlap.top >= blend_amount + # Center the blending gradient in the middle of the overlap. + blend_start_top = tile.overlap.top // 2 - blend_amount // 2 + # The region above the blending region is masked completely. + mask[:blend_start_top, :] = 0.0 + # Apply the blend gradient to the mask. Note that we use `*=` rather than `=` to achieve more natural + # behavior on the corners where vertical and horizontal blending gradients overlap. + mask[blend_start_top : blend_start_top + blend_amount, :] *= gradient_top_y + # HACK(ryand): For debugging + # tile_image[blend_start_top : blend_start_top + blend_amount, :] = 0 + + # Left blending: + # (See comments under 'top blending' for an explanation of the logic.) + if tile.overlap.left > 0: + assert tile.overlap.left >= blend_amount + blend_start_left = tile.overlap.left // 2 - blend_amount // 2 + mask[:, :blend_start_left] = 0.0 + mask[:, blend_start_left : blend_start_left + blend_amount] *= gradient_left_x + # HACK(ryand): For debugging + # tile_image[:, blend_start_left : blend_start_left + blend_amount] = 0 + + paste(dst_image=dst_image, src_image=tile_image, box=tile.coords, mask=mask) diff --git a/invokeai/backend/tiles/utils.py b/invokeai/backend/tiles/utils.py new file mode 100644 index 0000000000..cf8e926aa5 --- /dev/null +++ b/invokeai/backend/tiles/utils.py @@ -0,0 +1,36 @@ +from typing import Optional + +import numpy as np +from pydantic import BaseModel, Field + + +class TBLR(BaseModel): + top: int + bottom: int + left: int + right: int + + +class Tile(BaseModel): + coords: TBLR = Field(description="The coordinates of this tile relative to its parent image.") + overlap: TBLR = Field(description="The amount of overlap with adjacent tiles on each side of this tile.") + + +def paste(dst_image: np.ndarray, src_image: np.ndarray, box: TBLR, mask: Optional[np.ndarray] = None): + """Paste a source image into a destination image. + + Args: + dst_image (torch.Tensor): The destination image to paste into. Shape: (H, W, C). + src_image (torch.Tensor): The source image to paste. Shape: (H, W, C). H and W must be compatible with 'box'. + box (TBLR): Box defining the region in the 'dst_image' where 'src_image' will be pasted. + mask (Optional[torch.Tensor]): A mask that defines the blending between 'src_image' and 'dst_image'. + Range: [0.0, 1.0], Shape: (H, W). The output is calculate per-pixel according to + `src * mask + dst * (1 - mask)`. + """ + + if mask is None: + dst_image[box.top : box.bottom, box.left : box.right] = src_image + else: + mask = np.expand_dims(mask, -1) + dst_image_box = dst_image[box.top : box.bottom, box.left : box.right] + dst_image[box.top : box.bottom, box.left : box.right] = src_image * mask + dst_image_box * (1.0 - mask)