Add nodes for tile splitting and merging. The main motivation for these nodes is for use in tiled upscaling workflows.

2024-08-30 20:32:17 +00:00 · 2023-11-17 18:36:28 -05:00 · 2023-11-17 18:36:28 -05:00 · d742479810
commit d742479810
parent 77933a0a85
4 changed files with 353 additions and 0 deletions
--- a/invokeai/app/invocations/tiles.py
+++ b/invokeai/app/invocations/tiles.py
@ -0,0 +1,162 @@
+import numpy as np
+from PIL import Image
+from pydantic import BaseModel
+
+from invokeai.app.invocations.baseinvocation import (
+    BaseInvocation,
+    BaseInvocationOutput,
+    InputField,
+    InvocationContext,
+    OutputField,
+    WithMetadata,
+    WithWorkflow,
+    invocation,
+    invocation_output,
+)
+from invokeai.app.invocations.primitives import ImageField, ImageOutput
+from invokeai.app.services.image_records.image_records_common import ImageCategory, ResourceOrigin
+from invokeai.backend.tiles.tiles import calc_tiles, merge_tiles_with_linear_blending
+from invokeai.backend.tiles.utils import Tile
+
+# TODO(ryand): Is this important?
+_DIMENSION_MULTIPLE_OF = 8
+
+
+class TileWithImage(BaseModel):
+    tile: Tile
+    image: ImageField
+
+
+@invocation_output("calc_tiles_output")
+class CalcTilesOutput(BaseInvocationOutput):
+    # TODO(ryand): Add description from FieldDescriptions.
+    tiles: list[Tile] = OutputField(description="")
+
+
+@invocation("calculate_tiles", title="Calculate Tiles", tags=["tiles"], category="tiles", version="1.0.0")
+class CalcTiles(BaseInvocation):
+    """TODO(ryand)"""
+
+    # Inputs
+    image_height: int = InputField(ge=1)
+    image_width: int = InputField(ge=1)
+    tile_height: int = InputField(ge=1, multiple_of=_DIMENSION_MULTIPLE_OF, default=576)
+    tile_width: int = InputField(ge=1, multiple_of=_DIMENSION_MULTIPLE_OF, default=576)
+    overlap: int = InputField(ge=0, multiple_of=_DIMENSION_MULTIPLE_OF, default=64)
+
+    def invoke(self, context: InvocationContext) -> CalcTilesOutput:
+        tiles = calc_tiles(
+            image_height=self.image_height,
+            image_width=self.image_width,
+            tile_height=self.tile_height,
+            tile_width=self.tile_width,
+            overlap=self.overlap,
+        )
+        return CalcTilesOutput(tiles=tiles)
+
+
+@invocation_output("tile_to_properties_output")
+class TileToPropertiesOutput(BaseInvocationOutput):
+    # TODO(ryand): Add descriptions.
+    coords_top: int = OutputField(description="")
+    coords_bottom: int = OutputField(description="")
+    coords_left: int = OutputField(description="")
+    coords_right: int = OutputField(description="")
+
+    overlap_top: int = OutputField(description="")
+    overlap_bottom: int = OutputField(description="")
+    overlap_left: int = OutputField(description="")
+    overlap_right: int = OutputField(description="")
+
+
+@invocation("tile_to_properties")
+class TileToProperties(BaseInvocation):
+    """Split a Tile into its individual properties."""
+
+    tile: Tile = InputField()
+
+    def invoke(self, context: InvocationContext) -> TileToPropertiesOutput:
+        return TileToPropertiesOutput(
+            coords_top=self.tile.coords.top,
+            coords_bottom=self.tile.coords.bottom,
+            coords_left=self.tile.coords.left,
+            coords_right=self.tile.coords.right,
+            overlap_top=self.tile.overlap.top,
+            overlap_bottom=self.tile.overlap.bottom,
+            overlap_left=self.tile.overlap.left,
+            overlap_right=self.tile.overlap.right,
+        )
+
+
+# HACK(ryand): The only reason that PairTileImage is needed is because the iterate/collect nodes don't preserve order.
+# Can this be fixed?
+
+
+@invocation_output("pair_tile_image_output")
+class PairTileImageOutput(BaseInvocationOutput):
+    tile_with_image: TileWithImage = OutputField(description="")
+
+
+@invocation("pair_tile_image", title="Pair Tile with Image", tags=["tiles"], category="tiles", version="1.0.0")
+class PairTileImage(BaseInvocation):
+    image: ImageField = InputField()
+    tile: Tile = InputField()
+
+    def invoke(self, context: InvocationContext) -> PairTileImageOutput:
+        return PairTileImageOutput(
+            tile_with_image=TileWithImage(
+                tile=self.tile,
+                image=self.image,
+            )
+        )
+
+
+@invocation("merge_tiles_to_image", title="Merge Tiles To Image", tags=["tiles"], category="tiles", version="1.0.0")
+class MergeTilesToImage(BaseInvocation, WithMetadata, WithWorkflow):
+    """TODO(ryand)"""
+
+    # Inputs
+    image_height: int = InputField(ge=1)
+    image_width: int = InputField(ge=1)
+    tiles_with_images: list[TileWithImage] = InputField()
+    blend_amount: int = InputField(ge=0)
+
+    def invoke(self, context: InvocationContext) -> ImageOutput:
+        images = [twi.image for twi in self.tiles_with_images]
+        tiles = [twi.tile for twi in self.tiles_with_images]
+
+        # Get all tile images for processing.
+        # TODO(ryand): It pains me that we spend time PNG decoding each tile from disk when they almost certainly
+        # existed in memory at an earlier point in the graph.
+        tile_np_images: list[np.ndarray] = []
+        for image in images:
+            pil_image = context.services.images.get_pil_image(image.image_name)
+            pil_image = pil_image.convert("RGB")
+            tile_np_images.append(np.array(pil_image))
+
+        # Prepare the output image buffer.
+        # Check the first tile to determine how many image channels are expected in the output.
+        channels = tile_np_images[0].shape[-1]
+        dtype = tile_np_images[0].dtype
+        np_image = np.zeros(shape=(self.image_height, self.image_width, channels), dtype=dtype)
+
+        merge_tiles_with_linear_blending(
+            dst_image=np_image, tiles=tiles, tile_images=tile_np_images, blend_amount=self.blend_amount
+        )
+        pil_image = Image.fromarray(np_image)
+
+        image_dto = context.services.images.create(
+            image=pil_image,
+            image_origin=ResourceOrigin.INTERNAL,
+            image_category=ImageCategory.GENERAL,
+            node_id=self.id,
+            session_id=context.graph_execution_state_id,
+            is_intermediate=self.is_intermediate,
+            metadata=self.metadata,
+            workflow=self.workflow,
+        )
+        return ImageOutput(
+            image=ImageField(image_name=image_dto.image_name),
+            width=image_dto.width,
+            height=image_dto.height,
+        )
--- a/invokeai/backend/tiles/init.py
+++ b/invokeai/backend/tiles/init.py
--- a/invokeai/backend/tiles/tiles.py
+++ b/invokeai/backend/tiles/tiles.py
@ -0,0 +1,155 @@
+import math
+
+import numpy as np
+
+from invokeai.backend.tiles.utils import TBLR, Tile, paste
+
+# TODO(ryand)
+# Test the following:
+# - Tile too big in x, y
+# - Overlap too big in x, y
+# - Single tile fits
+# - Multiple tiles fit perfectly
+# - Not evenly divisible by tile size(with overlap)
+
+
+def calc_tiles_with_overlap(
+    image_height: int, image_width: int, tile_height: int, tile_width: int, overlap: int = 0
+) -> list[Tile]:
+    """Calculate the tile coordinates for a given image shape under a simple tiling scheme with overlaps.
+
+    Args:
+        image_height (int): The image height in px.
+        image_width (int): The image width in px.
+        tile_height (int): The tile height in px. All tiles will have this height.
+        tile_width (int): The tile width in px. All tiles will have this width.
+        overlap (int, optional): The target overlap between adjacent tiles. If the tiles do not evenly cover the image
+            shape, then the last row/column of tiles will overlap more than this. Defaults to 0.
+
+    Returns:
+        list[Tile]: A list of tiles that cover the image shape. Ordered from left-to-right, top-to-bottom.
+    """
+    assert image_height >= tile_height
+    assert image_width >= tile_width
+    assert overlap < tile_height
+    assert overlap < tile_width
+
+    non_overlap_per_tile_height = tile_height - overlap
+    non_overlap_per_tile_width = tile_width - overlap
+
+    num_tiles_y = math.ceil((image_height - overlap) / non_overlap_per_tile_height)
+    num_tiles_x = math.ceil((image_width - overlap) / non_overlap_per_tile_width)
+
+    # Calculate tile coordinates and overlaps.
+    tiles: list[Tile] = []
+    for tile_idx_y in range(num_tiles_y):
+        for tile_idx_x in range(num_tiles_x):
+            tile = Tile(
+                coords=TBLR(
+                    top=tile_idx_y * non_overlap_per_tile_height,
+                    bottom=tile_idx_y * non_overlap_per_tile_height + tile_height,
+                    left=tile_idx_x * non_overlap_per_tile_width,
+                    right=tile_idx_x * non_overlap_per_tile_width + tile_width,
+                ),
+                overlap=TBLR(
+                    top=0 if tile_idx_y == 0 else overlap,
+                    bottom=overlap,
+                    left=0 if tile_idx_x == 0 else overlap,
+                    right=overlap,
+                ),
+            )
+
+            if tile.coords.bottom > image_height:
+                # If this tile would go off the bottom of the image, shift it so that it is aligned with the bottom
+                # of the image.
+                tile.coords.bottom = image_height
+                tile.coords.top = image_height - tile_height
+                tile.overlap.bottom = 0
+                # Note that this could result in a large overlap between this tile and the one above it.
+                top_neighbor_bottom = (tile_idx_y - 1) * non_overlap_per_tile_height + tile_height
+                tile.overlap.top = top_neighbor_bottom - tile.coords.top
+
+            if tile.coords.right > image_width:
+                # If this tile would go off the right edge of the image, shift it so that it is aligned with the
+                # right edge of the image.
+                tile.coords.right = image_width
+                tile.coords.left = image_width - tile_width
+                tile.overlap.right = 0
+                # Note that this could result in a large overlap between this tile and the one to its left.
+                left_neighbor_right = (tile_idx_x - 1) * non_overlap_per_tile_width + tile_width
+                tile.overlap.left = left_neighbor_right - tile.coords.left
+
+            tiles.append(tile)
+
+    return tiles
+
+
+# TODO(ryand):
+# - Test with blend_amount=0
+# - Test tiles that go off of the dst_image.
+# - Test mismatched tiles and tile_images lengths.
+# - Test mismatched
+
+
+def merge_tiles_with_linear_blending(
+    dst_image: np.ndarray, tiles: list[Tile], tile_images: list[np.ndarray], blend_amount: int
+):
+    """Merge a set of image tiles into `dst_image` with linear blending between the tiles.
+
+    We expect every tile edge to either:
+    1) have an overlap of 0, because it is aligned with the image edge, or
+    2) have an overlap >= blend_amount.
+    If neither of these conditions are satisfied, we raise an exception.
+
+    The linear blending is centered at the halfway point of the overlap between adjacent tiles.
+
+    Args:
+        dst_image (np.ndarray): The destination image. Shape: (H, W, C).
+        tiles (list[Tile]): The list of tiles describing the locations of the respective `tile_images`.
+        tile_images (list[np.ndarray]): The tile images to merge into `dst_image`.
+        blend_amount (int): The amount of blending (in px) between adjacent overlapping tiles.
+    """
+    # Sort tiles and images first by left x coordinate, then by top y coordinate. During tile processing, we want to
+    # iterate over tiles left-to-right, top-to-bottom.
+    tiles_and_images = list(zip(tiles, tile_images, strict=True))
+    tiles_and_images = sorted(tiles_and_images, key=lambda x: x[0].coords.left)
+    tiles_and_images = sorted(tiles_and_images, key=lambda x: x[0].coords.top)
+
+    # Prepare 1D linear gradients for blending.
+    gradient_left_x = np.linspace(start=0.0, stop=1.0, num=blend_amount)
+    gradient_top_y = np.linspace(start=0.0, stop=1.0, num=blend_amount)
+    # Convert shape: (blend_amount, ) -> (blend_amount, 1). The extra dimension enables the gradient to be applied
+    # to a 2D image via broadcasting. Note that no additional dimension is needed on gradient_left_x for
+    # broadcasting to work correctly.
+    gradient_top_y = np.expand_dims(gradient_top_y, axis=1)
+
+    for tile, tile_image in tiles_and_images:
+        # We expect tiles to be written left-to-right, top-to-bottom. We construct a mask that applies linear blending
+        # to the top and to the left of the current tile. The inverse linear blending is automatically applied to the
+        # bottom/right of the tiles that have already been pasted by the paste(...) operation.
+        tile_height, tile_width, _ = tile_image.shape
+        mask = np.ones(shape=(tile_height, tile_width), dtype=np.float64)
+        # Top blending:
+        if tile.overlap.top > 0:
+            assert tile.overlap.top >= blend_amount
+            # Center the blending gradient in the middle of the overlap.
+            blend_start_top = tile.overlap.top // 2 - blend_amount // 2
+            # The region above the blending region is masked completely.
+            mask[:blend_start_top, :] = 0.0
+            # Apply the blend gradient to the mask. Note that we use `*=` rather than `=` to achieve more natural
+            # behavior on the corners where vertical and horizontal blending gradients overlap.
+            mask[blend_start_top : blend_start_top + blend_amount, :] *= gradient_top_y
+            # HACK(ryand): For debugging
+            # tile_image[blend_start_top : blend_start_top + blend_amount, :] = 0
+
+        # Left blending:
+        # (See comments under 'top blending' for an explanation of the logic.)
+        if tile.overlap.left > 0:
+            assert tile.overlap.left >= blend_amount
+            blend_start_left = tile.overlap.left // 2 - blend_amount // 2
+            mask[:, :blend_start_left] = 0.0
+            mask[:, blend_start_left : blend_start_left + blend_amount] *= gradient_left_x
+            # HACK(ryand): For debugging
+            # tile_image[:, blend_start_left : blend_start_left + blend_amount] = 0
+
+        paste(dst_image=dst_image, src_image=tile_image, box=tile.coords, mask=mask)
--- a/invokeai/backend/tiles/utils.py
+++ b/invokeai/backend/tiles/utils.py
@ -0,0 +1,36 @@
+from typing import Optional
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+
+class TBLR(BaseModel):
+    top: int
+    bottom: int
+    left: int
+    right: int
+
+
+class Tile(BaseModel):
+    coords: TBLR = Field(description="The coordinates of this tile relative to its parent image.")
+    overlap: TBLR = Field(description="The amount of overlap with adjacent tiles on each side of this tile.")
+
+
+def paste(dst_image: np.ndarray, src_image: np.ndarray, box: TBLR, mask: Optional[np.ndarray] = None):
+    """Paste a source image into a destination image.
+
+    Args:
+        dst_image (torch.Tensor): The destination image to paste into. Shape: (H, W, C).
+        src_image (torch.Tensor): The source image to paste. Shape: (H, W, C). H and W must be compatible with 'box'.
+        box (TBLR): Box defining the region in the 'dst_image' where 'src_image' will be pasted.
+        mask (Optional[torch.Tensor]): A mask that defines the blending between 'src_image' and 'dst_image'.
+            Range: [0.0, 1.0], Shape: (H, W). The output is calculate per-pixel according to
+            `src * mask + dst * (1 - mask)`.
+    """
+
+    if mask is None:
+        dst_image[box.top : box.bottom, box.left : box.right] = src_image
+    else:
+        mask = np.expand_dims(mask, -1)
+        dst_image_box = dst_image[box.top : box.bottom, box.left : box.right]
+        dst_image[box.top : box.bottom, box.left : box.right] = src_image * mask + dst_image_box * (1.0 - mask)