InvokeAI/invokeai/backend/generator/txt2img2img.py

"""
invokeai.backend.generator.txt2img inherits from invokeai.backend.generator
"""

import math
from typing import Callable, Optional

import torch
from diffusers.utils.logging import get_verbosity, set_verbosity, set_verbosity_error

from ..models import PostprocessingSettings
from .base import Generator
from .diffusers_pipeline import (
    ConditioningData,
    StableDiffusionGeneratorPipeline,
    trim_to_multiple_of,
)


class Txt2Img2Img(Generator):
    def __init__(self, model, precision):
        super().__init__(model, precision)
        self.init_latent = None  # for get_noise()

    def get_make_image(
        self,
        prompt: str,
        sampler,
        steps: int,
        cfg_scale: float,
        ddim_eta,
        conditioning,
        width: int,
        height: int,
        strength: float,
        step_callback: Optional[Callable] = None,
        threshold=0.0,
        warmup=0.2,
        perlin=0.0,
        h_symmetry_time_pct=None,
        v_symmetry_time_pct=None,
        attention_maps_callback=None,
        **kwargs,
    ):
        """
        Returns a function returning an image derived from the prompt and the initial image
        Return value depends on the seed at the time you call it
        kwargs are 'width' and 'height'
        """
        self.perlin = perlin

        # noinspection PyTypeChecker
        pipeline: StableDiffusionGeneratorPipeline = self.model
        pipeline.scheduler = sampler

        uc, c, extra_conditioning_info = conditioning
        conditioning_data = ConditioningData(
            uc,
            c,
            cfg_scale,
            extra_conditioning_info,
            postprocessing_settings=PostprocessingSettings(
                threshold=threshold,
                warmup=0.2,
                h_symmetry_time_pct=h_symmetry_time_pct,
                v_symmetry_time_pct=v_symmetry_time_pct,
            ),
        ).add_scheduler_args_if_applicable(pipeline.scheduler, eta=ddim_eta)

        def make_image(x_T):
            first_pass_latent_output, _ = pipeline.latents_from_embeddings(
                latents=torch.zeros_like(x_T),
                num_inference_steps=steps,
                conditioning_data=conditioning_data,
                noise=x_T,
                callback=step_callback,
            )

            # Get our initial generation width and height directly from the latent output so
            # the message below is accurate.
            init_width = first_pass_latent_output.size()[3] * self.downsampling_factor
            init_height = first_pass_latent_output.size()[2] * self.downsampling_factor
            print(
                f"\n>> Interpolating from {init_width}x{init_height} to {width}x{height} using DDIM sampling"
            )

            # resizing
            resized_latents = torch.nn.functional.interpolate(
                first_pass_latent_output,
                size=(
                    height // self.downsampling_factor,
                    width // self.downsampling_factor,
                ),
                mode="bilinear",
            )

            # Free up memory from the last generation.
            clear_cuda_cache = kwargs["clear_cuda_cache"] or None
            if clear_cuda_cache is not None:
                clear_cuda_cache()

            second_pass_noise = self.get_noise_like(
                resized_latents, override_perlin=True
            )

            # Clear symmetry for the second pass
            from dataclasses import replace

            new_postprocessing_settings = replace(
                conditioning_data.postprocessing_settings, h_symmetry_time_pct=None
            )
            new_postprocessing_settings = replace(
                new_postprocessing_settings, v_symmetry_time_pct=None
            )
            new_conditioning_data = replace(
                conditioning_data, postprocessing_settings=new_postprocessing_settings
            )

            verbosity = get_verbosity()
            set_verbosity_error()
            pipeline_output = pipeline.img2img_from_latents_and_embeddings(
                resized_latents,
                num_inference_steps=steps,
                conditioning_data=new_conditioning_data,
                strength=strength,
                noise=second_pass_noise,
                callback=step_callback,
            )
            set_verbosity(verbosity)

            if (
                pipeline_output.attention_map_saver is not None
                and attention_maps_callback is not None
            ):
                attention_maps_callback(pipeline_output.attention_map_saver)

            return pipeline.numpy_to_pil(pipeline_output.images)[0]

        # FIXME: do we really need something entirely different for the inpainting model?

        # in the case of the inpainting model being loaded, the trick of
        # providing an interpolated latent doesn't work, so we transiently
        # create a 512x512 PIL image, upscale it, and run the inpainting
        # over it in img2img mode. Because the inpaing model is so conservative
        # it doesn't change the image (much)

        return make_image

    def get_noise_like(self, like: torch.Tensor, override_perlin: bool = False):
        device = like.device
        if device.type == "mps":
            x = torch.randn_like(like, device="cpu", dtype=self.torch_dtype()).to(
                device
            )
        else:
            x = torch.randn_like(like, device=device, dtype=self.torch_dtype())
        if self.perlin > 0.0 and override_perlin == False:
            shape = like.shape
            x = (1 - self.perlin) * x + self.perlin * self.get_perlin_noise(
                shape[3], shape[2]
            )
        return x

    # returns a tensor filled with random numbers from a normal distribution
    def get_noise(self, width, height, scale=True):
        # print(f"Get noise: {width}x{height}")
        if scale:
            # Scale the input width and height for the initial generation
            # Make their area equivalent to the model's resolution area (e.g. 512*512 = 262144),
            # while keeping the minimum dimension at least 0.5 * resolution (e.g. 512*0.5 = 256)

            aspect = width / height
            dimension = self.model.unet.config.sample_size * self.model.vae_scale_factor
            min_dimension = math.floor(dimension * 0.5)
            model_area = (
                dimension * dimension
            )  # hardcoded for now since all models are trained on square images

            if aspect > 1.0:
                init_height = max(min_dimension, math.sqrt(model_area / aspect))
                init_width = init_height * aspect
            else:
                init_width = max(min_dimension, math.sqrt(model_area * aspect))
                init_height = init_width / aspect

            scaled_width, scaled_height = trim_to_multiple_of(
                math.floor(init_width), math.floor(init_height)
            )

        else:
            scaled_width = width
            scaled_height = height

        device = self.model.device
        channels = self.latent_channels
        if channels == 9:
            channels = 4  # we don't really want noise for all the mask channels
        shape = (
            1,
            channels,
            scaled_height // self.downsampling_factor,
            scaled_width // self.downsampling_factor,
        )
        if self.use_mps_noise or device.type == "mps":
            tensor = torch.empty(size=shape, device="cpu")
            tensor = self.get_noise_like(like=tensor).to(device)
        else:
            tensor = torch.empty(size=shape, device=device)
            tensor = self.get_noise_like(like=tensor)
        return tensor
all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`"""`
add more missing files 2023-02-28 05:37:13 +00:00			`invokeai.backend.generator.txt2img inherits from invokeai.backend.generator`
all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`"""`
add more missing files 2023-02-28 05:37:13 +00:00
			`import math`
			`from typing import Callable, Optional`

			`import torch`
			`from diffusers.utils.logging import get_verbosity, set_verbosity, set_verbosity_error`

			`from ..models import PostprocessingSettings`
all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`from .base import Generator`
			`from .diffusers_pipeline import (`
			`ConditioningData,`
			`StableDiffusionGeneratorPipeline,`
			`trim_to_multiple_of,`
			`)`
add more missing files 2023-02-28 05:37:13 +00:00

			`class Txt2Img2Img(Generator):`
			`def __init__(self, model, precision):`
			`super().__init__(model, precision)`
all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`self.init_latent = None # for get_noise()`

			`def get_make_image(`
			`self,`
			`prompt: str,`
			`sampler,`
			`steps: int,`
			`cfg_scale: float,`
			`ddim_eta,`
			`conditioning,`
			`width: int,`
			`height: int,`
			`strength: float,`
			`step_callback: Optional[Callable] = None,`
			`threshold=0.0,`
			`warmup=0.2,`
			`perlin=0.0,`
			`h_symmetry_time_pct=None,`
			`v_symmetry_time_pct=None,`
			`attention_maps_callback=None,`
			`**kwargs,`
			`):`
add more missing files 2023-02-28 05:37:13 +00:00			`"""`
			`Returns a function returning an image derived from the prompt and the initial image`
			`Return value depends on the seed at the time you call it`
			`kwargs are 'width' and 'height'`
			`"""`
			`self.perlin = perlin`

			`# noinspection PyTypeChecker`
			`pipeline: StableDiffusionGeneratorPipeline = self.model`
			`pipeline.scheduler = sampler`

			`uc, c, extra_conditioning_info = conditioning`
all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`conditioning_data = ConditioningData(`
			`uc,`
			`c,`
			`cfg_scale,`
			`extra_conditioning_info,`
			`postprocessing_settings=PostprocessingSettings(`
			`threshold=threshold,`
			`warmup=0.2,`
			`h_symmetry_time_pct=h_symmetry_time_pct,`
			`v_symmetry_time_pct=v_symmetry_time_pct,`
			`),`
			`).add_scheduler_args_if_applicable(pipeline.scheduler, eta=ddim_eta)`
add more missing files 2023-02-28 05:37:13 +00:00
			`def make_image(x_T):`
			`first_pass_latent_output, _ = pipeline.latents_from_embeddings(`
			`latents=torch.zeros_like(x_T),`
			`num_inference_steps=steps,`
			`conditioning_data=conditioning_data,`
			`noise=x_T,`
			`callback=step_callback,`
			`)`

			`# Get our initial generation width and height directly from the latent output so`
			`# the message below is accurate.`
			`init_width = first_pass_latent_output.size()[3] * self.downsampling_factor`
			`init_height = first_pass_latent_output.size()[2] * self.downsampling_factor`
			`print(`
all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`f"\n>> Interpolating from {init_width}x{init_height} to {width}x{height} using DDIM sampling"`
			`)`
add more missing files 2023-02-28 05:37:13 +00:00
			`# resizing`
			`resized_latents = torch.nn.functional.interpolate(`
			`first_pass_latent_output,`
all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`size=(`
			`height // self.downsampling_factor,`
			`width // self.downsampling_factor,`
			`),`
			`mode="bilinear",`
add more missing files 2023-02-28 05:37:13 +00:00			`)`

			`# Free up memory from the last generation.`
all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`clear_cuda_cache = kwargs["clear_cuda_cache"] or None`
add more missing files 2023-02-28 05:37:13 +00:00			`if clear_cuda_cache is not None:`
			`clear_cuda_cache()`

all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`second_pass_noise = self.get_noise_like(`
			`resized_latents, override_perlin=True`
			`)`
add more missing files 2023-02-28 05:37:13 +00:00
			`# Clear symmetry for the second pass`
			`from dataclasses import replace`
all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00
			`new_postprocessing_settings = replace(`
			`conditioning_data.postprocessing_settings, h_symmetry_time_pct=None`
			`)`
			`new_postprocessing_settings = replace(`
			`new_postprocessing_settings, v_symmetry_time_pct=None`
			`)`
			`new_conditioning_data = replace(`
			`conditioning_data, postprocessing_settings=new_postprocessing_settings`
			`)`
add more missing files 2023-02-28 05:37:13 +00:00
			`verbosity = get_verbosity()`
			`set_verbosity_error()`
			`pipeline_output = pipeline.img2img_from_latents_and_embeddings(`
			`resized_latents,`
			`num_inference_steps=steps,`
			`conditioning_data=new_conditioning_data,`
			`strength=strength,`
			`noise=second_pass_noise,`
all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`callback=step_callback,`
			`)`
add more missing files 2023-02-28 05:37:13 +00:00			`set_verbosity(verbosity)`

all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`if (`
			`pipeline_output.attention_map_saver is not None`
			`and attention_maps_callback is not None`
			`):`
add more missing files 2023-02-28 05:37:13 +00:00			`attention_maps_callback(pipeline_output.attention_map_saver)`

			`return pipeline.numpy_to_pil(pipeline_output.images)[0]`

			`# FIXME: do we really need something entirely different for the inpainting model?`

			`# in the case of the inpainting model being loaded, the trick of`
			`# providing an interpolated latent doesn't work, so we transiently`
			`# create a 512x512 PIL image, upscale it, and run the inpainting`
			`# over it in img2img mode. Because the inpaing model is so conservative`
			`# it doesn't change the image (much)`

			`return make_image`

all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`def get_noise_like(self, like: torch.Tensor, override_perlin: bool = False):`
add more missing files 2023-02-28 05:37:13 +00:00			`device = like.device`
all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`if device.type == "mps":`
			`x = torch.randn_like(like, device="cpu", dtype=self.torch_dtype()).to(`
			`device`
			`)`
add more missing files 2023-02-28 05:37:13 +00:00			`else:`
			`x = torch.randn_like(like, device=device, dtype=self.torch_dtype())`
			`if self.perlin > 0.0 and override_perlin == False:`
			`shape = like.shape`
all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`x = (1 - self.perlin) * x + self.perlin * self.get_perlin_noise(`
			`shape[3], shape[2]`
			`)`
add more missing files 2023-02-28 05:37:13 +00:00			`return x`

			`# returns a tensor filled with random numbers from a normal distribution`
all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`def get_noise(self, width, height, scale=True):`
add more missing files 2023-02-28 05:37:13 +00:00			`# print(f"Get noise: {width}x{height}")`
			`if scale:`
			`# Scale the input width and height for the initial generation`
			`# Make their area equivalent to the model's resolution area (e.g. 512*512 = 262144),`
			`# while keeping the minimum dimension at least 0.5 * resolution (e.g. 512*0.5 = 256)`

			`aspect = width / height`
			`dimension = self.model.unet.config.sample_size * self.model.vae_scale_factor`
			`min_dimension = math.floor(dimension * 0.5)`
all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`model_area = (`
			`dimension * dimension`
			`) # hardcoded for now since all models are trained on square images`
add more missing files 2023-02-28 05:37:13 +00:00
			`if aspect > 1.0:`
			`init_height = max(min_dimension, math.sqrt(model_area / aspect))`
			`init_width = init_height * aspect`
			`else:`
			`init_width = max(min_dimension, math.sqrt(model_area * aspect))`
			`init_height = init_width / aspect`

all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`scaled_width, scaled_height = trim_to_multiple_of(`
			`math.floor(init_width), math.floor(init_height)`
			`)`
add more missing files 2023-02-28 05:37:13 +00:00
			`else:`
			`scaled_width = width`
			`scaled_height = height`

			`device = self.model.device`
			`channels = self.latent_channels`
			`if channels == 9:`
			`channels = 4 # we don't really want noise for all the mask channels`
all vestiges of ldm.invoke removed 2023-03-03 06:02:00 +00:00			`shape = (`
			`1,`
			`channels,`
			`scaled_height // self.downsampling_factor,`
			`scaled_width // self.downsampling_factor,`
			`)`
			`if self.use_mps_noise or device.type == "mps":`
			`tensor = torch.empty(size=shape, device="cpu")`
add more missing files 2023-02-28 05:37:13 +00:00			`tensor = self.get_noise_like(like=tensor).to(device)`
			`else:`
			`tensor = torch.empty(size=shape, device=device)`
			`tensor = self.get_noise_like(like=tensor)`
			`return tensor`