InvokeAI/invokeai/backend/ip_adapter/utils.py

import inspect
import warnings
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import numpy as np
import PIL.Image
import torch
import torch.nn.functional as F
from diffusers.utils import is_compiled_module
from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
from diffusers.models import ControlNetModel
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput


def is_torch2_available():
    return hasattr(F, "scaled_dot_product_attention")


@torch.no_grad()
def generate(
    self,
    prompt: Union[str, List[str]] = None,
    image: Union[
        torch.FloatTensor,
        PIL.Image.Image,
        np.ndarray,
        List[torch.FloatTensor],
        List[PIL.Image.Image],
        List[np.ndarray],
    ] = None,
    height: Optional[int] = None,
    width: Optional[int] = None,
    num_inference_steps: int = 50,
    guidance_scale: float = 7.5,
    negative_prompt: Optional[Union[str, List[str]]] = None,
    num_images_per_prompt: Optional[int] = 1,
    eta: float = 0.0,
    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
    latents: Optional[torch.FloatTensor] = None,
    prompt_embeds: Optional[torch.FloatTensor] = None,
    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
    output_type: Optional[str] = "pil",
    return_dict: bool = True,
    callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
    callback_steps: int = 1,
    cross_attention_kwargs: Optional[Dict[str, Any]] = None,
    controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
    guess_mode: bool = False,
    control_guidance_start: Union[float, List[float]] = 0.0,
    control_guidance_end: Union[float, List[float]] = 1.0,
):
    r"""
    Function invoked when calling the pipeline for generation.

    Args:
        prompt (`str` or `List[str]`, *optional*):
            The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
            instead.
        image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
                `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
            The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
            the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
            also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
            height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
            specified in init, images must be passed as a list such that each element of the list can be correctly
            batched for input to a single controlnet.
        height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
            The height in pixels of the generated image.
        width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
            The width in pixels of the generated image.
        num_inference_steps (`int`, *optional*, defaults to 50):
            The number of denoising steps. More denoising steps usually lead to a higher quality image at the
            expense of slower inference.
        guidance_scale (`float`, *optional*, defaults to 7.5):
            Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
            `guidance_scale` is defined as `w` of equation 2. of [Imagen
            Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
            1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
            usually at the expense of lower image quality.
        negative_prompt (`str` or `List[str]`, *optional*):
            The prompt or prompts not to guide the image generation. If not defined, one has to pass
            `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
            less than `1`).
        num_images_per_prompt (`int`, *optional*, defaults to 1):
            The number of images to generate per prompt.
        eta (`float`, *optional*, defaults to 0.0):
            Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
            [`schedulers.DDIMScheduler`], will be ignored for others.
        generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
            One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
            to make generation deterministic.
        latents (`torch.FloatTensor`, *optional*):
            Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
            generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
            tensor will ge generated by sampling using the supplied random `generator`.
        prompt_embeds (`torch.FloatTensor`, *optional*):
            Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
            provided, text embeddings will be generated from `prompt` input argument.
        negative_prompt_embeds (`torch.FloatTensor`, *optional*):
            Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
            weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
            argument.
        output_type (`str`, *optional*, defaults to `"pil"`):
            The output format of the generate image. Choose between
            [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
        return_dict (`bool`, *optional*, defaults to `True`):
            Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
            plain tuple.
        callback (`Callable`, *optional*):
            A function that will be called every `callback_steps` steps during inference. The function will be
            called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
        callback_steps (`int`, *optional*, defaults to 1):
            The frequency at which the `callback` function will be called. If not specified, the callback will be
            called at every step.
        cross_attention_kwargs (`dict`, *optional*):
            A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
            `self.processor` in
            [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
        controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
            The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
            to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
            corresponding scale as a list.
        guess_mode (`bool`, *optional*, defaults to `False`):
            In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
            you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
        control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
            The percentage of total steps at which the controlnet starts applying.
        control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
            The percentage of total steps at which the controlnet stops applying.

    Examples:

    Returns:
        [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
        [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
        When returning a tuple, the first element is a list with the generated images, and the second element is a
        list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
        (nsfw) content, according to the `safety_checker`.
    """
    controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet

    # align format for control guidance
    if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
        control_guidance_start = len(control_guidance_end) * [control_guidance_start]
    elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
        control_guidance_end = len(control_guidance_start) * [control_guidance_end]
    elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
        mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
        control_guidance_start, control_guidance_end = mult * [control_guidance_start], mult * [
            control_guidance_end
        ]

    # 1. Check inputs. Raise error if not correct
    self.check_inputs(
        prompt,
        image,
        callback_steps,
        negative_prompt,
        prompt_embeds,
        negative_prompt_embeds,
        controlnet_conditioning_scale,
        control_guidance_start,
        control_guidance_end,
    )

    # 2. Define call parameters
    if prompt is not None and isinstance(prompt, str):
        batch_size = 1
    elif prompt is not None and isinstance(prompt, list):
        batch_size = len(prompt)
    else:
        batch_size = prompt_embeds.shape[0]

    device = self._execution_device
    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
    # corresponds to doing no classifier free guidance.
    do_classifier_free_guidance = guidance_scale > 1.0

    if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
        controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)

    global_pool_conditions = (
        controlnet.config.global_pool_conditions
        if isinstance(controlnet, ControlNetModel)
        else controlnet.nets[0].config.global_pool_conditions
    )
    guess_mode = guess_mode or global_pool_conditions

    # 3. Encode input prompt
    text_encoder_lora_scale = (
        cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
    )
    prompt_embeds = self._encode_prompt(
        prompt,
        device,
        num_images_per_prompt,
        do_classifier_free_guidance,
        negative_prompt,
        prompt_embeds=prompt_embeds,
        negative_prompt_embeds=negative_prompt_embeds,
        lora_scale=text_encoder_lora_scale,
    )

    # 4. Prepare image
    if isinstance(controlnet, ControlNetModel):
        image = self.prepare_image(
            image=image,
            width=width,
            height=height,
            batch_size=batch_size * num_images_per_prompt,
            num_images_per_prompt=num_images_per_prompt,
            device=device,
            dtype=controlnet.dtype,
            do_classifier_free_guidance=do_classifier_free_guidance,
            guess_mode=guess_mode,
        )
        height, width = image.shape[-2:]
    elif isinstance(controlnet, MultiControlNetModel):
        images = []

        for image_ in image:
            image_ = self.prepare_image(
                image=image_,
                width=width,
                height=height,
                batch_size=batch_size * num_images_per_prompt,
                num_images_per_prompt=num_images_per_prompt,
                device=device,
                dtype=controlnet.dtype,
                do_classifier_free_guidance=do_classifier_free_guidance,
                guess_mode=guess_mode,
            )

            images.append(image_)

        image = images
        height, width = image[0].shape[-2:]
    else:
        assert False

    # 5. Prepare timesteps
    self.scheduler.set_timesteps(num_inference_steps, device=device)
    timesteps = self.scheduler.timesteps

    # 6. Prepare latent variables
    num_channels_latents = self.unet.config.in_channels
    latents = self.prepare_latents(
        batch_size * num_images_per_prompt,
        num_channels_latents,
        height,
        width,
        prompt_embeds.dtype,
        device,
        generator,
        latents,
    )

    # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
    extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

    # 7.1 Create tensor stating which controlnets to keep
    controlnet_keep = []
    for i in range(len(timesteps)):
        keeps = [
            1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
            for s, e in zip(control_guidance_start, control_guidance_end)
        ]
        controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)

    # 8. Denoising loop
    num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
    with self.progress_bar(total=num_inference_steps) as progress_bar:
        for i, t in enumerate(timesteps):
            # expand the latents if we are doing classifier free guidance
            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

            # controlnet(s) inference
            if guess_mode and do_classifier_free_guidance:
                # Infer ControlNet only for the conditional batch.
                control_model_input = latents
                control_model_input = self.scheduler.scale_model_input(control_model_input, t)
                controlnet_prompt_embeds = prompt_embeds[:, :77, :].chunk(2)[1]
            else:
                control_model_input = latent_model_input
                controlnet_prompt_embeds = prompt_embeds[:, :77, :]

            if isinstance(controlnet_keep[i], list):
                cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
            else:
                controlnet_cond_scale = controlnet_conditioning_scale
                if isinstance(controlnet_cond_scale, list):
                    controlnet_cond_scale = controlnet_cond_scale[0]
                cond_scale = controlnet_cond_scale * controlnet_keep[i]

            down_block_res_samples, mid_block_res_sample = self.controlnet(
                control_model_input,
                t,
                encoder_hidden_states=controlnet_prompt_embeds,
                controlnet_cond=image,
                conditioning_scale=cond_scale,
                guess_mode=guess_mode,
                return_dict=False,
            )

            if guess_mode and do_classifier_free_guidance:
                # Infered ControlNet only for the conditional batch.
                # To apply the output of ControlNet to both the unconditional and conditional batches,
                # add 0 to the unconditional batch to keep it unchanged.
                down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
                mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])

            # predict the noise residual
            noise_pred = self.unet(
                latent_model_input,
                t,
                encoder_hidden_states=prompt_embeds,
                cross_attention_kwargs=cross_attention_kwargs,
                down_block_additional_residuals=down_block_res_samples,
                mid_block_additional_residual=mid_block_res_sample,
                return_dict=False,
            )[0]

            # perform guidance
            if do_classifier_free_guidance:
                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

            # compute the previous noisy sample x_t -> x_t-1
            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]

            # call the callback, if provided
            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                progress_bar.update()
                if callback is not None and i % callback_steps == 0:
                    callback(i, t, latents)

    # If we do sequential model offloading, let's offload unet and controlnet
    # manually for max memory savings
    if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
        self.unet.to("cpu")
        self.controlnet.to("cpu")
        torch.cuda.empty_cache()

    if not output_type == "latent":
        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
        image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
    else:
        image = latents
        has_nsfw_concept = None

    if has_nsfw_concept is None:
        do_denormalize = [True] * image.shape[0]
    else:
        do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]

    image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)

    # Offload last model to CPU
    if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
        self.final_offload_hook.offload()

    if not return_dict:
        return (image, has_nsfw_concept)

    return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
Core ip_adapter files from https://github.com/tencent-ailab/IP-Adapter Copied into InvokeAI since IP-Adapter repo is not a package. Is there a better way to do this for non-packaged Python code while still keeping InvokeAI install easy? 2023-08-29 07:51:55 +00:00			`import inspect`
			`import warnings`
			`from typing import Any, Callable, Dict, List, Optional, Tuple, Union`

			`import numpy as np`
			`import PIL.Image`
			`import torch`
			`import torch.nn.functional as F`
			`from diffusers.utils import is_compiled_module`
			`from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel`
			`from diffusers.models import ControlNetModel`
			`from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput`



			`def is_torch2_available():`
			`return hasattr(F, "scaled_dot_product_attention")`


			`@torch.no_grad()`
			`def generate(`
			`self,`
			`prompt: Union[str, List[str]] = None,`
			`image: Union[`
			`torch.FloatTensor,`
			`PIL.Image.Image,`
			`np.ndarray,`
			`List[torch.FloatTensor],`
			`List[PIL.Image.Image],`
			`List[np.ndarray],`
			`] = None,`
			`height: Optional[int] = None,`
			`width: Optional[int] = None,`
			`num_inference_steps: int = 50,`
			`guidance_scale: float = 7.5,`
			`negative_prompt: Optional[Union[str, List[str]]] = None,`
			`num_images_per_prompt: Optional[int] = 1,`
			`eta: float = 0.0,`
			`generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,`
			`latents: Optional[torch.FloatTensor] = None,`
			`prompt_embeds: Optional[torch.FloatTensor] = None,`
			`negative_prompt_embeds: Optional[torch.FloatTensor] = None,`
			`output_type: Optional[str] = "pil",`
			`return_dict: bool = True,`
			`callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,`
			`callback_steps: int = 1,`
			`cross_attention_kwargs: Optional[Dict[str, Any]] = None,`
			`controlnet_conditioning_scale: Union[float, List[float]] = 1.0,`
			`guess_mode: bool = False,`
			`control_guidance_start: Union[float, List[float]] = 0.0,`
			`control_guidance_end: Union[float, List[float]] = 1.0,`
			`):`
			`r"""`
			`Function invoked when calling the pipeline for generation.`

			`Args:`
			prompt (`str` or `List[str]`, optional):
			The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
			`instead.`
			image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
			`List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
			`The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If`
			the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
			also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
			height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
			`specified in init, images must be passed as a list such that each element of the list can be correctly`
			`batched for input to a single controlnet.`
			height (`int`, optional, defaults to self.unet.config.sample_size * self.vae_scale_factor):
			`The height in pixels of the generated image.`
			width (`int`, optional, defaults to self.unet.config.sample_size * self.vae_scale_factor):
			`The width in pixels of the generated image.`
			num_inference_steps (`int`, optional, defaults to 50):
			`The number of denoising steps. More denoising steps usually lead to a higher quality image at the`
			`expense of slower inference.`
			guidance_scale (`float`, optional, defaults to 7.5):
			`Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).`
			`guidance_scale` is defined as `w` of equation 2. of [Imagen
			Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
			1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
			`usually at the expense of lower image quality.`
			negative_prompt (`str` or `List[str]`, optional):
			`The prompt or prompts not to guide the image generation. If not defined, one has to pass`
			`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
			less than `1`).
			num_images_per_prompt (`int`, optional, defaults to 1):
			`The number of images to generate per prompt.`
			eta (`float`, optional, defaults to 0.0):
			`Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to`
			[`schedulers.DDIMScheduler`], will be ignored for others.
			generator (`torch.Generator` or `List[torch.Generator]`, optional):
			`One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)`
			`to make generation deterministic.`
			latents (`torch.FloatTensor`, optional):
			`Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image`
			`generation. Can be used to tweak the same generation with different prompts. If not provided, a latents`
			tensor will ge generated by sampling using the supplied random `generator`.
			prompt_embeds (`torch.FloatTensor`, optional):
			`Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not`
			provided, text embeddings will be generated from `prompt` input argument.
			negative_prompt_embeds (`torch.FloatTensor`, optional):
			`Pre-generated negative text embeddings. Can be used to easily tweak text inputs, e.g. prompt`
			weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
			`argument.`
			output_type (`str`, optional, defaults to `"pil"`):
			`The output format of the generate image. Choose between`
			[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
			return_dict (`bool`, optional, defaults to `True`):
			Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
			`plain tuple.`
			callback (`Callable`, optional):
			A function that will be called every `callback_steps` steps during inference. The function will be
			called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
			callback_steps (`int`, optional, defaults to 1):
			The frequency at which the `callback` function will be called. If not specified, the callback will be
			`called at every step.`
			cross_attention_kwargs (`dict`, optional):
			A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
			`self.processor` in
			`[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).`
			controlnet_conditioning_scale (`float` or `List[float]`, optional, defaults to 1.0):
			The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
			`to the residual in the original unet. If multiple ControlNets are specified in init, you can set the`
			`corresponding scale as a list.`
			guess_mode (`bool`, optional, defaults to `False`):
			`In this mode, the ControlNet encoder will try best to recognize the content of the input image even if`
			you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
			control_guidance_start (`float` or `List[float]`, optional, defaults to 0.0):
			`The percentage of total steps at which the controlnet starts applying.`
			control_guidance_end (`float` or `List[float]`, optional, defaults to 1.0):
			`The percentage of total steps at which the controlnet stops applying.`

			`Examples:`

			`Returns:`
			[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
			[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
			`When returning a tuple, the first element is a list with the generated images, and the second element is a`
			list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
			(nsfw) content, according to the `safety_checker`.
			`"""`
			`controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet`

			`# align format for control guidance`
			`if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):`
			`control_guidance_start = len(control_guidance_end) * [control_guidance_start]`
			`elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):`
			`control_guidance_end = len(control_guidance_start) * [control_guidance_end]`
			`elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):`
			`mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1`
			`control_guidance_start, control_guidance_end = mult * [control_guidance_start], mult * [`
			`control_guidance_end`
			`]`

			`# 1. Check inputs. Raise error if not correct`
			`self.check_inputs(`
			`prompt,`
			`image,`
			`callback_steps,`
			`negative_prompt,`
			`prompt_embeds,`
			`negative_prompt_embeds,`
			`controlnet_conditioning_scale,`
			`control_guidance_start,`
			`control_guidance_end,`
			`)`

			`# 2. Define call parameters`
			`if prompt is not None and isinstance(prompt, str):`
			`batch_size = 1`
			`elif prompt is not None and isinstance(prompt, list):`
			`batch_size = len(prompt)`
			`else:`
			`batch_size = prompt_embeds.shape[0]`

			`device = self._execution_device`
			# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
			# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
			`# corresponds to doing no classifier free guidance.`
			`do_classifier_free_guidance = guidance_scale > 1.0`

			`if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):`
			`controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)`

			`global_pool_conditions = (`
			`controlnet.config.global_pool_conditions`
			`if isinstance(controlnet, ControlNetModel)`
			`else controlnet.nets[0].config.global_pool_conditions`
			`)`
			`guess_mode = guess_mode or global_pool_conditions`

			`# 3. Encode input prompt`
			`text_encoder_lora_scale = (`
			`cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None`
			`)`
			`prompt_embeds = self._encode_prompt(`
			`prompt,`
			`device,`
			`num_images_per_prompt,`
			`do_classifier_free_guidance,`
			`negative_prompt,`
			`prompt_embeds=prompt_embeds,`
			`negative_prompt_embeds=negative_prompt_embeds,`
			`lora_scale=text_encoder_lora_scale,`
			`)`

			`# 4. Prepare image`
			`if isinstance(controlnet, ControlNetModel):`
			`image = self.prepare_image(`
			`image=image,`
			`width=width,`
			`height=height,`
			`batch_size=batch_size * num_images_per_prompt,`
			`num_images_per_prompt=num_images_per_prompt,`
			`device=device,`
			`dtype=controlnet.dtype,`
			`do_classifier_free_guidance=do_classifier_free_guidance,`
			`guess_mode=guess_mode,`
			`)`
			`height, width = image.shape[-2:]`
			`elif isinstance(controlnet, MultiControlNetModel):`
			`images = []`

			`for image_ in image:`
			`image_ = self.prepare_image(`
			`image=image_,`
			`width=width,`
			`height=height,`
			`batch_size=batch_size * num_images_per_prompt,`
			`num_images_per_prompt=num_images_per_prompt,`
			`device=device,`
			`dtype=controlnet.dtype,`
			`do_classifier_free_guidance=do_classifier_free_guidance,`
			`guess_mode=guess_mode,`
			`)`

			`images.append(image_)`

			`image = images`
			`height, width = image[0].shape[-2:]`
			`else:`
			`assert False`

			`# 5. Prepare timesteps`
			`self.scheduler.set_timesteps(num_inference_steps, device=device)`
			`timesteps = self.scheduler.timesteps`

			`# 6. Prepare latent variables`
			`num_channels_latents = self.unet.config.in_channels`
			`latents = self.prepare_latents(`
			`batch_size * num_images_per_prompt,`
			`num_channels_latents,`
			`height,`
			`width,`
			`prompt_embeds.dtype,`
			`device,`
			`generator,`
			`latents,`
			`)`

			`# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline`
			`extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)`

			`# 7.1 Create tensor stating which controlnets to keep`
			`controlnet_keep = []`
			`for i in range(len(timesteps)):`
			`keeps = [`
			`1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)`
			`for s, e in zip(control_guidance_start, control_guidance_end)`
			`]`
			`controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)`

			`# 8. Denoising loop`
			`num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order`
			`with self.progress_bar(total=num_inference_steps) as progress_bar:`
			`for i, t in enumerate(timesteps):`
			`# expand the latents if we are doing classifier free guidance`
			`latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents`
			`latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)`

			`# controlnet(s) inference`
			`if guess_mode and do_classifier_free_guidance:`
			`# Infer ControlNet only for the conditional batch.`
			`control_model_input = latents`
			`control_model_input = self.scheduler.scale_model_input(control_model_input, t)`
			`controlnet_prompt_embeds = prompt_embeds[:, :77, :].chunk(2)[1]`
			`else:`
			`control_model_input = latent_model_input`
			`controlnet_prompt_embeds = prompt_embeds[:, :77, :]`

			`if isinstance(controlnet_keep[i], list):`
			`cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]`
			`else:`
			`controlnet_cond_scale = controlnet_conditioning_scale`
			`if isinstance(controlnet_cond_scale, list):`
			`controlnet_cond_scale = controlnet_cond_scale[0]`
			`cond_scale = controlnet_cond_scale * controlnet_keep[i]`

			`down_block_res_samples, mid_block_res_sample = self.controlnet(`
			`control_model_input,`
			`t,`
			`encoder_hidden_states=controlnet_prompt_embeds,`
			`controlnet_cond=image,`
			`conditioning_scale=cond_scale,`
			`guess_mode=guess_mode,`
			`return_dict=False,`
			`)`

			`if guess_mode and do_classifier_free_guidance:`
			`# Infered ControlNet only for the conditional batch.`
			`# To apply the output of ControlNet to both the unconditional and conditional batches,`
			`# add 0 to the unconditional batch to keep it unchanged.`
			`down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]`
			`mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])`

			`# predict the noise residual`
			`noise_pred = self.unet(`
			`latent_model_input,`
			`t,`
			`encoder_hidden_states=prompt_embeds,`
			`cross_attention_kwargs=cross_attention_kwargs,`
			`down_block_additional_residuals=down_block_res_samples,`
			`mid_block_additional_residual=mid_block_res_sample,`
			`return_dict=False,`
			`)[0]`

			`# perform guidance`
			`if do_classifier_free_guidance:`
			`noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)`
			`noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)`

			`# compute the previous noisy sample x_t -> x_t-1`
			`latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]`

			`# call the callback, if provided`
			`if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):`
			`progress_bar.update()`
			`if callback is not None and i % callback_steps == 0:`
			`callback(i, t, latents)`

			`# If we do sequential model offloading, let's offload unet and controlnet`
			`# manually for max memory savings`
			`if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:`
			`self.unet.to("cpu")`
			`self.controlnet.to("cpu")`
			`torch.cuda.empty_cache()`

			`if not output_type == "latent":`
			`image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]`
			`image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)`
			`else:`
			`image = latents`
			`has_nsfw_concept = None`

			`if has_nsfw_concept is None:`
			`do_denormalize = [True] * image.shape[0]`
			`else:`
			`do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]`

			`image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)`

			`# Offload last model to CPU`
			`if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:`
			`self.final_offload_hook.offload()`

			`if not return_dict:`
			`return (image, has_nsfw_concept)`

			`return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)`