InvokeAI/invokeai/backend/stable_diffusion/denoise_context.py

from __future__ import annotations

from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Type, Union

import torch
from diffusers import UNet2DConditionModel
from diffusers.schedulers.scheduling_utils import SchedulerMixin, SchedulerOutput

if TYPE_CHECKING:
    from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningMode, TextConditioningData


@dataclass
class UNetKwargs:
    sample: torch.Tensor
    timestep: Union[torch.Tensor, float, int]
    encoder_hidden_states: torch.Tensor

    class_labels: Optional[torch.Tensor] = None
    timestep_cond: Optional[torch.Tensor] = None
    attention_mask: Optional[torch.Tensor] = None
    cross_attention_kwargs: Optional[Dict[str, Any]] = None
    added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None
    down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None
    mid_block_additional_residual: Optional[torch.Tensor] = None
    down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None
    encoder_attention_mask: Optional[torch.Tensor] = None
    # return_dict: bool = True


@dataclass
class DenoiseInputs:
    """Initial variables passed to denoise. Supposed to be unchanged."""

    # The latent-space image to denoise.
    # Shape: [batch, channels, latent_height, latent_width]
    # - If we are inpainting, this is the initial latent image before noise has been added.
    # - If we are generating a new image, this should be initialized to zeros.
    # - In some cases, this may be a partially-noised latent image (e.g. when running the SDXL refiner).
    orig_latents: torch.Tensor

    # kwargs forwarded to the scheduler.step() method.
    scheduler_step_kwargs: dict[str, Any]

    # Text conditionging data.
    conditioning_data: TextConditioningData

    # Noise used for two purposes:
    # 1. Used by the scheduler to noise the initial `latents` before denoising.
    # 2. Used to noise the `masked_latents` when inpainting.
    # `noise` should be None if the `latents` tensor has already been noised.
    # Shape: [1 or batch, channels, latent_height, latent_width]
    noise: Optional[torch.Tensor]

    # The seed used to generate the noise for the denoising process.
    # HACK(ryand): seed is only used in a particular case when `noise` is None, but we need to re-generate the
    # same noise used earlier in the pipeline. This should really be handled in a clearer way.
    seed: int

    # The timestep schedule for the denoising process.
    timesteps: torch.Tensor

    # The first timestep in the schedule. This is used to determine the initial noise level, so
    # should be populated if you want noise applied *even* if timesteps is empty.
    init_timestep: torch.Tensor

    # Class of attention processor that is used.
    attention_processor_cls: Type[Any]


@dataclass
class DenoiseContext:
    """Context with all variables in denoise"""

    # Initial variables passed to denoise. Supposed to be unchanged.
    inputs: DenoiseInputs

    # Scheduler which used to apply noise predictions.
    scheduler: SchedulerMixin

    # UNet model.
    unet: Optional[UNet2DConditionModel] = None

    # Current state of latent-space image in denoising process.
    # None until `pre_denoise_loop` callback.
    # Shape: [batch, channels, latent_height, latent_width]
    latents: Optional[torch.Tensor] = None

    # Current denoising step index.
    # None until `pre_step` callback.
    step_index: Optional[int] = None

    # Current denoising step timestep.
    # None until `pre_step` callback.
    timestep: Optional[torch.Tensor] = None

    # Arguments which will be passed to UNet model.
    # Available in `pre_unet`/`post_unet` callbacks, otherwise will be None.
    unet_kwargs: Optional[UNetKwargs] = None

    # SchedulerOutput class returned from step function(normally, generated by scheduler).
    # Supposed to be used only in `post_step` callback, otherwise can be None.
    step_output: Optional[SchedulerOutput] = None

    # Scaled version of `latents`, which will be passed to unet_kwargs initialization.
    # Available in events inside step(between `pre_step` and `post_stop`).
    # Shape: [batch, channels, latent_height, latent_width]
    latent_model_input: Optional[torch.Tensor] = None

    # [TMP] Defines on which conditionings current unet call will be runned.
    # Available in `pre_unet`/`post_unet` callbacks, otherwise will be None.
    conditioning_mode: Optional[ConditioningMode] = None

    # [TMP] Noise predictions from negative conditioning.
    # Available in `apply_cfg` and `post_apply_cfg` callbacks, otherwise will be None.
    # Shape: [batch, channels, latent_height, latent_width]
    negative_noise_pred: Optional[torch.Tensor] = None

    # [TMP] Noise predictions from positive conditioning.
    # Available in `apply_cfg` and `post_apply_cfg` callbacks, otherwise will be None.
    # Shape: [batch, channels, latent_height, latent_width]
    positive_noise_pred: Optional[torch.Tensor] = None

    # Combined noise prediction from passed conditionings.
    # Available in `apply_cfg` and `post_apply_cfg` callbacks, otherwise will be None.
    # Shape: [batch, channels, latent_height, latent_width]
    noise_pred: Optional[torch.Tensor] = None

    # Dictionary for extensions to pass extra info about denoise process to other extensions.
    extra: dict = field(default_factory=dict)
Base code from draft PR 2024-07-12 17:31:26 +00:00			`from __future__ import annotations`

			`from dataclasses import dataclass, field`
Change attention processor apply logic 2024-07-16 17:03:29 +00:00			`from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Type, Union`
Base code from draft PR 2024-07-12 17:31:26 +00:00
			`import torch`
			`from diffusers import UNet2DConditionModel`
			`from diffusers.schedulers.scheduling_utils import SchedulerMixin, SchedulerOutput`

			`if TYPE_CHECKING:`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00			`from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningMode, TextConditioningData`
Base code from draft PR 2024-07-12 17:31:26 +00:00

			`@dataclass`
			`class UNetKwargs:`
			`sample: torch.Tensor`
			`timestep: Union[torch.Tensor, float, int]`
			`encoder_hidden_states: torch.Tensor`

			`class_labels: Optional[torch.Tensor] = None`
			`timestep_cond: Optional[torch.Tensor] = None`
			`attention_mask: Optional[torch.Tensor] = None`
			`cross_attention_kwargs: Optional[Dict[str, Any]] = None`
			`added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None`
			`down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None`
			`mid_block_additional_residual: Optional[torch.Tensor] = None`
			`down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None`
			`encoder_attention_mask: Optional[torch.Tensor] = None`
			`# return_dict: bool = True`


			`@dataclass`
Separate inputs in denoise context 2024-07-16 16:30:29 +00:00			`class DenoiseInputs:`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00			`"""Initial variables passed to denoise. Supposed to be unchanged."""`
Added some comments 2024-07-16 19:52:44 +00:00
Comments, a bit refactor 2024-07-17 01:20:31 +00:00			`# The latent-space image to denoise.`
			`# Shape: [batch, channels, latent_height, latent_width]`
			`# - If we are inpainting, this is the initial latent image before noise has been added.`
			`# - If we are generating a new image, this should be initialized to zeros.`
			`# - In some cases, this may be a partially-noised latent image (e.g. when running the SDXL refiner).`
Separate inputs in denoise context 2024-07-16 16:30:29 +00:00			`orig_latents: torch.Tensor`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00
			`# kwargs forwarded to the scheduler.step() method.`
Base code from draft PR 2024-07-12 17:31:26 +00:00			`scheduler_step_kwargs: dict[str, Any]`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00
			`# Text conditionging data.`
Base code from draft PR 2024-07-12 17:31:26 +00:00			`conditioning_data: TextConditioningData`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00
			`# Noise used for two purposes:`
			# 1. Used by the scheduler to noise the initial `latents` before denoising.
			# 2. Used to noise the `masked_latents` when inpainting.
			# `noise` should be None if the `latents` tensor has already been noised.
			`# Shape: [1 or batch, channels, latent_height, latent_width]`
Base code from draft PR 2024-07-12 17:31:26 +00:00			`noise: Optional[torch.Tensor]`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00
			`# The seed used to generate the noise for the denoising process.`
			# HACK(ryand): seed is only used in a particular case when `noise` is None, but we need to re-generate the
			`# same noise used earlier in the pipeline. This should really be handled in a clearer way.`
Base code from draft PR 2024-07-12 17:31:26 +00:00			`seed: int`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00
			`# The timestep schedule for the denoising process.`
Base code from draft PR 2024-07-12 17:31:26 +00:00			`timesteps: torch.Tensor`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00
			`# The first timestep in the schedule. This is used to determine the initial noise level, so`
			`# should be populated if you want noise applied even if timesteps is empty.`
Base code from draft PR 2024-07-12 17:31:26 +00:00			`init_timestep: torch.Tensor`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00
			`# Class of attention processor that is used.`
Change attention processor apply logic 2024-07-16 17:03:29 +00:00			`attention_processor_cls: Type[Any]`
Base code from draft PR 2024-07-12 17:31:26 +00:00
Separate inputs in denoise context 2024-07-16 16:30:29 +00:00
			`@dataclass`
			`class DenoiseContext:`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00			`"""Context with all variables in denoise"""`
Added some comments 2024-07-16 19:52:44 +00:00
Comments, a bit refactor 2024-07-17 01:20:31 +00:00			`# Initial variables passed to denoise. Supposed to be unchanged.`
Separate inputs in denoise context 2024-07-16 16:30:29 +00:00			`inputs: DenoiseInputs`

Comments, a bit refactor 2024-07-17 01:20:31 +00:00			`# Scheduler which used to apply noise predictions.`
Base code from draft PR 2024-07-12 17:31:26 +00:00			`scheduler: SchedulerMixin`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00
			`# UNet model.`
Base code from draft PR 2024-07-12 17:31:26 +00:00			`unet: Optional[UNet2DConditionModel] = None`

Comments, a bit refactor 2024-07-17 01:20:31 +00:00			`# Current state of latent-space image in denoising process.`
			# None until `pre_denoise_loop` callback.
			`# Shape: [batch, channels, latent_height, latent_width]`
Separate inputs in denoise context 2024-07-16 16:30:29 +00:00			`latents: Optional[torch.Tensor] = None`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00
			`# Current denoising step index.`
			# None until `pre_step` callback.
Base code from draft PR 2024-07-12 17:31:26 +00:00			`step_index: Optional[int] = None`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00
			`# Current denoising step timestep.`
			# None until `pre_step` callback.
Base code from draft PR 2024-07-12 17:31:26 +00:00			`timestep: Optional[torch.Tensor] = None`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00
			`# Arguments which will be passed to UNet model.`
			# Available in `pre_unet`/`post_unet` callbacks, otherwise will be None.
Base code from draft PR 2024-07-12 17:31:26 +00:00			`unet_kwargs: Optional[UNetKwargs] = None`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00
			`# SchedulerOutput class returned from step function(normally, generated by scheduler).`
			# Supposed to be used only in `post_step` callback, otherwise can be None.
Base code from draft PR 2024-07-12 17:31:26 +00:00			`step_output: Optional[SchedulerOutput] = None`

Comments, a bit refactor 2024-07-17 01:20:31 +00:00			# Scaled version of `latents`, which will be passed to unet_kwargs initialization.
			# Available in events inside step(between `pre_step` and `post_stop`).
			`# Shape: [batch, channels, latent_height, latent_width]`
Base code from draft PR 2024-07-12 17:31:26 +00:00			`latent_model_input: Optional[torch.Tensor] = None`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00
			`# [TMP] Defines on which conditionings current unet call will be runned.`
			# Available in `pre_unet`/`post_unet` callbacks, otherwise will be None.
			`conditioning_mode: Optional[ConditioningMode] = None`

			`# [TMP] Noise predictions from negative conditioning.`
			# Available in `apply_cfg` and `post_apply_cfg` callbacks, otherwise will be None.
			`# Shape: [batch, channels, latent_height, latent_width]`
Base code from draft PR 2024-07-12 17:31:26 +00:00			`negative_noise_pred: Optional[torch.Tensor] = None`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00
			`# [TMP] Noise predictions from positive conditioning.`
			# Available in `apply_cfg` and `post_apply_cfg` callbacks, otherwise will be None.
			`# Shape: [batch, channels, latent_height, latent_width]`
Base code from draft PR 2024-07-12 17:31:26 +00:00			`positive_noise_pred: Optional[torch.Tensor] = None`
Comments, a bit refactor 2024-07-17 01:20:31 +00:00
			`# Combined noise prediction from passed conditionings.`
			# Available in `apply_cfg` and `post_apply_cfg` callbacks, otherwise will be None.
			`# Shape: [batch, channels, latent_height, latent_width]`
Base code from draft PR 2024-07-12 17:31:26 +00:00			`noise_pred: Optional[torch.Tensor] = None`

Comments, a bit refactor 2024-07-17 01:20:31 +00:00			`# Dictionary for extensions to pass extra info about denoise process to other extensions.`
Base code from draft PR 2024-07-12 17:31:26 +00:00			`extra: dict = field(default_factory=dict)`