Fix handling handling of 0-step denoising process (#6544)

## Summary https://github.com/invoke-ai/InvokeAI/pull/6522 introduced a change in behavior in cases where start/end were set such that there are 0 timesteps. This PR reverts that change. cc @StAlKeR7779 ## QA Instructions Run with euler, 5 steps, start: 0.0, end: 0.05. I ran this test before #6522, after #6522, and on this branch. This branch restores the behavior to pre-#6522 i.e. noise is injected even if no denoising steps are applied. ## Checklist - [x] _The PR has a short but descriptive title, suitable for a changelog_ - [x] _Tests added / updated (if applicable)_ - [x] _Documentation added / updated (if applicable)_
2024-08-30 20:32:17 +00:00 · 2024-06-26 13:01:58 -04:00
parent dc23bebebf 9a3b8c6fcb
commit f76282a5ff
4 changed files with 15 additions and 11 deletions
--- a/invokeai/app/invocations/denoise_latents.py
+++ b/invokeai/app/invocations/denoise_latents.py
@ -625,6 +625,7 @@ class DenoiseLatentsInvocation(BaseInvocation):
        t_start_idx *= scheduler.order
        t_end_idx *= scheduler.order

+        init_timestep = timesteps[t_start_idx : t_start_idx + 1]
        timesteps = timesteps[t_start_idx : t_start_idx + t_end_idx]

        scheduler_step_kwargs: Dict[str, Any] = {}
@ -647,7 +648,7 @@ class DenoiseLatentsInvocation(BaseInvocation):
        if isinstance(scheduler, TCDScheduler):
            scheduler_step_kwargs.update({"eta": 1.0})

-        return timesteps, scheduler_step_kwargs
+        return timesteps, init_timestep, scheduler_step_kwargs

    def prep_inpaint_mask(
        self, context: InvocationContext, latents: torch.Tensor
@ -813,7 +814,7 @@ class DenoiseLatentsInvocation(BaseInvocation):
                dtype=unet.dtype,
            )

-            timesteps, scheduler_step_kwargs = self.init_scheduler(
+            timesteps, init_timestep, scheduler_step_kwargs = self.init_scheduler(
                scheduler,
                device=unet.device,
                steps=self.steps,
@ -825,6 +826,7 @@ class DenoiseLatentsInvocation(BaseInvocation):
            result_latents = pipeline.latents_from_embeddings(
                latents=latents,
                timesteps=timesteps,
+                init_timestep=init_timestep,
                noise=noise,
                seed=seed,
                mask=mask,
--- a/invokeai/app/invocations/tiled_multi_diffusion_denoise_latents.py
+++ b/invokeai/app/invocations/tiled_multi_diffusion_denoise_latents.py
@ -252,7 +252,7 @@ class TiledMultiDiffusionDenoiseLatents(BaseInvocation):
                    )
                )

-            timesteps, scheduler_step_kwargs = DenoiseLatentsInvocation.init_scheduler(
+            timesteps, init_timestep, scheduler_step_kwargs = DenoiseLatentsInvocation.init_scheduler(
                scheduler,
                device=unet.device,
                steps=self.steps,
@ -269,6 +269,7 @@ class TiledMultiDiffusionDenoiseLatents(BaseInvocation):
                scheduler_step_kwargs=scheduler_step_kwargs,
                noise=noise,
                timesteps=timesteps,
+                init_timestep=init_timestep,
                callback=step_callback,
            )

--- a/invokeai/backend/stable_diffusion/diffusers_pipeline.py
+++ b/invokeai/backend/stable_diffusion/diffusers_pipeline.py
@ -273,6 +273,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
        noise: Optional[torch.Tensor],
        seed: int,
        timesteps: torch.Tensor,
+        init_timestep: torch.Tensor,
        callback: Callable[[PipelineIntermediateState], None],
        control_data: list[ControlNetData] | None = None,
        ip_adapter_data: Optional[list[IPAdapterData]] = None,
@ -298,6 +299,8 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
                HACK(ryand): seed is only used in a particular case when `noise` is None, but we need to re-generate the
                same noise used earlier in the pipeline. This should really be handled in a clearer way.
            timesteps: The timestep schedule for the denoising process.
+            init_timestep: The first timestep in the schedule. This is used to determine the initial noise level, so
+                should be populated if you want noise applied *even* if timesteps is empty.
            callback: A callback function that is called to report progress during the denoising process.
            control_data: ControlNet data.
            ip_adapter_data: IP-Adapter data.
@ -312,17 +315,16 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
                SD UNet model.
            is_gradient_mask: A flag indicating whether `mask` is a gradient mask or not.
        """
-        if timesteps.shape[0] == 0:
+        if init_timestep.shape[0] == 0:
            return latents

        orig_latents = latents.clone()
+
        batch_size = latents.shape[0]
+        batched_init_timestep = init_timestep.expand(batch_size)

        # noise can be None if the latents have already been noised (e.g. when running the SDXL refiner).
        if noise is not None:
-            # batched_init_timestep should have shape (batch_size, 1).
-            batched_init_timestep = timesteps[0:1].expand(batch_size)
-
            # TODO(ryand): I'm pretty sure we should be applying init_noise_sigma in cases where we are starting with
            # full noise. Investigate the history of why this got commented out.
            # latents = noise * self.scheduler.init_noise_sigma # it's like in t2l according to diffusers
--- a/invokeai/backend/stable_diffusion/multi_diffusion_pipeline.py
+++ b/invokeai/backend/stable_diffusion/multi_diffusion_pipeline.py
@ -44,20 +44,19 @@ class MultiDiffusionPipeline(StableDiffusionGeneratorPipeline):
        scheduler_step_kwargs: dict[str, Any],
        noise: Optional[torch.Tensor],
        timesteps: torch.Tensor,
+        init_timestep: torch.Tensor,
        callback: Callable[[PipelineIntermediateState], None],
    ) -> torch.Tensor:
        self._check_regional_prompting(multi_diffusion_conditioning)

-        if timesteps.shape[0] == 0:
+        if init_timestep.shape[0] == 0:
            return latents

        batch_size, _, latent_height, latent_width = latents.shape
+        batched_init_timestep = init_timestep.expand(batch_size)

        # noise can be None if the latents have already been noised (e.g. when running the SDXL refiner).
        if noise is not None:
-            # batched_init_timestep should have shape (batch_size, 1).
-            batched_init_timestep = timesteps[0:1].expand(batch_size)
-
            # TODO(ryand): I'm pretty sure we should be applying init_noise_sigma in cases where we are starting with
            # full noise. Investigate the history of why this got commented out.
            # latents = noise * self.scheduler.init_noise_sigma # it's like in t2l according to diffusers