From ee1b3157ce772f9350462f3afbd2953cbba9ca8d Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Wed, 28 Feb 2024 13:49:02 -0500 Subject: [PATCH] Split ip_adapter_conditioning out from ConditioningData. --- invokeai/app/invocations/latent.py | 8 +------- .../stable_diffusion/diffusers_pipeline.py | 14 ++++++++++---- .../diffusion/conditioning_data.py | 2 -- .../diffusion/shared_invokeai_diffusion.py | 18 ++++++++++++------ 4 files changed, 23 insertions(+), 19 deletions(-) diff --git a/invokeai/app/invocations/latent.py b/invokeai/app/invocations/latent.py index 1511c1ba15..7fc389e12f 100644 --- a/invokeai/app/invocations/latent.py +++ b/invokeai/app/invocations/latent.py @@ -487,7 +487,6 @@ class DenoiseLatentsInvocation(BaseInvocation): self, context: InvocationContext, ip_adapter: Optional[Union[IPAdapterField, list[IPAdapterField]]], - conditioning_data: ConditioningData, exit_stack: ExitStack, ) -> Optional[list[IPAdapterData]]: """If IP-Adapter is enabled, then this function loads the requisite models, and adds the image prompt embeddings @@ -504,7 +503,6 @@ class DenoiseLatentsInvocation(BaseInvocation): return None ip_adapter_data_list = [] - conditioning_data.ip_adapter_conditioning = [] for single_ip_adapter in ip_adapter: ip_adapter_model: Union[IPAdapter, IPAdapterPlus] = exit_stack.enter_context( context.services.model_manager.get_model( @@ -537,16 +535,13 @@ class DenoiseLatentsInvocation(BaseInvocation): single_ipa_images, image_encoder_model ) - conditioning_data.ip_adapter_conditioning.append( - IPAdapterConditioningInfo(image_prompt_embeds, uncond_image_prompt_embeds) - ) - ip_adapter_data_list.append( IPAdapterData( ip_adapter_model=ip_adapter_model, weight=single_ip_adapter.weight, begin_step_percent=single_ip_adapter.begin_step_percent, end_step_percent=single_ip_adapter.end_step_percent, + ip_adapter_conditioning=IPAdapterConditioningInfo(image_prompt_embeds, uncond_image_prompt_embeds), ) ) @@ -780,7 +775,6 @@ class DenoiseLatentsInvocation(BaseInvocation): ip_adapter_data = self.prep_ip_adapter_data( context=context, ip_adapter=self.ip_adapter, - conditioning_data=conditioning_data, exit_stack=exit_stack, ) diff --git a/invokeai/backend/stable_diffusion/diffusers_pipeline.py b/invokeai/backend/stable_diffusion/diffusers_pipeline.py index 3b54136469..3e50530598 100644 --- a/invokeai/backend/stable_diffusion/diffusers_pipeline.py +++ b/invokeai/backend/stable_diffusion/diffusers_pipeline.py @@ -24,7 +24,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from invokeai.app.services.config import InvokeAIAppConfig from invokeai.backend.ip_adapter.ip_adapter import IPAdapter from invokeai.backend.ip_adapter.unet_patcher import UNetPatcher -from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningData +from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningData, IPAdapterConditioningInfo from invokeai.backend.stable_diffusion.diffusion.regional_prompt_attention import apply_regional_prompt_attn from invokeai.backend.stable_diffusion.diffusion.shared_invokeai_diffusion import InvokeAIDiffuserComponent @@ -165,10 +165,11 @@ class ControlNetData: @dataclass class IPAdapterData: - ip_adapter_model: IPAdapter = Field(default=None) - # TODO: change to polymorphic so can do different weights per step (once implemented...) + ip_adapter_model: IPAdapter + ip_adapter_conditioning: IPAdapterConditioningInfo + + # Either a single weight applied to all steps, or a list of weights for each step. weight: Union[float, List[float]] = Field(default=1.0) - # weight: float = Field(default=1.0) begin_step_percent: float = Field(default=0.0) end_step_percent: float = Field(default=1.0) @@ -564,12 +565,17 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline): down_intrablock_additional_residuals = accum_adapter_state + ip_adapter_conditioning = None + if ip_adapter_data is not None: + ip_adapter_conditioning = [ipa.ip_adapter_conditioning for ipa in ip_adapter_data] + uc_noise_pred, c_noise_pred = self.invokeai_diffuser.do_unet_step( sample=latent_model_input, timestep=t, # TODO: debug how handled batched and non batched timesteps step_index=step_index, total_step_count=total_step_count, conditioning_data=conditioning_data, + ip_adapter_conditioning=ip_adapter_conditioning, down_block_additional_residuals=down_block_additional_residuals, # for ControlNet mid_block_additional_residual=mid_block_additional_residual, # for ControlNet down_intrablock_additional_residuals=down_intrablock_additional_residuals, # for T2I-Adapter diff --git a/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py b/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py index 267a17e917..98ad3dcfdc 100644 --- a/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py +++ b/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py @@ -71,5 +71,3 @@ class ConditioningData: ref [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf) """ guidance_rescale_multiplier: float = 0 - - ip_adapter_conditioning: Optional[list[IPAdapterConditioningInfo]] = None diff --git a/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py b/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py index 3c1d19b602..5251197cb1 100644 --- a/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py +++ b/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py @@ -14,6 +14,7 @@ from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ( BasicConditioningInfo, ConditioningData, ExtraConditioningInfo, + IPAdapterConditioningInfo, SDXLConditioningInfo, ) from invokeai.backend.stable_diffusion.diffusion.regional_prompt_attention import Range, RegionalPromptData @@ -329,6 +330,7 @@ class InvokeAIDiffuserComponent: sample: torch.Tensor, timestep: torch.Tensor, conditioning_data: ConditioningData, + ip_adapter_conditioning: Optional[list[IPAdapterConditioningInfo]], step_index: int, total_step_count: int, down_block_additional_residuals: Optional[torch.Tensor] = None, # for ControlNet @@ -353,6 +355,7 @@ class InvokeAIDiffuserComponent: x=sample, sigma=timestep, conditioning_data=conditioning_data, + ip_adapter_conditioning=ip_adapter_conditioning, cross_attention_control_types_to_do=cross_attention_control_types_to_do, down_block_additional_residuals=down_block_additional_residuals, mid_block_additional_residual=mid_block_additional_residual, @@ -366,6 +369,7 @@ class InvokeAIDiffuserComponent: x=sample, sigma=timestep, conditioning_data=conditioning_data, + ip_adapter_conditioning=ip_adapter_conditioning, down_block_additional_residuals=down_block_additional_residuals, mid_block_additional_residual=mid_block_additional_residual, down_intrablock_additional_residuals=down_intrablock_additional_residuals, @@ -425,6 +429,7 @@ class InvokeAIDiffuserComponent: x, sigma, conditioning_data: ConditioningData, + ip_adapter_conditioning: Optional[list[IPAdapterConditioningInfo]], down_block_additional_residuals: Optional[torch.Tensor] = None, # for ControlNet mid_block_additional_residual: Optional[torch.Tensor] = None, # for ControlNet down_intrablock_additional_residuals: Optional[torch.Tensor] = None, # for T2I-Adapter @@ -483,14 +488,14 @@ class InvokeAIDiffuserComponent: } # TODO(ryand): Figure out interactions between regional prompting and IP-Adapter conditioning. - if conditioning_data.ip_adapter_conditioning is not None: + if ip_adapter_conditioning is not None: # Note that we 'stack' to produce tensors of shape (batch_size, num_ip_images, seq_len, token_len). cross_attention_kwargs = { "ip_adapter_image_prompt_embeds": [ torch.stack( [ipa_conditioning.uncond_image_prompt_embeds, ipa_conditioning.cond_image_prompt_embeds] ) - for ipa_conditioning in conditioning_data.ip_adapter_conditioning + for ipa_conditioning in ip_adapter_conditioning ] } @@ -527,6 +532,7 @@ class InvokeAIDiffuserComponent: x: torch.Tensor, sigma, conditioning_data: ConditioningData, + ip_adapter_conditioning: Optional[list[IPAdapterConditioningInfo]], cross_attention_control_types_to_do: list[CrossAttentionType], down_block_additional_residuals: Optional[torch.Tensor] = None, # for ControlNet mid_block_additional_residual: Optional[torch.Tensor] = None, # for ControlNet @@ -581,12 +587,12 @@ class InvokeAIDiffuserComponent: cross_attention_kwargs = None # Prepare IP-Adapter cross-attention kwargs for the unconditioned pass. - if conditioning_data.ip_adapter_conditioning is not None: + if ip_adapter_conditioning is not None: # Note that we 'unsqueeze' to produce tensors of shape (batch_size=1, num_ip_images, seq_len, token_len). cross_attention_kwargs = { "ip_adapter_image_prompt_embeds": [ torch.unsqueeze(ipa_conditioning.uncond_image_prompt_embeds, dim=0) - for ipa_conditioning in conditioning_data.ip_adapter_conditioning + for ipa_conditioning in ip_adapter_conditioning ] } @@ -622,12 +628,12 @@ class InvokeAIDiffuserComponent: cross_attention_kwargs = None # Prepare IP-Adapter cross-attention kwargs for the conditioned pass. - if conditioning_data.ip_adapter_conditioning is not None: + if ip_adapter_conditioning is not None: # Note that we 'unsqueeze' to produce tensors of shape (batch_size=1, num_ip_images, seq_len, token_len). cross_attention_kwargs = { "ip_adapter_image_prompt_embeds": [ torch.unsqueeze(ipa_conditioning.cond_image_prompt_embeds, dim=0) - for ipa_conditioning in conditioning_data.ip_adapter_conditioning + for ipa_conditioning in ip_adapter_conditioning ] }