diff --git a/invokeai/app/invocations/latent.py b/invokeai/app/invocations/latent.py index 70233b8f67..fba661671d 100644 --- a/invokeai/app/invocations/latent.py +++ b/invokeai/app/invocations/latent.py @@ -488,7 +488,6 @@ class DenoiseLatentsInvocation(BaseInvocation): self, context: InvocationContext, ip_adapter: Optional[Union[IPAdapterField, list[IPAdapterField]]], - conditioning_data: ConditioningData, exit_stack: ExitStack, ) -> Optional[list[IPAdapterData]]: """If IP-Adapter is enabled, then this function loads the requisite models, and adds the image prompt embeddings @@ -505,7 +504,6 @@ class DenoiseLatentsInvocation(BaseInvocation): return None ip_adapter_data_list = [] - conditioning_data.ip_adapter_conditioning = [] for single_ip_adapter in ip_adapter: ip_adapter_model: Union[IPAdapter, IPAdapterPlus] = exit_stack.enter_context( context.models.load(single_ip_adapter.ip_adapter_model) @@ -528,16 +526,13 @@ class DenoiseLatentsInvocation(BaseInvocation): single_ipa_images, image_encoder_model ) - conditioning_data.ip_adapter_conditioning.append( - IPAdapterConditioningInfo(image_prompt_embeds, uncond_image_prompt_embeds) - ) - ip_adapter_data_list.append( IPAdapterData( ip_adapter_model=ip_adapter_model, weight=single_ip_adapter.weight, begin_step_percent=single_ip_adapter.begin_step_percent, end_step_percent=single_ip_adapter.end_step_percent, + ip_adapter_conditioning=IPAdapterConditioningInfo(image_prompt_embeds, uncond_image_prompt_embeds), ) ) @@ -772,7 +767,6 @@ class DenoiseLatentsInvocation(BaseInvocation): ip_adapter_data = self.prep_ip_adapter_data( context=context, ip_adapter=self.ip_adapter, - conditioning_data=conditioning_data, exit_stack=exit_stack, ) diff --git a/invokeai/backend/stable_diffusion/diffusers_pipeline.py b/invokeai/backend/stable_diffusion/diffusers_pipeline.py index 80fe1a9c40..53b1ef5313 100644 --- a/invokeai/backend/stable_diffusion/diffusers_pipeline.py +++ b/invokeai/backend/stable_diffusion/diffusers_pipeline.py @@ -23,7 +23,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from invokeai.app.services.config.config_default import get_config from invokeai.backend.ip_adapter.ip_adapter import IPAdapter from invokeai.backend.ip_adapter.unet_patcher import UNetPatcher -from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningData +from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningData, IPAdapterConditioningInfo from invokeai.backend.stable_diffusion.diffusion.shared_invokeai_diffusion import InvokeAIDiffuserComponent from invokeai.backend.util.attention import auto_detect_slice_size from invokeai.backend.util.devices import normalize_device @@ -151,10 +151,11 @@ class ControlNetData: @dataclass class IPAdapterData: - ip_adapter_model: IPAdapter = Field(default=None) - # TODO: change to polymorphic so can do different weights per step (once implemented...) + ip_adapter_model: IPAdapter + ip_adapter_conditioning: IPAdapterConditioningInfo + + # Either a single weight applied to all steps, or a list of weights for each step. weight: Union[float, List[float]] = Field(default=1.0) - # weight: float = Field(default=1.0) begin_step_percent: float = Field(default=0.0) end_step_percent: float = Field(default=1.0) @@ -549,12 +550,17 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline): down_intrablock_additional_residuals = accum_adapter_state + ip_adapter_conditioning = None + if ip_adapter_data is not None: + ip_adapter_conditioning = [ipa.ip_adapter_conditioning for ipa in ip_adapter_data] + uc_noise_pred, c_noise_pred = self.invokeai_diffuser.do_unet_step( sample=latent_model_input, timestep=t, # TODO: debug how handled batched and non batched timesteps step_index=step_index, total_step_count=total_step_count, conditioning_data=conditioning_data, + ip_adapter_conditioning=ip_adapter_conditioning, down_block_additional_residuals=down_block_additional_residuals, # for ControlNet mid_block_additional_residual=mid_block_additional_residual, # for ControlNet down_intrablock_additional_residuals=down_intrablock_additional_residuals, # for T2I-Adapter diff --git a/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py b/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py index 597905481a..b00c56120d 100644 --- a/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py +++ b/invokeai/backend/stable_diffusion/diffusion/conditioning_data.py @@ -69,5 +69,3 @@ class ConditioningData: ref [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf) """ guidance_rescale_multiplier: float = 0 - - ip_adapter_conditioning: Optional[list[IPAdapterConditioningInfo]] = None diff --git a/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py b/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py index f55876623c..657351e6c6 100644 --- a/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py +++ b/invokeai/backend/stable_diffusion/diffusion/shared_invokeai_diffusion.py @@ -12,6 +12,7 @@ from invokeai.app.services.config.config_default import get_config from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ( ConditioningData, ExtraConditioningInfo, + IPAdapterConditioningInfo, SDXLConditioningInfo, ) @@ -199,6 +200,7 @@ class InvokeAIDiffuserComponent: sample: torch.Tensor, timestep: torch.Tensor, conditioning_data: ConditioningData, + ip_adapter_conditioning: Optional[list[IPAdapterConditioningInfo]], step_index: int, total_step_count: int, down_block_additional_residuals: Optional[torch.Tensor] = None, # for ControlNet @@ -223,6 +225,7 @@ class InvokeAIDiffuserComponent: x=sample, sigma=timestep, conditioning_data=conditioning_data, + ip_adapter_conditioning=ip_adapter_conditioning, cross_attention_control_types_to_do=cross_attention_control_types_to_do, down_block_additional_residuals=down_block_additional_residuals, mid_block_additional_residual=mid_block_additional_residual, @@ -236,6 +239,7 @@ class InvokeAIDiffuserComponent: x=sample, sigma=timestep, conditioning_data=conditioning_data, + ip_adapter_conditioning=ip_adapter_conditioning, down_block_additional_residuals=down_block_additional_residuals, mid_block_additional_residual=mid_block_additional_residual, down_intrablock_additional_residuals=down_intrablock_additional_residuals, @@ -297,6 +301,7 @@ class InvokeAIDiffuserComponent: x, sigma, conditioning_data: ConditioningData, + ip_adapter_conditioning: Optional[list[IPAdapterConditioningInfo]], down_block_additional_residuals: Optional[torch.Tensor] = None, # for ControlNet mid_block_additional_residual: Optional[torch.Tensor] = None, # for ControlNet down_intrablock_additional_residuals: Optional[torch.Tensor] = None, # for T2I-Adapter @@ -308,14 +313,14 @@ class InvokeAIDiffuserComponent: sigma_twice = torch.cat([sigma] * 2) cross_attention_kwargs = None - if conditioning_data.ip_adapter_conditioning is not None: + if ip_adapter_conditioning is not None: # Note that we 'stack' to produce tensors of shape (batch_size, num_ip_images, seq_len, token_len). cross_attention_kwargs = { "ip_adapter_image_prompt_embeds": [ torch.stack( [ipa_conditioning.uncond_image_prompt_embeds, ipa_conditioning.cond_image_prompt_embeds] ) - for ipa_conditioning in conditioning_data.ip_adapter_conditioning + for ipa_conditioning in ip_adapter_conditioning ] } @@ -361,6 +366,7 @@ class InvokeAIDiffuserComponent: x: torch.Tensor, sigma, conditioning_data: ConditioningData, + ip_adapter_conditioning: Optional[list[IPAdapterConditioningInfo]], cross_attention_control_types_to_do: list[CrossAttentionType], down_block_additional_residuals: Optional[torch.Tensor] = None, # for ControlNet mid_block_additional_residual: Optional[torch.Tensor] = None, # for ControlNet @@ -411,12 +417,12 @@ class InvokeAIDiffuserComponent: cross_attention_kwargs = None # Prepare IP-Adapter cross-attention kwargs for the unconditioned pass. - if conditioning_data.ip_adapter_conditioning is not None: + if ip_adapter_conditioning is not None: # Note that we 'unsqueeze' to produce tensors of shape (batch_size=1, num_ip_images, seq_len, token_len). cross_attention_kwargs = { "ip_adapter_image_prompt_embeds": [ torch.unsqueeze(ipa_conditioning.uncond_image_prompt_embeds, dim=0) - for ipa_conditioning in conditioning_data.ip_adapter_conditioning + for ipa_conditioning in ip_adapter_conditioning ] } @@ -452,12 +458,12 @@ class InvokeAIDiffuserComponent: cross_attention_kwargs = None # Prepare IP-Adapter cross-attention kwargs for the conditioned pass. - if conditioning_data.ip_adapter_conditioning is not None: + if ip_adapter_conditioning is not None: # Note that we 'unsqueeze' to produce tensors of shape (batch_size=1, num_ip_images, seq_len, token_len). cross_attention_kwargs = { "ip_adapter_image_prompt_embeds": [ torch.unsqueeze(ipa_conditioning.cond_image_prompt_embeds, dim=0) - for ipa_conditioning in conditioning_data.ip_adapter_conditioning + for ipa_conditioning in ip_adapter_conditioning ] }