Split ip_adapter_conditioning out from ConditioningData.

This commit is contained in:
Ryan Dick 2024-02-28 13:49:02 -05:00
parent e7ec13f209
commit ee1b3157ce
4 changed files with 23 additions and 19 deletions

View File

@ -487,7 +487,6 @@ class DenoiseLatentsInvocation(BaseInvocation):
self, self,
context: InvocationContext, context: InvocationContext,
ip_adapter: Optional[Union[IPAdapterField, list[IPAdapterField]]], ip_adapter: Optional[Union[IPAdapterField, list[IPAdapterField]]],
conditioning_data: ConditioningData,
exit_stack: ExitStack, exit_stack: ExitStack,
) -> Optional[list[IPAdapterData]]: ) -> Optional[list[IPAdapterData]]:
"""If IP-Adapter is enabled, then this function loads the requisite models, and adds the image prompt embeddings """If IP-Adapter is enabled, then this function loads the requisite models, and adds the image prompt embeddings
@ -504,7 +503,6 @@ class DenoiseLatentsInvocation(BaseInvocation):
return None return None
ip_adapter_data_list = [] ip_adapter_data_list = []
conditioning_data.ip_adapter_conditioning = []
for single_ip_adapter in ip_adapter: for single_ip_adapter in ip_adapter:
ip_adapter_model: Union[IPAdapter, IPAdapterPlus] = exit_stack.enter_context( ip_adapter_model: Union[IPAdapter, IPAdapterPlus] = exit_stack.enter_context(
context.services.model_manager.get_model( context.services.model_manager.get_model(
@ -537,16 +535,13 @@ class DenoiseLatentsInvocation(BaseInvocation):
single_ipa_images, image_encoder_model single_ipa_images, image_encoder_model
) )
conditioning_data.ip_adapter_conditioning.append(
IPAdapterConditioningInfo(image_prompt_embeds, uncond_image_prompt_embeds)
)
ip_adapter_data_list.append( ip_adapter_data_list.append(
IPAdapterData( IPAdapterData(
ip_adapter_model=ip_adapter_model, ip_adapter_model=ip_adapter_model,
weight=single_ip_adapter.weight, weight=single_ip_adapter.weight,
begin_step_percent=single_ip_adapter.begin_step_percent, begin_step_percent=single_ip_adapter.begin_step_percent,
end_step_percent=single_ip_adapter.end_step_percent, end_step_percent=single_ip_adapter.end_step_percent,
ip_adapter_conditioning=IPAdapterConditioningInfo(image_prompt_embeds, uncond_image_prompt_embeds),
) )
) )
@ -780,7 +775,6 @@ class DenoiseLatentsInvocation(BaseInvocation):
ip_adapter_data = self.prep_ip_adapter_data( ip_adapter_data = self.prep_ip_adapter_data(
context=context, context=context,
ip_adapter=self.ip_adapter, ip_adapter=self.ip_adapter,
conditioning_data=conditioning_data,
exit_stack=exit_stack, exit_stack=exit_stack,
) )

View File

@ -24,7 +24,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
from invokeai.app.services.config import InvokeAIAppConfig from invokeai.app.services.config import InvokeAIAppConfig
from invokeai.backend.ip_adapter.ip_adapter import IPAdapter from invokeai.backend.ip_adapter.ip_adapter import IPAdapter
from invokeai.backend.ip_adapter.unet_patcher import UNetPatcher from invokeai.backend.ip_adapter.unet_patcher import UNetPatcher
from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningData from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningData, IPAdapterConditioningInfo
from invokeai.backend.stable_diffusion.diffusion.regional_prompt_attention import apply_regional_prompt_attn from invokeai.backend.stable_diffusion.diffusion.regional_prompt_attention import apply_regional_prompt_attn
from invokeai.backend.stable_diffusion.diffusion.shared_invokeai_diffusion import InvokeAIDiffuserComponent from invokeai.backend.stable_diffusion.diffusion.shared_invokeai_diffusion import InvokeAIDiffuserComponent
@ -165,10 +165,11 @@ class ControlNetData:
@dataclass @dataclass
class IPAdapterData: class IPAdapterData:
ip_adapter_model: IPAdapter = Field(default=None) ip_adapter_model: IPAdapter
# TODO: change to polymorphic so can do different weights per step (once implemented...) ip_adapter_conditioning: IPAdapterConditioningInfo
# Either a single weight applied to all steps, or a list of weights for each step.
weight: Union[float, List[float]] = Field(default=1.0) weight: Union[float, List[float]] = Field(default=1.0)
# weight: float = Field(default=1.0)
begin_step_percent: float = Field(default=0.0) begin_step_percent: float = Field(default=0.0)
end_step_percent: float = Field(default=1.0) end_step_percent: float = Field(default=1.0)
@ -564,12 +565,17 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
down_intrablock_additional_residuals = accum_adapter_state down_intrablock_additional_residuals = accum_adapter_state
ip_adapter_conditioning = None
if ip_adapter_data is not None:
ip_adapter_conditioning = [ipa.ip_adapter_conditioning for ipa in ip_adapter_data]
uc_noise_pred, c_noise_pred = self.invokeai_diffuser.do_unet_step( uc_noise_pred, c_noise_pred = self.invokeai_diffuser.do_unet_step(
sample=latent_model_input, sample=latent_model_input,
timestep=t, # TODO: debug how handled batched and non batched timesteps timestep=t, # TODO: debug how handled batched and non batched timesteps
step_index=step_index, step_index=step_index,
total_step_count=total_step_count, total_step_count=total_step_count,
conditioning_data=conditioning_data, conditioning_data=conditioning_data,
ip_adapter_conditioning=ip_adapter_conditioning,
down_block_additional_residuals=down_block_additional_residuals, # for ControlNet down_block_additional_residuals=down_block_additional_residuals, # for ControlNet
mid_block_additional_residual=mid_block_additional_residual, # for ControlNet mid_block_additional_residual=mid_block_additional_residual, # for ControlNet
down_intrablock_additional_residuals=down_intrablock_additional_residuals, # for T2I-Adapter down_intrablock_additional_residuals=down_intrablock_additional_residuals, # for T2I-Adapter

View File

@ -71,5 +71,3 @@ class ConditioningData:
ref [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf) ref [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf)
""" """
guidance_rescale_multiplier: float = 0 guidance_rescale_multiplier: float = 0
ip_adapter_conditioning: Optional[list[IPAdapterConditioningInfo]] = None

View File

@ -14,6 +14,7 @@ from invokeai.backend.stable_diffusion.diffusion.conditioning_data import (
BasicConditioningInfo, BasicConditioningInfo,
ConditioningData, ConditioningData,
ExtraConditioningInfo, ExtraConditioningInfo,
IPAdapterConditioningInfo,
SDXLConditioningInfo, SDXLConditioningInfo,
) )
from invokeai.backend.stable_diffusion.diffusion.regional_prompt_attention import Range, RegionalPromptData from invokeai.backend.stable_diffusion.diffusion.regional_prompt_attention import Range, RegionalPromptData
@ -329,6 +330,7 @@ class InvokeAIDiffuserComponent:
sample: torch.Tensor, sample: torch.Tensor,
timestep: torch.Tensor, timestep: torch.Tensor,
conditioning_data: ConditioningData, conditioning_data: ConditioningData,
ip_adapter_conditioning: Optional[list[IPAdapterConditioningInfo]],
step_index: int, step_index: int,
total_step_count: int, total_step_count: int,
down_block_additional_residuals: Optional[torch.Tensor] = None, # for ControlNet down_block_additional_residuals: Optional[torch.Tensor] = None, # for ControlNet
@ -353,6 +355,7 @@ class InvokeAIDiffuserComponent:
x=sample, x=sample,
sigma=timestep, sigma=timestep,
conditioning_data=conditioning_data, conditioning_data=conditioning_data,
ip_adapter_conditioning=ip_adapter_conditioning,
cross_attention_control_types_to_do=cross_attention_control_types_to_do, cross_attention_control_types_to_do=cross_attention_control_types_to_do,
down_block_additional_residuals=down_block_additional_residuals, down_block_additional_residuals=down_block_additional_residuals,
mid_block_additional_residual=mid_block_additional_residual, mid_block_additional_residual=mid_block_additional_residual,
@ -366,6 +369,7 @@ class InvokeAIDiffuserComponent:
x=sample, x=sample,
sigma=timestep, sigma=timestep,
conditioning_data=conditioning_data, conditioning_data=conditioning_data,
ip_adapter_conditioning=ip_adapter_conditioning,
down_block_additional_residuals=down_block_additional_residuals, down_block_additional_residuals=down_block_additional_residuals,
mid_block_additional_residual=mid_block_additional_residual, mid_block_additional_residual=mid_block_additional_residual,
down_intrablock_additional_residuals=down_intrablock_additional_residuals, down_intrablock_additional_residuals=down_intrablock_additional_residuals,
@ -425,6 +429,7 @@ class InvokeAIDiffuserComponent:
x, x,
sigma, sigma,
conditioning_data: ConditioningData, conditioning_data: ConditioningData,
ip_adapter_conditioning: Optional[list[IPAdapterConditioningInfo]],
down_block_additional_residuals: Optional[torch.Tensor] = None, # for ControlNet down_block_additional_residuals: Optional[torch.Tensor] = None, # for ControlNet
mid_block_additional_residual: Optional[torch.Tensor] = None, # for ControlNet mid_block_additional_residual: Optional[torch.Tensor] = None, # for ControlNet
down_intrablock_additional_residuals: Optional[torch.Tensor] = None, # for T2I-Adapter down_intrablock_additional_residuals: Optional[torch.Tensor] = None, # for T2I-Adapter
@ -483,14 +488,14 @@ class InvokeAIDiffuserComponent:
} }
# TODO(ryand): Figure out interactions between regional prompting and IP-Adapter conditioning. # TODO(ryand): Figure out interactions between regional prompting and IP-Adapter conditioning.
if conditioning_data.ip_adapter_conditioning is not None: if ip_adapter_conditioning is not None:
# Note that we 'stack' to produce tensors of shape (batch_size, num_ip_images, seq_len, token_len). # Note that we 'stack' to produce tensors of shape (batch_size, num_ip_images, seq_len, token_len).
cross_attention_kwargs = { cross_attention_kwargs = {
"ip_adapter_image_prompt_embeds": [ "ip_adapter_image_prompt_embeds": [
torch.stack( torch.stack(
[ipa_conditioning.uncond_image_prompt_embeds, ipa_conditioning.cond_image_prompt_embeds] [ipa_conditioning.uncond_image_prompt_embeds, ipa_conditioning.cond_image_prompt_embeds]
) )
for ipa_conditioning in conditioning_data.ip_adapter_conditioning for ipa_conditioning in ip_adapter_conditioning
] ]
} }
@ -527,6 +532,7 @@ class InvokeAIDiffuserComponent:
x: torch.Tensor, x: torch.Tensor,
sigma, sigma,
conditioning_data: ConditioningData, conditioning_data: ConditioningData,
ip_adapter_conditioning: Optional[list[IPAdapterConditioningInfo]],
cross_attention_control_types_to_do: list[CrossAttentionType], cross_attention_control_types_to_do: list[CrossAttentionType],
down_block_additional_residuals: Optional[torch.Tensor] = None, # for ControlNet down_block_additional_residuals: Optional[torch.Tensor] = None, # for ControlNet
mid_block_additional_residual: Optional[torch.Tensor] = None, # for ControlNet mid_block_additional_residual: Optional[torch.Tensor] = None, # for ControlNet
@ -581,12 +587,12 @@ class InvokeAIDiffuserComponent:
cross_attention_kwargs = None cross_attention_kwargs = None
# Prepare IP-Adapter cross-attention kwargs for the unconditioned pass. # Prepare IP-Adapter cross-attention kwargs for the unconditioned pass.
if conditioning_data.ip_adapter_conditioning is not None: if ip_adapter_conditioning is not None:
# Note that we 'unsqueeze' to produce tensors of shape (batch_size=1, num_ip_images, seq_len, token_len). # Note that we 'unsqueeze' to produce tensors of shape (batch_size=1, num_ip_images, seq_len, token_len).
cross_attention_kwargs = { cross_attention_kwargs = {
"ip_adapter_image_prompt_embeds": [ "ip_adapter_image_prompt_embeds": [
torch.unsqueeze(ipa_conditioning.uncond_image_prompt_embeds, dim=0) torch.unsqueeze(ipa_conditioning.uncond_image_prompt_embeds, dim=0)
for ipa_conditioning in conditioning_data.ip_adapter_conditioning for ipa_conditioning in ip_adapter_conditioning
] ]
} }
@ -622,12 +628,12 @@ class InvokeAIDiffuserComponent:
cross_attention_kwargs = None cross_attention_kwargs = None
# Prepare IP-Adapter cross-attention kwargs for the conditioned pass. # Prepare IP-Adapter cross-attention kwargs for the conditioned pass.
if conditioning_data.ip_adapter_conditioning is not None: if ip_adapter_conditioning is not None:
# Note that we 'unsqueeze' to produce tensors of shape (batch_size=1, num_ip_images, seq_len, token_len). # Note that we 'unsqueeze' to produce tensors of shape (batch_size=1, num_ip_images, seq_len, token_len).
cross_attention_kwargs = { cross_attention_kwargs = {
"ip_adapter_image_prompt_embeds": [ "ip_adapter_image_prompt_embeds": [
torch.unsqueeze(ipa_conditioning.cond_image_prompt_embeds, dim=0) torch.unsqueeze(ipa_conditioning.cond_image_prompt_embeds, dim=0)
for ipa_conditioning in conditioning_data.ip_adapter_conditioning for ipa_conditioning in ip_adapter_conditioning
] ]
} }