mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
406 lines
19 KiB
Python
406 lines
19 KiB
Python
from enum import Enum
|
|
from math import ceil
|
|
from typing import Callable
|
|
|
|
import torch
|
|
|
|
|
|
class InvokeAIDiffuserComponent:
|
|
|
|
class StructuredConditioning:
|
|
def __init__(self, edited_conditioning: torch.Tensor = None, edit_opcodes: list[tuple] = None):
|
|
"""
|
|
:param edited_conditioning: if doing cross-attention control, the edited conditioning (1 x 77 x 768)
|
|
:param edit_opcodes: if doing cross-attention control, opcodes from a SequenceMatcher describing how to map original conditioning tokens to edited conditioning tokens
|
|
"""
|
|
# TODO migrate conditioning and unconditioning here, too
|
|
#self.conditioning = conditioning
|
|
#self.unconditioning = unconditioning
|
|
self.edited_conditioning = edited_conditioning
|
|
self.edit_opcodes = edit_opcodes
|
|
|
|
@property
|
|
def wants_cross_attention_control(self):
|
|
return self.edited_conditioning is not None
|
|
|
|
'''
|
|
The aim of this component is to provide a single place for code that can be applied identically to
|
|
all InvokeAI diffusion procedures.
|
|
|
|
At the moment it includes the following features:
|
|
* Cross Attention Control ("prompt2prompt")
|
|
'''
|
|
|
|
def __init__(self, model, model_forward_callback: Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]):
|
|
"""
|
|
:param model: the unet model to pass through to cross attention control
|
|
:param model_forward_callback: a lambda with arguments (x, sigma, conditioning_to_apply). will be called repeatedly. most likely, this should simply call model.forward(x, sigma, conditioning)
|
|
"""
|
|
self.model = model
|
|
self.model_forward_callback = model_forward_callback
|
|
|
|
|
|
def setup_cross_attention_control(self, conditioning: StructuredConditioning):
|
|
self.conditioning = conditioning
|
|
CrossAttentionControl.setup_cross_attention_control(self.model, conditioning.edited_conditioning, conditioning.edit_opcodes)
|
|
|
|
def remove_cross_attention_control(self):
|
|
self.conditioning = None
|
|
CrossAttentionControl.remove_cross_attention_control(self.model)
|
|
|
|
@property
|
|
def edited_conditioning(self):
|
|
if self.conditioning is None:
|
|
return None
|
|
else:
|
|
return self.conditioning.edited_conditioning
|
|
|
|
def do_diffusion_step(self, x: torch.Tensor, sigma: torch.Tensor,
|
|
unconditioning: torch.Tensor, conditioning: torch.Tensor,
|
|
unconditional_guidance_scale: float):
|
|
"""
|
|
:param x: Current latents
|
|
:param sigma: aka t, passed to the internal model to control how much denoising will occur
|
|
:param unconditioning: [B x 77 x 768] embeddings for unconditioned output
|
|
:param conditioning: [B x 77 x 768] embeddings for conditioned output
|
|
:param unconditional_guidance_scale: aka CFG scale, controls how much effect the conditioning tensor has
|
|
:param model: the unet model to pass through to cross attention control
|
|
:param model_forward_callback: a lambda with arguments (x, sigma, conditioning_to_apply). will be called repeatedly. most likely, this should simply call model.forward(x, sigma, conditioning)
|
|
:return: the new latents after applying the model to x using unconditioning and CFG-scaled conditioning.
|
|
"""
|
|
|
|
CrossAttentionControl.clear_requests(self.model)
|
|
|
|
if self.edited_conditioning is None:
|
|
# faster batched path
|
|
x_twice = torch.cat([x]*2)
|
|
sigma_twice = torch.cat([sigma]*2)
|
|
both_conditionings = torch.cat([unconditioning, conditioning])
|
|
unconditioned_next_x, conditioned_next_x = self.model_forward_callback(x_twice, sigma_twice, both_conditionings).chunk(2)
|
|
else:
|
|
# slower non-batched path (20% slower on mac MPS)
|
|
# We are only interested in using attention maps for conditioned_next_x, but batching them with generation of
|
|
# unconditioned_next_x causes attention maps to *also* be saved for the unconditioned_next_x.
|
|
# This messes app their application later, due to mismatched shape of dim 0 (seems to be 16 for batched vs. 8)
|
|
# (For the batched invocation the `wrangler` function gets attention tensor with shape[0]=16,
|
|
# representing batched uncond + cond, but then when it comes to applying the saved attention, the
|
|
# wrangler gets an attention tensor which only has shape[0]=8, representing just self.edited_conditionings.)
|
|
# todo: give CrossAttentionControl's `wrangler` function more info so it can work with a batched call as well.
|
|
unconditioned_next_x = self.model_forward_callback(x, sigma, unconditioning)
|
|
|
|
# process x using the original prompt, saving the attention maps
|
|
CrossAttentionControl.request_save_attention_maps(self.model)
|
|
_ = self.model_forward_callback(x, sigma, conditioning)
|
|
CrossAttentionControl.clear_requests(self.model)
|
|
|
|
# process x again, using the saved attention maps to control where self.edited_conditioning will be applied
|
|
CrossAttentionControl.request_apply_saved_attention_maps(self.model)
|
|
conditioned_next_x = self.model_forward_callback(x, sigma, self.edited_conditioning)
|
|
CrossAttentionControl.clear_requests(self.model)
|
|
|
|
|
|
# to scale how much effect conditioning has, calculate the changes it does and then scale that
|
|
scaled_delta = (conditioned_next_x - unconditioned_next_x) * unconditional_guidance_scale
|
|
combined_next_x = unconditioned_next_x + scaled_delta
|
|
|
|
return combined_next_x
|
|
|
|
|
|
# todo: make this work
|
|
@classmethod
|
|
def apply_conjunction(cls, x, t, forward_func, uc, c_or_weighted_c_list, global_guidance_scale):
|
|
x_in = torch.cat([x] * 2)
|
|
t_in = torch.cat([t] * 2) # aka sigmas
|
|
|
|
deltas = None
|
|
uncond_latents = None
|
|
weighted_cond_list = c_or_weighted_c_list if type(c_or_weighted_c_list) is list else [(c_or_weighted_c_list, 1)]
|
|
|
|
# below is fugly omg
|
|
num_actual_conditionings = len(c_or_weighted_c_list)
|
|
conditionings = [uc] + [c for c,weight in weighted_cond_list]
|
|
weights = [1] + [weight for c,weight in weighted_cond_list]
|
|
chunk_count = ceil(len(conditionings)/2)
|
|
deltas = None
|
|
for chunk_index in range(chunk_count):
|
|
offset = chunk_index*2
|
|
chunk_size = min(2, len(conditionings)-offset)
|
|
|
|
if chunk_size == 1:
|
|
c_in = conditionings[offset]
|
|
latents_a = forward_func(x_in[:-1], t_in[:-1], c_in)
|
|
latents_b = None
|
|
else:
|
|
c_in = torch.cat(conditionings[offset:offset+2])
|
|
latents_a, latents_b = forward_func(x_in, t_in, c_in).chunk(2)
|
|
|
|
# first chunk is guaranteed to be 2 entries: uncond_latents + first conditioining
|
|
if chunk_index == 0:
|
|
uncond_latents = latents_a
|
|
deltas = latents_b - uncond_latents
|
|
else:
|
|
deltas = torch.cat((deltas, latents_a - uncond_latents))
|
|
if latents_b is not None:
|
|
deltas = torch.cat((deltas, latents_b - uncond_latents))
|
|
|
|
# merge the weighted deltas together into a single merged delta
|
|
per_delta_weights = torch.tensor(weights[1:], dtype=deltas.dtype, device=deltas.device)
|
|
normalize = False
|
|
if normalize:
|
|
per_delta_weights /= torch.sum(per_delta_weights)
|
|
reshaped_weights = per_delta_weights.reshape(per_delta_weights.shape + (1, 1, 1))
|
|
deltas_merged = torch.sum(deltas * reshaped_weights, dim=0, keepdim=True)
|
|
|
|
# old_return_value = super().forward(x, sigma, uncond, cond, cond_scale)
|
|
# assert(0 == len(torch.nonzero(old_return_value - (uncond_latents + deltas_merged * cond_scale))))
|
|
|
|
return uncond_latents + deltas_merged * global_guidance_scale
|
|
|
|
|
|
# adapted from bloc97's CrossAttentionControl colab
|
|
# https://github.com/bloc97/CrossAttentionControl
|
|
|
|
class CrossAttentionControl:
|
|
|
|
|
|
@classmethod
|
|
def remove_cross_attention_control(cls, model):
|
|
cls.remove_attention_function(model)
|
|
|
|
@classmethod
|
|
def setup_cross_attention_control(cls, model,
|
|
substitute_conditioning: torch.Tensor,
|
|
edit_opcodes: list):
|
|
"""
|
|
Inject attention parameters and functions into the passed in model to enable cross attention editing.
|
|
|
|
:param model: The unet model to inject into.
|
|
:param substitute_conditioning: The "edited" conditioning vector, [Bx77x768]
|
|
:param edit_opcodes: Opcodes from difflib.SequenceMatcher describing how the base
|
|
conditionings map to the "edited" conditionings.
|
|
:return:
|
|
"""
|
|
|
|
# adapted from init_attention_edit
|
|
device = substitute_conditioning.device
|
|
|
|
# urgh. should this be hardcoded?
|
|
max_length = 77
|
|
# mask=1 means use base prompt attention, mask=0 means use edited prompt attention
|
|
mask = torch.zeros(max_length)
|
|
indices_target = torch.arange(max_length, dtype=torch.long)
|
|
indices = torch.zeros(max_length, dtype=torch.long)
|
|
for name, a0, a1, b0, b1 in edit_opcodes:
|
|
if b0 < max_length:
|
|
if name == "equal":# or (name == "replace" and a1 - a0 == b1 - b0):
|
|
# these tokens have not been edited
|
|
indices[b0:b1] = indices_target[a0:a1]
|
|
mask[b0:b1] = 1
|
|
|
|
for m in cls.get_attention_modules(model, cls.AttentionType.SELF):
|
|
m.last_attn_slice_mask = None
|
|
m.last_attn_slice_indices = None
|
|
|
|
for m in cls.get_attention_modules(model, cls.AttentionType.TOKENS):
|
|
m.last_attn_slice_mask = mask.to(device)
|
|
m.last_attn_slice_indices = indices.to(device)
|
|
|
|
cls.inject_attention_function(model)
|
|
|
|
|
|
class AttentionType(Enum):
|
|
SELF = 1
|
|
TOKENS = 2
|
|
|
|
|
|
@classmethod
|
|
def get_attention_modules(cls, model, which: AttentionType):
|
|
which_attn = "attn1" if which is cls.AttentionType.SELF else "attn2"
|
|
return [module for name, module in model.named_modules() if
|
|
type(module).__name__ == "CrossAttention" and which_attn in name]
|
|
|
|
@classmethod
|
|
def clear_requests(cls, model):
|
|
self_attention_modules = cls.get_attention_modules(model, cls.AttentionType.SELF)
|
|
tokens_attention_modules = cls.get_attention_modules(model, cls.AttentionType.TOKENS)
|
|
for m in self_attention_modules+tokens_attention_modules:
|
|
m.save_last_attn_slice = False
|
|
m.use_last_attn_slice = False
|
|
|
|
@classmethod
|
|
def request_save_attention_maps(cls, model):
|
|
self_attention_modules = cls.get_attention_modules(model, cls.AttentionType.SELF)
|
|
tokens_attention_modules = cls.get_attention_modules(model, cls.AttentionType.TOKENS)
|
|
for m in self_attention_modules+tokens_attention_modules:
|
|
# clear out the saved slice in case the outermost dim changes
|
|
m.last_attn_slice = None
|
|
m.save_last_attn_slice = True
|
|
|
|
@classmethod
|
|
def request_apply_saved_attention_maps(cls, model):
|
|
self_attention_modules = cls.get_attention_modules(model, cls.AttentionType.SELF)
|
|
tokens_attention_modules = cls.get_attention_modules(model, cls.AttentionType.TOKENS)
|
|
for m in self_attention_modules+tokens_attention_modules:
|
|
m.use_last_attn_slice = True
|
|
|
|
|
|
|
|
@classmethod
|
|
def inject_attention_function(cls, unet):
|
|
# ORIGINAL SOURCE CODE: https://github.com/huggingface/diffusers/blob/91ddd2a25b848df0fa1262d4f1cd98c7ccb87750/src/diffusers/models/attention.py#L276
|
|
|
|
def attention_slice_wrangler(self, attention_scores, suggested_attention_slice, dim, offset, slice_size):
|
|
|
|
#print("in wrangler with suggested_attention_slice shape", suggested_attention_slice.shape, "dim", dim)
|
|
|
|
attn_slice = suggested_attention_slice
|
|
if dim is not None:
|
|
start = offset
|
|
end = start+slice_size
|
|
#print(f"in wrangler, sliced dim {dim} {start}-{end}, use_last_attn_slice is {self.use_last_attn_slice}, save_last_attn_slice is {self.save_last_attn_slice}")
|
|
#else:
|
|
# print(f"in wrangler, whole, use_last_attn_slice is {self.use_last_attn_slice}, save_last_attn_slice is {self.save_last_attn_slice}")
|
|
|
|
|
|
if self.use_last_attn_slice:
|
|
this_attn_slice = attn_slice
|
|
if self.last_attn_slice_mask is not None:
|
|
# indices and mask operate on dim=2, no need to slice
|
|
base_attn_slice_full = torch.index_select(self.last_attn_slice, -1, self.last_attn_slice_indices)
|
|
base_attn_slice_mask = self.last_attn_slice_mask
|
|
if dim is None:
|
|
base_attn_slice = base_attn_slice_full
|
|
#print("using whole base slice of shape", base_attn_slice.shape, "from complete shape", base_attn_slice_full.shape)
|
|
elif dim == 0:
|
|
base_attn_slice = base_attn_slice_full[start:end]
|
|
#print("using base dim 0 slice of shape", base_attn_slice.shape, "from complete shape", base_attn_slice_full.shape)
|
|
elif dim == 1:
|
|
base_attn_slice = base_attn_slice_full[:, start:end]
|
|
#print("using base dim 1 slice of shape", base_attn_slice.shape, "from complete shape", base_attn_slice_full.shape)
|
|
|
|
attn_slice = this_attn_slice * (1 - base_attn_slice_mask) + \
|
|
base_attn_slice * base_attn_slice_mask
|
|
else:
|
|
if dim is None:
|
|
attn_slice = self.last_attn_slice
|
|
#print("took whole slice of shape", attn_slice.shape, "from complete shape", self.last_attn_slice.shape)
|
|
elif dim == 0:
|
|
attn_slice = self.last_attn_slice[start:end]
|
|
#print("took dim 0 slice of shape", attn_slice.shape, "from complete shape", self.last_attn_slice.shape)
|
|
elif dim == 1:
|
|
attn_slice = self.last_attn_slice[:, start:end]
|
|
#print("took dim 1 slice of shape", attn_slice.shape, "from complete shape", self.last_attn_slice.shape)
|
|
|
|
if self.save_last_attn_slice:
|
|
if dim is None:
|
|
self.last_attn_slice = attn_slice
|
|
elif dim == 0:
|
|
# dynamically grow last_attn_slice if needed
|
|
if self.last_attn_slice is None:
|
|
self.last_attn_slice = attn_slice
|
|
#print("no last_attn_slice: shape now", self.last_attn_slice.shape)
|
|
elif self.last_attn_slice.shape[0] == start:
|
|
self.last_attn_slice = torch.cat([self.last_attn_slice, attn_slice], dim=0)
|
|
assert(self.last_attn_slice.shape[0] == end)
|
|
#print("last_attn_slice too small, appended dim 0 shape", attn_slice.shape, ", shape now", self.last_attn_slice.shape)
|
|
else:
|
|
# no need to grow
|
|
self.last_attn_slice[start:end] = attn_slice
|
|
#print("last_attn_slice shape is fine, setting dim 0 shape", attn_slice.shape, ", shape now", self.last_attn_slice.shape)
|
|
|
|
elif dim == 1:
|
|
# dynamically grow last_attn_slice if needed
|
|
if self.last_attn_slice is None:
|
|
self.last_attn_slice = attn_slice
|
|
elif self.last_attn_slice.shape[1] == start:
|
|
self.last_attn_slice = torch.cat([self.last_attn_slice, attn_slice], dim=1)
|
|
assert(self.last_attn_slice.shape[1] == end)
|
|
else:
|
|
# no need to grow
|
|
self.last_attn_slice[:, start:end] = attn_slice
|
|
|
|
if self.use_last_attn_weights and self.last_attn_slice_weights is not None:
|
|
if dim is None:
|
|
weights = self.last_attn_slice_weights
|
|
elif dim == 0:
|
|
weights = self.last_attn_slice_weights[start:end]
|
|
elif dim == 1:
|
|
weights = self.last_attn_slice_weights[:, start:end]
|
|
attn_slice = attn_slice * weights
|
|
|
|
return attn_slice
|
|
|
|
for name, module in unet.named_modules():
|
|
module_name = type(module).__name__
|
|
if module_name == "CrossAttention":
|
|
module.last_attn_slice = None
|
|
module.last_attn_slice_indices = None
|
|
module.last_attn_slice_mask = None
|
|
module.use_last_attn_weights = False
|
|
module.use_last_attn_slice = False
|
|
module.save_last_attn_slice = False
|
|
module.set_attention_slice_wrangler(attention_slice_wrangler)
|
|
|
|
@classmethod
|
|
def remove_attention_function(cls, unet):
|
|
for name, module in unet.named_modules():
|
|
module_name = type(module).__name__
|
|
if module_name == "CrossAttention":
|
|
module.set_attention_slice_wrangler(None)
|
|
|
|
|
|
# original code below
|
|
|
|
# Functions supporting Cross-Attention Control
|
|
# Copied from https://github.com/bloc97/CrossAttentionControl
|
|
|
|
from difflib import SequenceMatcher
|
|
|
|
import torch
|
|
|
|
|
|
def prompt_token(prompt, index, clip_tokenizer):
|
|
tokens = clip_tokenizer(prompt,
|
|
padding='max_length',
|
|
max_length=clip_tokenizer.model_max_length,
|
|
truncation=True,
|
|
return_tensors='pt',
|
|
return_overflowing_tokens=True
|
|
).input_ids[0]
|
|
return clip_tokenizer.decode(tokens[index:index + 1])
|
|
|
|
|
|
def use_last_tokens_attention(unet, use=True):
|
|
for name, module in unet.named_modules():
|
|
module_name = type(module).__name__
|
|
if module_name == 'CrossAttention' and 'attn2' in name:
|
|
module.use_last_attn_slice = use
|
|
|
|
|
|
def use_last_tokens_attention_weights(unet, use=True):
|
|
for name, module in unet.named_modules():
|
|
module_name = type(module).__name__
|
|
if module_name == 'CrossAttention' and 'attn2' in name:
|
|
module.use_last_attn_weights = use
|
|
|
|
|
|
def use_last_self_attention(unet, use=True):
|
|
for name, module in unet.named_modules():
|
|
module_name = type(module).__name__
|
|
if module_name == 'CrossAttention' and 'attn1' in name:
|
|
module.use_last_attn_slice = use
|
|
|
|
|
|
def save_last_tokens_attention(unet, save=True):
|
|
for name, module in unet.named_modules():
|
|
module_name = type(module).__name__
|
|
if module_name == 'CrossAttention' and 'attn2' in name:
|
|
module.save_last_attn_slice = save
|
|
|
|
|
|
def save_last_self_attention(unet, save=True):
|
|
for name, module in unet.named_modules():
|
|
module_name = type(module).__name__
|
|
if module_name == 'CrossAttention' and 'attn1' in name:
|
|
module.save_last_attn_slice = save
|