import enum from typing import Optional import torch # adapted from bloc97's CrossAttentionControl colab # https://github.com/bloc97/CrossAttentionControl class CrossAttentionControl: class Arguments: def __init__(self, edited_conditioning: torch.Tensor, edit_opcodes: list[tuple], edit_options: dict): """ :param edited_conditioning: if doing cross-attention control, the edited conditioning [1 x 77 x 768] :param edit_opcodes: if doing cross-attention control, a list of difflib.SequenceMatcher-like opcodes describing how to map original conditioning tokens to edited conditioning tokens (only the 'equal' opcode is required) :param edit_options: if doing cross-attention control, per-edit options. there should be 1 item in edit_options for each item in edit_opcodes. """ # todo: rewrite this to take embedding fragments rather than a single edited_conditioning vector self.edited_conditioning = edited_conditioning self.edit_opcodes = edit_opcodes if edited_conditioning is not None: assert len(edit_opcodes) == len(edit_options), \ "there must be 1 edit_options dict for each edit_opcodes tuple" non_none_edit_options = [x for x in edit_options if x is not None] assert len(non_none_edit_options)>0, "missing edit_options" if len(non_none_edit_options)>1: print('warning: cross-attention control options are not working properly for >1 edit') self.edit_options = non_none_edit_options[0] class Context: class Action(enum.Enum): NONE = 0 SAVE = 1, APPLY = 2 def __init__(self, arguments: 'CrossAttentionControl.Arguments', step_count: int): """ :param arguments: Arguments for the cross-attention control process :param step_count: The absolute total number of steps of diffusion (for img2img this is likely larger than the number of steps that will actually run) """ self.arguments = arguments self.step_count = step_count self.self_cross_attention_module_identifiers = [] self.tokens_cross_attention_module_identifiers = [] self.saved_cross_attention_maps = {} self.clear_requests(cleanup=True) def register_cross_attention_modules(self, model): for name,module in CrossAttentionControl.get_attention_modules(model, CrossAttentionControl.CrossAttentionType.SELF): self.self_cross_attention_module_identifiers.append(name) for name,module in CrossAttentionControl.get_attention_modules(model, CrossAttentionControl.CrossAttentionType.TOKENS): self.tokens_cross_attention_module_identifiers.append(name) def request_save_attention_maps(self, cross_attention_type: 'CrossAttentionControl.CrossAttentionType'): if cross_attention_type == CrossAttentionControl.CrossAttentionType.SELF: self.self_cross_attention_action = CrossAttentionControl.Context.Action.SAVE else: self.tokens_cross_attention_action = CrossAttentionControl.Context.Action.SAVE def request_apply_saved_attention_maps(self, cross_attention_type: 'CrossAttentionControl.CrossAttentionType'): if cross_attention_type == CrossAttentionControl.CrossAttentionType.SELF: self.self_cross_attention_action = CrossAttentionControl.Context.Action.APPLY else: self.tokens_cross_attention_action = CrossAttentionControl.Context.Action.APPLY def is_tokens_cross_attention(self, module_identifier) -> bool: return module_identifier in self.tokens_cross_attention_module_identifiers def get_should_save_maps(self, module_identifier: str) -> bool: if module_identifier in self.self_cross_attention_module_identifiers: return self.self_cross_attention_action == CrossAttentionControl.Context.Action.SAVE elif module_identifier in self.tokens_cross_attention_module_identifiers: return self.tokens_cross_attention_action == CrossAttentionControl.Context.Action.SAVE return False def get_should_apply_saved_maps(self, module_identifier: str) -> bool: if module_identifier in self.self_cross_attention_module_identifiers: return self.self_cross_attention_action == CrossAttentionControl.Context.Action.APPLY elif module_identifier in self.tokens_cross_attention_module_identifiers: return self.tokens_cross_attention_action == CrossAttentionControl.Context.Action.APPLY return False def get_active_cross_attention_control_types_for_step(self, percent_through:float=None)\ -> list['CrossAttentionControl.CrossAttentionType']: """ Should cross-attention control be applied on the given step? :param percent_through: How far through the step sequence are we (0.0=pure noise, 1.0=completely denoised image). Expected range 0.0..<1.0. :return: A list of attention types that cross-attention control should be performed for on the given step. May be []. """ if percent_through is None: return [CrossAttentionControl.CrossAttentionType.SELF, CrossAttentionControl.CrossAttentionType.TOKENS] opts = self.arguments.edit_options to_control = [] if opts['s_start'] <= percent_through and percent_through < opts['s_end']: to_control.append(CrossAttentionControl.CrossAttentionType.SELF) if opts['t_start'] <= percent_through and percent_through < opts['t_end']: to_control.append(CrossAttentionControl.CrossAttentionType.TOKENS) return to_control def save_slice(self, identifier: str, slice: torch.Tensor, dim: Optional[int], offset: int, slice_size: Optional[int]): if identifier not in self.saved_cross_attention_maps: self.saved_cross_attention_maps[identifier] = { 'dim': dim, 'slice_size': slice_size, 'slices': {offset or 0: slice} } else: self.saved_cross_attention_maps[identifier]['slices'][offset or 0] = slice def get_slice(self, identifier: str, requested_dim: Optional[int], requested_offset: int, slice_size: int): saved_attention_dict = self.saved_cross_attention_maps[identifier] if requested_dim is None: if saved_attention_dict['dim'] is not None: raise RuntimeError(f"dim mismatch: expected dim=None, have {saved_attention_dict['dim']}") return saved_attention_dict['slices'][0] if saved_attention_dict['dim'] == requested_dim: if slice_size != saved_attention_dict['slice_size']: raise RuntimeError( f"slice_size mismatch: expected slice_size={slice_size}, have {saved_attention_dict['slice_size']}") return saved_attention_dict['slices'][requested_offset] if saved_attention_dict['dim'] == None: whole_saved_attention = saved_attention_dict['slices'][0] if requested_dim == 0: return whole_saved_attention[requested_offset:requested_offset + slice_size] elif requested_dim == 1: return whole_saved_attention[:, requested_offset:requested_offset + slice_size] raise RuntimeError(f"Cannot convert dim {saved_attention_dict['dim']} to requested dim {requested_dim}") def get_slicing_strategy(self, identifier: str) -> Optional[tuple[int, int]]: saved_attention = self.saved_cross_attention_maps.get(identifier, None) if saved_attention is None: return None, None return saved_attention['dim'], saved_attention['slice_size'] def clear_requests(self, cleanup=True): self.tokens_cross_attention_action = CrossAttentionControl.Context.Action.NONE self.self_cross_attention_action = CrossAttentionControl.Context.Action.NONE if cleanup: self.saved_cross_attention_maps = {} def offload_saved_attention_slices_to_cpu(self): for key, map_dict in self.saved_cross_attention_maps.items(): for offset, slice in map_dict['slices'].items(): map_dict[offset] = slice.to('cpu') @classmethod def remove_cross_attention_control(cls, model): cls.remove_attention_function(model) @classmethod def setup_cross_attention_control(cls, model, context: Context): """ Inject attention parameters and functions into the passed in model to enable cross attention editing. :param model: The unet model to inject into. :param cross_attention_control_args: Arugments passeed to the CrossAttentionControl implementations :return: None """ # adapted from init_attention_edit device = context.arguments.edited_conditioning.device # urgh. should this be hardcoded? max_length = 77 # mask=1 means use base prompt attention, mask=0 means use edited prompt attention mask = torch.zeros(max_length) indices_target = torch.arange(max_length, dtype=torch.long) indices = torch.zeros(max_length, dtype=torch.long) for name, a0, a1, b0, b1 in context.arguments.edit_opcodes: if b0 < max_length: if name == "equal":# or (name == "replace" and a1 - a0 == b1 - b0): # these tokens have not been edited indices[b0:b1] = indices_target[a0:a1] mask[b0:b1] = 1 context.register_cross_attention_modules(model) context.cross_attention_mask = mask.to(device) context.cross_attention_index_map = indices.to(device) cls.inject_attention_function(model, context) class CrossAttentionType(enum.Enum): SELF = 1 TOKENS = 2 @classmethod def get_attention_modules(cls, model, which: CrossAttentionType): which_attn = "attn1" if which is cls.CrossAttentionType.SELF else "attn2" return [(name,module) for name, module in model.named_modules() if type(module).__name__ == "CrossAttention" and which_attn in name] @classmethod def inject_attention_function(cls, unet, context: 'CrossAttentionControl.Context'): # ORIGINAL SOURCE CODE: https://github.com/huggingface/diffusers/blob/91ddd2a25b848df0fa1262d4f1cd98c7ccb87750/src/diffusers/models/attention.py#L276 def attention_slice_wrangler(module, suggested_attention_slice:torch.Tensor, dim, offset, slice_size): #memory_usage = suggested_attention_slice.element_size() * suggested_attention_slice.nelement() attention_slice = suggested_attention_slice if context.get_should_save_maps(module.identifier): #print(module.identifier, "saving suggested_attention_slice of shape", # suggested_attention_slice.shape, "dim", dim, "offset", offset) slice_to_save = attention_slice.to('cpu') if dim is not None else attention_slice context.save_slice(module.identifier, slice_to_save, dim=dim, offset=offset, slice_size=slice_size) elif context.get_should_apply_saved_maps(module.identifier): #print(module.identifier, "applying saved attention slice for dim", dim, "offset", offset) saved_attention_slice = context.get_slice(module.identifier, dim, offset, slice_size) # slice may have been offloaded to CPU saved_attention_slice = saved_attention_slice.to(suggested_attention_slice.device) if context.is_tokens_cross_attention(module.identifier): index_map = context.cross_attention_index_map remapped_saved_attention_slice = torch.index_select(saved_attention_slice, -1, index_map) this_attention_slice = suggested_attention_slice mask = context.cross_attention_mask saved_mask = mask this_mask = 1 - mask attention_slice = remapped_saved_attention_slice * saved_mask + \ this_attention_slice * this_mask else: # just use everything attention_slice = saved_attention_slice return attention_slice for name, module in unet.named_modules(): module_name = type(module).__name__ if module_name == "CrossAttention": module.identifier = name module.set_attention_slice_wrangler(attention_slice_wrangler) module.set_slicing_strategy_getter(lambda module, module_identifier=name: \ context.get_slicing_strategy(module_identifier)) @classmethod def remove_attention_function(cls, unet): # clear wrangler callback for name, module in unet.named_modules(): module_name = type(module).__name__ if module_name == "CrossAttention": module.set_attention_slice_wrangler(None) module.set_slicing_strategy_getter(None)