From d2c55dc0110dc561a16b8b3aad436a7ad2a4a112 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Sun, 30 Jul 2023 14:20:59 +0200 Subject: [PATCH 1/7] enable .and() syntax and long prompts --- invokeai/app/invocations/compel.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/invokeai/app/invocations/compel.py b/invokeai/app/invocations/compel.py index fb29e01628..3795fea8fd 100644 --- a/invokeai/app/invocations/compel.py +++ b/invokeai/app/invocations/compel.py @@ -127,16 +127,15 @@ class CompelInvocation(BaseInvocation): text_encoder=text_encoder, textual_inversion_manager=ti_manager, dtype_for_device_getter=torch_dtype, - truncate_long_prompts=True, + truncate_long_prompts=False, ) conjunction = Compel.parse_prompt_string(self.prompt) - prompt: Union[FlattenedPrompt, Blend] = conjunction.prompts[0] if context.services.configuration.log_tokenization: - log_tokenization_for_prompt_object(prompt, tokenizer) + log_tokenization_for_prompt_object(conjunction, tokenizer) - c, options = compel.build_conditioning_tensor_for_prompt_object(prompt) + c, options = compel.build_conditioning_tensor_for_conjunction(conjunction) ec = InvokeAIDiffuserComponent.ExtraConditioningInfo( tokens_count_including_eos_bos=get_max_token_count(tokenizer, conjunction), @@ -289,7 +288,7 @@ class SDXLPromptInvocationBase: text_encoder=text_encoder, textual_inversion_manager=ti_manager, dtype_for_device_getter=torch_dtype, - truncate_long_prompts=True, # TODO: + truncate_long_prompts=False, # TODO: returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED, # TODO: clip skip requires_pooled=True, ) @@ -298,8 +297,7 @@ class SDXLPromptInvocationBase: if context.services.configuration.log_tokenization: # TODO: better logging for and syntax - for prompt_obj in conjunction.prompts: - log_tokenization_for_prompt_object(prompt_obj, tokenizer) + log_tokenization_for_conjunction(conjunction, tokenizer) # TODO: ask for optimizations? to not run text_encoder twice c, options = compel.build_conditioning_tensor_for_conjunction(conjunction) From b65c9ad61209f089dcfb80ead84c36e9256c1ccf Mon Sep 17 00:00:00 2001 From: Sergey Borisov Date: Mon, 28 Aug 2023 04:50:58 +0300 Subject: [PATCH 2/7] Add monkeypatch for xformers to align unaligned attention_mask --- invokeai/backend/util/hotfixes.py | 44 +++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/invokeai/backend/util/hotfixes.py b/invokeai/backend/util/hotfixes.py index 3d7f278f86..cf97d494d7 100644 --- a/invokeai/backend/util/hotfixes.py +++ b/invokeai/backend/util/hotfixes.py @@ -761,3 +761,47 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin): diffusers.ControlNetModel = ControlNetModel diffusers.models.controlnet.ControlNetModel = ControlNetModel + +try: + import xformers + xformers_available = True +except: + xformers_available = False + + +if xformers_available: + # TODO: remove when fixed in diffusers + _xformers_memory_efficient_attention = xformers.ops.memory_efficient_attention + def new_memory_efficient_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_bias = None, + p: float = 0.0, + scale: Optional[float] = None, + *, + op = None, + ): + # diffusers not align shape to 8, which is required by xformers + if attn_bias is not None and type(attn_bias) is torch.Tensor: + orig_size = attn_bias.shape[-1] + new_size = ((orig_size + 7) // 8) * 8 + aligned_attn_bias = torch.zeros( + (attn_bias.shape[0], attn_bias.shape[1], new_size), + device=attn_bias.device, + dtype=attn_bias.dtype, + ) + aligned_attn_bias[:,:,:orig_size] = attn_bias + attn_bias = aligned_attn_bias[:,:,:orig_size] + + return _xformers_memory_efficient_attention( + query=query, + key=key, + value=value, + attn_bias=attn_bias, + p=p, + scale=scale, + op=op, + ) + + xformers.ops.memory_efficient_attention = new_memory_efficient_attention From 2bf747caf6a5ea5481984e2545c73af955c1b54c Mon Sep 17 00:00:00 2001 From: Sergey Borisov Date: Mon, 28 Aug 2023 18:36:27 +0300 Subject: [PATCH 3/7] Blackify --- invokeai/backend/util/hotfixes.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/invokeai/backend/util/hotfixes.py b/invokeai/backend/util/hotfixes.py index cf97d494d7..161a35eb52 100644 --- a/invokeai/backend/util/hotfixes.py +++ b/invokeai/backend/util/hotfixes.py @@ -764,6 +764,7 @@ diffusers.models.controlnet.ControlNetModel = ControlNetModel try: import xformers + xformers_available = True except: xformers_available = False @@ -772,27 +773,28 @@ except: if xformers_available: # TODO: remove when fixed in diffusers _xformers_memory_efficient_attention = xformers.ops.memory_efficient_attention + def new_memory_efficient_attention( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - attn_bias = None, + attn_bias=None, p: float = 0.0, scale: Optional[float] = None, *, - op = None, + op=None, ): # diffusers not align shape to 8, which is required by xformers if attn_bias is not None and type(attn_bias) is torch.Tensor: orig_size = attn_bias.shape[-1] new_size = ((orig_size + 7) // 8) * 8 - aligned_attn_bias = torch.zeros( + aligned_attn_bias = torch.zeros( (attn_bias.shape[0], attn_bias.shape[1], new_size), device=attn_bias.device, dtype=attn_bias.dtype, ) - aligned_attn_bias[:,:,:orig_size] = attn_bias - attn_bias = aligned_attn_bias[:,:,:orig_size] + aligned_attn_bias[:, :, :orig_size] = attn_bias + attn_bias = aligned_attn_bias[:, :, :orig_size] return _xformers_memory_efficient_attention( query=query, From 4196c669a0fc9d4bf1d2160657af7a697b2db23c Mon Sep 17 00:00:00 2001 From: blessedcoolant <54517381+blessedcoolant@users.noreply.github.com> Date: Tue, 29 Aug 2023 12:57:26 +1200 Subject: [PATCH 4/7] chore: black / flake lint errors --- invokeai/backend/util/hotfixes.py | 33 ++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/invokeai/backend/util/hotfixes.py b/invokeai/backend/util/hotfixes.py index 34aefdd827..983d0b7601 100644 --- a/invokeai/backend/util/hotfixes.py +++ b/invokeai/backend/util/hotfixes.py @@ -38,7 +38,8 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin): Whether to flip the sin to cos in the time embedding. freq_shift (`int`, defaults to 0): The frequency shift to apply to the time embedding. - down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`): + down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", \ + "CrossAttnDownBlock2D", "DownBlock2D")`): The tuple of downsample blocks to use. only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`): block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`): @@ -140,7 +141,9 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin): # If `num_attention_heads` is not defined (which is the case for most models) # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. # The reason for this behavior is to correct for incorrectly named variables that were introduced - # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 + # when this library was created... + # The incorrect naming was only discovered much ... + # later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking # which is why we correct for the naming here. num_attention_heads = num_attention_heads or attention_head_dim @@ -148,17 +151,20 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin): # Check inputs if len(block_out_channels) != len(down_block_types): raise ValueError( - f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." + f"Must provide the same number of `block_out_channels` as `down_block_types`. \ + `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." ) if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types): raise ValueError( - f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." + f"Must provide the same number of `only_cross_attention` as `down_block_types`. \ + `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." ) if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types): raise ValueError( - f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}." + f"Must provide the same number of `num_attention_heads` as `down_block_types`. \ + `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}." ) if isinstance(transformer_layers_per_block, int): @@ -195,7 +201,8 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin): self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim) elif encoder_hid_dim_type == "text_image_proj": # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much - # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # they are set to `cross_attention_dim` here as this is exactly the required dimension ... + # for the currently only use # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)` self.encoder_hid_proj = TextImageProjection( text_embed_dim=encoder_hid_dim, @@ -243,8 +250,10 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin): text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads ) elif addition_embed_type == "text_image": - # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much - # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. + # To not clutter the __init__ too much + # they are set to `cross_attention_dim` here as this is exactly the required dimension... + # for the currently only use # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)` self.add_embedding = TextImageTimeEmbedding( text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim @@ -666,12 +675,14 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin): elif self.config.addition_embed_type == "text_time": if "text_embeds" not in added_cond_kwargs: raise ValueError( - f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`" + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which \ + requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`" ) text_embeds = added_cond_kwargs.get("text_embeds") if "time_ids" not in added_cond_kwargs: raise ValueError( - f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`" + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which \ + requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`" ) time_ids = added_cond_kwargs.get("time_ids") time_embeds = self.add_time_proj(time_ids.flatten()) @@ -774,7 +785,7 @@ try: import xformers xformers_available = True -except: +except Exception: xformers_available = False From 68dc3c6cb41a492dd1292451a59b0ec832324ca0 Mon Sep 17 00:00:00 2001 From: blessedcoolant <54517381+blessedcoolant@users.noreply.github.com> Date: Tue, 29 Aug 2023 12:58:59 +1200 Subject: [PATCH 5/7] feat: Upgrade compel to 2.0.2 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9aef66a35f..129538264d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ "albumentations", "click", "clip_anytorch", # replacing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip", - "compel~=2.0.0", + "compel~=2.0.2", "controlnet-aux>=0.0.6", "timm==0.6.13", # needed to override timm latest in controlnet_aux, see https://github.com/isl-org/ZoeDepth/issues/26 "datasets", From cfee8d9804fee37a350f55447e4204b5d9d8f576 Mon Sep 17 00:00:00 2001 From: blessedcoolant <54517381+blessedcoolant@users.noreply.github.com> Date: Tue, 29 Aug 2023 13:09:30 +1200 Subject: [PATCH 6/7] chore: seamless print statement cleanup --- invokeai/backend/model_management/seamless.py | 1 - 1 file changed, 1 deletion(-) diff --git a/invokeai/backend/model_management/seamless.py b/invokeai/backend/model_management/seamless.py index 54885769ad..7138f2e123 100644 --- a/invokeai/backend/model_management/seamless.py +++ b/invokeai/backend/model_management/seamless.py @@ -71,7 +71,6 @@ def set_seamless(model: Union[UNet2DConditionModel, AutoencoderKL], seamless_axe """ if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)): - print(f"applied - {m_name}") m.asymmetric_padding_mode = {} m.asymmetric_padding = {} m.asymmetric_padding_mode["x"] = "circular" if ("x" in seamless_axes) else "constant" From 121396f8440517b913f4dce0a763a99c002c9f42 Mon Sep 17 00:00:00 2001 From: Sergey Borisov Date: Tue, 29 Aug 2023 17:07:33 +0300 Subject: [PATCH 7/7] Fix tokenization log for sd models --- invokeai/app/invocations/compel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/invokeai/app/invocations/compel.py b/invokeai/app/invocations/compel.py index 8a4cadc139..e128792d70 100644 --- a/invokeai/app/invocations/compel.py +++ b/invokeai/app/invocations/compel.py @@ -122,7 +122,7 @@ class CompelInvocation(BaseInvocation): conjunction = Compel.parse_prompt_string(self.prompt) if context.services.configuration.log_tokenization: - log_tokenization_for_prompt_object(conjunction, tokenizer) + log_tokenization_for_conjunction(conjunction, tokenizer) c, options = compel.build_conditioning_tensor_for_conjunction(conjunction)