Add support for SDXL textual inversion/embeddings (#5213)

## What type of PR is this? (check all applicable) - [ ] Refactor - [ ] Feature - [X] Bug Fix - [ ] Optimization - [ ] Documentation Update - [ ] Community Node Submission ## Have you discussed this change with the InvokeAI team? - [X] Yes - [ ] No, because: ## Have you updated all relevant documentation? - [X] Yes - [ ] No ## Description This adds support for at least some of the SDXL embeddings currently available on Civitai. The embeddings I have tested include: - https://civitai.com/models/154898/marblingtixl?modelVersionId=173668 - https://civitai.com/models/148131?modelVersionId=167640 - https://civitai.com/models/123485/hannah-ferguson-or-sdxl-or-comfyui-only-or-embedding?modelVersionId=134674 (said to be "comfyui only") - https://civitai.com/models/185938/kendall-jenner-sdxl-embedding?modelVersionId=208785 I am _not entirely sure_ that I have implemented support in the most elegant way. The issue is that these embeddings have two weight tensors, `clip_g` and `clip_l`, which correspond to `text_encoder` and `text_encoder_2` in the main model. When the patcher calls the ModelPatcher's `apply_ti()` method, I simply check the dimensions of the incoming text encoder and choose the weights that match the dimensions of the encoder. While writing this, I also ran into a possible issue with the Compel library's `get_pooled_embeddings()` call. It pads the input token list to the model's max token length and then calls the TI manager to add the additional tokens from the embedding. However, this ends up making the input token list longer than the max length, and CLIPTextEncoder crashes with a tensor size mismatch. I worked around this behavior by making the TI manager's `expand_textual_inversion_token_ids_if_necessary()` method remove the excess pads at the end of the token list. Also note that I have made similar changes to `apply_ti()` in the ONNXModelPatcher, but haven't tested them yet. ## Related Tickets & Documents  - Related Issue # - Closes #4401 ## QA Instructions, Screenshots, Recordings  ## Added/updated tests? - [ ] Yes - [X] No : We need to create tests for model patching... ## [optional] Are there any post deployment tasks we need to perform?
2024-08-30 20:32:17 +00:00 · 2023-12-01 09:17:01 -05:00
parent a8ef4e5be8 5a3f1f2b22
commit 0539a64569
2 changed files with 50 additions and 13 deletions
--- a/invokeai/backend/model_management/lora.py
+++ b/invokeai/backend/model_management/lora.py
@ -192,20 +192,33 @@ class ModelPatcher:
                    trigger += f"-!pad-{i}"
                return f"<{trigger}>"

+            def _get_ti_embedding(model_embeddings, ti):
+                # for SDXL models, select the embedding that matches the text encoder's dimensions
+                if ti.embedding_2 is not None:
+                    return (
+                        ti.embedding_2
+                        if ti.embedding_2.shape[1] == model_embeddings.weight.data[0].shape[0]
+                        else ti.embedding
+                    )
+                else:
+                    return ti.embedding
+
            # modify tokenizer
            new_tokens_added = 0
            for ti_name, ti in ti_list:
-                for i in range(ti.embedding.shape[0]):
+                ti_embedding = _get_ti_embedding(text_encoder.get_input_embeddings(), ti)
+
+                for i in range(ti_embedding.shape[0]):
                    new_tokens_added += ti_tokenizer.add_tokens(_get_trigger(ti_name, i))

            # modify text_encoder
            text_encoder.resize_token_embeddings(init_tokens_count + new_tokens_added, pad_to_multiple_of)
            model_embeddings = text_encoder.get_input_embeddings()

-            for ti_name, ti in ti_list:
+            for ti_name, _ in ti_list:
                ti_tokens = []
-                for i in range(ti.embedding.shape[0]):
-                    embedding = ti.embedding[i]
+                for i in range(ti_embedding.shape[0]):
+                    embedding = ti_embedding[i]
                    trigger = _get_trigger(ti_name, i)

                    token_id = ti_tokenizer.convert_tokens_to_ids(trigger)
@ -273,6 +286,7 @@ class ModelPatcher:

 class TextualInversionModel:
    embedding: torch.Tensor  # [n, 768]|[n, 1280]
+    embedding_2: Optional[torch.Tensor] = None  # [n, 768]|[n, 1280]   - for SDXL models

    @classmethod
    def from_checkpoint(
@ -296,8 +310,8 @@ class TextualInversionModel:
        if "string_to_param" in state_dict:
            if len(state_dict["string_to_param"]) > 1:
                print(
-                    f'Warn: Embedding "{file_path.name}" contains multiple tokens, which is not supported. The first'
-                    " token will be used."
+                    f'Warn: Embedding "{file_path.name}" contains multiple tokens, which is not supported. The first',
+                    " token will be used.",
                )

            result.embedding = next(iter(state_dict["string_to_param"].values()))
@ -306,6 +320,11 @@ class TextualInversionModel:
        elif "emb_params" in state_dict:
            result.embedding = state_dict["emb_params"]

+        # v5(sdxl safetensors file)
+        elif "clip_g" in state_dict and "clip_l" in state_dict:
+            result.embedding = state_dict["clip_g"]
+            result.embedding_2 = state_dict["clip_l"]
+
        # v4(diffusers bin files)
        else:
            result.embedding = next(iter(state_dict.values()))
@ -342,6 +361,13 @@ class TextualInversionManager(BaseTextualInversionManager):
            if token_id in self.pad_tokens:
                new_token_ids.extend(self.pad_tokens[token_id])

+        # Do not exceed the max model input size
+        # The -2 here is compensating for compensate compel.embeddings_provider.get_token_ids(),
+        # which first removes and then adds back the start and end tokens.
+        max_length = list(self.tokenizer.max_model_input_sizes.values())[0] - 2
+        if len(new_token_ids) > max_length:
+            new_token_ids = new_token_ids[0:max_length]
+
        return new_token_ids


@ -490,24 +516,31 @@ class ONNXModelPatcher:
                    trigger += f"-!pad-{i}"
                return f"<{trigger}>"

+            # modify text_encoder
+            orig_embeddings = text_encoder.tensors["text_model.embeddings.token_embedding.weight"]
+
            # modify tokenizer
            new_tokens_added = 0
            for ti_name, ti in ti_list:
-                for i in range(ti.embedding.shape[0]):
-                    new_tokens_added += ti_tokenizer.add_tokens(_get_trigger(ti_name, i))
+                if ti.embedding_2 is not None:
+                    ti_embedding = (
+                        ti.embedding_2 if ti.embedding_2.shape[1] == orig_embeddings.shape[0] else ti.embedding
+                    )
+                else:
+                    ti_embedding = ti.embedding

-            # modify text_encoder
-            orig_embeddings = text_encoder.tensors["text_model.embeddings.token_embedding.weight"]
+                for i in range(ti_embedding.shape[0]):
+                    new_tokens_added += ti_tokenizer.add_tokens(_get_trigger(ti_name, i))

            embeddings = np.concatenate(
                (np.copy(orig_embeddings), np.zeros((new_tokens_added, orig_embeddings.shape[1]))),
                axis=0,
            )

-            for ti_name, ti in ti_list:
+            for ti_name, _ in ti_list:
                ti_tokens = []
-                for i in range(ti.embedding.shape[0]):
-                    embedding = ti.embedding[i].detach().numpy()
+                for i in range(ti_embedding.shape[0]):
+                    embedding = ti_embedding[i].detach().numpy()
                    trigger = _get_trigger(ti_name, i)

                    token_id = ti_tokenizer.convert_tokens_to_ids(trigger)
--- a/invokeai/backend/model_management/model_probe.py
+++ b/invokeai/backend/model_management/model_probe.py
@ -373,12 +373,16 @@ class TextualInversionCheckpointProbe(CheckpointProbeBase):
            token_dim = list(checkpoint["string_to_param"].values())[0].shape[-1]
        elif "emb_params" in checkpoint:
            token_dim = checkpoint["emb_params"].shape[-1]
+        elif "clip_g" in checkpoint:
+            token_dim = checkpoint["clip_g"].shape[-1]
        else:
            token_dim = list(checkpoint.values())[0].shape[0]
        if token_dim == 768:
            return BaseModelType.StableDiffusion1
        elif token_dim == 1024:
            return BaseModelType.StableDiffusion2
+        elif token_dim == 1280:
+            return BaseModelType.StableDiffusionXL
        else:
            return None