From 0719a46372bde45bd0b60d251030e8a47b0d994b Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Fri, 1 Dec 2023 01:28:28 -0500 Subject: [PATCH 1/3] add support for SDXL textual inversion/embeddings --- invokeai/backend/model_management/lora.py | 52 +++++++++++++++---- .../backend/model_management/model_probe.py | 4 ++ 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/invokeai/backend/model_management/lora.py b/invokeai/backend/model_management/lora.py index 4389cacacc..f9c40f8386 100644 --- a/invokeai/backend/model_management/lora.py +++ b/invokeai/backend/model_management/lora.py @@ -192,10 +192,19 @@ class ModelPatcher: trigger += f"-!pad-{i}" return f"<{trigger}>" + def _get_ti_embedding(model_embeddings, ti): + # for SDXL models, select the embedding that matches the text encoder's dimensions + if ti.embedding_2 is not None: + return ti.embedding_2 if ti.embedding_2.shape[1] == model_embeddings.weight.data[0].shape[0] else ti.embedding + else: + return ti.embedding + # modify tokenizer new_tokens_added = 0 for ti_name, ti in ti_list: - for i in range(ti.embedding.shape[0]): + ti_embedding = _get_ti_embedding(text_encoder.get_input_embeddings(), ti) + + for i in range(ti_embedding.shape[0]): new_tokens_added += ti_tokenizer.add_tokens(_get_trigger(ti_name, i)) # modify text_encoder @@ -203,9 +212,10 @@ class ModelPatcher: model_embeddings = text_encoder.get_input_embeddings() for ti_name, ti in ti_list: + ti_tokens = [] - for i in range(ti.embedding.shape[0]): - embedding = ti.embedding[i] + for i in range(ti_embedding.shape[0]): + embedding = ti_embedding[i] trigger = _get_trigger(ti_name, i) token_id = ti_tokenizer.convert_tokens_to_ids(trigger) @@ -272,7 +282,8 @@ class ModelPatcher: class TextualInversionModel: - embedding: torch.Tensor # [n, 768]|[n, 1280] + embedding: torch.Tensor # [n, 768]|[n, 1280] + embedding_2: Optional[torch.Tensor] = None # [n, 768]|[n, 1280] - for SDXL models @classmethod def from_checkpoint( @@ -296,7 +307,7 @@ class TextualInversionModel: if "string_to_param" in state_dict: if len(state_dict["string_to_param"]) > 1: print( - f'Warn: Embedding "{file_path.name}" contains multiple tokens, which is not supported. The first' + f'Warn: Embedding "{file_path.name}" contains multiple tokens, which is not supported. The first', " token will be used." ) @@ -306,6 +317,11 @@ class TextualInversionModel: elif "emb_params" in state_dict: result.embedding = state_dict["emb_params"] + # v5(sdxl safetensors file) + elif "clip_g" in state_dict and "clip_l" in state_dict: + result.embedding = state_dict["clip_g"] + result.embedding_2 = state_dict["clip_l"] + # v4(diffusers bin files) else: result.embedding = next(iter(state_dict.values())) @@ -316,6 +332,7 @@ class TextualInversionModel: if not isinstance(result.embedding, torch.Tensor): raise ValueError(f"Invalid embeddings file: {file_path.name}") + return result @@ -342,6 +359,13 @@ class TextualInversionManager(BaseTextualInversionManager): if token_id in self.pad_tokens: new_token_ids.extend(self.pad_tokens[token_id]) + # Do not exceed the max model input size + # The -2 here is compensating for compensate compel.embeddings_provider.get_token_ids(), + # which first removes and then adds back the start and end tokens. + max_length = list(self.tokenizer.max_model_input_sizes.values())[0] - 2 + if len(new_token_ids) > max_length: + new_token_ids = new_token_ids[0:max_length] + return new_token_ids @@ -490,14 +514,20 @@ class ONNXModelPatcher: trigger += f"-!pad-{i}" return f"<{trigger}>" + # modify text_encoder + orig_embeddings = text_encoder.tensors["text_model.embeddings.token_embedding.weight"] + # modify tokenizer new_tokens_added = 0 for ti_name, ti in ti_list: - for i in range(ti.embedding.shape[0]): - new_tokens_added += ti_tokenizer.add_tokens(_get_trigger(ti_name, i)) - # modify text_encoder - orig_embeddings = text_encoder.tensors["text_model.embeddings.token_embedding.weight"] + if ti.embedding_2 is not None: + ti_embedding = ti.embedding_2 if ti.embedding_2.shape[1] == orig_embeddings.shape[0] else ti.embedding + else: + ti_embedding = ti.embedding + + for i in range(ti_embedding.shape[0]): + new_tokens_added += ti_tokenizer.add_tokens(_get_trigger(ti_name, i)) embeddings = np.concatenate( (np.copy(orig_embeddings), np.zeros((new_tokens_added, orig_embeddings.shape[1]))), @@ -506,8 +536,8 @@ class ONNXModelPatcher: for ti_name, ti in ti_list: ti_tokens = [] - for i in range(ti.embedding.shape[0]): - embedding = ti.embedding[i].detach().numpy() + for i in range(ti_embedding.shape[0]): + embedding = ti_embedding[i].detach().numpy() trigger = _get_trigger(ti_name, i) token_id = ti_tokenizer.convert_tokens_to_ids(trigger) diff --git a/invokeai/backend/model_management/model_probe.py b/invokeai/backend/model_management/model_probe.py index aebe30f116..af4f3f2a62 100644 --- a/invokeai/backend/model_management/model_probe.py +++ b/invokeai/backend/model_management/model_probe.py @@ -373,12 +373,16 @@ class TextualInversionCheckpointProbe(CheckpointProbeBase): token_dim = list(checkpoint["string_to_param"].values())[0].shape[-1] elif "emb_params" in checkpoint: token_dim = checkpoint["emb_params"].shape[-1] + elif "clip_g" in checkpoint: + token_dim = checkpoint["clip_g"].shape[-1] else: token_dim = list(checkpoint.values())[0].shape[0] if token_dim == 768: return BaseModelType.StableDiffusion1 elif token_dim == 1024: return BaseModelType.StableDiffusion2 + elif token_dim == 1280: + return BaseModelType.StableDiffusionXL else: return None From f95ce1870c03381999c637b46c991e47dd4aa520 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Fri, 1 Dec 2023 01:46:12 -0500 Subject: [PATCH 2/3] fix ruff format check --- invokeai/backend/model_management/lora.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/invokeai/backend/model_management/lora.py b/invokeai/backend/model_management/lora.py index f9c40f8386..acd1f6bab6 100644 --- a/invokeai/backend/model_management/lora.py +++ b/invokeai/backend/model_management/lora.py @@ -195,7 +195,11 @@ class ModelPatcher: def _get_ti_embedding(model_embeddings, ti): # for SDXL models, select the embedding that matches the text encoder's dimensions if ti.embedding_2 is not None: - return ti.embedding_2 if ti.embedding_2.shape[1] == model_embeddings.weight.data[0].shape[0] else ti.embedding + return ( + ti.embedding_2 + if ti.embedding_2.shape[1] == model_embeddings.weight.data[0].shape[0] + else ti.embedding + ) else: return ti.embedding @@ -212,7 +216,6 @@ class ModelPatcher: model_embeddings = text_encoder.get_input_embeddings() for ti_name, ti in ti_list: - ti_tokens = [] for i in range(ti_embedding.shape[0]): embedding = ti_embedding[i] @@ -282,7 +285,7 @@ class ModelPatcher: class TextualInversionModel: - embedding: torch.Tensor # [n, 768]|[n, 1280] + embedding: torch.Tensor # [n, 768]|[n, 1280] embedding_2: Optional[torch.Tensor] = None # [n, 768]|[n, 1280] - for SDXL models @classmethod @@ -308,7 +311,7 @@ class TextualInversionModel: if len(state_dict["string_to_param"]) > 1: print( f'Warn: Embedding "{file_path.name}" contains multiple tokens, which is not supported. The first', - " token will be used." + " token will be used.", ) result.embedding = next(iter(state_dict["string_to_param"].values())) @@ -319,8 +322,8 @@ class TextualInversionModel: # v5(sdxl safetensors file) elif "clip_g" in state_dict and "clip_l" in state_dict: - result.embedding = state_dict["clip_g"] - result.embedding_2 = state_dict["clip_l"] + result.embedding = state_dict["clip_g"] + result.embedding_2 = state_dict["clip_l"] # v4(diffusers bin files) else: @@ -332,7 +335,6 @@ class TextualInversionModel: if not isinstance(result.embedding, torch.Tensor): raise ValueError(f"Invalid embeddings file: {file_path.name}") - return result @@ -520,9 +522,10 @@ class ONNXModelPatcher: # modify tokenizer new_tokens_added = 0 for ti_name, ti in ti_list: - if ti.embedding_2 is not None: - ti_embedding = ti.embedding_2 if ti.embedding_2.shape[1] == orig_embeddings.shape[0] else ti.embedding + ti_embedding = ( + ti.embedding_2 if ti.embedding_2.shape[1] == orig_embeddings.shape[0] else ti.embedding + ) else: ti_embedding = ti.embedding From 5a3f1f2b2228d6f8d4f09f95fec93e738d89f5d8 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Fri, 1 Dec 2023 01:59:26 -0500 Subject: [PATCH 3/3] fix ruff github format errors --- invokeai/backend/model_management/lora.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/invokeai/backend/model_management/lora.py b/invokeai/backend/model_management/lora.py index acd1f6bab6..3d2136659f 100644 --- a/invokeai/backend/model_management/lora.py +++ b/invokeai/backend/model_management/lora.py @@ -215,7 +215,7 @@ class ModelPatcher: text_encoder.resize_token_embeddings(init_tokens_count + new_tokens_added, pad_to_multiple_of) model_embeddings = text_encoder.get_input_embeddings() - for ti_name, ti in ti_list: + for ti_name, _ in ti_list: ti_tokens = [] for i in range(ti_embedding.shape[0]): embedding = ti_embedding[i] @@ -537,7 +537,7 @@ class ONNXModelPatcher: axis=0, ) - for ti_name, ti in ti_list: + for ti_name, _ in ti_list: ti_tokens = [] for i in range(ti_embedding.shape[0]): embedding = ti_embedding[i].detach().numpy()