mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
Add support for SDXL textual inversion/embeddings (#5213)
## What type of PR is this? (check all applicable) - [ ] Refactor - [ ] Feature - [X] Bug Fix - [ ] Optimization - [ ] Documentation Update - [ ] Community Node Submission ## Have you discussed this change with the InvokeAI team? - [X] Yes - [ ] No, because: ## Have you updated all relevant documentation? - [X] Yes - [ ] No ## Description This adds support for at least some of the SDXL embeddings currently available on Civitai. The embeddings I have tested include: - https://civitai.com/models/154898/marblingtixl?modelVersionId=173668 - https://civitai.com/models/148131?modelVersionId=167640 - https://civitai.com/models/123485/hannah-ferguson-or-sdxl-or-comfyui-only-or-embedding?modelVersionId=134674 (said to be "comfyui only") - https://civitai.com/models/185938/kendall-jenner-sdxl-embedding?modelVersionId=208785 I am _not entirely sure_ that I have implemented support in the most elegant way. The issue is that these embeddings have two weight tensors, `clip_g` and `clip_l`, which correspond to `text_encoder` and `text_encoder_2` in the main model. When the patcher calls the ModelPatcher's `apply_ti()` method, I simply check the dimensions of the incoming text encoder and choose the weights that match the dimensions of the encoder. While writing this, I also ran into a possible issue with the Compel library's `get_pooled_embeddings()` call. It pads the input token list to the model's max token length and then calls the TI manager to add the additional tokens from the embedding. However, this ends up making the input token list longer than the max length, and CLIPTextEncoder crashes with a tensor size mismatch. I worked around this behavior by making the TI manager's `expand_textual_inversion_token_ids_if_necessary()` method remove the excess pads at the end of the token list. Also note that I have made similar changes to `apply_ti()` in the ONNXModelPatcher, but haven't tested them yet. ## Related Tickets & Documents <!-- For pull requests that relate or close an issue, please include them below. For example having the text: "closes #1234" would connect the current pull request to issue 1234. And when we merge the pull request, Github will automatically close the issue. --> - Related Issue # - Closes #4401 ## QA Instructions, Screenshots, Recordings <!-- Please provide steps on how to test changes, any hardware or software specifications as well as any other pertinent information. --> ## Added/updated tests? - [ ] Yes - [X] No : We need to create tests for model patching... ## [optional] Are there any post deployment tasks we need to perform?
This commit is contained in:
commit
0539a64569
@ -192,20 +192,33 @@ class ModelPatcher:
|
||||
trigger += f"-!pad-{i}"
|
||||
return f"<{trigger}>"
|
||||
|
||||
def _get_ti_embedding(model_embeddings, ti):
|
||||
# for SDXL models, select the embedding that matches the text encoder's dimensions
|
||||
if ti.embedding_2 is not None:
|
||||
return (
|
||||
ti.embedding_2
|
||||
if ti.embedding_2.shape[1] == model_embeddings.weight.data[0].shape[0]
|
||||
else ti.embedding
|
||||
)
|
||||
else:
|
||||
return ti.embedding
|
||||
|
||||
# modify tokenizer
|
||||
new_tokens_added = 0
|
||||
for ti_name, ti in ti_list:
|
||||
for i in range(ti.embedding.shape[0]):
|
||||
ti_embedding = _get_ti_embedding(text_encoder.get_input_embeddings(), ti)
|
||||
|
||||
for i in range(ti_embedding.shape[0]):
|
||||
new_tokens_added += ti_tokenizer.add_tokens(_get_trigger(ti_name, i))
|
||||
|
||||
# modify text_encoder
|
||||
text_encoder.resize_token_embeddings(init_tokens_count + new_tokens_added, pad_to_multiple_of)
|
||||
model_embeddings = text_encoder.get_input_embeddings()
|
||||
|
||||
for ti_name, ti in ti_list:
|
||||
for ti_name, _ in ti_list:
|
||||
ti_tokens = []
|
||||
for i in range(ti.embedding.shape[0]):
|
||||
embedding = ti.embedding[i]
|
||||
for i in range(ti_embedding.shape[0]):
|
||||
embedding = ti_embedding[i]
|
||||
trigger = _get_trigger(ti_name, i)
|
||||
|
||||
token_id = ti_tokenizer.convert_tokens_to_ids(trigger)
|
||||
@ -273,6 +286,7 @@ class ModelPatcher:
|
||||
|
||||
class TextualInversionModel:
|
||||
embedding: torch.Tensor # [n, 768]|[n, 1280]
|
||||
embedding_2: Optional[torch.Tensor] = None # [n, 768]|[n, 1280] - for SDXL models
|
||||
|
||||
@classmethod
|
||||
def from_checkpoint(
|
||||
@ -296,8 +310,8 @@ class TextualInversionModel:
|
||||
if "string_to_param" in state_dict:
|
||||
if len(state_dict["string_to_param"]) > 1:
|
||||
print(
|
||||
f'Warn: Embedding "{file_path.name}" contains multiple tokens, which is not supported. The first'
|
||||
" token will be used."
|
||||
f'Warn: Embedding "{file_path.name}" contains multiple tokens, which is not supported. The first',
|
||||
" token will be used.",
|
||||
)
|
||||
|
||||
result.embedding = next(iter(state_dict["string_to_param"].values()))
|
||||
@ -306,6 +320,11 @@ class TextualInversionModel:
|
||||
elif "emb_params" in state_dict:
|
||||
result.embedding = state_dict["emb_params"]
|
||||
|
||||
# v5(sdxl safetensors file)
|
||||
elif "clip_g" in state_dict and "clip_l" in state_dict:
|
||||
result.embedding = state_dict["clip_g"]
|
||||
result.embedding_2 = state_dict["clip_l"]
|
||||
|
||||
# v4(diffusers bin files)
|
||||
else:
|
||||
result.embedding = next(iter(state_dict.values()))
|
||||
@ -342,6 +361,13 @@ class TextualInversionManager(BaseTextualInversionManager):
|
||||
if token_id in self.pad_tokens:
|
||||
new_token_ids.extend(self.pad_tokens[token_id])
|
||||
|
||||
# Do not exceed the max model input size
|
||||
# The -2 here is compensating for compensate compel.embeddings_provider.get_token_ids(),
|
||||
# which first removes and then adds back the start and end tokens.
|
||||
max_length = list(self.tokenizer.max_model_input_sizes.values())[0] - 2
|
||||
if len(new_token_ids) > max_length:
|
||||
new_token_ids = new_token_ids[0:max_length]
|
||||
|
||||
return new_token_ids
|
||||
|
||||
|
||||
@ -490,24 +516,31 @@ class ONNXModelPatcher:
|
||||
trigger += f"-!pad-{i}"
|
||||
return f"<{trigger}>"
|
||||
|
||||
# modify text_encoder
|
||||
orig_embeddings = text_encoder.tensors["text_model.embeddings.token_embedding.weight"]
|
||||
|
||||
# modify tokenizer
|
||||
new_tokens_added = 0
|
||||
for ti_name, ti in ti_list:
|
||||
for i in range(ti.embedding.shape[0]):
|
||||
new_tokens_added += ti_tokenizer.add_tokens(_get_trigger(ti_name, i))
|
||||
if ti.embedding_2 is not None:
|
||||
ti_embedding = (
|
||||
ti.embedding_2 if ti.embedding_2.shape[1] == orig_embeddings.shape[0] else ti.embedding
|
||||
)
|
||||
else:
|
||||
ti_embedding = ti.embedding
|
||||
|
||||
# modify text_encoder
|
||||
orig_embeddings = text_encoder.tensors["text_model.embeddings.token_embedding.weight"]
|
||||
for i in range(ti_embedding.shape[0]):
|
||||
new_tokens_added += ti_tokenizer.add_tokens(_get_trigger(ti_name, i))
|
||||
|
||||
embeddings = np.concatenate(
|
||||
(np.copy(orig_embeddings), np.zeros((new_tokens_added, orig_embeddings.shape[1]))),
|
||||
axis=0,
|
||||
)
|
||||
|
||||
for ti_name, ti in ti_list:
|
||||
for ti_name, _ in ti_list:
|
||||
ti_tokens = []
|
||||
for i in range(ti.embedding.shape[0]):
|
||||
embedding = ti.embedding[i].detach().numpy()
|
||||
for i in range(ti_embedding.shape[0]):
|
||||
embedding = ti_embedding[i].detach().numpy()
|
||||
trigger = _get_trigger(ti_name, i)
|
||||
|
||||
token_id = ti_tokenizer.convert_tokens_to_ids(trigger)
|
||||
|
@ -373,12 +373,16 @@ class TextualInversionCheckpointProbe(CheckpointProbeBase):
|
||||
token_dim = list(checkpoint["string_to_param"].values())[0].shape[-1]
|
||||
elif "emb_params" in checkpoint:
|
||||
token_dim = checkpoint["emb_params"].shape[-1]
|
||||
elif "clip_g" in checkpoint:
|
||||
token_dim = checkpoint["clip_g"].shape[-1]
|
||||
else:
|
||||
token_dim = list(checkpoint.values())[0].shape[0]
|
||||
if token_dim == 768:
|
||||
return BaseModelType.StableDiffusion1
|
||||
elif token_dim == 1024:
|
||||
return BaseModelType.StableDiffusion2
|
||||
elif token_dim == 1280:
|
||||
return BaseModelType.StableDiffusionXL
|
||||
else:
|
||||
return None
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user