Add support for SDXL textual inversion/embeddings (#5213)

## What type of PR is this? (check all applicable)

- [ ] Refactor
- [ ] Feature
- [X] Bug Fix
- [ ] Optimization
- [ ] Documentation Update
- [ ] Community Node Submission


## Have you discussed this change with the InvokeAI team?
- [X] Yes
- [ ] No, because:

      
## Have you updated all relevant documentation?
- [X] Yes
- [ ] No


## Description

This adds support for at least some of the SDXL embeddings currently
available on Civitai. The embeddings I have tested include:

- https://civitai.com/models/154898/marblingtixl?modelVersionId=173668
- https://civitai.com/models/148131?modelVersionId=167640
-
https://civitai.com/models/123485/hannah-ferguson-or-sdxl-or-comfyui-only-or-embedding?modelVersionId=134674
(said to be "comfyui only")
-
https://civitai.com/models/185938/kendall-jenner-sdxl-embedding?modelVersionId=208785

I am _not entirely sure_ that I have implemented support in the most
elegant way. The issue is that these embeddings have two weight tensors,
`clip_g` and `clip_l`, which correspond to `text_encoder` and
`text_encoder_2` in the main model. When the patcher calls the
ModelPatcher's `apply_ti()` method, I simply check the dimensions of the
incoming text encoder and choose the weights that match the dimensions
of the encoder.

While writing this, I also ran into a possible issue with the Compel
library's `get_pooled_embeddings()` call. It pads the input token list
to the model's max token length and then calls the TI manager to add the
additional tokens from the embedding. However, this ends up making the
input token list longer than the max length, and CLIPTextEncoder crashes
with a tensor size mismatch. I worked around this behavior by making the
TI manager's `expand_textual_inversion_token_ids_if_necessary()` method
remove the excess pads at the end of the token list.

Also note that I have made similar changes to `apply_ti()` in the
ONNXModelPatcher, but haven't tested them yet.

## Related Tickets & Documents

<!--
For pull requests that relate or close an issue, please include them
below. 

For example having the text: "closes #1234" would connect the current
pull
request to issue 1234.  And when we merge the pull request, Github will
automatically close the issue.
-->

- Related Issue #
- Closes #4401 

## QA Instructions, Screenshots, Recordings

<!-- 
Please provide steps on how to test changes, any hardware or 
software specifications as well as any other pertinent information. 
-->

## Added/updated tests?

- [ ] Yes
- [X] No : We need to create tests for model patching...

## [optional] Are there any post deployment tasks we need to perform?
This commit is contained in:
Lincoln Stein 2023-12-01 09:17:01 -05:00 committed by GitHub
commit 0539a64569
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 50 additions and 13 deletions

View File

@ -192,20 +192,33 @@ class ModelPatcher:
trigger += f"-!pad-{i}"
return f"<{trigger}>"
def _get_ti_embedding(model_embeddings, ti):
# for SDXL models, select the embedding that matches the text encoder's dimensions
if ti.embedding_2 is not None:
return (
ti.embedding_2
if ti.embedding_2.shape[1] == model_embeddings.weight.data[0].shape[0]
else ti.embedding
)
else:
return ti.embedding
# modify tokenizer
new_tokens_added = 0
for ti_name, ti in ti_list:
for i in range(ti.embedding.shape[0]):
ti_embedding = _get_ti_embedding(text_encoder.get_input_embeddings(), ti)
for i in range(ti_embedding.shape[0]):
new_tokens_added += ti_tokenizer.add_tokens(_get_trigger(ti_name, i))
# modify text_encoder
text_encoder.resize_token_embeddings(init_tokens_count + new_tokens_added, pad_to_multiple_of)
model_embeddings = text_encoder.get_input_embeddings()
for ti_name, ti in ti_list:
for ti_name, _ in ti_list:
ti_tokens = []
for i in range(ti.embedding.shape[0]):
embedding = ti.embedding[i]
for i in range(ti_embedding.shape[0]):
embedding = ti_embedding[i]
trigger = _get_trigger(ti_name, i)
token_id = ti_tokenizer.convert_tokens_to_ids(trigger)
@ -273,6 +286,7 @@ class ModelPatcher:
class TextualInversionModel:
embedding: torch.Tensor # [n, 768]|[n, 1280]
embedding_2: Optional[torch.Tensor] = None # [n, 768]|[n, 1280] - for SDXL models
@classmethod
def from_checkpoint(
@ -296,8 +310,8 @@ class TextualInversionModel:
if "string_to_param" in state_dict:
if len(state_dict["string_to_param"]) > 1:
print(
f'Warn: Embedding "{file_path.name}" contains multiple tokens, which is not supported. The first'
" token will be used."
f'Warn: Embedding "{file_path.name}" contains multiple tokens, which is not supported. The first',
" token will be used.",
)
result.embedding = next(iter(state_dict["string_to_param"].values()))
@ -306,6 +320,11 @@ class TextualInversionModel:
elif "emb_params" in state_dict:
result.embedding = state_dict["emb_params"]
# v5(sdxl safetensors file)
elif "clip_g" in state_dict and "clip_l" in state_dict:
result.embedding = state_dict["clip_g"]
result.embedding_2 = state_dict["clip_l"]
# v4(diffusers bin files)
else:
result.embedding = next(iter(state_dict.values()))
@ -342,6 +361,13 @@ class TextualInversionManager(BaseTextualInversionManager):
if token_id in self.pad_tokens:
new_token_ids.extend(self.pad_tokens[token_id])
# Do not exceed the max model input size
# The -2 here is compensating for compensate compel.embeddings_provider.get_token_ids(),
# which first removes and then adds back the start and end tokens.
max_length = list(self.tokenizer.max_model_input_sizes.values())[0] - 2
if len(new_token_ids) > max_length:
new_token_ids = new_token_ids[0:max_length]
return new_token_ids
@ -490,24 +516,31 @@ class ONNXModelPatcher:
trigger += f"-!pad-{i}"
return f"<{trigger}>"
# modify text_encoder
orig_embeddings = text_encoder.tensors["text_model.embeddings.token_embedding.weight"]
# modify tokenizer
new_tokens_added = 0
for ti_name, ti in ti_list:
for i in range(ti.embedding.shape[0]):
new_tokens_added += ti_tokenizer.add_tokens(_get_trigger(ti_name, i))
if ti.embedding_2 is not None:
ti_embedding = (
ti.embedding_2 if ti.embedding_2.shape[1] == orig_embeddings.shape[0] else ti.embedding
)
else:
ti_embedding = ti.embedding
# modify text_encoder
orig_embeddings = text_encoder.tensors["text_model.embeddings.token_embedding.weight"]
for i in range(ti_embedding.shape[0]):
new_tokens_added += ti_tokenizer.add_tokens(_get_trigger(ti_name, i))
embeddings = np.concatenate(
(np.copy(orig_embeddings), np.zeros((new_tokens_added, orig_embeddings.shape[1]))),
axis=0,
)
for ti_name, ti in ti_list:
for ti_name, _ in ti_list:
ti_tokens = []
for i in range(ti.embedding.shape[0]):
embedding = ti.embedding[i].detach().numpy()
for i in range(ti_embedding.shape[0]):
embedding = ti_embedding[i].detach().numpy()
trigger = _get_trigger(ti_name, i)
token_id = ti_tokenizer.convert_tokens_to_ids(trigger)

View File

@ -373,12 +373,16 @@ class TextualInversionCheckpointProbe(CheckpointProbeBase):
token_dim = list(checkpoint["string_to_param"].values())[0].shape[-1]
elif "emb_params" in checkpoint:
token_dim = checkpoint["emb_params"].shape[-1]
elif "clip_g" in checkpoint:
token_dim = checkpoint["clip_g"].shape[-1]
else:
token_dim = list(checkpoint.values())[0].shape[0]
if token_dim == 768:
return BaseModelType.StableDiffusion1
elif token_dim == 1024:
return BaseModelType.StableDiffusion2
elif token_dim == 1280:
return BaseModelType.StableDiffusionXL
else:
return None