InvokeAI/invokeai/backend/textual_inversion.py

"""Textual Inversion wrapper class."""

from pathlib import Path
from typing import Optional, Union

import torch
from compel.embeddings_provider import BaseTextualInversionManager
from safetensors.torch import load_file
from transformers import CLIPTokenizer
from typing_extensions import Self

from .raw_model import RawModel


class TextualInversionModelRaw(RawModel):
    embedding: torch.Tensor  # [n, 768]|[n, 1280]
    embedding_2: Optional[torch.Tensor] = None  # [n, 768]|[n, 1280]   - for SDXL models

    @classmethod
    def from_checkpoint(
        cls,
        file_path: Union[str, Path],
        device: Optional[torch.device] = None,
        dtype: Optional[torch.dtype] = None,
    ) -> Self:
        if not isinstance(file_path, Path):
            file_path = Path(file_path)

        result = cls()  # TODO:

        if file_path.suffix == ".safetensors":
            state_dict = load_file(file_path.absolute().as_posix(), device="cpu")
        else:
            state_dict = torch.load(file_path, map_location="cpu")

        # both v1 and v2 format embeddings
        # difference mostly in metadata
        if "string_to_param" in state_dict:
            if len(state_dict["string_to_param"]) > 1:
                print(
                    f'Warn: Embedding "{file_path.name}" contains multiple tokens, which is not supported. The first',
                    " token will be used.",
                )

            result.embedding = next(iter(state_dict["string_to_param"].values()))

        # v3 (easynegative)
        elif "emb_params" in state_dict:
            result.embedding = state_dict["emb_params"]

        # v5(sdxl safetensors file)
        elif "clip_g" in state_dict and "clip_l" in state_dict:
            result.embedding = state_dict["clip_g"]
            result.embedding_2 = state_dict["clip_l"]

        # v4(diffusers bin files)
        else:
            result.embedding = next(iter(state_dict.values()))

            if len(result.embedding.shape) == 1:
                result.embedding = result.embedding.unsqueeze(0)

            if not isinstance(result.embedding, torch.Tensor):
                raise ValueError(f"Invalid embeddings file: {file_path.name}")

        return result


class TextualInversionManager(BaseTextualInversionManager):
    """TextualInversionManager implements the BaseTextualInversionManager ABC from the compel library."""

    def __init__(self, tokenizer: CLIPTokenizer):
        self.pad_tokens: dict[int, list[int]] = {}
        self.tokenizer = tokenizer

    def expand_textual_inversion_token_ids_if_necessary(self, token_ids: list[int]) -> list[int]:
        """Given a list of tokens ids, expand any TI tokens to their corresponding pad tokens.

        For example, suppose we have a `<ti_dog>` TI with 4 vectors that was added to the tokenizer with the following
        mapping of tokens to token_ids:
        ```
        <ti_dog>: 49408
        <ti_dog-!pad-1>: 49409
        <ti_dog-!pad-2>: 49410
        <ti_dog-!pad-3>: 49411
        ```
        `self.pad_tokens` would be set to `{49408: [49408, 49409, 49410, 49411]}`.
        This function is responsible for expanding `49408` in the token_ids list to `[49408, 49409, 49410, 49411]`.
        """
        # Short circuit if there are no pad tokens to save a little time.
        if len(self.pad_tokens) == 0:
            return token_ids

        # This function assumes that compel has not included the BOS and EOS tokens in the token_ids list. We verify
        # this assumption here.
        if token_ids[0] == self.tokenizer.bos_token_id:
            raise ValueError("token_ids must not start with bos_token_id")
        if token_ids[-1] == self.tokenizer.eos_token_id:
            raise ValueError("token_ids must not end with eos_token_id")

        # Expand any TI tokens to their corresponding pad tokens.
        new_token_ids: list[int] = []
        for token_id in token_ids:
            new_token_ids.append(token_id)
            if token_id in self.pad_tokens:
                new_token_ids.extend(self.pad_tokens[token_id])

        # Do not exceed the max model input size. The -2 here is compensating for
        # compel.embeddings_provider.get_token_ids(), which first removes and then adds back the start and end tokens.
        max_length = self.tokenizer.model_max_length - 2
        if len(new_token_ids) > max_length:
            # HACK: If TI token expansion causes us to exceed the max text encoder input length, we silently discard
            # tokens. Token expansion should happen in a way that is compatible with compel's default handling of long
            # prompts.
            new_token_ids = new_token_ids[0:max_length]

        return new_token_ids
BREAKING CHANGES: invocations now require model key, not base/type/name - Implement new model loader and modify invocations and embeddings - Finish implementation loaders for all models currently supported by InvokeAI. - Move lora, textual_inversion, and model patching support into backend/embeddings. - Restore support for model cache statistics collection (a little ugly, needs work). - Fixed up invocations that load and patch models. - Move seamless and silencewarnings utils into better location 2024-02-06 03:56:32 +00:00			`"""Textual Inversion wrapper class."""`

			`from pathlib import Path`
Add docs to TextualInversionManager and improve types. No changes to functionality. 2024-05-27 14:32:49 +00:00			`from typing import Optional, Union`
BREAKING CHANGES: invocations now require model key, not base/type/name - Implement new model loader and modify invocations and embeddings - Finish implementation loaders for all models currently supported by InvokeAI. - Move lora, textual_inversion, and model patching support into backend/embeddings. - Restore support for model cache statistics collection (a little ugly, needs work). - Fixed up invocations that load and patch models. - Move seamless and silencewarnings utils into better location 2024-02-06 03:56:32 +00:00
			`import torch`
			`from compel.embeddings_provider import BaseTextualInversionManager`
			`from safetensors.torch import load_file`
			`from transformers import CLIPTokenizer`
			`from typing_extensions import Self`
final tidying before marking PR as ready for review - Replace AnyModelLoader with ModelLoaderRegistry - Fix type check errors in multiple files - Remove apparently unneeded `get_model_config_enum()` method from model manager - Remove last vestiges of old model manager - Updated tests and documentation resolve conflict with seamless.py 2024-02-18 06:27:42 +00:00
Tidy names and locations of modules - Rename old "model_management" directory to "model_management_OLD" in order to catch dangling references to original model manager. - Caught and fixed most dangling references (still checking) - Rename lora, textual_inversion and model_patcher modules - Introduce a RawModel base class to simplfy the Union returned by the model loaders. - Tidy up the model manager 2-related tests. Add useful fixtures, and a finalizer to the queue and installer fixtures that will stop the services and release threads. 2024-02-17 16:45:32 +00:00			`from .raw_model import RawModel`
BREAKING CHANGES: invocations now require model key, not base/type/name - Implement new model loader and modify invocations and embeddings - Finish implementation loaders for all models currently supported by InvokeAI. - Move lora, textual_inversion, and model patching support into backend/embeddings. - Restore support for model cache statistics collection (a little ugly, needs work). - Fixed up invocations that load and patch models. - Move seamless and silencewarnings utils into better location 2024-02-06 03:56:32 +00:00
final tidying before marking PR as ready for review - Replace AnyModelLoader with ModelLoaderRegistry - Fix type check errors in multiple files - Remove apparently unneeded `get_model_config_enum()` method from model manager - Remove last vestiges of old model manager - Updated tests and documentation resolve conflict with seamless.py 2024-02-18 06:27:42 +00:00
Tidy names and locations of modules - Rename old "model_management" directory to "model_management_OLD" in order to catch dangling references to original model manager. - Caught and fixed most dangling references (still checking) - Rename lora, textual_inversion and model_patcher modules - Introduce a RawModel base class to simplfy the Union returned by the model loaders. - Tidy up the model manager 2-related tests. Add useful fixtures, and a finalizer to the queue and installer fixtures that will stop the services and release threads. 2024-02-17 16:45:32 +00:00			`class TextualInversionModelRaw(RawModel):`
BREAKING CHANGES: invocations now require model key, not base/type/name - Implement new model loader and modify invocations and embeddings - Finish implementation loaders for all models currently supported by InvokeAI. - Move lora, textual_inversion, and model patching support into backend/embeddings. - Restore support for model cache statistics collection (a little ugly, needs work). - Fixed up invocations that load and patch models. - Move seamless and silencewarnings utils into better location 2024-02-06 03:56:32 +00:00			`embedding: torch.Tensor # [n, 768]\|[n, 1280]`
			`embedding_2: Optional[torch.Tensor] = None # [n, 768]\|[n, 1280] - for SDXL models`

			`@classmethod`
			`def from_checkpoint(`
			`cls,`
			`file_path: Union[str, Path],`
			`device: Optional[torch.device] = None,`
			`dtype: Optional[torch.dtype] = None,`
			`) -> Self:`
			`if not isinstance(file_path, Path):`
			`file_path = Path(file_path)`

			`result = cls() # TODO:`

			`if file_path.suffix == ".safetensors":`
			`state_dict = load_file(file_path.absolute().as_posix(), device="cpu")`
			`else:`
			`state_dict = torch.load(file_path, map_location="cpu")`

			`# both v1 and v2 format embeddings`
			`# difference mostly in metadata`
			`if "string_to_param" in state_dict:`
			`if len(state_dict["string_to_param"]) > 1:`
			`print(`
			`f'Warn: Embedding "{file_path.name}" contains multiple tokens, which is not supported. The first',`
			`" token will be used.",`
			`)`

			`result.embedding = next(iter(state_dict["string_to_param"].values()))`

			`# v3 (easynegative)`
			`elif "emb_params" in state_dict:`
			`result.embedding = state_dict["emb_params"]`

			`# v5(sdxl safetensors file)`
			`elif "clip_g" in state_dict and "clip_l" in state_dict:`
			`result.embedding = state_dict["clip_g"]`
			`result.embedding_2 = state_dict["clip_l"]`

			`# v4(diffusers bin files)`
			`else:`
			`result.embedding = next(iter(state_dict.values()))`

			`if len(result.embedding.shape) == 1:`
			`result.embedding = result.embedding.unsqueeze(0)`

			`if not isinstance(result.embedding, torch.Tensor):`
			`raise ValueError(f"Invalid embeddings file: {file_path.name}")`

			`return result`


Add docs to TextualInversionManager and improve types. No changes to functionality. 2024-05-27 14:32:49 +00:00			`class TextualInversionManager(BaseTextualInversionManager):`
			`"""TextualInversionManager implements the BaseTextualInversionManager ABC from the compel library."""`
BREAKING CHANGES: invocations now require model key, not base/type/name - Implement new model loader and modify invocations and embeddings - Finish implementation loaders for all models currently supported by InvokeAI. - Move lora, textual_inversion, and model patching support into backend/embeddings. - Restore support for model cache statistics collection (a little ugly, needs work). - Fixed up invocations that load and patch models. - Move seamless and silencewarnings utils into better location 2024-02-06 03:56:32 +00:00
			`def __init__(self, tokenizer: CLIPTokenizer):`
Add docs to TextualInversionManager and improve types. No changes to functionality. 2024-05-27 14:32:49 +00:00			`self.pad_tokens: dict[int, list[int]] = {}`
BREAKING CHANGES: invocations now require model key, not base/type/name - Implement new model loader and modify invocations and embeddings - Finish implementation loaders for all models currently supported by InvokeAI. - Move lora, textual_inversion, and model patching support into backend/embeddings. - Restore support for model cache statistics collection (a little ugly, needs work). - Fixed up invocations that load and patch models. - Move seamless and silencewarnings utils into better location 2024-02-06 03:56:32 +00:00			`self.tokenizer = tokenizer`

			`def expand_textual_inversion_token_ids_if_necessary(self, token_ids: list[int]) -> list[int]:`
Add docs to TextualInversionManager and improve types. No changes to functionality. 2024-05-27 14:32:49 +00:00			`"""Given a list of tokens ids, expand any TI tokens to their corresponding pad tokens.`

			For example, suppose we have a `<ti_dog>` TI with 4 vectors that was added to the tokenizer with the following
			`mapping of tokens to token_ids:`
			```
			`<ti_dog>: 49408`
			`<ti_dog-!pad-1>: 49409`
			`<ti_dog-!pad-2>: 49410`
			`<ti_dog-!pad-3>: 49411`
			```
			`self.pad_tokens` would be set to `{49408: [49408, 49409, 49410, 49411]}`.
			This function is responsible for expanding `49408` in the token_ids list to `[49408, 49409, 49410, 49411]`.
			`"""`
			`# Short circuit if there are no pad tokens to save a little time.`
BREAKING CHANGES: invocations now require model key, not base/type/name - Implement new model loader and modify invocations and embeddings - Finish implementation loaders for all models currently supported by InvokeAI. - Move lora, textual_inversion, and model patching support into backend/embeddings. - Restore support for model cache statistics collection (a little ugly, needs work). - Fixed up invocations that load and patch models. - Move seamless and silencewarnings utils into better location 2024-02-06 03:56:32 +00:00			`if len(self.pad_tokens) == 0:`
			`return token_ids`

Add docs to TextualInversionManager and improve types. No changes to functionality. 2024-05-27 14:32:49 +00:00			`# This function assumes that compel has not included the BOS and EOS tokens in the token_ids list. We verify`
			`# this assumption here.`
BREAKING CHANGES: invocations now require model key, not base/type/name - Implement new model loader and modify invocations and embeddings - Finish implementation loaders for all models currently supported by InvokeAI. - Move lora, textual_inversion, and model patching support into backend/embeddings. - Restore support for model cache statistics collection (a little ugly, needs work). - Fixed up invocations that load and patch models. - Move seamless and silencewarnings utils into better location 2024-02-06 03:56:32 +00:00			`if token_ids[0] == self.tokenizer.bos_token_id:`
			`raise ValueError("token_ids must not start with bos_token_id")`
			`if token_ids[-1] == self.tokenizer.eos_token_id:`
			`raise ValueError("token_ids must not end with eos_token_id")`

Add docs to TextualInversionManager and improve types. No changes to functionality. 2024-05-27 14:32:49 +00:00			`# Expand any TI tokens to their corresponding pad tokens.`
			`new_token_ids: list[int] = []`
BREAKING CHANGES: invocations now require model key, not base/type/name - Implement new model loader and modify invocations and embeddings - Finish implementation loaders for all models currently supported by InvokeAI. - Move lora, textual_inversion, and model patching support into backend/embeddings. - Restore support for model cache statistics collection (a little ugly, needs work). - Fixed up invocations that load and patch models. - Move seamless and silencewarnings utils into better location 2024-02-06 03:56:32 +00:00			`for token_id in token_ids:`
			`new_token_ids.append(token_id)`
			`if token_id in self.pad_tokens:`
			`new_token_ids.extend(self.pad_tokens[token_id])`

Add docs to TextualInversionManager and improve types. No changes to functionality. 2024-05-27 14:32:49 +00:00			`# Do not exceed the max model input size. The -2 here is compensating for`
			`# compel.embeddings_provider.get_token_ids(), which first removes and then adds back the start and end tokens.`
Update TextualInversionManager for compatibility with the latest transformers release. See https://github.com/invoke-ai/InvokeAI/issues/6445. 2024-05-27 14:35:02 +00:00			`max_length = self.tokenizer.model_max_length - 2`
BREAKING CHANGES: invocations now require model key, not base/type/name - Implement new model loader and modify invocations and embeddings - Finish implementation loaders for all models currently supported by InvokeAI. - Move lora, textual_inversion, and model patching support into backend/embeddings. - Restore support for model cache statistics collection (a little ugly, needs work). - Fixed up invocations that load and patch models. - Move seamless and silencewarnings utils into better location 2024-02-06 03:56:32 +00:00			`if len(new_token_ids) > max_length:`
Add a callout about the hackiness of dropping tokens in the TextualInversionManager. 2024-05-27 14:53:12 +00:00			`# HACK: If TI token expansion causes us to exceed the max text encoder input length, we silently discard`
			`# tokens. Token expansion should happen in a way that is compatible with compel's default handling of long`
			`# prompts.`
BREAKING CHANGES: invocations now require model key, not base/type/name - Implement new model loader and modify invocations and embeddings - Finish implementation loaders for all models currently supported by InvokeAI. - Move lora, textual_inversion, and model patching support into backend/embeddings. - Restore support for model cache statistics collection (a little ugly, needs work). - Fixed up invocations that load and patch models. - Move seamless and silencewarnings utils into better location 2024-02-06 03:56:32 +00:00			`new_token_ids = new_token_ids[0:max_length]`

			`return new_token_ids`