InvokeAI/invokeai/backend/prompting/conditioning.py

"""
This module handles the generation of the conditioning tensors.

Useful function exports:

get_uc_and_c_and_ec()           get the conditioned and unconditioned latent, and edited conditioning if we're doing cross-attention control

"""
import re
from typing import Any, Optional, Union

from compel import Compel
from compel.prompt_parser import (
    Blend,
    CrossAttentionControlSubstitute,
    FlattenedPrompt,
    Fragment,
    PromptParser,
)
from transformers import CLIPTokenizer

from invokeai.backend.globals import Globals

from ..stable_diffusion import InvokeAIDiffuserComponent
from ..util import torch_dtype


def get_tokenizer(model) -> CLIPTokenizer:
    # TODO remove legacy ckpt fallback handling
    return (
        getattr(model, "tokenizer", None)  # diffusers
        or model.cond_stage_model.tokenizer
    )  # ldm


def get_text_encoder(model) -> Any:
    # TODO remove legacy ckpt fallback handling
    return getattr(
        model, "text_encoder", None
    ) or UnsqueezingLDMTransformer(  # diffusers
        model.cond_stage_model.transformer
    )  # ldm


class UnsqueezingLDMTransformer:
    def __init__(self, ldm_transformer):
        self.ldm_transformer = ldm_transformer

    @property
    def device(self):
        return self.ldm_transformer.device

    def __call__(self, *args, **kwargs):
        insufficiently_unsqueezed_tensor = self.ldm_transformer(*args, **kwargs)
        return insufficiently_unsqueezed_tensor.unsqueeze(0)


def get_uc_and_c_and_ec(
    prompt_string, model, log_tokens=False, skip_normalize_legacy_blend=False
):
    # lazy-load any deferred textual inversions.
    # this might take a couple of seconds the first time a textual inversion is used.
    model.textual_inversion_manager.create_deferred_token_ids_for_any_trigger_terms(
        prompt_string
    )

    tokenizer = get_tokenizer(model)
    text_encoder = get_text_encoder(model)
    compel = Compel(
        tokenizer=tokenizer,
        text_encoder=text_encoder,
        textual_inversion_manager=model.textual_inversion_manager,
        dtype_for_device_getter=torch_dtype,
        truncate_long_prompts=False
    )

    # get rid of any newline characters
    prompt_string = prompt_string.replace("\n", " ")
    (
        positive_prompt_string,
        negative_prompt_string,
    ) = split_prompt_to_positive_and_negative(prompt_string)
    legacy_blend = try_parse_legacy_blend(
        positive_prompt_string, skip_normalize_legacy_blend
    )
    positive_prompt: Union[FlattenedPrompt, Blend]
    if legacy_blend is not None:
        positive_prompt = legacy_blend
    else:
        positive_prompt = Compel.parse_prompt_string(positive_prompt_string)
    negative_prompt: Union[FlattenedPrompt, Blend] = Compel.parse_prompt_string(
        negative_prompt_string
    )

    if log_tokens or getattr(Globals, "log_tokenization", False):
        log_tokenization(positive_prompt, negative_prompt, tokenizer=tokenizer)

    c, options = compel.build_conditioning_tensor_for_prompt_object(positive_prompt)
    uc, _ = compel.build_conditioning_tensor_for_prompt_object(negative_prompt)
    [c, uc] = compel.pad_conditioning_tensors_to_same_length([c, uc])

    tokens_count = get_max_token_count(tokenizer, positive_prompt)

    ec = InvokeAIDiffuserComponent.ExtraConditioningInfo(
        tokens_count_including_eos_bos=tokens_count,
        cross_attention_control_args=options.get("cross_attention_control", None),
    )
    return uc, c, ec


def get_prompt_structure(
    prompt_string, skip_normalize_legacy_blend: bool = False
) -> (Union[FlattenedPrompt, Blend], FlattenedPrompt):
    (
        positive_prompt_string,
        negative_prompt_string,
    ) = split_prompt_to_positive_and_negative(prompt_string)
    legacy_blend = try_parse_legacy_blend(
        positive_prompt_string, skip_normalize_legacy_blend
    )
    positive_prompt: Union[FlattenedPrompt, Blend]
    if legacy_blend is not None:
        positive_prompt = legacy_blend
    else:
        positive_prompt = Compel.parse_prompt_string(positive_prompt_string)
    negative_prompt: Union[FlattenedPrompt, Blend] = Compel.parse_prompt_string(
        negative_prompt_string
    )

    return positive_prompt, negative_prompt


def get_max_token_count(
    tokenizer, prompt: Union[FlattenedPrompt, Blend], truncate_if_too_long=False
) -> int:
    if type(prompt) is Blend:
        blend: Blend = prompt
        return max(
            [
                get_max_token_count(tokenizer, c, truncate_if_too_long)
                for c in blend.prompts
            ]
        )
    else:
        return len(
            get_tokens_for_prompt_object(tokenizer, prompt, truncate_if_too_long)
        )


def get_tokens_for_prompt_object(
    tokenizer, parsed_prompt: FlattenedPrompt, truncate_if_too_long=True
) -> [str]:
    if type(parsed_prompt) is Blend:
        raise ValueError(
            "Blend is not supported here - you need to get tokens for each of its .children"
        )

    text_fragments = [
        x.text
        if type(x) is Fragment
        else (
            " ".join([f.text for f in x.original])
            if type(x) is CrossAttentionControlSubstitute
            else str(x)
        )
        for x in parsed_prompt.children
    ]
    text = " ".join(text_fragments)
    tokens = tokenizer.tokenize(text)
    if truncate_if_too_long:
        max_tokens_length = tokenizer.model_max_length - 2  # typically 75
        tokens = tokens[0:max_tokens_length]
    return tokens


def split_prompt_to_positive_and_negative(prompt_string_uncleaned: str):
    unconditioned_words = ""
    unconditional_regex = r"\[(.*?)\]"
    unconditionals = re.findall(unconditional_regex, prompt_string_uncleaned)
    if len(unconditionals) > 0:
        unconditioned_words = " ".join(unconditionals)

        # Remove Unconditioned Words From Prompt
        unconditional_regex_compile = re.compile(unconditional_regex)
        clean_prompt = unconditional_regex_compile.sub(" ", prompt_string_uncleaned)
        prompt_string_cleaned = re.sub(" +", " ", clean_prompt)
    else:
        prompt_string_cleaned = prompt_string_uncleaned
    return prompt_string_cleaned, unconditioned_words


def log_tokenization(
    positive_prompt: Union[Blend, FlattenedPrompt],
    negative_prompt: Union[Blend, FlattenedPrompt],
    tokenizer,
):
    print(f"\n>> [TOKENLOG] Parsed Prompt: {positive_prompt}")
    print(f"\n>> [TOKENLOG] Parsed Negative Prompt: {negative_prompt}")

    log_tokenization_for_prompt_object(positive_prompt, tokenizer)
    log_tokenization_for_prompt_object(
        negative_prompt, tokenizer, display_label_prefix="(negative prompt)"
    )


def log_tokenization_for_prompt_object(
    p: Union[Blend, FlattenedPrompt], tokenizer, display_label_prefix=None
):
    display_label_prefix = display_label_prefix or ""
    if type(p) is Blend:
        blend: Blend = p
        for i, c in enumerate(blend.prompts):
            log_tokenization_for_prompt_object(
                c,
                tokenizer,
                display_label_prefix=f"{display_label_prefix}(blend part {i + 1}, weight={blend.weights[i]})",
            )
    elif type(p) is FlattenedPrompt:
        flattened_prompt: FlattenedPrompt = p
        if flattened_prompt.wants_cross_attention_control:
            original_fragments = []
            edited_fragments = []
            for f in flattened_prompt.children:
                if type(f) is CrossAttentionControlSubstitute:
                    original_fragments += f.original
                    edited_fragments += f.edited
                else:
                    original_fragments.append(f)
                    edited_fragments.append(f)

            original_text = " ".join([x.text for x in original_fragments])
            log_tokenization_for_text(
                original_text,
                tokenizer,
                display_label=f"{display_label_prefix}(.swap originals)",
            )
            edited_text = " ".join([x.text for x in edited_fragments])
            log_tokenization_for_text(
                edited_text,
                tokenizer,
                display_label=f"{display_label_prefix}(.swap replacements)",
            )
        else:
            text = " ".join([x.text for x in flattened_prompt.children])
            log_tokenization_for_text(
                text, tokenizer, display_label=display_label_prefix
            )


def log_tokenization_for_text(text, tokenizer, display_label=None, truncate_if_too_long=False):
    """shows how the prompt is tokenized
    # usually tokens have '</w>' to indicate end-of-word,
    # but for readability it has been replaced with ' '
    """
    tokens = tokenizer.tokenize(text)
    tokenized = ""
    discarded = ""
    usedTokens = 0
    totalTokens = len(tokens)

    for i in range(0, totalTokens):
        token = tokens[i].replace("</w>", " ")
        # alternate color
        s = (usedTokens % 6) + 1
        if truncate_if_too_long and i >= tokenizer.model_max_length:
            discarded = discarded + f"\x1b[0;3{s};40m{token}"
        else:
            tokenized = tokenized + f"\x1b[0;3{s};40m{token}"
            usedTokens += 1

    if usedTokens > 0:
        print(f'\n>> [TOKENLOG] Tokens {display_label or ""} ({usedTokens}):')
        print(f"{tokenized}\x1b[0m")

    if discarded != "":
        print(f"\n>> [TOKENLOG] Tokens Discarded ({totalTokens - usedTokens}):")
        print(f"{discarded}\x1b[0m")


def try_parse_legacy_blend(text: str, skip_normalize: bool = False) -> Optional[Blend]:
    weighted_subprompts = split_weighted_subprompts(text, skip_normalize=skip_normalize)
    if len(weighted_subprompts) <= 1:
        return None
    strings = [x[0] for x in weighted_subprompts]
    weights = [x[1] for x in weighted_subprompts]

    pp = PromptParser()
    parsed_conjunctions = [pp.parse_conjunction(x) for x in strings]
    flattened_prompts = [x.prompts[0] for x in parsed_conjunctions]

    return Blend(
        prompts=flattened_prompts, weights=weights, normalize_weights=not skip_normalize
    )


def split_weighted_subprompts(text, skip_normalize=False) -> list:
    """
    Legacy blend parsing.

    grabs all text up to the first occurrence of ':'
    uses the grabbed text as a sub-prompt, and takes the value following ':' as weight
    if ':' has no value defined, defaults to 1.0
    repeats until no text remaining
    """
    prompt_parser = re.compile(
        """
            (?P<prompt>     # capture group for 'prompt'
            (?:\\\:|[^:])+  # match one or more non ':' characters or escaped colons '\:'
            )               # end 'prompt'
            (?:             # non-capture group
            :+              # match one or more ':' characters
            (?P<weight>     # capture group for 'weight'
            -?\d+(?:\.\d+)? # match positive or negative integer or decimal number
            )?              # end weight capture group, make optional
            \s*             # strip spaces after weight
            |               # OR
            $               # else, if no ':' then match end of line
            )               # end non-capture group
            """,
        re.VERBOSE,
    )
    parsed_prompts = [
        (match.group("prompt").replace("\\:", ":"), float(match.group("weight") or 1))
        for match in re.finditer(prompt_parser, text)
    ]
    if skip_normalize:
        return parsed_prompts
    weight_sum = sum(map(lambda x: x[1], parsed_prompts))
    if weight_sum == 0:
        print(
            "* Warning: Subprompt weights add up to zero. Discarding and using even weights instead."
        )
        equal_weight = 1 / max(len(parsed_prompts), 1)
        return [(x[0], equal_weight) for x in parsed_prompts]
    return [(x[0], x[1] / weight_sum) for x in parsed_prompts]