mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
Merge branch 'main' into bugfix/embed-loading-messages
This commit is contained in:
@ -21,7 +21,7 @@ import os
|
||||
import re
|
||||
import torch
|
||||
from pathlib import Path
|
||||
from ldm.invoke.globals import Globals
|
||||
from ldm.invoke.globals import Globals, global_cache_dir
|
||||
from safetensors.torch import load_file
|
||||
|
||||
try:
|
||||
@ -637,7 +637,7 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
|
||||
|
||||
|
||||
def convert_ldm_clip_checkpoint(checkpoint):
|
||||
text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
|
||||
text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14",cache_dir=global_cache_dir('hub'))
|
||||
|
||||
keys = list(checkpoint.keys())
|
||||
|
||||
@ -677,7 +677,8 @@ textenc_pattern = re.compile("|".join(protected.keys()))
|
||||
|
||||
|
||||
def convert_paint_by_example_checkpoint(checkpoint):
|
||||
config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14")
|
||||
cache_dir = global_cache_dir('hub')
|
||||
config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14",cache_dir=cache_dir)
|
||||
model = PaintByExampleImageEncoder(config)
|
||||
|
||||
keys = list(checkpoint.keys())
|
||||
@ -744,7 +745,8 @@ def convert_paint_by_example_checkpoint(checkpoint):
|
||||
|
||||
|
||||
def convert_open_clip_checkpoint(checkpoint):
|
||||
text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
|
||||
cache_dir=global_cache_dir('hub')
|
||||
text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder", cache_dir=cache_dir)
|
||||
|
||||
keys = list(checkpoint.keys())
|
||||
|
||||
@ -795,6 +797,7 @@ def convert_ckpt_to_diffuser(checkpoint_path:str,
|
||||
):
|
||||
|
||||
checkpoint = load_file(checkpoint_path) if Path(checkpoint_path).suffix == '.safetensors' else torch.load(checkpoint_path)
|
||||
cache_dir = global_cache_dir('hub')
|
||||
|
||||
# Sometimes models don't have the global_step item
|
||||
if "global_step" in checkpoint:
|
||||
@ -904,7 +907,7 @@ def convert_ckpt_to_diffuser(checkpoint_path:str,
|
||||
|
||||
if model_type == "FrozenOpenCLIPEmbedder":
|
||||
text_model = convert_open_clip_checkpoint(checkpoint)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")
|
||||
tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer",cache_dir=global_cache_dir('diffusers'))
|
||||
pipe = StableDiffusionPipeline(
|
||||
vae=vae,
|
||||
text_encoder=text_model,
|
||||
@ -917,8 +920,8 @@ def convert_ckpt_to_diffuser(checkpoint_path:str,
|
||||
)
|
||||
elif model_type == "PaintByExample":
|
||||
vision_model = convert_paint_by_example_checkpoint(checkpoint)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
|
||||
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14",cache_dir=cache_dir)
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker",cache_dir=cache_dir)
|
||||
pipe = PaintByExamplePipeline(
|
||||
vae=vae,
|
||||
image_encoder=vision_model,
|
||||
@ -929,9 +932,9 @@ def convert_ckpt_to_diffuser(checkpoint_path:str,
|
||||
)
|
||||
elif model_type in ['FrozenCLIPEmbedder','WeightedFrozenCLIPEmbedder']:
|
||||
text_model = convert_ldm_clip_checkpoint(checkpoint)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
|
||||
safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
|
||||
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14",cache_dir=cache_dir)
|
||||
safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker",cache_dir=cache_dir)
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker",cache_dir=cache_dir)
|
||||
pipe = StableDiffusionPipeline(
|
||||
vae=vae,
|
||||
text_encoder=text_model,
|
||||
@ -944,7 +947,7 @@ def convert_ckpt_to_diffuser(checkpoint_path:str,
|
||||
else:
|
||||
text_config = create_ldm_bert_config(original_config)
|
||||
text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
|
||||
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
|
||||
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased",cache_dir=cache_dir)
|
||||
pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
|
||||
|
||||
pipe.save_pretrained(
|
||||
|
@ -1,18 +1,16 @@
|
||||
import math
|
||||
import os.path
|
||||
from functools import partial
|
||||
from typing import Optional
|
||||
|
||||
import clip
|
||||
import kornia
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from functools import partial
|
||||
import clip
|
||||
from einops import rearrange, repeat
|
||||
from einops import repeat
|
||||
from transformers import CLIPTokenizer, CLIPTextModel
|
||||
import kornia
|
||||
from ldm.invoke.devices import choose_torch_device
|
||||
from ldm.invoke.globals import Globals, global_cache_dir
|
||||
#from ldm.modules.textual_inversion_manager import TextualInversionManager
|
||||
|
||||
from ldm.invoke.devices import choose_torch_device
|
||||
from ldm.invoke.globals import global_cache_dir
|
||||
from ldm.modules.x_transformer import (
|
||||
Encoder,
|
||||
TransformerWrapper,
|
||||
@ -654,21 +652,22 @@ class WeightedFrozenCLIPEmbedder(FrozenCLIPEmbedder):
|
||||
per_token_weights += [weight] * len(this_fragment_token_ids)
|
||||
|
||||
# leave room for bos/eos
|
||||
if len(all_token_ids) > self.max_length - 2:
|
||||
excess_token_count = len(all_token_ids) - self.max_length - 2
|
||||
max_token_count_without_bos_eos_markers = self.max_length - 2
|
||||
if len(all_token_ids) > max_token_count_without_bos_eos_markers:
|
||||
excess_token_count = len(all_token_ids) - max_token_count_without_bos_eos_markers
|
||||
# TODO build nice description string of how the truncation was applied
|
||||
# this should be done by calling self.tokenizer.convert_ids_to_tokens() then passing the result to
|
||||
# self.tokenizer.convert_tokens_to_string() for the token_ids on each side of the truncation limit.
|
||||
print(f">> Prompt is {excess_token_count} token(s) too long and has been truncated")
|
||||
all_token_ids = all_token_ids[0:self.max_length]
|
||||
per_token_weights = per_token_weights[0:self.max_length]
|
||||
all_token_ids = all_token_ids[0:max_token_count_without_bos_eos_markers]
|
||||
per_token_weights = per_token_weights[0:max_token_count_without_bos_eos_markers]
|
||||
|
||||
# pad out to a 77-entry array: [eos_token, <prompt tokens>, eos_token, ..., eos_token]
|
||||
# pad out to a 77-entry array: [bos_token, <prompt tokens>, eos_token, pad_token…]
|
||||
# (77 = self.max_length)
|
||||
all_token_ids = [self.tokenizer.bos_token_id] + all_token_ids + [self.tokenizer.eos_token_id]
|
||||
per_token_weights = [1.0] + per_token_weights + [1.0]
|
||||
pad_length = self.max_length - len(all_token_ids)
|
||||
all_token_ids += [self.tokenizer.eos_token_id] * pad_length
|
||||
all_token_ids += [self.tokenizer.pad_token_id] * pad_length
|
||||
per_token_weights += [1.0] * pad_length
|
||||
|
||||
all_token_ids_tensor = torch.tensor(all_token_ids, dtype=torch.long).to(self.device)
|
||||
|
@ -3,8 +3,9 @@ import math
|
||||
import torch
|
||||
from transformers import CLIPTokenizer, CLIPTextModel
|
||||
|
||||
from ldm.modules.textual_inversion_manager import TextualInversionManager
|
||||
from ldm.invoke.devices import torch_dtype
|
||||
from ldm.modules.textual_inversion_manager import TextualInversionManager
|
||||
|
||||
|
||||
class WeightedPromptFragmentsToEmbeddingsConverter():
|
||||
|
||||
@ -22,8 +23,8 @@ class WeightedPromptFragmentsToEmbeddingsConverter():
|
||||
return self.tokenizer.model_max_length
|
||||
|
||||
def get_embeddings_for_weighted_prompt_fragments(self,
|
||||
text: list[str],
|
||||
fragment_weights: list[float],
|
||||
text: list[list[str]],
|
||||
fragment_weights: list[list[float]],
|
||||
should_return_tokens: bool = False,
|
||||
device='cpu'
|
||||
) -> torch.Tensor:
|
||||
@ -198,12 +199,12 @@ class WeightedPromptFragmentsToEmbeddingsConverter():
|
||||
all_token_ids = all_token_ids[0:max_token_count_without_bos_eos_markers]
|
||||
per_token_weights = per_token_weights[0:max_token_count_without_bos_eos_markers]
|
||||
|
||||
# pad out to a self.max_length-entry array: [eos_token, <prompt tokens>, eos_token, ..., eos_token]
|
||||
# pad out to a self.max_length-entry array: [bos_token, <prompt tokens>, eos_token, pad_token…]
|
||||
# (typically self.max_length == 77)
|
||||
all_token_ids = [self.tokenizer.bos_token_id] + all_token_ids + [self.tokenizer.eos_token_id]
|
||||
per_token_weights = [1.0] + per_token_weights + [1.0]
|
||||
pad_length = self.max_length - len(all_token_ids)
|
||||
all_token_ids += [self.tokenizer.eos_token_id] * pad_length
|
||||
all_token_ids += [self.tokenizer.pad_token_id] * pad_length
|
||||
per_token_weights += [1.0] * pad_length
|
||||
|
||||
all_token_ids_tensor = torch.tensor(all_token_ids, dtype=torch.long, device=device)
|
||||
|
Reference in New Issue
Block a user