From d35ec3398d8255f5621ec29f8c0eb0ec8a7c3129 Mon Sep 17 00:00:00 2001 From: Kevin Turner <83819+keturn@users.noreply.github.com> Date: Fri, 20 Jan 2023 19:23:12 -0800 Subject: [PATCH] fix: use pad_token for padding Stable Diffusion does not use the eos_token for padding. --- ldm/modules/encoders/modules.py | 18 ++++++++---------- ldm/modules/prompt_to_embeddings_converter.py | 11 ++++++----- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/ldm/modules/encoders/modules.py b/ldm/modules/encoders/modules.py index aafb1299ad..6715b229f1 100644 --- a/ldm/modules/encoders/modules.py +++ b/ldm/modules/encoders/modules.py @@ -1,18 +1,16 @@ import math -import os.path +from functools import partial from typing import Optional +import clip +import kornia import torch import torch.nn as nn -from functools import partial -import clip -from einops import rearrange, repeat +from einops import repeat from transformers import CLIPTokenizer, CLIPTextModel -import kornia -from ldm.invoke.devices import choose_torch_device -from ldm.invoke.globals import Globals, global_cache_dir -#from ldm.modules.textual_inversion_manager import TextualInversionManager +from ldm.invoke.devices import choose_torch_device +from ldm.invoke.globals import global_cache_dir from ldm.modules.x_transformer import ( Encoder, TransformerWrapper, @@ -663,12 +661,12 @@ class WeightedFrozenCLIPEmbedder(FrozenCLIPEmbedder): all_token_ids = all_token_ids[0:self.max_length] per_token_weights = per_token_weights[0:self.max_length] - # pad out to a 77-entry array: [eos_token, , eos_token, ..., eos_token] + # pad out to a 77-entry array: [bos_token, , eos_token, pad_token…] # (77 = self.max_length) all_token_ids = [self.tokenizer.bos_token_id] + all_token_ids + [self.tokenizer.eos_token_id] per_token_weights = [1.0] + per_token_weights + [1.0] pad_length = self.max_length - len(all_token_ids) - all_token_ids += [self.tokenizer.eos_token_id] * pad_length + all_token_ids += [self.tokenizer.pad_token_id] * pad_length per_token_weights += [1.0] * pad_length all_token_ids_tensor = torch.tensor(all_token_ids, dtype=torch.long).to(self.device) diff --git a/ldm/modules/prompt_to_embeddings_converter.py b/ldm/modules/prompt_to_embeddings_converter.py index ab989e4892..dea15d61b4 100644 --- a/ldm/modules/prompt_to_embeddings_converter.py +++ b/ldm/modules/prompt_to_embeddings_converter.py @@ -3,8 +3,9 @@ import math import torch from transformers import CLIPTokenizer, CLIPTextModel -from ldm.modules.textual_inversion_manager import TextualInversionManager from ldm.invoke.devices import torch_dtype +from ldm.modules.textual_inversion_manager import TextualInversionManager + class WeightedPromptFragmentsToEmbeddingsConverter(): @@ -22,8 +23,8 @@ class WeightedPromptFragmentsToEmbeddingsConverter(): return self.tokenizer.model_max_length def get_embeddings_for_weighted_prompt_fragments(self, - text: list[str], - fragment_weights: list[float], + text: list[list[str]], + fragment_weights: list[list[float]], should_return_tokens: bool = False, device='cpu' ) -> torch.Tensor: @@ -198,12 +199,12 @@ class WeightedPromptFragmentsToEmbeddingsConverter(): all_token_ids = all_token_ids[0:max_token_count_without_bos_eos_markers] per_token_weights = per_token_weights[0:max_token_count_without_bos_eos_markers] - # pad out to a self.max_length-entry array: [eos_token, , eos_token, ..., eos_token] + # pad out to a self.max_length-entry array: [bos_token, , eos_token, pad_token…] # (typically self.max_length == 77) all_token_ids = [self.tokenizer.bos_token_id] + all_token_ids + [self.tokenizer.eos_token_id] per_token_weights = [1.0] + per_token_weights + [1.0] pad_length = self.max_length - len(all_token_ids) - all_token_ids += [self.tokenizer.eos_token_id] * pad_length + all_token_ids += [self.tokenizer.pad_token_id] * pad_length per_token_weights += [1.0] * pad_length all_token_ids_tensor = torch.tensor(all_token_ids, dtype=torch.long, device=device)