From a3c22b5fe6864a2247e42e8f52770ad65e35ce82 Mon Sep 17 00:00:00 2001 From: Sergey Borisov Date: Sun, 25 Jun 2023 21:06:22 +0300 Subject: [PATCH] Remove upcast_attention and prediction_type from stable diffusion model logic, fix ckpt conversion according to this --- .../convert_ckpt_to_diffusers.py | 47 ++++++++-------- .../models/stable_diffusion.py | 56 +++++-------------- 2 files changed, 35 insertions(+), 68 deletions(-) diff --git a/invokeai/backend/model_management/convert_ckpt_to_diffusers.py b/invokeai/backend/model_management/convert_ckpt_to_diffusers.py index 5d097f5a4e..1eeee92fb7 100644 --- a/invokeai/backend/model_management/convert_ckpt_to_diffusers.py +++ b/invokeai/backend/model_management/convert_ckpt_to_diffusers.py @@ -30,7 +30,7 @@ from invokeai.app.services.config import InvokeAIAppConfig from .model_manager import ModelManager from .model_cache import ModelCache -from .models import SchedulerPredictionType, BaseModelType, ModelVariantType +from .models import BaseModelType, ModelVariantType try: from omegaconf import OmegaConf @@ -73,7 +73,9 @@ from transformers import ( from ..stable_diffusion import StableDiffusionGeneratorPipeline -MODEL_ROOT = None +# TODO: redo in future +#CONVERT_MODEL_ROOT = InvokeAIAppConfig.get_config().models_path / "core" / "convert" +CONVERT_MODEL_ROOT = InvokeAIAppConfig.get_config().root_path / "models" / "core" / "convert" def shave_segments(path, n_shave_prefix_segments=1): """ @@ -605,7 +607,7 @@ def convert_ldm_vae_checkpoint(checkpoint, config): else: vae_state_dict = checkpoint - new_checkpoint = convert_ldm_vae_state_dict(vae_state_dict,config) + new_checkpoint = convert_ldm_vae_state_dict(vae_state_dict, config) return new_checkpoint def convert_ldm_vae_state_dict(vae_state_dict, config): @@ -828,7 +830,7 @@ def convert_ldm_bert_checkpoint(checkpoint, config): def convert_ldm_clip_checkpoint(checkpoint): - text_model = CLIPTextModel.from_pretrained(MODEL_ROOT / 'clip-vit-large-patch14') + text_model = CLIPTextModel.from_pretrained(CONVERT_MODEL_ROOT / 'clip-vit-large-patch14') keys = list(checkpoint.keys()) text_model_dict = {} @@ -882,7 +884,7 @@ textenc_pattern = re.compile("|".join(protected.keys())) def convert_open_clip_checkpoint(checkpoint): text_model = CLIPTextModel.from_pretrained( - MODEL_ROOT / 'stable-diffusion-2-clip', + CONVERT_MODEL_ROOT / 'stable-diffusion-2-clip', subfolder='text_encoder', ) @@ -949,7 +951,7 @@ def convert_open_clip_checkpoint(checkpoint): return text_model -def replace_checkpoint_vae(checkpoint, vae_path:str): +def replace_checkpoint_vae(checkpoint, vae_path: str): if vae_path.endswith(".safetensors"): vae_ckpt = load_file(vae_path) else: @@ -959,7 +961,7 @@ def replace_checkpoint_vae(checkpoint, vae_path:str): new_key = f'first_stage_model.{vae_key}' checkpoint[new_key] = state_dict[vae_key] -def convert_ldm_vae_to_diffusers(checkpoint, vae_config: DictConfig, image_size: int)->AutoencoderKL: +def convert_ldm_vae_to_diffusers(checkpoint, vae_config: DictConfig, image_size: int) -> AutoencoderKL: vae_config = create_vae_diffusers_config( vae_config, image_size=image_size ) @@ -979,8 +981,6 @@ def load_pipeline_from_original_stable_diffusion_ckpt( original_config_file: str, extract_ema: bool = True, precision: torch.dtype = torch.float32, - upcast_attention: bool = False, - prediction_type: SchedulerPredictionType = SchedulerPredictionType.Epsilon, scan_needed: bool = True, ) -> StableDiffusionPipeline: """ @@ -994,8 +994,6 @@ def load_pipeline_from_original_stable_diffusion_ckpt( :param checkpoint_path: Path to `.ckpt` file. :param original_config_file: Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically inferred by looking for a key that only exists in SD2.0 models. - :param prediction_type: The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion - v1.X and Stable Diffusion v2 Base. Use `'v-prediction'` for Stable Diffusion v2. :param scheduler_type: Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm", "ddim"]`. :param model_type: The pipeline type. `None` to automatically infer, or one of `["FrozenOpenCLIPEmbedder", "FrozenCLIPEmbedder"]`. :param extract_ema: Only relevant for @@ -1003,17 +1001,16 @@ def load_pipeline_from_original_stable_diffusion_ckpt( or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning. :param precision: precision to use - torch.float16, torch.float32 or torch.autocast - :param upcast_attention: Whether the attention computation should always be upcasted. This is necessary when - running stable diffusion 2.1. """ - config = InvokeAIAppConfig.get_config() + if not isinstance(checkpoint_path, Path): + checkpoint_path = Path(checkpoint_path) with warnings.catch_warnings(): warnings.simplefilter("ignore") verbosity = dlogging.get_verbosity() dlogging.set_verbosity_error() - if str(checkpoint_path).endswith(".safetensors"): + if checkpoint_path.suffix == ".safetensors": checkpoint = load_file(checkpoint_path) else: if scan_needed: @@ -1026,9 +1023,13 @@ def load_pipeline_from_original_stable_diffusion_ckpt( original_config = OmegaConf.load(original_config_file) - if model_version == BaseModelType.StableDiffusion2 and prediction_type == SchedulerPredictionType.VPrediction: + if model_version == BaseModelType.StableDiffusion2 and original_config["model"]["params"]["parameterization"] == "v": + prediction_type = "v_prediction" + upcast_attention = True image_size = 768 else: + prediction_type = "epsilon" + upcast_attention = False image_size = 512 # @@ -1083,7 +1084,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt( if model_type == "FrozenOpenCLIPEmbedder": text_model = convert_open_clip_checkpoint(checkpoint) tokenizer = CLIPTokenizer.from_pretrained( - MODEL_ROOT / 'stable-diffusion-2-clip', + CONVERT_MODEL_ROOT / 'stable-diffusion-2-clip', subfolder='tokenizer', ) pipe = StableDiffusionPipeline( @@ -1099,9 +1100,9 @@ def load_pipeline_from_original_stable_diffusion_ckpt( elif model_type in ["FrozenCLIPEmbedder", "WeightedFrozenCLIPEmbedder"]: text_model = convert_ldm_clip_checkpoint(checkpoint) - tokenizer = CLIPTokenizer.from_pretrained(MODEL_ROOT / 'clip-vit-large-patch14') - safety_checker = StableDiffusionSafetyChecker.from_pretrained(MODEL_ROOT / 'stable-diffusion-safety-checker') - feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ROOT / 'stable-diffusion-safety-checker') + tokenizer = CLIPTokenizer.from_pretrained(CONVERT_MODEL_ROOT / 'clip-vit-large-patch14') + safety_checker = StableDiffusionSafetyChecker.from_pretrained(CONVERT_MODEL_ROOT / 'stable-diffusion-safety-checker') + feature_extractor = AutoFeatureExtractor.from_pretrained(CONVERT_MODEL_ROOT / 'stable-diffusion-safety-checker') pipe = StableDiffusionPipeline( vae=vae.to(precision), text_encoder=text_model.to(precision), @@ -1115,7 +1116,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt( else: text_config = create_ldm_bert_config(original_config) text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) - tokenizer = BertTokenizerFast.from_pretrained(MODEL_ROOT / "bert-base-uncased") + tokenizer = BertTokenizerFast.from_pretrained(CONVERT_MODEL_ROOT / "bert-base-uncased") pipe = LDMTextToImagePipeline( vqvae=vae, bert=text_model, @@ -1131,7 +1132,6 @@ def load_pipeline_from_original_stable_diffusion_ckpt( def convert_ckpt_to_diffusers( checkpoint_path: Union[str, Path], dump_path: Union[str, Path], - model_root: Union[str, Path], **kwargs, ): """ @@ -1139,9 +1139,6 @@ def convert_ckpt_to_diffusers( and in addition a path-like object indicating the location of the desired diffusers model to be written. """ - # setting global here to avoid massive changes late at night - global MODEL_ROOT - MODEL_ROOT = Path(model_root) / 'core/convert' pipe = load_pipeline_from_original_stable_diffusion_ckpt(checkpoint_path, **kwargs) pipe.save_pretrained( diff --git a/invokeai/backend/model_management/models/stable_diffusion.py b/invokeai/backend/model_management/models/stable_diffusion.py index a269ae12e2..f5112dfebb 100644 --- a/invokeai/backend/model_management/models/stable_diffusion.py +++ b/invokeai/backend/model_management/models/stable_diffusion.py @@ -34,7 +34,7 @@ class StableDiffusion1Model(DiffusersModel): class CheckpointConfig(ModelConfigBase): model_format: Literal[StableDiffusion1ModelFormat.Checkpoint] vae: Optional[str] = Field(None) - config: Optional[str] = Field(None) + config: str variant: ModelVariantType @@ -81,6 +81,8 @@ class StableDiffusion1Model(DiffusersModel): else: raise Exception("Unkown stable diffusion 1.* model format") + if ckpt_config_path is None: + ckpt_config_path = _select_ckpt_config(BaseModelType.StableDiffusion1, variant) return cls.create_config( path=path, @@ -116,7 +118,7 @@ class StableDiffusion1Model(DiffusersModel): version=BaseModelType.StableDiffusion1, model_config=config, output_path=output_path, - ) # TODO: args + ) else: return model_path @@ -183,13 +185,8 @@ class StableDiffusion2Model(DiffusersModel): else: raise Exception("Unkown stable diffusion 2.* model format") - if variant == ModelVariantType.Normal: - prediction_type = SchedulerPredictionType.VPrediction - upcast_attention = True - - else: - prediction_type = SchedulerPredictionType.Epsilon - upcast_attention = False + if ckpt_config_path is None: + ckpt_config_path = _select_ckpt_config(BaseModelType.StableDiffusion2, variant) return cls.create_config( path=path, @@ -197,8 +194,6 @@ class StableDiffusion2Model(DiffusersModel): config=ckpt_config_path, variant=variant, - prediction_type=prediction_type, - upcast_attention=upcast_attention, ) @classproperty @@ -227,7 +222,7 @@ class StableDiffusion2Model(DiffusersModel): version=BaseModelType.StableDiffusion2, model_config=config, output_path=output_path, - ) # TODO: args + ) else: return model_path @@ -238,18 +233,18 @@ def _select_ckpt_config(version: BaseModelType, variant: ModelVariantType): ModelVariantType.Inpaint: "v1-inpainting-inference.yaml", }, BaseModelType.StableDiffusion2: { - # code further will manually set upcast_attention and v_prediction - ModelVariantType.Normal: "v2-inference.yaml", + ModelVariantType.Normal: "v2-inference-v.yaml", # best guess, as we can't differentiate with base(512) ModelVariantType.Inpaint: "v2-inpainting-inference.yaml", ModelVariantType.Depth: "v2-midas-inference.yaml", } } + app_config = InvokeAIAppConfig.get_config() try: - # TODO: path - #model_config.config = app_config.config_dir / "stable-diffusion" / ckpt_configs[version][model_config.variant] - #return InvokeAIAppConfig.get_config().legacy_conf_dir / ckpt_configs[version][variant] - return InvokeAIAppConfig.get_config().root_dir / "configs" / "stable-diffusion" / ckpt_configs[version][variant] + config_path = app_config.legacy_conf_path / ckpt_configs[version][variant] + if config_path.is_relative_to(app_config.root_path): + config_path = config_path.relative_to(app_config.root_path) + return str(config_path) except: return None @@ -268,36 +263,14 @@ def _convert_ckpt_and_cache( """ app_config = InvokeAIAppConfig.get_config() - if model_config.config is None: - model_config.config = _select_ckpt_config(version, model_config.variant) - if model_config.config is None: - raise Exception(f"Model variant {model_config.variant} not supported for {version}") - - weights = app_config.root_dir / model_config.path config_file = app_config.root_dir / model_config.config output_path = Path(output_path) - if version == BaseModelType.StableDiffusion1: - upcast_attention = False - prediction_type = SchedulerPredictionType.Epsilon - - elif version == BaseModelType.StableDiffusion2: - upcast_attention = model_config.upcast_attention - prediction_type = model_config.prediction_type - - else: - raise Exception(f"Unknown model provided: {version}") - - # return cached version if it exists if output_path.exists(): return output_path - # TODO: I think that it more correctly to convert with embedded vae - # as if user will delete custom vae he will got not embedded but also custom vae - #vae_ckpt_path, vae_model = self._get_vae_for_conversion(weights, mconfig) - # to avoid circular import errors from ..convert_ckpt_to_diffusers import convert_ckpt_to_diffusers with SilenceWarnings(): @@ -308,9 +281,6 @@ def _convert_ckpt_and_cache( model_variant=model_config.variant, original_config_file=config_file, extract_ema=True, - upcast_attention=upcast_attention, - prediction_type=prediction_type, scan_needed=True, - model_root=app_config.models_path, ) return output_path