diff --git a/ldm/invoke/CLI.py b/ldm/invoke/CLI.py index 3a23f890e8..b433e063d1 100644 --- a/ldm/invoke/CLI.py +++ b/ldm/invoke/CLI.py @@ -44,11 +44,13 @@ def main(): print('--max_loaded_models must be >= 1; using 1') args.max_loaded_models = 1 - # alert - setting a global here + # alert - setting a few globals here Globals.try_patchmatch = args.patchmatch Globals.always_use_cpu = args.always_use_cpu Globals.internet_available = args.internet_available and check_internet() Globals.disable_xformers = not args.xformers + Globals.ckpt_convert = args.ckpt_convert + print(f'>> Internet connectivity is {Globals.internet_available}') if not args.conf: @@ -717,11 +719,16 @@ def optimize_model(model_name_or_path:str, gen, opt, completer): print(f'** {model_name_or_path} is already optimized. Will not overwrite. If this is an error, please remove the directory {diffuser_path} and try again.') return + vae = None + if input('Replace this model\'s VAE with "stabilityai/sd-vae-ft-mse"? [n] ').strip() in ('y','Y'): + vae = dict(repo_id='stabilityai/sd-vae-ft-mse') + new_config = gen.model_manager.convert_and_import( ckpt_path, diffuser_path, model_name=model_name, model_description=model_description, + vae = vae, commit_to_conf=opt.conf, ) if not new_config: diff --git a/ldm/invoke/args.py b/ldm/invoke/args.py index c918e4fba7..3904d2f573 100644 --- a/ldm/invoke/args.py +++ b/ldm/invoke/args.py @@ -503,6 +503,13 @@ class Args(object): help=f'Set model precision. Defaults to auto selected based on device. Options: {", ".join(PRECISION_CHOICES)}', default='auto', ) + model_group.add_argument( + '--ckpt_convert', + action=argparse.BooleanOptionalAction, + dest='ckpt_convert', + default=False, + help='Load legacy ckpt files as diffusers. Pass --no-ckpt-convert to inhibit this behavior', + ) model_group.add_argument( '--internet', action=argparse.BooleanOptionalAction, diff --git a/ldm/invoke/ckpt_to_diffuser.py b/ldm/invoke/ckpt_to_diffuser.py index 9b1735f831..fe56051aa3 100644 --- a/ldm/invoke/ckpt_to_diffuser.py +++ b/ldm/invoke/ckpt_to_diffuser.py @@ -23,6 +23,7 @@ import torch from pathlib import Path from ldm.invoke.globals import Globals, global_cache_dir from safetensors.torch import load_file +from typing import Union try: from omegaconf import OmegaConf @@ -46,9 +47,11 @@ from diffusers import ( ) from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder, PaintByExamplePipeline -from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker +from diffusers.utils import is_safetensors_available from transformers import AutoFeatureExtractor, BertTokenizerFast, CLIPTextModel, CLIPTokenizer, CLIPVisionConfig +from ldm.invoke.generator.diffusers_pipeline import StableDiffusionGeneratorPipeline + def shave_segments(path, n_shave_prefix_segments=1): """ Removes segments. Positive values shave the first segments, negative shave the last segments. @@ -318,11 +321,10 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False unet_key = "model.diffusion_model." # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA if sum(k.startswith("model_ema") for k in keys) > 100: - print(f"Checkpoint {path} has both EMA and non-EMA weights.") + print(f" | Checkpoint {path} has both EMA and non-EMA weights.") if extract_ema: print( - "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA" - " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag." + ' | Extracting EMA weights (usually better for inference)' ) for key in keys: if key.startswith("model.diffusion_model"): @@ -330,8 +332,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key) else: print( - "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA" - " weights (usually better for inference), please make sure to add the `--extract_ema` flag." + ' | Extracting only the non-EMA weights (usually better for fine-tuning)' ) for key in keys: @@ -784,17 +785,44 @@ def convert_open_clip_checkpoint(checkpoint): return text_model -def convert_ckpt_to_diffuser(checkpoint_path:str, - dump_path:str, - original_config_file:str=None, - num_in_channels:int=None, - scheduler_type:str='pndm', - pipeline_type:str=None, - image_size:int=None, - prediction_type:str=None, - extract_ema:bool=False, - upcast_attn:bool=False, - ): +def load_pipeline_from_original_stable_diffusion_ckpt( + checkpoint_path:str, + original_config_file:str=None, + num_in_channels:int=None, + scheduler_type:str='pndm', + pipeline_type:str=None, + image_size:int=None, + prediction_type:str=None, + extract_ema:bool=True, + upcast_attn:bool=False, + vae:AutoencoderKL=None +)->StableDiffusionGeneratorPipeline: + ''' + Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml` + config file. + + Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the + global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is + recommended that you override the default values and/or supply an `original_config_file` wherever possible. + + :param checkpoint_path: Path to `.ckpt` file. + :param original_config_file: Path to `.yaml` config file corresponding to the original architecture. + If `None`, will be automatically inferred by looking for a key that only exists in SD2.0 models. + :param image_size: The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2 + Base. Use 768 for Stable Diffusion v2. + :param prediction_type: The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion + v1.X and Stable Diffusion v2 Base. Use `'v-prediction'` for Stable Diffusion v2. + :param num_in_channels: The number of input channels. If `None` number of input channels will be automatically + inferred. + :param scheduler_type: Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", + "euler-ancestral", "dpm", "ddim"]`. :param model_type: The pipeline type. `None` to automatically infer, or one of + `["FrozenOpenCLIPEmbedder", "FrozenCLIPEmbedder", "PaintByExample"]`. :param extract_ema: Only relevant for + checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights + or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher + quality images for inference. Non-EMA weights are usually better to continue fine-tuning. + :param upcast_attention: Whether the attention computation should always be upcasted. This is necessary when + running stable diffusion 2.1. + ''' checkpoint = load_file(checkpoint_path) if Path(checkpoint_path).suffix == '.safetensors' else torch.load(checkpoint_path) cache_dir = global_cache_dir('hub') @@ -803,7 +831,7 @@ def convert_ckpt_to_diffuser(checkpoint_path:str, if "global_step" in checkpoint: global_step = checkpoint["global_step"] else: - print("global_step key not found in model") + print(" | global_step key not found in model") global_step = None # sometimes there is a state_dict key and sometimes not @@ -893,12 +921,16 @@ def convert_ckpt_to_diffuser(checkpoint_path:str, unet.load_state_dict(converted_unet_checkpoint) - # Convert the VAE model. - vae_config = create_vae_diffusers_config(original_config, image_size=image_size) - converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) + # Convert the VAE model, or use the one passed + if not vae: + print(f' | Using checkpoint model\'s original VAE') + vae_config = create_vae_diffusers_config(original_config, image_size=image_size) + converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) - vae = AutoencoderKL(**vae_config) - vae.load_state_dict(converted_vae_checkpoint) + vae = AutoencoderKL(**vae_config) + vae.load_state_dict(converted_vae_checkpoint) + else: + print(f' | Using external VAE specified in config') # Convert the text model. model_type = pipeline_type @@ -907,8 +939,11 @@ def convert_ckpt_to_diffuser(checkpoint_path:str, if model_type == "FrozenOpenCLIPEmbedder": text_model = convert_open_clip_checkpoint(checkpoint) - tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer",cache_dir=global_cache_dir('diffusers')) - pipe = StableDiffusionPipeline( + tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", + subfolder="tokenizer", + cache_dir=global_cache_dir('diffusers') + ) + pipe = StableDiffusionGeneratorPipeline( vae=vae, text_encoder=text_model, tokenizer=tokenizer, @@ -933,15 +968,14 @@ def convert_ckpt_to_diffuser(checkpoint_path:str, elif model_type in ['FrozenCLIPEmbedder','WeightedFrozenCLIPEmbedder']: text_model = convert_ldm_clip_checkpoint(checkpoint) tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14",cache_dir=cache_dir) - safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker",cache_dir=cache_dir) feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker",cache_dir=cache_dir) - pipe = StableDiffusionPipeline( + pipe = StableDiffusionGeneratorPipeline( vae=vae, text_encoder=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler, - safety_checker=safety_checker, + safety_checker=None, feature_extractor=feature_extractor, ) else: @@ -950,7 +984,23 @@ def convert_ckpt_to_diffuser(checkpoint_path:str, tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased",cache_dir=cache_dir) pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler) + return pipe + +def convert_ckpt_to_diffuser( + checkpoint_path:Union[str,Path], + dump_path:Union[str,Path], + **kwargs, +): + ''' + Takes all the arguments of load_pipeline_from_original_stable_diffusion_ckpt(), + and in addition a path-like object indicating the location of the desired diffusers + model to be written. + ''' + pipe = load_pipeline_from_original_stable_diffusion_ckpt( + checkpoint_path, + **kwargs + ) pipe.save_pretrained( dump_path, - safe_serialization=1, + safe_serialization=is_safetensors_available(), ) diff --git a/ldm/invoke/globals.py b/ldm/invoke/globals.py index 4bc1d1ccf0..6bfa0ecd9d 100644 --- a/ldm/invoke/globals.py +++ b/ldm/invoke/globals.py @@ -51,6 +51,9 @@ Globals.disable_xformers = False # whether we are forcing full precision Globals.full_precision = False +# whether we should convert ckpt files into diffusers models on the fly +Globals.ckpt_convert = False + def global_config_file()->Path: return Path(Globals.root, Globals.config_dir, Globals.models_file) diff --git a/ldm/invoke/model_manager.py b/ldm/invoke/model_manager.py index dbc690ec54..ea8bc8d83c 100644 --- a/ldm/invoke/model_manager.py +++ b/ldm/invoke/model_manager.py @@ -150,6 +150,10 @@ class ModelManager(object): ''' Return true if this is a legacy (.ckpt) model ''' + # if we are converting legacy files automatically, then + # there are no legacy ckpts! + if Globals.ckpt_convert: + return False info = self.model_info(model_name) if 'weights' in info and info['weights'].endswith(('.ckpt','.safetensors')): return True @@ -340,6 +344,26 @@ class ModelManager(object): config = os.path.join(Globals.root,config) if not os.path.isabs(weights): weights = os.path.normpath(os.path.join(Globals.root,weights)) + + # if converting automatically to diffusers, then we do the conversion and return + # a diffusers pipeline + if Globals.ckpt_convert: + print(f'>> Converting legacy checkpoint {model_name} into a diffusers model...') + from ldm.invoke.ckpt_to_diffuser import load_pipeline_from_original_stable_diffusion_ckpt + if vae_config := self._choose_diffusers_vae(model_name): + vae = self._load_vae(vae_config) + pipeline = load_pipeline_from_original_stable_diffusion_ckpt( + checkpoint_path = weights, + original_config_file = config, + vae = vae, + ) + return ( + pipeline.to(self.device).to(torch.float16 if self.precision == 'float16' else torch.float32), + width, + height, + 'NOHASH' + ) + # scan model self.scan_model(model_name, weights) @@ -484,7 +508,7 @@ class ModelManager(object): return pipeline, width, height, model_hash def model_name_or_path(self, model_name:Union[str,DictConfig]) -> str | Path: - if isinstance(model_name,DictConfig): + if isinstance(model_name,DictConfig) or isinstance(model_name,dict): mconfig = model_name elif model_name in self.config: mconfig = self.config[model_name] @@ -664,6 +688,7 @@ class ModelManager(object): diffusers_path:Path, model_name=None, model_description=None, + vae= None, commit_to_conf:Path=None, )->dict: ''' @@ -681,39 +706,24 @@ class ModelManager(object): model_description = model_description or 'Optimized version of {model_name}' print(f'>> Optimizing {model_name} (30-60s)') try: - verbosity =transformers.logging.get_verbosity() - transformers.logging.set_verbosity_error() - convert_ckpt_to_diffuser(ckpt_path, diffusers_path,extract_ema=True) - transformers.logging.set_verbosity(verbosity) - print(f'>> Success. Optimized model is now located at {str(diffusers_path)}') - print(f'>> Writing new config file entry for {model_name}') + # By passing the specified VAE too the conversion function, the autoencoder + # will be built into the model rather than tacked on afterward via the config file + vae_model = self._load_vae(vae) if vae else None + convert_ckpt_to_diffuser( + ckpt_path, + diffusers_path, + extract_ema = True, + vae = vae_model, + ) + print(f' | Success. Optimized model is now located at {str(diffusers_path)}') + print(f' | Writing new config file entry for {model_name}') new_config = dict( path=str(diffusers_path), description=model_description, format='diffusers', ) - - # HACK (LS): in the event that the original entry is using a custom ckpt VAE, we try to - # map that VAE onto a diffuser VAE using a hard-coded dictionary. - # I would prefer to do this differently: We load the ckpt model into memory, swap the - # VAE in memory, and then pass that to convert_ckpt_to_diffuser() so that the swapped - # VAE is built into the model. However, when I tried this I got obscure key errors. - if model_name in self.config and (vae_ckpt_path := self.model_info(model_name)['vae']): - vae_basename = Path(vae_ckpt_path).stem - diffusers_vae = None - if (diffusers_vae := VAE_TO_REPO_ID.get(vae_basename,None)): - print(f'>> {vae_basename} VAE corresponds to known {diffusers_vae} diffusers version') - new_config.update( - vae = {'repo_id': diffusers_vae} - ) - else: - print(f'** Custom VAE "{vae_basename}" found, but corresponding diffusers model unknown') - print(f'** Using "stabilityai/sd-vae-ft-mse"; If this isn\'t right, please edit the model config') - new_config.update( - vae = {'repo_id': 'stabilityai/sd-vae-ft-mse'} - ) - - self.del_model(model_name) + if model_name in self.config: + self.del_model(model_name) self.add_model(model_name, new_config, True) if commit_to_conf: self.commit(commit_to_conf) @@ -742,6 +752,27 @@ class ModelManager(object): return search_folder, found_models + def _choose_diffusers_vae(self, model_name:str, vae:str=None)->Union[dict,str]: + + # In the event that the original entry is using a custom ckpt VAE, we try to + # map that VAE onto a diffuser VAE using a hard-coded dictionary. + # I would prefer to do this differently: We load the ckpt model into memory, swap the + # VAE in memory, and then pass that to convert_ckpt_to_diffuser() so that the swapped + # VAE is built into the model. However, when I tried this I got obscure key errors. + if vae: + return vae + if model_name in self.config and (vae_ckpt_path := self.model_info(model_name).get('vae',None)): + vae_basename = Path(vae_ckpt_path).stem + diffusers_vae = None + if (diffusers_vae := VAE_TO_REPO_ID.get(vae_basename,None)): + print(f'>> {vae_basename} VAE corresponds to known {diffusers_vae} diffusers version') + vae = {'repo_id': diffusers_vae} + else: + print(f'** Custom VAE "{vae_basename}" found, but corresponding diffusers model unknown') + print('** Using "stabilityai/sd-vae-ft-mse"; If this isn\'t right, please edit the model config') + vae = {'repo_id': 'stabilityai/sd-vae-ft-mse'} + return vae + def _make_cache_room(self) -> None: num_loaded_models = len(self.models) if num_loaded_models >= self.max_loaded_models: @@ -976,7 +1007,7 @@ class ModelManager(object): f.write(hash) return hash - def _load_vae(self, vae_config): + def _load_vae(self, vae_config)->AutoencoderKL: vae_args = {} name_or_path = self.model_name_or_path(vae_config) using_fp16 = self.precision == 'float16'