From fc3378bb74466ef7ca169c371d3bc0c360e2ddd2 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Thu, 2 Feb 2023 15:15:44 -0500
Subject: [PATCH] Load legacy ckpt files as diffusers models (#2468)

* refactor ckpt_to_diffuser to allow converted pipeline to remain in memory

- This idea was introduced by Damian
- Note that although I attempted to use the updated HuggingFace module
  pipelines/stable_diffusion/convert_from_ckpt.py, it was unable to
  convert safetensors files for reasons I didn't dig into.
- Default is to extract EMA weights.

* add --ckpt_convert option to load legacy ckpt files as diffusers models

- not quite working - I'm getting artifacts and glitches in the
  converted diffuser models
- leave as draft for time being

* do not include safety checker in converted files

* add ability to control which vae is used

API now allows the caller to pass an external VAE model to the
checkpoint conversion process. In this way, if an external VAE is
specified in the checkpoint's config stanza, this VAE will be used
when constructing the diffusers model.

Tested with both regular and inpainting 1.X models.

Not tested with SD 2.X models!

---------

Co-authored-by: Jonathan <34005131+JPPhoto@users.noreply.github.com>
Co-authored-by: Damian Stewart <null@damianstewart.com>
---
 ldm/invoke/CLI.py              |   9 ++-
 ldm/invoke/args.py             |   7 +++
 ldm/invoke/ckpt_to_diffuser.py | 108 ++++++++++++++++++++++++---------
 ldm/invoke/globals.py          |   3 +
 ldm/invoke/model_manager.py    |  91 ++++++++++++++++++---------
 5 files changed, 158 insertions(+), 60 deletions(-)

diff --git a/ldm/invoke/CLI.py b/ldm/invoke/CLI.py
index 96f8094643..e9e0c3c0f4 100644
--- a/ldm/invoke/CLI.py
+++ b/ldm/invoke/CLI.py
@@ -73,11 +73,13 @@ def main():
             print('--max_loaded_models must be >= 1; using 1')
             args.max_loaded_models = 1
 
-    # alert - setting a global here
+    # alert - setting a few globals here
     Globals.try_patchmatch = args.patchmatch
     Globals.always_use_cpu = args.always_use_cpu
     Globals.internet_available = args.internet_available and check_internet()
     Globals.disable_xformers = not args.xformers
+    Globals.ckpt_convert = args.ckpt_convert
+    
     print(f'>> Internet connectivity is {Globals.internet_available}')
 
     if not args.conf:
@@ -746,11 +748,16 @@ def optimize_model(model_name_or_path:str, gen, opt, completer):
         print(f'** {model_name_or_path} is already optimized. Will not overwrite. If this is an error, please remove the directory {diffuser_path} and try again.')
         return
 
+    vae = None
+    if input('Replace this model\'s VAE with "stabilityai/sd-vae-ft-mse"? [n] ').strip() in ('y','Y'):
+        vae = dict(repo_id='stabilityai/sd-vae-ft-mse')
+
     new_config = gen.model_manager.convert_and_import(
         ckpt_path,
         diffuser_path,
         model_name=model_name,
         model_description=model_description,
+        vae = vae,
         commit_to_conf=opt.conf,
     )
     if not new_config:
diff --git a/ldm/invoke/args.py b/ldm/invoke/args.py
index c918e4fba7..3904d2f573 100644
--- a/ldm/invoke/args.py
+++ b/ldm/invoke/args.py
@@ -503,6 +503,13 @@ class Args(object):
             help=f'Set model precision. Defaults to auto selected based on device. Options: {", ".join(PRECISION_CHOICES)}',
             default='auto',
         )
+        model_group.add_argument(
+            '--ckpt_convert',
+            action=argparse.BooleanOptionalAction,
+            dest='ckpt_convert',
+            default=False,
+            help='Load legacy ckpt files as diffusers. Pass --no-ckpt-convert to inhibit this behavior',
+        )
         model_group.add_argument(
             '--internet',
             action=argparse.BooleanOptionalAction,
diff --git a/ldm/invoke/ckpt_to_diffuser.py b/ldm/invoke/ckpt_to_diffuser.py
index 9b1735f831..fe56051aa3 100644
--- a/ldm/invoke/ckpt_to_diffuser.py
+++ b/ldm/invoke/ckpt_to_diffuser.py
@@ -23,6 +23,7 @@ import torch
 from pathlib import Path
 from ldm.invoke.globals import Globals, global_cache_dir
 from safetensors.torch import load_file
+from typing import Union
 
 try:
     from omegaconf import OmegaConf
@@ -46,9 +47,11 @@ from diffusers import (
 )
 from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
 from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder, PaintByExamplePipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from diffusers.utils import is_safetensors_available
 from transformers import AutoFeatureExtractor, BertTokenizerFast, CLIPTextModel, CLIPTokenizer, CLIPVisionConfig
 
+from ldm.invoke.generator.diffusers_pipeline import StableDiffusionGeneratorPipeline
+
 def shave_segments(path, n_shave_prefix_segments=1):
     """
     Removes segments. Positive values shave the first segments, negative shave the last segments.
@@ -318,11 +321,10 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
     unet_key = "model.diffusion_model."
     # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
     if sum(k.startswith("model_ema") for k in keys) > 100:
-        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(f"  | Checkpoint {path} has both EMA and non-EMA weights.")
         if extract_ema:
             print(
-                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
-                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+                '  | Extracting EMA weights (usually better for inference)'
             )
             for key in keys:
                 if key.startswith("model.diffusion_model"):
@@ -330,8 +332,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
                     unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
         else:
             print(
-                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
-                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+                '  | Extracting only the non-EMA weights (usually better for fine-tuning)'
             )
 
     for key in keys:
@@ -784,17 +785,44 @@ def convert_open_clip_checkpoint(checkpoint):
 
     return text_model
 
-def convert_ckpt_to_diffuser(checkpoint_path:str,
-                             dump_path:str,
-                             original_config_file:str=None,
-                             num_in_channels:int=None,
-                             scheduler_type:str='pndm',
-                             pipeline_type:str=None,
-                             image_size:int=None,
-                             prediction_type:str=None,
-                             extract_ema:bool=False,
-                             upcast_attn:bool=False,
-                             ):
+def load_pipeline_from_original_stable_diffusion_ckpt(
+        checkpoint_path:str,
+        original_config_file:str=None,
+        num_in_channels:int=None,
+        scheduler_type:str='pndm',
+        pipeline_type:str=None,
+        image_size:int=None,
+        prediction_type:str=None,
+        extract_ema:bool=True,
+        upcast_attn:bool=False,
+        vae:AutoencoderKL=None
+)->StableDiffusionGeneratorPipeline:
+    '''
+    Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
+    config file.
+
+    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
+    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
+    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
+
+    :param checkpoint_path: Path to `.ckpt` file. 
+    :param original_config_file: Path to `.yaml` config file corresponding to the original architecture. 
+      If `None`, will be automatically inferred by looking for a key that only exists in SD2.0 models.
+    :param image_size: The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2
+      Base. Use 768 for Stable Diffusion v2.
+    :param prediction_type: The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion
+     v1.X and Stable Diffusion v2 Base. Use `'v-prediction'` for Stable Diffusion v2.
+    :param num_in_channels: The number of input channels. If `None` number of input channels will be automatically
+    inferred. 
+    :param scheduler_type: Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler",
+     "euler-ancestral", "dpm", "ddim"]`. :param model_type: The pipeline type. `None` to automatically infer, or one of
+     `["FrozenOpenCLIPEmbedder", "FrozenCLIPEmbedder", "PaintByExample"]`. :param extract_ema: Only relevant for
+     checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights
+     or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher
+     quality images for inference. Non-EMA weights are usually better to continue fine-tuning.
+    :param upcast_attention: Whether the attention computation should always be upcasted. This is necessary when
+    running stable diffusion 2.1.
+    '''
 
     checkpoint = load_file(checkpoint_path) if Path(checkpoint_path).suffix == '.safetensors' else torch.load(checkpoint_path)
     cache_dir = global_cache_dir('hub')
@@ -803,7 +831,7 @@ def convert_ckpt_to_diffuser(checkpoint_path:str,
     if "global_step" in checkpoint:
         global_step = checkpoint["global_step"]
     else:
-        print("global_step key not found in model")
+        print("  | global_step key not found in model")
         global_step = None
 
     # sometimes there is a state_dict key and sometimes not
@@ -893,12 +921,16 @@ def convert_ckpt_to_diffuser(checkpoint_path:str,
 
     unet.load_state_dict(converted_unet_checkpoint)
 
-    # Convert the VAE model.
-    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
-    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+    # Convert the VAE model, or use the one passed
+    if not vae:
+        print(f'  | Using checkpoint model\'s original VAE')
+        vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+        converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
 
-    vae = AutoencoderKL(**vae_config)
-    vae.load_state_dict(converted_vae_checkpoint)
+        vae = AutoencoderKL(**vae_config)
+        vae.load_state_dict(converted_vae_checkpoint)
+    else:
+        print(f'  | Using external VAE specified in config')
 
     # Convert the text model.
     model_type = pipeline_type
@@ -907,8 +939,11 @@ def convert_ckpt_to_diffuser(checkpoint_path:str,
 
     if model_type == "FrozenOpenCLIPEmbedder":
         text_model = convert_open_clip_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer",cache_dir=global_cache_dir('diffusers'))
-        pipe = StableDiffusionPipeline(
+        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2",
+                                                  subfolder="tokenizer",
+                                                  cache_dir=global_cache_dir('diffusers')
+                                                  )
+        pipe = StableDiffusionGeneratorPipeline(
             vae=vae,
             text_encoder=text_model,
             tokenizer=tokenizer,
@@ -933,15 +968,14 @@ def convert_ckpt_to_diffuser(checkpoint_path:str,
     elif model_type in ['FrozenCLIPEmbedder','WeightedFrozenCLIPEmbedder']:
         text_model = convert_ldm_clip_checkpoint(checkpoint)
         tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14",cache_dir=cache_dir)
-        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker",cache_dir=cache_dir)
         feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker",cache_dir=cache_dir)
-        pipe = StableDiffusionPipeline(
+        pipe = StableDiffusionGeneratorPipeline(
             vae=vae,
             text_encoder=text_model,
             tokenizer=tokenizer,
             unet=unet,
             scheduler=scheduler,
-            safety_checker=safety_checker,
+            safety_checker=None,
             feature_extractor=feature_extractor,
         )
     else:
@@ -950,7 +984,23 @@ def convert_ckpt_to_diffuser(checkpoint_path:str,
         tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased",cache_dir=cache_dir)
         pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
 
+    return pipe
+
+def convert_ckpt_to_diffuser(
+        checkpoint_path:Union[str,Path],
+        dump_path:Union[str,Path],
+        **kwargs,
+):
+    '''
+    Takes all the arguments of load_pipeline_from_original_stable_diffusion_ckpt(),
+    and in addition a path-like object indicating the location of the desired diffusers
+    model to be written.
+    '''
+    pipe = load_pipeline_from_original_stable_diffusion_ckpt(
+        checkpoint_path,
+        **kwargs
+    )
     pipe.save_pretrained(
         dump_path,
-        safe_serialization=1,                 
+        safe_serialization=is_safetensors_available(),
     )
diff --git a/ldm/invoke/globals.py b/ldm/invoke/globals.py
index 4bc1d1ccf0..6bfa0ecd9d 100644
--- a/ldm/invoke/globals.py
+++ b/ldm/invoke/globals.py
@@ -51,6 +51,9 @@ Globals.disable_xformers = False
 # whether we are forcing full precision
 Globals.full_precision = False
 
+# whether we should convert ckpt files into diffusers models on the fly
+Globals.ckpt_convert = False
+
 def global_config_file()->Path:
     return Path(Globals.root, Globals.config_dir, Globals.models_file)
 
diff --git a/ldm/invoke/model_manager.py b/ldm/invoke/model_manager.py
index dbc690ec54..ea8bc8d83c 100644
--- a/ldm/invoke/model_manager.py
+++ b/ldm/invoke/model_manager.py
@@ -150,6 +150,10 @@ class ModelManager(object):
         '''
         Return true if this is a legacy (.ckpt) model
         '''
+        # if we are converting legacy files automatically, then
+        # there are no legacy ckpts!
+        if Globals.ckpt_convert:
+            return False
         info = self.model_info(model_name)
         if 'weights' in info and info['weights'].endswith(('.ckpt','.safetensors')):
             return True
@@ -340,6 +344,26 @@ class ModelManager(object):
             config = os.path.join(Globals.root,config)
         if not os.path.isabs(weights):
             weights = os.path.normpath(os.path.join(Globals.root,weights))
+
+        # if converting automatically to diffusers, then we do the conversion and return
+        # a diffusers pipeline
+        if Globals.ckpt_convert:
+            print(f'>> Converting legacy checkpoint {model_name} into a diffusers model...')
+            from ldm.invoke.ckpt_to_diffuser import load_pipeline_from_original_stable_diffusion_ckpt
+            if vae_config := self._choose_diffusers_vae(model_name):
+                vae = self._load_vae(vae_config)
+            pipeline = load_pipeline_from_original_stable_diffusion_ckpt(
+                checkpoint_path = weights,
+                original_config_file = config,
+                vae = vae,
+            )
+            return (
+                pipeline.to(self.device).to(torch.float16 if self.precision == 'float16' else torch.float32),
+                width,
+                height,
+                'NOHASH'
+                )
+
         # scan model
         self.scan_model(model_name, weights)
 
@@ -484,7 +508,7 @@ class ModelManager(object):
         return pipeline, width, height, model_hash
 
     def model_name_or_path(self, model_name:Union[str,DictConfig]) -> str | Path:
-        if isinstance(model_name,DictConfig):
+        if isinstance(model_name,DictConfig) or isinstance(model_name,dict):
             mconfig = model_name
         elif model_name in self.config:
             mconfig = self.config[model_name]
@@ -664,6 +688,7 @@ class ModelManager(object):
                            diffusers_path:Path,
                            model_name=None,
                            model_description=None,
+                           vae= None,
                            commit_to_conf:Path=None,
     )->dict:
         '''
@@ -681,39 +706,24 @@ class ModelManager(object):
         model_description = model_description or 'Optimized version of {model_name}'
         print(f'>> Optimizing {model_name} (30-60s)')
         try:
-            verbosity =transformers.logging.get_verbosity()
-            transformers.logging.set_verbosity_error()
-            convert_ckpt_to_diffuser(ckpt_path, diffusers_path,extract_ema=True)
-            transformers.logging.set_verbosity(verbosity)
-            print(f'>> Success. Optimized model is now located at {str(diffusers_path)}')
-            print(f'>> Writing new config file entry for {model_name}')
+            # By passing the specified VAE too the conversion function, the autoencoder
+            # will be built into the model rather than tacked on afterward via the config file
+            vae_model = self._load_vae(vae) if vae else None
+            convert_ckpt_to_diffuser(
+                ckpt_path,
+                diffusers_path,
+                extract_ema = True,
+                vae = vae_model,
+            )
+            print(f'  | Success. Optimized model is now located at {str(diffusers_path)}')
+            print(f'  | Writing new config file entry for {model_name}')
             new_config = dict(
                 path=str(diffusers_path),
                 description=model_description,
                 format='diffusers',
             )
-
-            # HACK (LS): in the event that the original entry is using a custom ckpt VAE, we try to
-            # map that VAE onto a diffuser VAE using a hard-coded dictionary.
-            # I would prefer to do this differently: We load the ckpt model into memory, swap the
-            # VAE in memory, and then pass that to convert_ckpt_to_diffuser() so that the swapped
-            # VAE is built into the model. However, when I tried this I got obscure key errors.
-            if model_name in self.config and (vae_ckpt_path := self.model_info(model_name)['vae']):
-                vae_basename = Path(vae_ckpt_path).stem
-                diffusers_vae = None
-                if (diffusers_vae := VAE_TO_REPO_ID.get(vae_basename,None)):
-                    print(f'>> {vae_basename} VAE corresponds to known {diffusers_vae} diffusers version')
-                    new_config.update(
-                        vae = {'repo_id': diffusers_vae}
-                    )
-                else:
-                    print(f'** Custom VAE "{vae_basename}" found, but corresponding diffusers model unknown')
-                    print(f'** Using "stabilityai/sd-vae-ft-mse"; If this isn\'t right, please edit the model config')
-                    new_config.update(
-                        vae = {'repo_id': 'stabilityai/sd-vae-ft-mse'}
-                    )
-
-            self.del_model(model_name)
+            if model_name in self.config:
+                self.del_model(model_name)
             self.add_model(model_name, new_config, True)
             if commit_to_conf:
                 self.commit(commit_to_conf)
@@ -742,6 +752,27 @@ class ModelManager(object):
 
         return search_folder, found_models
 
+    def _choose_diffusers_vae(self, model_name:str, vae:str=None)->Union[dict,str]:
+        
+        # In the event that the original entry is using a custom ckpt VAE, we try to
+        # map that VAE onto a diffuser VAE using a hard-coded dictionary.
+        # I would prefer to do this differently: We load the ckpt model into memory, swap the
+        # VAE in memory, and then pass that to convert_ckpt_to_diffuser() so that the swapped
+        # VAE is built into the model. However, when I tried this I got obscure key errors.
+        if vae:
+            return vae
+        if model_name in self.config and (vae_ckpt_path := self.model_info(model_name).get('vae',None)):
+            vae_basename = Path(vae_ckpt_path).stem
+            diffusers_vae = None
+            if (diffusers_vae := VAE_TO_REPO_ID.get(vae_basename,None)):
+                print(f'>> {vae_basename} VAE corresponds to known {diffusers_vae} diffusers version')
+                vae = {'repo_id': diffusers_vae}
+            else:
+                print(f'** Custom VAE "{vae_basename}" found, but corresponding diffusers model unknown')
+                print('** Using "stabilityai/sd-vae-ft-mse"; If this isn\'t right, please edit the model config')
+                vae = {'repo_id': 'stabilityai/sd-vae-ft-mse'}
+        return vae
+
     def _make_cache_room(self) -> None:
         num_loaded_models = len(self.models)
         if num_loaded_models >= self.max_loaded_models:
@@ -976,7 +1007,7 @@ class ModelManager(object):
             f.write(hash)
         return hash
 
-    def _load_vae(self, vae_config):
+    def _load_vae(self, vae_config)->AutoencoderKL:
         vae_args = {}
         name_or_path = self.model_name_or_path(vae_config)
         using_fp16 = self.precision == 'float16'