Merge branch 'main' into dev/installer

2024-08-30 20:32:17 +00:00 · 2023-02-02 15:17:40 -05:00 · 2023-02-02 15:17:40 -05:00 · 2202288eb2
commit 2202288eb2
parent 96228507d2 fc3378bb74
5 changed files with 158 additions and 60 deletions
--- a/ldm/invoke/CLI.py
+++ b/ldm/invoke/CLI.py
@ -44,11 +44,13 @@ def main():
            print('--max_loaded_models must be >= 1; using 1')
            args.max_loaded_models = 1

-    # alert - setting a global here
+    # alert - setting a few globals here
    Globals.try_patchmatch = args.patchmatch
    Globals.always_use_cpu = args.always_use_cpu
    Globals.internet_available = args.internet_available and check_internet()
    Globals.disable_xformers = not args.xformers
+    Globals.ckpt_convert = args.ckpt_convert
+    
    print(f'>> Internet connectivity is {Globals.internet_available}')

    if not args.conf:
@ -717,11 +719,16 @@ def optimize_model(model_name_or_path:str, gen, opt, completer):
        print(f'** {model_name_or_path} is already optimized. Will not overwrite. If this is an error, please remove the directory {diffuser_path} and try again.')
        return

+    vae = None
+    if input('Replace this model\'s VAE with "stabilityai/sd-vae-ft-mse"? [n] ').strip() in ('y','Y'):
+        vae = dict(repo_id='stabilityai/sd-vae-ft-mse')
+
    new_config = gen.model_manager.convert_and_import(
        ckpt_path,
        diffuser_path,
        model_name=model_name,
        model_description=model_description,
+        vae = vae,
        commit_to_conf=opt.conf,
    )
    if not new_config:
--- a/ldm/invoke/args.py
+++ b/ldm/invoke/args.py
@ -503,6 +503,13 @@ class Args(object):
            help=f'Set model precision. Defaults to auto selected based on device. Options: {", ".join(PRECISION_CHOICES)}',
            default='auto',
        )
+        model_group.add_argument(
+            '--ckpt_convert',
+            action=argparse.BooleanOptionalAction,
+            dest='ckpt_convert',
+            default=False,
+            help='Load legacy ckpt files as diffusers. Pass --no-ckpt-convert to inhibit this behavior',
+        )
        model_group.add_argument(
            '--internet',
            action=argparse.BooleanOptionalAction,
--- a/ldm/invoke/ckpt_to_diffuser.py
+++ b/ldm/invoke/ckpt_to_diffuser.py
@ -23,6 +23,7 @@ import torch
 from pathlib import Path
 from ldm.invoke.globals import Globals, global_cache_dir
 from safetensors.torch import load_file
+from typing import Union

 try:
    from omegaconf import OmegaConf
@ -46,9 +47,11 @@ from diffusers import (
 )
 from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
 from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder, PaintByExamplePipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from diffusers.utils import is_safetensors_available
 from transformers import AutoFeatureExtractor, BertTokenizerFast, CLIPTextModel, CLIPTokenizer, CLIPVisionConfig

+from ldm.invoke.generator.diffusers_pipeline import StableDiffusionGeneratorPipeline
+
 def shave_segments(path, n_shave_prefix_segments=1):
    """
    Removes segments. Positive values shave the first segments, negative shave the last segments.
@ -318,11 +321,10 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
    unet_key = "model.diffusion_model."
    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
    if sum(k.startswith("model_ema") for k in keys) > 100:
-        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(f"  | Checkpoint {path} has both EMA and non-EMA weights.")
        if extract_ema:
            print(
-                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
-                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+                '  | Extracting EMA weights (usually better for inference)'
            )
            for key in keys:
                if key.startswith("model.diffusion_model"):
@ -330,8 +332,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
        else:
            print(
-                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
-                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+                '  | Extracting only the non-EMA weights (usually better for fine-tuning)'
            )

    for key in keys:
@ -784,17 +785,44 @@ def convert_open_clip_checkpoint(checkpoint):

    return text_model

-def convert_ckpt_to_diffuser(checkpoint_path:str,
-                             dump_path:str,
-                             original_config_file:str=None,
-                             num_in_channels:int=None,
-                             scheduler_type:str='pndm',
-                             pipeline_type:str=None,
-                             image_size:int=None,
-                             prediction_type:str=None,
-                             extract_ema:bool=False,
-                             upcast_attn:bool=False,
-                             ):
+def load_pipeline_from_original_stable_diffusion_ckpt(
+        checkpoint_path:str,
+        original_config_file:str=None,
+        num_in_channels:int=None,
+        scheduler_type:str='pndm',
+        pipeline_type:str=None,
+        image_size:int=None,
+        prediction_type:str=None,
+        extract_ema:bool=True,
+        upcast_attn:bool=False,
+        vae:AutoencoderKL=None
+)->StableDiffusionGeneratorPipeline:
+    '''
+    Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
+    config file.
+
+    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
+    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
+    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
+
+    :param checkpoint_path: Path to `.ckpt` file. 
+    :param original_config_file: Path to `.yaml` config file corresponding to the original architecture. 
+      If `None`, will be automatically inferred by looking for a key that only exists in SD2.0 models.
+    :param image_size: The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2
+      Base. Use 768 for Stable Diffusion v2.
+    :param prediction_type: The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion
+     v1.X and Stable Diffusion v2 Base. Use `'v-prediction'` for Stable Diffusion v2.
+    :param num_in_channels: The number of input channels. If `None` number of input channels will be automatically
+    inferred. 
+    :param scheduler_type: Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler",
+     "euler-ancestral", "dpm", "ddim"]`. :param model_type: The pipeline type. `None` to automatically infer, or one of
+     `["FrozenOpenCLIPEmbedder", "FrozenCLIPEmbedder", "PaintByExample"]`. :param extract_ema: Only relevant for
+     checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights
+     or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher
+     quality images for inference. Non-EMA weights are usually better to continue fine-tuning.
+    :param upcast_attention: Whether the attention computation should always be upcasted. This is necessary when
+    running stable diffusion 2.1.
+    '''

    checkpoint = load_file(checkpoint_path) if Path(checkpoint_path).suffix == '.safetensors' else torch.load(checkpoint_path)
    cache_dir = global_cache_dir('hub')
@ -803,7 +831,7 @@ def convert_ckpt_to_diffuser(checkpoint_path:str,
    if "global_step" in checkpoint:
        global_step = checkpoint["global_step"]
    else:
-        print("global_step key not found in model")
+        print("  | global_step key not found in model")
        global_step = None

    # sometimes there is a state_dict key and sometimes not
@ -893,12 +921,16 @@ def convert_ckpt_to_diffuser(checkpoint_path:str,

    unet.load_state_dict(converted_unet_checkpoint)

-    # Convert the VAE model.
-    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
-    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+    # Convert the VAE model, or use the one passed
+    if not vae:
+        print(f'  | Using checkpoint model\'s original VAE')
+        vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+        converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)

-    vae = AutoencoderKL(**vae_config)
-    vae.load_state_dict(converted_vae_checkpoint)
+        vae = AutoencoderKL(**vae_config)
+        vae.load_state_dict(converted_vae_checkpoint)
+    else:
+        print(f'  | Using external VAE specified in config')

    # Convert the text model.
    model_type = pipeline_type
@ -907,8 +939,11 @@ def convert_ckpt_to_diffuser(checkpoint_path:str,

    if model_type == "FrozenOpenCLIPEmbedder":
        text_model = convert_open_clip_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer",cache_dir=global_cache_dir('diffusers'))
-        pipe = StableDiffusionPipeline(
+        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2",
+                                                  subfolder="tokenizer",
+                                                  cache_dir=global_cache_dir('diffusers')
+                                                  )
+        pipe = StableDiffusionGeneratorPipeline(
            vae=vae,
            text_encoder=text_model,
            tokenizer=tokenizer,
@ -933,15 +968,14 @@ def convert_ckpt_to_diffuser(checkpoint_path:str,
    elif model_type in ['FrozenCLIPEmbedder','WeightedFrozenCLIPEmbedder']:
        text_model = convert_ldm_clip_checkpoint(checkpoint)
        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14",cache_dir=cache_dir)
-        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker",cache_dir=cache_dir)
        feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker",cache_dir=cache_dir)
-        pipe = StableDiffusionPipeline(
+        pipe = StableDiffusionGeneratorPipeline(
            vae=vae,
            text_encoder=text_model,
            tokenizer=tokenizer,
            unet=unet,
            scheduler=scheduler,
-            safety_checker=safety_checker,
+            safety_checker=None,
            feature_extractor=feature_extractor,
        )
    else:
@ -950,7 +984,23 @@ def convert_ckpt_to_diffuser(checkpoint_path:str,
        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased",cache_dir=cache_dir)
        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)

+    return pipe
+
+def convert_ckpt_to_diffuser(
+        checkpoint_path:Union[str,Path],
+        dump_path:Union[str,Path],
+        **kwargs,
+):
+    '''
+    Takes all the arguments of load_pipeline_from_original_stable_diffusion_ckpt(),
+    and in addition a path-like object indicating the location of the desired diffusers
+    model to be written.
+    '''
+    pipe = load_pipeline_from_original_stable_diffusion_ckpt(
+        checkpoint_path,
+        **kwargs
+    )
    pipe.save_pretrained(
        dump_path,
-        safe_serialization=1,                 
+        safe_serialization=is_safetensors_available(),
    )
--- a/ldm/invoke/globals.py
+++ b/ldm/invoke/globals.py
@ -51,6 +51,9 @@ Globals.disable_xformers = False
 # whether we are forcing full precision
 Globals.full_precision = False

+# whether we should convert ckpt files into diffusers models on the fly
+Globals.ckpt_convert = False
+
 def global_config_file()->Path:
    return Path(Globals.root, Globals.config_dir, Globals.models_file)

--- a/ldm/invoke/model_manager.py
+++ b/ldm/invoke/model_manager.py
@ -150,6 +150,10 @@ class ModelManager(object):
        '''
        Return true if this is a legacy (.ckpt) model
        '''
+        # if we are converting legacy files automatically, then
+        # there are no legacy ckpts!
+        if Globals.ckpt_convert:
+            return False
        info = self.model_info(model_name)
        if 'weights' in info and info['weights'].endswith(('.ckpt','.safetensors')):
            return True
@ -340,6 +344,26 @@ class ModelManager(object):
            config = os.path.join(Globals.root,config)
        if not os.path.isabs(weights):
            weights = os.path.normpath(os.path.join(Globals.root,weights))
+
+        # if converting automatically to diffusers, then we do the conversion and return
+        # a diffusers pipeline
+        if Globals.ckpt_convert:
+            print(f'>> Converting legacy checkpoint {model_name} into a diffusers model...')
+            from ldm.invoke.ckpt_to_diffuser import load_pipeline_from_original_stable_diffusion_ckpt
+            if vae_config := self._choose_diffusers_vae(model_name):
+                vae = self._load_vae(vae_config)
+            pipeline = load_pipeline_from_original_stable_diffusion_ckpt(
+                checkpoint_path = weights,
+                original_config_file = config,
+                vae = vae,
+            )
+            return (
+                pipeline.to(self.device).to(torch.float16 if self.precision == 'float16' else torch.float32),
+                width,
+                height,
+                'NOHASH'
+                )
+
        # scan model
        self.scan_model(model_name, weights)

@ -484,7 +508,7 @@ class ModelManager(object):
        return pipeline, width, height, model_hash

    def model_name_or_path(self, model_name:Union[str,DictConfig]) -> str | Path:
-        if isinstance(model_name,DictConfig):
+        if isinstance(model_name,DictConfig) or isinstance(model_name,dict):
            mconfig = model_name
        elif model_name in self.config:
            mconfig = self.config[model_name]
@ -664,6 +688,7 @@ class ModelManager(object):
                           diffusers_path:Path,
                           model_name=None,
                           model_description=None,
+                           vae= None,
                           commit_to_conf:Path=None,
    )->dict:
        '''
@ -681,39 +706,24 @@ class ModelManager(object):
        model_description = model_description or 'Optimized version of {model_name}'
        print(f'>> Optimizing {model_name} (30-60s)')
        try:
-            verbosity =transformers.logging.get_verbosity()
-            transformers.logging.set_verbosity_error()
-            convert_ckpt_to_diffuser(ckpt_path, diffusers_path,extract_ema=True)
-            transformers.logging.set_verbosity(verbosity)
-            print(f'>> Success. Optimized model is now located at {str(diffusers_path)}')
-            print(f'>> Writing new config file entry for {model_name}')
+            # By passing the specified VAE too the conversion function, the autoencoder
+            # will be built into the model rather than tacked on afterward via the config file
+            vae_model = self._load_vae(vae) if vae else None
+            convert_ckpt_to_diffuser(
+                ckpt_path,
+                diffusers_path,
+                extract_ema = True,
+                vae = vae_model,
+            )
+            print(f'  | Success. Optimized model is now located at {str(diffusers_path)}')
+            print(f'  | Writing new config file entry for {model_name}')
            new_config = dict(
                path=str(diffusers_path),
                description=model_description,
                format='diffusers',
            )
-
-            # HACK (LS): in the event that the original entry is using a custom ckpt VAE, we try to
-            # map that VAE onto a diffuser VAE using a hard-coded dictionary.
-            # I would prefer to do this differently: We load the ckpt model into memory, swap the
-            # VAE in memory, and then pass that to convert_ckpt_to_diffuser() so that the swapped
-            # VAE is built into the model. However, when I tried this I got obscure key errors.
-            if model_name in self.config and (vae_ckpt_path := self.model_info(model_name)['vae']):
-                vae_basename = Path(vae_ckpt_path).stem
-                diffusers_vae = None
-                if (diffusers_vae := VAE_TO_REPO_ID.get(vae_basename,None)):
-                    print(f'>> {vae_basename} VAE corresponds to known {diffusers_vae} diffusers version')
-                    new_config.update(
-                        vae = {'repo_id': diffusers_vae}
-                    )
-                else:
-                    print(f'** Custom VAE "{vae_basename}" found, but corresponding diffusers model unknown')
-                    print(f'** Using "stabilityai/sd-vae-ft-mse"; If this isn\'t right, please edit the model config')
-                    new_config.update(
-                        vae = {'repo_id': 'stabilityai/sd-vae-ft-mse'}
-                    )
-
-            self.del_model(model_name)
+            if model_name in self.config:
+                self.del_model(model_name)
            self.add_model(model_name, new_config, True)
            if commit_to_conf:
                self.commit(commit_to_conf)
@ -742,6 +752,27 @@ class ModelManager(object):

        return search_folder, found_models

+    def _choose_diffusers_vae(self, model_name:str, vae:str=None)->Union[dict,str]:
+        
+        # In the event that the original entry is using a custom ckpt VAE, we try to
+        # map that VAE onto a diffuser VAE using a hard-coded dictionary.
+        # I would prefer to do this differently: We load the ckpt model into memory, swap the
+        # VAE in memory, and then pass that to convert_ckpt_to_diffuser() so that the swapped
+        # VAE is built into the model. However, when I tried this I got obscure key errors.
+        if vae:
+            return vae
+        if model_name in self.config and (vae_ckpt_path := self.model_info(model_name).get('vae',None)):
+            vae_basename = Path(vae_ckpt_path).stem
+            diffusers_vae = None
+            if (diffusers_vae := VAE_TO_REPO_ID.get(vae_basename,None)):
+                print(f'>> {vae_basename} VAE corresponds to known {diffusers_vae} diffusers version')
+                vae = {'repo_id': diffusers_vae}
+            else:
+                print(f'** Custom VAE "{vae_basename}" found, but corresponding diffusers model unknown')
+                print('** Using "stabilityai/sd-vae-ft-mse"; If this isn\'t right, please edit the model config')
+                vae = {'repo_id': 'stabilityai/sd-vae-ft-mse'}
+        return vae
+
    def _make_cache_room(self) -> None:
        num_loaded_models = len(self.models)
        if num_loaded_models >= self.max_loaded_models:
@ -976,7 +1007,7 @@ class ModelManager(object):
            f.write(hash)
        return hash

-    def _load_vae(self, vae_config):
+    def _load_vae(self, vae_config)->AutoencoderKL:
        vae_args = {}
        name_or_path = self.model_name_or_path(vae_config)
        using_fp16 = self.precision == 'float16'