Fix bugs in merge and convert process (#2491)

1. The convert module was converting ckpt models into StableDiffusionGeneratorPipeline objects for use in-memory, but then when saved to disk created files that could not be merged with StableDiffusionPipeline models. I have added a flag that selects which pipeline class to return, so that both in-memory and disk conversions work properly. 2. This PR also fixes an issue with `invoke.sh` not using the correct path for the textual inversion and merge scripts. 3. Quench nags during the merge process about the safety checker being turned off.
2024-08-30 20:32:17 +00:00 · 2023-02-04 09:40:09 -05:00
parent e96ac61cb3 5145df21d9
commit f76d57637e
4 changed files with 183 additions and 181 deletions
--- a/installer/templates/invoke.sh.in
+++ b/installer/templates/invoke.sh.in
@ -47,11 +47,11 @@ if [ "$0" != "bash" ]; then
            ;;
        3)
            echo "Starting Textual Inversion:"
-            exec textual_inversion --gui $@
+            exec invokeai-ti --gui $@
            ;;
        4)
            echo "Merging Models:"
-            exec merge_models --gui $@
+            exec invokeai-merge --gui $@
            ;;
        5)
            echo "Developer Console:"
--- a/ldm/invoke/ckpt_to_diffuser.py
+++ b/ldm/invoke/ckpt_to_diffuser.py
@ -20,6 +20,7 @@
 import os
 import re
 import torch
+import warnings
 from pathlib import Path
 from ldm.invoke.globals import Globals, global_cache_dir
 from safetensors.torch import load_file
@ -44,6 +45,7 @@ from diffusers import (
    PNDMScheduler,
    StableDiffusionPipeline,
    UNet2DConditionModel,
+    logging as dlogging,
 )
 from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
 from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder, PaintByExamplePipeline
@ -795,8 +797,9 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
        prediction_type:str=None,
        extract_ema:bool=True,
        upcast_attn:bool=False,
-        vae:AutoencoderKL=None
-)->StableDiffusionGeneratorPipeline:
+        vae:AutoencoderKL=None,
+        return_generator_pipeline:bool=False,
+)->Union[StableDiffusionPipeline,StableDiffusionGeneratorPipeline]:
    '''
    Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
    config file.
@ -823,166 +826,173 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
    :param upcast_attention: Whether the attention computation should always be upcasted. This is necessary when
    running stable diffusion 2.1.
    '''
+    
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore')
+        verbosity = dlogging.get_verbosity()
+        dlogging.set_verbosity_error()

-    checkpoint = load_file(checkpoint_path) if Path(checkpoint_path).suffix == '.safetensors' else torch.load(checkpoint_path)
-    cache_dir = global_cache_dir('hub')
+        checkpoint = load_file(checkpoint_path) if Path(checkpoint_path).suffix == '.safetensors' else torch.load(checkpoint_path)
+        cache_dir = global_cache_dir('hub')
+        pipeline_class = StableDiffusionGeneratorPipeline if return_generator_pipeline else StableDiffusionPipeline

-    # Sometimes models don't have the global_step item
-    if "global_step" in checkpoint:
-        global_step = checkpoint["global_step"]
-    else:
-        print("  | global_step key not found in model")
-        global_step = None
-
-    # sometimes there is a state_dict key and sometimes not
-    if 'state_dict' in checkpoint:
-        checkpoint = checkpoint["state_dict"]
-
-    upcast_attention = False
-    if original_config_file is None:
-        key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
-
-        if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024:
-            original_config_file = os.path.join(Globals.root,'configs','stable-diffusion','v2-inference-v.yaml')
-
-            if global_step == 110000:
-                # v2.1 needs to upcast attention
-                upcast_attention = True
+        # Sometimes models don't have the global_step item
+        if "global_step" in checkpoint:
+            global_step = checkpoint["global_step"]
        else:
-            original_config_file = os.path.join(Globals.root,'configs','stable-diffusion','v1-inference.yaml')
+            print("  | global_step key not found in model")
+            global_step = None

-    original_config = OmegaConf.load(original_config_file)
+        # sometimes there is a state_dict key and sometimes not
+        if 'state_dict' in checkpoint:
+            checkpoint = checkpoint["state_dict"]

-    if num_in_channels is not None:
-        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+        upcast_attention = False
+        if original_config_file is None:
+            key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"

-    if (
-        "parameterization" in original_config["model"]["params"]
-        and original_config["model"]["params"]["parameterization"] == "v"
-    ):
-        if prediction_type is None:
-            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
-            # as it relies on a brittle global step parameter here
-            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
-        if image_size is None:
-            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
-            # as it relies on a brittle global step parameter here
-            image_size = 512 if global_step == 875000 else 768
-    else:
-        if prediction_type is None:
-            prediction_type = "epsilon"
-        if image_size is None:
-            image_size = 512
+            if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024:
+                original_config_file = os.path.join(Globals.root,'configs','stable-diffusion','v2-inference-v.yaml')

-    num_train_timesteps = original_config.model.params.timesteps
-    beta_start = original_config.model.params.linear_start
-    beta_end = original_config.model.params.linear_end
+                if global_step == 110000:
+                    # v2.1 needs to upcast attention
+                    upcast_attention = True
+            else:
+                original_config_file = os.path.join(Globals.root,'configs','stable-diffusion','v1-inference.yaml')

-    scheduler = DDIMScheduler(
-        beta_end=beta_end,
-        beta_schedule="scaled_linear",
-        beta_start=beta_start,
-        num_train_timesteps=num_train_timesteps,
-        steps_offset=1,
-        clip_sample=False,
-        set_alpha_to_one=False,
-        prediction_type=prediction_type,
-    )
-    # make sure scheduler works correctly with DDIM
-    scheduler.register_to_config(clip_sample=False)
+        original_config = OmegaConf.load(original_config_file)

-    if scheduler_type == "pndm":
-        config = dict(scheduler.config)
-        config["skip_prk_steps"] = True
-        scheduler = PNDMScheduler.from_config(config)
-    elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "heun":
-        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "dpm":
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-    elif scheduler_type == "ddim":
-        scheduler = scheduler
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+        if num_in_channels is not None:
+            original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels

-    # Convert the UNet2DConditionModel model.
-    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
-    unet_config["upcast_attention"] = upcast_attention
-    unet = UNet2DConditionModel(**unet_config)
+        if (
+            "parameterization" in original_config["model"]["params"]
+            and original_config["model"]["params"]["parameterization"] == "v"
+        ):
+            if prediction_type is None:
+                # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
+                # as it relies on a brittle global step parameter here
+                prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
+            if image_size is None:
+                # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
+                # as it relies on a brittle global step parameter here
+                image_size = 512 if global_step == 875000 else 768
+        else:
+            if prediction_type is None:
+                prediction_type = "epsilon"
+            if image_size is None:
+                image_size = 512

-    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
-        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
-    )
+        num_train_timesteps = original_config.model.params.timesteps
+        beta_start = original_config.model.params.linear_start
+        beta_end = original_config.model.params.linear_end

-    unet.load_state_dict(converted_unet_checkpoint)
-
-    # Convert the VAE model, or use the one passed
-    if not vae:
-        print(f'  | Using checkpoint model\'s original VAE')
-        vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
-        converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-
-        vae = AutoencoderKL(**vae_config)
-        vae.load_state_dict(converted_vae_checkpoint)
-    else:
-        print(f'  | Using external VAE specified in config')
-
-    # Convert the text model.
-    model_type = pipeline_type
-    if model_type is None:
-        model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
-
-    if model_type == "FrozenOpenCLIPEmbedder":
-        text_model = convert_open_clip_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2",
-                                                  subfolder="tokenizer",
-                                                  cache_dir=global_cache_dir('diffusers')
-                                                  )
-        pipe = StableDiffusionGeneratorPipeline(
-            vae=vae,
-            text_encoder=text_model,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=None,
-            feature_extractor=None,
-            requires_safety_checker=False,
+        scheduler = DDIMScheduler(
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            beta_start=beta_start,
+            num_train_timesteps=num_train_timesteps,
+            steps_offset=1,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            prediction_type=prediction_type,
        )
-    elif model_type == "PaintByExample":
-        vision_model = convert_paint_by_example_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14",cache_dir=cache_dir)
-        feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker",cache_dir=cache_dir)
-        pipe = PaintByExamplePipeline(
-            vae=vae,
-            image_encoder=vision_model,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=None,
-            feature_extractor=feature_extractor,
+        # make sure scheduler works correctly with DDIM
+        scheduler.register_to_config(clip_sample=False)
+
+        if scheduler_type == "pndm":
+            config = dict(scheduler.config)
+            config["skip_prk_steps"] = True
+            scheduler = PNDMScheduler.from_config(config)
+        elif scheduler_type == "lms":
+            scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+        elif scheduler_type == "heun":
+            scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
+        elif scheduler_type == "euler":
+            scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
+        elif scheduler_type == "euler-ancestral":
+            scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
+        elif scheduler_type == "dpm":
+            scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+        elif scheduler_type == "ddim":
+            scheduler = scheduler
+        else:
+            raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+
+        # Convert the UNet2DConditionModel model.
+        unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+        unet_config["upcast_attention"] = upcast_attention
+        unet = UNet2DConditionModel(**unet_config)
+
+        converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+            checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
        )
-    elif model_type in ['FrozenCLIPEmbedder','WeightedFrozenCLIPEmbedder']:
-        text_model = convert_ldm_clip_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14",cache_dir=cache_dir)
-        feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker",cache_dir=cache_dir)
-        pipe = StableDiffusionGeneratorPipeline(
-            vae=vae,
-            text_encoder=text_model,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=None,
-            feature_extractor=feature_extractor,
-        )
-    else:
-        text_config = create_ldm_bert_config(original_config)
-        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
-        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased",cache_dir=cache_dir)
-        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+
+        unet.load_state_dict(converted_unet_checkpoint)
+
+        # Convert the VAE model, or use the one passed
+        if not vae:
+            print('  | Using checkpoint model\'s original VAE')
+            vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+            converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+            vae = AutoencoderKL(**vae_config)
+            vae.load_state_dict(converted_vae_checkpoint)
+        else:
+            print('  | Using external VAE specified in config')
+
+        # Convert the text model.
+        model_type = pipeline_type
+        if model_type is None:
+            model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+
+        if model_type == "FrozenOpenCLIPEmbedder":
+            text_model = convert_open_clip_checkpoint(checkpoint)
+            tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2",
+                                                      subfolder="tokenizer",
+                                                      cache_dir=global_cache_dir('diffusers')
+                                                      )
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_model,
+                tokenizer=tokenizer,
+                unet=unet,
+                scheduler=scheduler,
+                safety_checker=None,
+                feature_extractor=None,
+                requires_safety_checker=False,
+            )
+        elif model_type == "PaintByExample":
+            vision_model = convert_paint_by_example_checkpoint(checkpoint)
+            tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14",cache_dir=cache_dir)
+            feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker",cache_dir=cache_dir)
+            pipe = PaintByExamplePipeline(
+                vae=vae,
+                image_encoder=vision_model,
+                unet=unet,
+                scheduler=scheduler,
+                safety_checker=None,
+                feature_extractor=feature_extractor,
+            )
+        elif model_type in ['FrozenCLIPEmbedder','WeightedFrozenCLIPEmbedder']:
+            text_model = convert_ldm_clip_checkpoint(checkpoint)
+            tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14",cache_dir=cache_dir)
+            feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker",cache_dir=cache_dir)
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_model,
+                tokenizer=tokenizer,
+                unet=unet,
+                scheduler=scheduler,
+                safety_checker=None,
+                feature_extractor=feature_extractor,
+            )
+        else:
+            text_config = create_ldm_bert_config(original_config)
+            text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
+            tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased",cache_dir=cache_dir)
+            pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+    dlogging.set_verbosity(verbosity)

    return pipe

@ -1000,6 +1010,7 @@ def convert_ckpt_to_diffuser(
        checkpoint_path,
        **kwargs
    )
+    
    pipe.save_pretrained(
        dump_path,
        safe_serialization=is_safetensors_available(),
--- a/ldm/invoke/merge_diffusers.py
+++ b/ldm/invoke/merge_diffusers.py
@ -8,13 +8,14 @@ import argparse
 import curses
 import os
 import sys
+import traceback
+import warnings
 from argparse import Namespace
 from pathlib import Path
 from typing import List, Union

 import npyscreen
-import warnings
-from diffusers import DiffusionPipeline
+from diffusers import DiffusionPipeline, logging as dlogging
 from omegaconf import OmegaConf

 from ldm.invoke.globals import (
@ -46,18 +47,24 @@ def merge_diffusion_models(
    **kwargs - the default DiffusionPipeline.get_config_dict kwargs:
         cache_dir, resume_download, force_download, proxies, local_files_only, use_auth_token, revision, torch_dtype, device_map
    """
-    pipe = DiffusionPipeline.from_pretrained(
-        model_ids_or_paths[0],
-        cache_dir=kwargs.get("cache_dir", global_cache_dir()),
-        custom_pipeline="checkpoint_merger",
-    )
-    merged_pipe = pipe.merge(
-        pretrained_model_name_or_path_list=model_ids_or_paths,
-        alpha=alpha,
-        interp=interp,
-        force=force,
-        **kwargs,
-    )
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore')
+        verbosity = dlogging.get_verbosity()
+        dlogging.set_verbosity_error()
+        
+        pipe = DiffusionPipeline.from_pretrained(
+            model_ids_or_paths[0],
+            cache_dir=kwargs.get("cache_dir", global_cache_dir()),
+            custom_pipeline="checkpoint_merger",
+        )
+        merged_pipe = pipe.merge(
+            pretrained_model_name_or_path_list=model_ids_or_paths,
+            alpha=alpha,
+            interp=interp,
+            force=force,
+            **kwargs,
+        )
+        dlogging.set_verbosity(verbosity)
    return merged_pipe


@ -443,22 +450,5 @@ def main():
    ] = cache_dir  # because not clear the merge pipeline is honoring cache_dir
    args.cache_dir = cache_dir

-    with warnings.catch_warnings():
-        warnings.simplefilter('ignore')
-        try:
-            if args.front_end:
-                run_gui(args)
-            else:
-                run_cli(args)
-            print(f'>> Conversion successful.')
-        except Exception as e:
-            if str(e).startswith('Not enough space'):
-                print('** Not enough horizontal space! Try making the window wider, or relaunch with a smaller starting size.')
-            else:
-                print(f"** An error occurred while merging the pipelines: {str(e)}")
-            sys.exit(-1)
-        except KeyboardInterrupt:
-            sys.exit(-1)
-
 if __name__ == "__main__":
    main()
--- a/ldm/invoke/model_manager.py
+++ b/ldm/invoke/model_manager.py
@ -356,6 +356,7 @@ class ModelManager(object):
                checkpoint_path = weights,
                original_config_file = config,
                vae = vae,
+                return_generator_pipeline=True,
            )
            return (
                pipeline.to(self.device).to(torch.float16 if self.precision == 'float16' else torch.float32),