Support both v2-v and v2-e legacy ckpt models in v2.3 (#2907)

# Support SD version 2 "epsilon" and "v-predict" inference configurations in v2.3 This is a port of the `main` PR #2870 back into V2.3. It allows both "epsilon" inference V2 models (e.g. "v2-base") and "v-predict" models (e.g. "V2-768") to be imported and converted into correct diffusers models. This depends on picking the right configuration file to use, and since there is no intrinsic difference between the two types of models, when we detect that a V2 model is being imported, we fall back to asking the user to select the model type.
2025-07-26 05:17:55 +00:00 · 2023-03-12 04:42:16 +13:00
parent 9c2f3259ca 827ac82d54
commit b5bd5240b6
9 changed files with 236 additions and 64 deletions
--- a/invokeai/configs/stable-diffusion/v2-inference.yaml
+++ b/invokeai/configs/stable-diffusion/v2-inference.yaml
@ -0,0 +1,67 @@
+model:
+  base_learning_rate: 1.0e-4
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False # we set this to false because this is an inference only config
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        use_fp16: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"
--- a/invokeai/frontend/dist/assets/index-c09cf9ca.js
+++ b/invokeai/frontend/dist/assets/index-c09cf9ca.js
--- a/invokeai/frontend/dist/index.html
+++ b/invokeai/frontend/dist/index.html
@ -5,7 +5,7 @@
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>InvokeAI - A Stable Diffusion Toolkit</title>
    <link rel="shortcut icon" type="icon" href="./assets/favicon-0d253ced.ico" />
-    <script type="module" crossorigin src="./assets/index-720872d1.js"></script>
+    <script type="module" crossorigin src="./assets/index-c09cf9ca.js"></script>
    <link rel="stylesheet" href="./assets/index-14cb2922.css">
  </head>

--- a/invokeai/frontend/dist/locales/en.json
+++ b/invokeai/frontend/dist/locales/en.json
@ -365,7 +365,8 @@
        "convertToDiffusersHelpText6": "Do you wish to convert this model?",
        "convertToDiffusersSaveLocation": "Save Location",
        "v1": "v1",
-        "v2": "v2",
+        "v2_base": "v2 (512px)",
+        "v2_768": "v2 (768px)",
        "inpainting": "v1 Inpainting",
        "customConfig": "Custom Config",
        "pathToCustomConfig": "Path To Custom Config",
--- a/invokeai/frontend/public/locales/en.json
+++ b/invokeai/frontend/public/locales/en.json
@ -365,7 +365,8 @@
        "convertToDiffusersHelpText6": "Do you wish to convert this model?",
        "convertToDiffusersSaveLocation": "Save Location",
        "v1": "v1",
-        "v2": "v2",
+        "v2_base": "v2 (512px)",
+        "v2_768": "v2 (768px)",
        "inpainting": "v1 Inpainting",
        "customConfig": "Custom Config",
        "pathToCustomConfig": "Path To Custom Config",
--- a/invokeai/frontend/src/features/system/components/ModelManager/SearchModels.tsx
+++ b/invokeai/frontend/src/features/system/components/ModelManager/SearchModels.tsx
@ -181,7 +181,8 @@ export default function SearchModels() {

    const configFiles = {
      v1: 'configs/stable-diffusion/v1-inference.yaml',
-      v2: 'configs/stable-diffusion/v2-inference-v.yaml',
+      v2_base: 'configs/stable-diffusion/v2-inference-v.yaml',
+      v2_768: 'configs/stable-diffusion/v2-inference-v.yaml',
      inpainting: 'configs/stable-diffusion/v1-inpainting-inference.yaml',
      custom: pathToConfig,
    };
@ -385,7 +386,8 @@ export default function SearchModels() {
              >
                <Flex gap={4}>
                  <Radio value="v1">{t('modelManager.v1')}</Radio>
-                  <Radio value="v2">{t('modelManager.v2')}</Radio>
+                  <Radio value="v2_base">{t('modelManager.v2_base')}</Radio>
+                  <Radio value="v2_768">{t('modelManager.v2_768')}</Radio>
                  <Radio value="inpainting">
                    {t('modelManager.inpainting')}
                  </Radio>
--- a/ldm/invoke/CLI.py
+++ b/ldm/invoke/CLI.py
@ -22,7 +22,7 @@ from ..generate import Generate
 from .args import (Args, dream_cmd_from_png, metadata_dumps,
                             metadata_from_png)
 from .generator.diffusers_pipeline import PipelineIntermediateState
-from .globals import Globals
+from .globals import Globals, global_config_dir
 from .image_util import make_grid
 from .log import write_log
 from .model_manager import ModelManager
@ -33,7 +33,6 @@ from ..util import url_attachment_name
 # global used in multiple functions (fix)
 infile = None

-
 def main():
    """Initialize command-line parsers and the diffusion model"""
    global infile
@ -66,6 +65,9 @@ def main():
    Globals.sequential_guidance = args.sequential_guidance
    Globals.ckpt_convert = args.ckpt_convert

+    # run any post-install patches needed
+    run_patches()
+
    print(f">> Internet connectivity is {Globals.internet_available}")

    if not args.conf:
@ -662,7 +664,16 @@ def import_model(model_path: str, gen, opt, completer, convert=False):
    )

    if not imported_name:
-        print("** Import failed or was skipped")
+        if config_file := _pick_configuration_file(completer):
+            imported_name = gen.model_manager.heuristic_import(
+                model_path,
+                model_name=model_name,
+                description=model_desc,
+                convert=convert,
+                model_config_file=config_file,
+            )
+    if not imported_name:
+        print("** Aborting import.")
        return

    if not _verify_load(imported_name, gen):
@ -676,6 +687,48 @@ def import_model(model_path: str, gen, opt, completer, convert=False):
    completer.update_models(gen.model_manager.list_models())
    print(f">> {imported_name} successfully installed")

+def _pick_configuration_file(completer)->Path:
+    print(
+"""
+Please select the type of this model:
+[1] A Stable Diffusion v1.x ckpt/safetensors model
+[2] A Stable Diffusion v1.x inpainting ckpt/safetensors model
+[3] A Stable Diffusion v2.x base model (512 pixels)
+[4] A Stable Diffusion v2.x v-predictive model (768 pixels)
+[5] Other (you will be prompted to enter the config file path)
+[Q] I have no idea! Skip the import.
+""")
+    choices = [
+        global_config_dir() / 'stable-diffusion' / x
+        for x in [
+                'v1-inference.yaml',
+                'v1-inpainting-inference.yaml',
+                'v2-inference.yaml',
+                'v2-inference-v.yaml',
+        ]
+    ]
+
+    ok = False
+    while not ok:
+        try:
+            choice = input('select 0-5, Q > ').strip()
+            if choice.startswith(('q','Q')):
+                return
+            if choice == '5':
+                completer.complete_extensions(('.yaml'))
+                choice = Path(input('Select config file for this model> ').strip()).absolute()
+                completer.complete_extensions(None)
+                ok = choice.exists()
+            else:
+                choice = choices[int(choice)-1]
+                ok = True
+        except (ValueError, IndexError):
+            print(f'{choice} is not a valid choice')
+        except EOFError:
+            return
+    return choice
+
+
 def _verify_load(model_name: str, gen) -> bool:
    print(">> Verifying that new model loads...")
    current_model = gen.model_name
@ -1237,6 +1290,20 @@ def check_internet() -> bool:
    except:
        return False

+# This routine performs any patch-ups needed after installation
+def run_patches():
+    # install ckpt configuration files that may have been added to the
+    # distro after original root directory configuration
+    import invokeai.configs as conf
+    from shutil import copyfile
+    
+    root_configs = Path(global_config_dir(), 'stable-diffusion')
+    repo_configs = Path(conf.__path__[0], 'stable-diffusion')
+    for src in repo_configs.iterdir():
+        dest = root_configs / src.name
+        if not dest.exists():
+            copyfile(src,dest)
+    
 if __name__ == '__main__':
    main()
    
--- a/ldm/invoke/ckpt_to_diffuser.py
+++ b/ldm/invoke/ckpt_to_diffuser.py
@ -862,12 +862,16 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
        if original_config_file is None:
            model_type = ModelManager.probe_model_type(checkpoint)
            
-            if model_type == SDLegacyType.V2:
+            if model_type == SDLegacyType.V2_v:
                original_config_file = global_config_dir() / 'stable-diffusion' / 'v2-inference-v.yaml'
                if global_step == 110000:
                    # v2.1 needs to upcast attention
                    upcast_attention = True
-                    
+                elif model_type == SDLegacyType.V2_e:
+                    original_config_file = (
+                        global_config_dir() / "stable-diffusion" / "v2-inference.yaml"
+                    )
+
            elif model_type == SDLegacyType.V1_INPAINT:
                original_config_file = global_config_dir() / 'stable-diffusion' / 'v1-inpainting-inference.yaml'
                
--- a/ldm/invoke/model_manager.py
+++ b/ldm/invoke/model_manager.py
@ -47,6 +47,8 @@ class SDLegacyType(Enum):
    V1 = 1
    V1_INPAINT = 2
    V2 = 3
+    V2_e = 4
+    V2_v = 5
    UNKNOWN = 99


@ -724,15 +726,25 @@ class ModelManager(object):
        format. Valid return values include:
        SDLegacyType.V1
        SDLegacyType.V1_INPAINT
-        SDLegacyType.V2
+        SDLegacyType.V2     (V2 prediction type unknown)
+        SDLegacyType.V2_e   (V2 using 'epsilon' prediction type)
+        SDLegacyType.V2_v   (V2 using 'v_prediction' prediction type)
        SDLegacyType.UNKNOWN
        """
-        key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
-        if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024:
-            return SDLegacyType.V2
+        global_step = checkpoint.get('global_step')
+        state_dict = checkpoint.get("state_dict") or checkpoint

        try:
            state_dict = checkpoint.get("state_dict") or checkpoint
+            key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
+            if key_name in state_dict and state_dict[key_name].shape[-1] == 1024:
+                if global_step == 220000:
+                    return SDLegacyType.V2_e
+                elif global_step == 110000:
+                    return SDLegacyType.V2_v
+                else:
+                    return SDLegacyType.V2
+            # otherwise we assume a V1 file
            in_channels = state_dict[
                "model.diffusion_model.input_blocks.0.0.weight"
            ].shape[1]
@ -746,12 +758,13 @@ class ModelManager(object):
            return SDLegacyType.UNKNOWN

    def heuristic_import(
-        self,
-        path_url_or_repo: str,
-        convert: bool = False,
-        model_name: str = None,
-        description: str = None,
-        commit_to_conf: Path = None,
+            self,
+            path_url_or_repo: str,
+            convert: bool = False,
+            model_name: str = None,
+            description: str = None,
+            model_config_file: Path = None,
+            commit_to_conf: Path = None,
    ) -> str:
        """
        Accept a string which could be:
@ -849,7 +862,7 @@ class ModelManager(object):

        if model_path.stem in self.config:  # already imported
            print("  | Already imported. Skipping")
-            return
+            return model_path.stem

        # another round of heuristics to guess the correct config file.
        checkpoint = (
@ -857,32 +870,49 @@ class ModelManager(object):
            if model_path.suffix == ".safetensors"
            else torch.load(model_path)
        )
-        model_type = self.probe_model_type(checkpoint)
+        # additional probing needed if no config file provided
+        if model_config_file is None:
+            model_type = self.probe_model_type(checkpoint)
+            if model_type == SDLegacyType.V1:
+                print("  | SD-v1 model detected")
+                model_config_file = Path(
+                    Globals.root, "configs/stable-diffusion/v1-inference.yaml"
+                )
+            elif model_type == SDLegacyType.V1_INPAINT:
+                print("  | SD-v1 inpainting model detected")
+                model_config_file = Path(
+                    Globals.root, "configs/stable-diffusion/v1-inpainting-inference.yaml"
+                )
+            elif model_type == SDLegacyType.V2_v:
+                print(
+                    "  | SD-v2-v model detected"
+                )
+                model_config_file = Path(
+                    Globals.root, "configs/stable-diffusion/v2-inference-v.yaml"
+                )
+            elif model_type == SDLegacyType.V2_e:
+                print(
+                    "  | SD-v2-e model detected"
+                )
+                model_config_file = Path(
+                    Globals.root, "configs/stable-diffusion/v2-inference.yaml"
+                )
+            elif model_type == SDLegacyType.V2:
+                print(
+                    f"** {thing} is a V2 checkpoint file, but its parameterization cannot be determined. Please provide configuration file path."
+                )
+                return
+            else:
+                print(
+                    f"** {thing} is a legacy checkpoint file but not a known Stable Diffusion model. Please provide configuration file path."
+                )
+                return

-        model_config_file = None
-        if model_type == SDLegacyType.V1:
-            print("  | SD-v1 model detected")
-            model_config_file = Path(
-                Globals.root, "configs/stable-diffusion/v1-inference.yaml"
-            )
-        elif model_type == SDLegacyType.V1_INPAINT:
-            print("  | SD-v1 inpainting model detected")
-            model_config_file = Path(
-                Globals.root, "configs/stable-diffusion/v1-inpainting-inference.yaml"
-            )
-        elif model_type == SDLegacyType.V2:
-            print(
-                "  | SD-v2 model detected; model will be converted to diffusers format"
-            )
-            model_config_file = Path(
-                Globals.root, "configs/stable-diffusion/v2-inference-v.yaml"
-            )
+        if model_config_file.name.startswith('v2'):
            convert = True
-        else:
            print(
-                f"** {thing} is a legacy checkpoint file but not in a known Stable Diffusion model. Skipping import"
+                "  | This SD-v2 model will be converted to diffusers format for use"
            )
-            return

        if convert:
            diffuser_path = Path(