add support for controlnet & sdxl conversion - not fully working

2024-08-30 20:32:17 +00:00 · 2023-07-22 20:12:16 -04:00
parent 907ff165be
commit 5607794dbb
10 changed files with 1519 additions and 680 deletions
--- a/invokeai/backend/install/invokeai_configure.py
+++ b/invokeai/backend/install/invokeai_configure.py
@ -55,6 +55,7 @@ from invokeai.frontend.install.widgets import (
 from invokeai.backend.install.legacy_arg_parsing import legacy_parser
 from invokeai.backend.install.model_install_backend import (
    hf_download_from_pretrained,
+    hf_download_with_resume,
    InstallSelections,
    ModelInstall,
 )
@ -204,6 +205,13 @@ def download_conversion_models():
        pipeline = CLIPTextModel.from_pretrained(repo_id, subfolder="text_encoder", **kwargs)
        pipeline.save_pretrained(target_dir / 'stable-diffusion-2-clip' / 'text_encoder', safe_serialization=True)

+        repo_id = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+        _, model_name = repo_id.split('/')
+        tokenizer_2 = CLIPTokenizer.from_pretrained(repo_id, **kwargs)
+        tokenizer_2.save_pretrained(target_dir / model_name, safe_serialization=True)
+        # for some reason config.json never downloads
+        hf_download_with_resume(repo_id, target_dir / model_name, "config.json")
+
        # VAE
        logger.info('Downloading stable diffusion VAE')
        vae = AutoencoderKL.from_pretrained('stabilityai/sd-vae-ft-mse', **kwargs)
--- a/invokeai/backend/install/model_install_backend.py
+++ b/invokeai/backend/install/model_install_backend.py
@ -58,7 +58,15 @@ LEGACY_CONFIGS = {
            SchedulerPredictionType.Epsilon: 'v2-inpainting-inference.yaml',
            SchedulerPredictionType.VPrediction: 'v2-inpainting-inference-v.yaml',
        }
-    }
+    },
+    
+    BaseModelType.StableDiffusionXL: {
+        ModelVariantType.Normal: 'sd_xl_base.yaml',
+    },
+    
+    BaseModelType.StableDiffusionXLRefiner: {
+        ModelVariantType.Normal: 'sd_xl_refiner.yaml',
+    },
 }

@dataclass
@ -329,6 +337,7 @@ class ModelInstall(object):
            description = str(description),
            model_format = info.format,
            )
+        legacy_conf = None
        if info.model_type == ModelType.Main:
            attributes.update(dict(variant = info.variant_type,))
            if info.format=="checkpoint":
@ -343,11 +352,17 @@ class ModelInstall(object):
                except KeyError:
                    legacy_conf = Path(self.config.legacy_conf_dir, 'v1-inference.yaml')  # best guess
                    
-                attributes.update(
-                    dict(
-                        config = str(legacy_conf)
-                    )
+        if info.model_type == ModelType.ControlNet and info.format=="checkpoint":
+            possible_conf = path.with_suffix('.yaml')
+            if possible_conf.exists():
+                legacy_conf = str(self.relative_to_root(possible_conf))
+
+        if legacy_conf:
+            attributes.update(
+                dict(
+                    config = str(legacy_conf)
                )
+            )
        return attributes

    def relative_to_root(self, path: Path)->Path:
--- a/invokeai/backend/model_management/convert_ckpt_to_diffusers.py
+++ b/invokeai/backend/model_management/convert_ckpt_to_diffusers.py
--- a/invokeai/backend/model_management/model_manager.py
+++ b/invokeai/backend/model_management/model_manager.py
@ -673,6 +673,7 @@ class ModelManager(object):

        self.models[model_key] = model_config
        self.commit()
+
        return AddModelResult(
            name = model_name,
            model_type = model_type,
@ -840,7 +841,7 @@ class ModelManager(object):
        Returns the preamble for the config file.
        """
        return textwrap.dedent(
-            """\
+            """
            # This file describes the alternative machine learning models
            # available to InvokeAI script.
            #
--- a/invokeai/backend/model_management/model_probe.py
+++ b/invokeai/backend/model_management/model_probe.py
@ -253,7 +253,8 @@ class PipelineCheckpointProbe(CheckpointProbeBase):
            return BaseModelType.StableDiffusion1
        if key_name in state_dict and state_dict[key_name].shape[-1] == 1024:
            return BaseModelType.StableDiffusion2
-        # TODO: Verify that this is correct! Need an XL checkpoint file for this.
+        # TODO: This is just a guess based on N=1
+        key_name = 'model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_k.weight'
        if key_name in state_dict and state_dict[key_name].shape[-1] == 2048:
            return BaseModelType.StableDiffusionXL
        raise InvalidModelException("Cannot determine base type")
--- a/invokeai/backend/model_management/models/controlnet.py
+++ b/invokeai/backend/model_management/models/controlnet.py
@ -1,7 +1,8 @@
 import os
 import torch
 from enum import Enum
-from typing import Optional
+from pathlib import Path
+from typing import Optional, Literal
 from .base import (
    ModelBase,
    ModelConfigBase,
@ -15,6 +16,7 @@ from .base import (
    InvalidModelException,
    ModelNotFoundException,
 )
+from invokeai.app.services.config import InvokeAIAppConfig

 class ControlNetModelFormat(str, Enum):
    Checkpoint = "checkpoint"
@ -24,8 +26,12 @@ class ControlNetModel(ModelBase):
    #model_class: Type
    #model_size: int

-    class Config(ModelConfigBase):
-        model_format: ControlNetModelFormat
+    class DiffusersConfig(ModelConfigBase):
+        model_format: Literal[ControlNetModelFormat.Diffusers]
+
+    class CheckpointConfig(ModelConfigBase):
+        model_format: Literal[ControlNetModelFormat.Checkpoint]
+        config: str

    def __init__(self, model_path: str, base_model: BaseModelType, model_type: ModelType):
        assert model_type == ModelType.ControlNet
@ -99,13 +105,51 @@ class ControlNetModel(ModelBase):

    @classmethod
    def convert_if_required(
+            cls,
+            model_path: str,
+            output_path: str,
+            config: ModelConfigBase,
+            base_model: BaseModelType,
+    ) -> str:
+        if cls.detect_format(model_path) == ControlNetModelFormat.Checkpoint:
+            return _convert_controlnet_ckpt_and_cache(
+                model_path = model_path,
+                model_config = config.config,
+                output_path = output_path,
+                base_model = base_model,
+                )
+        else:
+            return model_path
+
+@classmethod
+def _convert_controlnet_ckpt_and_cache(
        cls,
        model_path: str,
        output_path: str,
-        config: ModelConfigBase, # empty config or config of parent model
        base_model: BaseModelType,
-    ) -> str:
-        if cls.detect_format(model_path) != ControlNetModelFormat.Diffusers:
-            raise NotImplementedError("Checkpoint controlnet models currently unsupported")
-        else:
-            return model_path
+        model_config: ControlNetModel.CheckpointConfig,
+) -> str:
+    """
+    Convert the controlnet from checkpoint format to diffusers format,
+    cache it to disk, and return Path to converted
+    file. If already on disk then just returns Path.
+    """
+    app_config = InvokeAIAppConfig.get_config()
+    weights = app_config.root_path / model_path
+    output_path = Path(output_path)
+
+    # return cached version if it exists
+    if output_path.exists():
+        return output_path
+
+    # to avoid circular import errors
+    from ..convert_ckpt_to_diffusers import convert_controlnet_to_diffusers
+    convert_controlnet_to_diffusers(
+        weights,
+        output_path,
+        original_config_file = app_config.root_path / model_config,
+        image_size = 512,
+        scan_needed = True,
+        from_safetensors = weights.suffix == ".safetensors"
+    )
+    return output_path
--- a/invokeai/backend/model_management/models/sdxl.py
+++ b/invokeai/backend/model_management/models/sdxl.py
@ -48,7 +48,7 @@ class StableDiffusionXLModel(DiffusersModel):
        if model_format == StableDiffusionXLModelFormat.Checkpoint:
            if ckpt_config_path:
                ckpt_config = OmegaConf.load(ckpt_config_path)
-                ckpt_config["model"]["params"]["unet_config"]["params"]["in_channels"]
+                in_channels = ckpt_config["model"]["params"]["unet_config"]["params"]["in_channels"]

            else:
                checkpoint = read_checkpoint_meta(path)
@ -109,6 +109,13 @@ class StableDiffusionXLModel(DiffusersModel):
        base_model: BaseModelType,
    ) -> str:
        if isinstance(config, cls.CheckpointConfig):
-            raise NotImplementedError('conversion of SDXL checkpoint models to diffusers format is not yet supported')
+            from invokeai.backend.model_management.models.stable_diffusion import _convert_ckpt_and_cache
+            return _convert_ckpt_and_cache(
+                version=base_model,
+                model_config=config,
+                output_path=output_path,
+                model_type='SDXL',
+                no_safetensors=True, # giving errors for some reason
+            )
        else:
            return model_path
--- a/invokeai/backend/model_management/models/stable_diffusion.py
+++ b/invokeai/backend/model_management/models/stable_diffusion.py
@ -15,6 +15,7 @@ from .base import (
    classproperty,
    InvalidModelException,
 )
+from .sdxl import StableDiffusionXLModel
 from invokeai.app.services.config import InvokeAIAppConfig
 from omegaconf import OmegaConf

@ -235,42 +236,16 @@ class StableDiffusion2Model(DiffusersModel):
        else:
            return model_path

-def _select_ckpt_config(version: BaseModelType, variant: ModelVariantType):
-    ckpt_configs = {
-        BaseModelType.StableDiffusion1: {
-            ModelVariantType.Normal: "v1-inference.yaml",
-            ModelVariantType.Inpaint: "v1-inpainting-inference.yaml",
-        },
-        BaseModelType.StableDiffusion2: {
-            ModelVariantType.Normal: "v2-inference-v.yaml", # best guess, as we can't differentiate with base(512)
-            ModelVariantType.Inpaint: "v2-inpainting-inference.yaml",
-            ModelVariantType.Depth: "v2-midas-inference.yaml",
-        },
-        # note that these .yaml files don't yet exist!
-        BaseModelType.StableDiffusionXL: {
-            ModelVariantType.Normal: "xl-inference-v.yaml",
-            ModelVariantType.Inpaint: "xl-inpainting-inference.yaml",
-            ModelVariantType.Depth: "xl-midas-inference.yaml",
-        }
-    }
-
-    app_config = InvokeAIAppConfig.get_config()
-    try:
-        config_path = app_config.legacy_conf_path / ckpt_configs[version][variant]
-        if config_path.is_relative_to(app_config.root_path):
-            config_path = config_path.relative_to(app_config.root_path)
-        return str(config_path)
-            
-    except:
-        return None
-
-
 # TODO: rework
-# Note that convert_ckpt_to_diffuses does not currently support conversion of SDXL models
 def _convert_ckpt_and_cache(
-    version: BaseModelType,
-    model_config: Union[StableDiffusion1Model.CheckpointConfig, StableDiffusion2Model.CheckpointConfig],
-    output_path: str,
+        version: BaseModelType,
+        model_config: Union[StableDiffusion1Model.CheckpointConfig,
+                            StableDiffusion2Model.CheckpointConfig,
+                            StableDiffusionXLModel.CheckpointConfig,
+                            ],
+        output_path: str,
+        use_save_model: bool=False,
+        **kwargs,
 ) -> str:
    """
    Convert the checkpoint model indicated in mconfig into a
@ -298,5 +273,42 @@ def _convert_ckpt_and_cache(
            original_config_file=config_file,
            extract_ema=True,
            scan_needed=True,
+            from_safetensors = weights.suffix == ".safetensors",
+            **kwargs,
        )
    return output_path
+
+def _select_ckpt_config(version: BaseModelType, variant: ModelVariantType):
+    ckpt_configs = {
+        BaseModelType.StableDiffusion1: {
+            ModelVariantType.Normal: "v1-inference.yaml",
+            ModelVariantType.Inpaint: "v1-inpainting-inference.yaml",
+        },
+        BaseModelType.StableDiffusion2: {
+            ModelVariantType.Normal: "v2-inference-v.yaml", # best guess, as we can't differentiate with base(512)
+            ModelVariantType.Inpaint: "v2-inpainting-inference.yaml",
+            ModelVariantType.Depth: "v2-midas-inference.yaml",
+        },
+        BaseModelType.StableDiffusionXL: {
+            ModelVariantType.Normal: "sd_xl_base.yaml",
+            ModelVariantType.Inpaint: None,
+            ModelVariantType.Depth: None,
+        },
+        BaseModelType.StableDiffusionXLRefiner: {
+            ModelVariantType.Normal: "sd_xl_refiner.yaml",
+            ModelVariantType.Inpaint: None,
+            ModelVariantType.Depth: None,
+        },
+    }
+
+    app_config = InvokeAIAppConfig.get_config()
+    try:
+        config_path = app_config.legacy_conf_path / ckpt_configs[version][variant]
+        if config_path.is_relative_to(app_config.root_path):
+            config_path = config_path.relative_to(app_config.root_path)
+        return str(config_path)
+            
+    except:
+        return None
+
+
--- a/invokeai/configs/stable-diffusion/sd_xl_base.yaml
+++ b/invokeai/configs/stable-diffusion/sd_xl_base.yaml
@ -0,0 +1,98 @@
+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.13025
+    disable_first_stage_autocast: True
+
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+      params:
+        num_idx: 1000
+
+        weighting_config:
+          target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        adm_in_channels: 2816
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4]
+        num_head_channels: 64
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: [1, 2, 10]  # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16
+        context_dim: 2048
+        spatial_transformer_attn_type: softmax-xformers
+        legacy: False
+
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          # crossattn cond
+          - is_trainable: False
+            input_key: txt
+            target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
+            params:
+              layer: hidden
+              layer_idx: 11
+          # crossattn and vector cond
+          - is_trainable: False
+            input_key: txt
+            target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
+            params:
+              arch: ViT-bigG-14
+              version: laion2b_s39b_b160k
+              freeze: True
+              layer: penultimate
+              always_return_pooled: True
+              legacy: False
+          # vector cond
+          - is_trainable: False
+            input_key: original_size_as_tuple
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256  # multiplied by two
+          # vector cond
+          - is_trainable: False
+            input_key: crop_coords_top_left
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256  # multiplied by two
+          # vector cond
+          - is_trainable: False
+            input_key: target_size_as_tuple
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256  # multiplied by two
+
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
--- a/invokeai/configs/stable-diffusion/sd_xl_refiner.yaml
+++ b/invokeai/configs/stable-diffusion/sd_xl_refiner.yaml
@ -0,0 +1,91 @@
+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.13025
+    disable_first_stage_autocast: True
+
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+      params:
+        num_idx: 1000
+
+        weighting_config:
+          target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        adm_in_channels: 2560
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 4
+        out_channels: 4
+        model_channels: 384
+        attention_resolutions: [4, 2]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 4
+        context_dim: [1280, 1280, 1280, 1280]  # 1280
+        spatial_transformer_attn_type: softmax-xformers
+        legacy: False
+
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          # crossattn and vector cond
+          - is_trainable: False
+            input_key: txt
+            target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
+            params:
+              arch: ViT-bigG-14
+              version: laion2b_s39b_b160k
+              legacy: False
+              freeze: True
+              layer: penultimate
+              always_return_pooled: True
+          # vector cond
+          - is_trainable: False
+            input_key: original_size_as_tuple
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256  # multiplied by two
+          # vector cond
+          - is_trainable: False
+            input_key: crop_coords_top_left
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256  # multiplied by two
+          # vector cond
+          - is_trainable: False
+            input_key: aesthetic_score
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256  # multiplied by one
+
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity