From a3c22b5fe6864a2247e42e8f52770ad65e35ce82 Mon Sep 17 00:00:00 2001
From: Sergey Borisov <stalkek7779@yandex.ru>
Date: Sun, 25 Jun 2023 21:06:22 +0300
Subject: [PATCH] Remove upcast_attention and prediction_type from stable
 diffusion model logic, fix ckpt conversion according to this

---
 .../convert_ckpt_to_diffusers.py              | 47 ++++++++--------
 .../models/stable_diffusion.py                | 56 +++++--------------
 2 files changed, 35 insertions(+), 68 deletions(-)

diff --git a/invokeai/backend/model_management/convert_ckpt_to_diffusers.py b/invokeai/backend/model_management/convert_ckpt_to_diffusers.py
index 5d097f5a4e..1eeee92fb7 100644
--- a/invokeai/backend/model_management/convert_ckpt_to_diffusers.py
+++ b/invokeai/backend/model_management/convert_ckpt_to_diffusers.py
@@ -30,7 +30,7 @@ from invokeai.app.services.config import InvokeAIAppConfig
 
 from .model_manager import ModelManager
 from .model_cache import ModelCache
-from .models import SchedulerPredictionType, BaseModelType, ModelVariantType
+from .models import BaseModelType, ModelVariantType
 
 try:
     from omegaconf import OmegaConf
@@ -73,7 +73,9 @@ from transformers import (
 
 from ..stable_diffusion import StableDiffusionGeneratorPipeline
 
-MODEL_ROOT = None
+# TODO: redo in future
+#CONVERT_MODEL_ROOT = InvokeAIAppConfig.get_config().models_path / "core" / "convert"
+CONVERT_MODEL_ROOT = InvokeAIAppConfig.get_config().root_path / "models" / "core" / "convert"
 
 def shave_segments(path, n_shave_prefix_segments=1):
     """
@@ -605,7 +607,7 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     else:
         vae_state_dict = checkpoint
         
-    new_checkpoint = convert_ldm_vae_state_dict(vae_state_dict,config)
+    new_checkpoint = convert_ldm_vae_state_dict(vae_state_dict, config)
     return new_checkpoint
 
 def convert_ldm_vae_state_dict(vae_state_dict, config):
@@ -828,7 +830,7 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
 
 
 def convert_ldm_clip_checkpoint(checkpoint):
-    text_model = CLIPTextModel.from_pretrained(MODEL_ROOT / 'clip-vit-large-patch14')
+    text_model = CLIPTextModel.from_pretrained(CONVERT_MODEL_ROOT / 'clip-vit-large-patch14')
     keys = list(checkpoint.keys())
 
     text_model_dict = {}
@@ -882,7 +884,7 @@ textenc_pattern = re.compile("|".join(protected.keys()))
 
 def convert_open_clip_checkpoint(checkpoint):
     text_model = CLIPTextModel.from_pretrained(
-        MODEL_ROOT / 'stable-diffusion-2-clip',
+        CONVERT_MODEL_ROOT / 'stable-diffusion-2-clip',
         subfolder='text_encoder',
     )
 
@@ -949,7 +951,7 @@ def convert_open_clip_checkpoint(checkpoint):
 
     return text_model
 
-def replace_checkpoint_vae(checkpoint, vae_path:str):
+def replace_checkpoint_vae(checkpoint, vae_path: str):
     if vae_path.endswith(".safetensors"):
         vae_ckpt = load_file(vae_path)
     else:
@@ -959,7 +961,7 @@ def replace_checkpoint_vae(checkpoint, vae_path:str):
         new_key = f'first_stage_model.{vae_key}'
         checkpoint[new_key] = state_dict[vae_key]
 
-def convert_ldm_vae_to_diffusers(checkpoint, vae_config: DictConfig, image_size: int)->AutoencoderKL:
+def convert_ldm_vae_to_diffusers(checkpoint, vae_config: DictConfig, image_size: int) -> AutoencoderKL:
     vae_config = create_vae_diffusers_config(
         vae_config, image_size=image_size
     )
@@ -979,8 +981,6 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
     original_config_file: str,
     extract_ema: bool = True,
     precision: torch.dtype = torch.float32,
-    upcast_attention: bool = False,
-    prediction_type: SchedulerPredictionType = SchedulerPredictionType.Epsilon,
     scan_needed: bool = True,
 ) -> StableDiffusionPipeline:
     """
@@ -994,8 +994,6 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
     :param checkpoint_path: Path to `.ckpt` file.
     :param original_config_file: Path to `.yaml` config file corresponding to the original architecture.
       If `None`, will be automatically inferred by looking for a key that only exists in SD2.0 models.
-    :param prediction_type: The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion
-     v1.X and Stable Diffusion v2 Base. Use `'v-prediction'` for Stable Diffusion v2.
     :param scheduler_type: Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler",
      "euler-ancestral", "dpm", "ddim"]`. :param model_type: The pipeline type. `None` to automatically infer, or one of
      `["FrozenOpenCLIPEmbedder", "FrozenCLIPEmbedder"]`. :param extract_ema: Only relevant for
@@ -1003,17 +1001,16 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
      or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher
      quality images for inference. Non-EMA weights are usually better to continue fine-tuning.
     :param precision: precision to use - torch.float16, torch.float32 or torch.autocast
-    :param upcast_attention: Whether the attention computation should always be upcasted. This is necessary when
-    running stable diffusion 2.1.
     """
-    config = InvokeAIAppConfig.get_config()
+    if not isinstance(checkpoint_path, Path):
+        checkpoint_path = Path(checkpoint_path)
 
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         verbosity = dlogging.get_verbosity()
         dlogging.set_verbosity_error()
 
-        if str(checkpoint_path).endswith(".safetensors"):
+        if checkpoint_path.suffix == ".safetensors":
             checkpoint = load_file(checkpoint_path)
         else:
             if scan_needed:
@@ -1026,9 +1023,13 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
 
         original_config = OmegaConf.load(original_config_file)
 
-        if model_version == BaseModelType.StableDiffusion2 and prediction_type == SchedulerPredictionType.VPrediction:
+        if model_version == BaseModelType.StableDiffusion2 and original_config["model"]["params"]["parameterization"] == "v":
+            prediction_type = "v_prediction"
+            upcast_attention = True
             image_size = 768
         else:
+            prediction_type = "epsilon"
+            upcast_attention = False
             image_size = 512
 
         #
@@ -1083,7 +1084,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
         if model_type == "FrozenOpenCLIPEmbedder":
             text_model = convert_open_clip_checkpoint(checkpoint)
             tokenizer = CLIPTokenizer.from_pretrained(
-                MODEL_ROOT / 'stable-diffusion-2-clip',
+                CONVERT_MODEL_ROOT / 'stable-diffusion-2-clip',
                 subfolder='tokenizer',
             )
             pipe = StableDiffusionPipeline(
@@ -1099,9 +1100,9 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
 
         elif model_type in ["FrozenCLIPEmbedder", "WeightedFrozenCLIPEmbedder"]:
             text_model = convert_ldm_clip_checkpoint(checkpoint)
-            tokenizer = CLIPTokenizer.from_pretrained(MODEL_ROOT / 'clip-vit-large-patch14')
-            safety_checker = StableDiffusionSafetyChecker.from_pretrained(MODEL_ROOT / 'stable-diffusion-safety-checker')
-            feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ROOT / 'stable-diffusion-safety-checker')
+            tokenizer = CLIPTokenizer.from_pretrained(CONVERT_MODEL_ROOT / 'clip-vit-large-patch14')
+            safety_checker = StableDiffusionSafetyChecker.from_pretrained(CONVERT_MODEL_ROOT / 'stable-diffusion-safety-checker')
+            feature_extractor = AutoFeatureExtractor.from_pretrained(CONVERT_MODEL_ROOT / 'stable-diffusion-safety-checker')
             pipe = StableDiffusionPipeline(
                 vae=vae.to(precision),
                 text_encoder=text_model.to(precision),
@@ -1115,7 +1116,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
         else:
             text_config = create_ldm_bert_config(original_config)
             text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
-            tokenizer = BertTokenizerFast.from_pretrained(MODEL_ROOT / "bert-base-uncased")
+            tokenizer = BertTokenizerFast.from_pretrained(CONVERT_MODEL_ROOT / "bert-base-uncased")
             pipe = LDMTextToImagePipeline(
                 vqvae=vae,
                 bert=text_model,
@@ -1131,7 +1132,6 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
 def convert_ckpt_to_diffusers(
         checkpoint_path: Union[str, Path],
         dump_path: Union[str, Path],
-        model_root: Union[str, Path],
         **kwargs,
 ):
     """
@@ -1139,9 +1139,6 @@ def convert_ckpt_to_diffusers(
     and in addition a path-like object indicating the location of the desired diffusers
     model to be written.
     """
-    # setting global here to avoid massive changes late at night
-    global MODEL_ROOT
-    MODEL_ROOT = Path(model_root) / 'core/convert'
     pipe = load_pipeline_from_original_stable_diffusion_ckpt(checkpoint_path, **kwargs)
 
     pipe.save_pretrained(
diff --git a/invokeai/backend/model_management/models/stable_diffusion.py b/invokeai/backend/model_management/models/stable_diffusion.py
index a269ae12e2..f5112dfebb 100644
--- a/invokeai/backend/model_management/models/stable_diffusion.py
+++ b/invokeai/backend/model_management/models/stable_diffusion.py
@@ -34,7 +34,7 @@ class StableDiffusion1Model(DiffusersModel):
     class CheckpointConfig(ModelConfigBase):
         model_format: Literal[StableDiffusion1ModelFormat.Checkpoint]
         vae: Optional[str] = Field(None)
-        config: Optional[str] = Field(None)
+        config: str
         variant: ModelVariantType
 
 
@@ -81,6 +81,8 @@ class StableDiffusion1Model(DiffusersModel):
         else:
             raise Exception("Unkown stable diffusion 1.* model format")
 
+        if ckpt_config_path is None:
+            ckpt_config_path = _select_ckpt_config(BaseModelType.StableDiffusion1, variant)
 
         return cls.create_config(
             path=path,
@@ -116,7 +118,7 @@ class StableDiffusion1Model(DiffusersModel):
                 version=BaseModelType.StableDiffusion1,
                 model_config=config,
                 output_path=output_path,
-            ) # TODO: args
+            )
         else:
             return model_path
 
@@ -183,13 +185,8 @@ class StableDiffusion2Model(DiffusersModel):
         else:
             raise Exception("Unkown stable diffusion 2.* model format")
 
-        if variant == ModelVariantType.Normal:
-            prediction_type = SchedulerPredictionType.VPrediction
-            upcast_attention = True
-
-        else:
-            prediction_type = SchedulerPredictionType.Epsilon
-            upcast_attention = False
+        if ckpt_config_path is None:
+            ckpt_config_path = _select_ckpt_config(BaseModelType.StableDiffusion2, variant)
 
         return cls.create_config(
             path=path,
@@ -197,8 +194,6 @@ class StableDiffusion2Model(DiffusersModel):
 
             config=ckpt_config_path,
             variant=variant,
-            prediction_type=prediction_type,
-            upcast_attention=upcast_attention,
         )
 
     @classproperty
@@ -227,7 +222,7 @@ class StableDiffusion2Model(DiffusersModel):
                 version=BaseModelType.StableDiffusion2,
                 model_config=config,
                 output_path=output_path,
-            ) # TODO: args
+            )
         else:
             return model_path
 
@@ -238,18 +233,18 @@ def _select_ckpt_config(version: BaseModelType, variant: ModelVariantType):
             ModelVariantType.Inpaint: "v1-inpainting-inference.yaml",
         },
         BaseModelType.StableDiffusion2: {
-            # code further will manually set upcast_attention and v_prediction
-            ModelVariantType.Normal: "v2-inference.yaml",
+            ModelVariantType.Normal: "v2-inference-v.yaml", # best guess, as we can't differentiate with base(512)
             ModelVariantType.Inpaint: "v2-inpainting-inference.yaml",
             ModelVariantType.Depth: "v2-midas-inference.yaml",
         }
     }
 
+    app_config = InvokeAIAppConfig.get_config()
     try:
-        # TODO: path
-        #model_config.config = app_config.config_dir / "stable-diffusion" / ckpt_configs[version][model_config.variant]
-        #return InvokeAIAppConfig.get_config().legacy_conf_dir / ckpt_configs[version][variant]
-        return InvokeAIAppConfig.get_config().root_dir / "configs" / "stable-diffusion" / ckpt_configs[version][variant]
+        config_path = app_config.legacy_conf_path / ckpt_configs[version][variant]
+        if config_path.is_relative_to(app_config.root_path):
+            config_path = config_path.relative_to(app_config.root_path)
+        return str(config_path)
             
     except:
         return None
@@ -268,36 +263,14 @@ def _convert_ckpt_and_cache(
     """
     app_config = InvokeAIAppConfig.get_config()
 
-    if model_config.config is None:
-        model_config.config = _select_ckpt_config(version, model_config.variant)
-        if model_config.config is None:
-            raise Exception(f"Model variant {model_config.variant} not supported for {version}")
-
-
     weights = app_config.root_dir / model_config.path
     config_file = app_config.root_dir / model_config.config
     output_path = Path(output_path)
 
-    if version == BaseModelType.StableDiffusion1:
-        upcast_attention = False
-        prediction_type = SchedulerPredictionType.Epsilon
-
-    elif version == BaseModelType.StableDiffusion2:
-        upcast_attention = model_config.upcast_attention
-        prediction_type = model_config.prediction_type
-
-    else:
-        raise Exception(f"Unknown model provided: {version}")
-
-
     # return cached version if it exists
     if output_path.exists():
         return output_path
 
-    # TODO: I think that it more correctly to convert with embedded vae
-    #       as if user will delete custom vae he will got not embedded but also custom vae
-    #vae_ckpt_path, vae_model = self._get_vae_for_conversion(weights, mconfig)
-
     # to avoid circular import errors
     from ..convert_ckpt_to_diffusers import convert_ckpt_to_diffusers
     with SilenceWarnings():        
@@ -308,9 +281,6 @@ def _convert_ckpt_and_cache(
             model_variant=model_config.variant,
             original_config_file=config_file,
             extract_ema=True,
-            upcast_attention=upcast_attention,
-            prediction_type=prediction_type,
             scan_needed=True,
-            model_root=app_config.models_path,
         )
     return output_path