preload_models interactively downloads sd model files

2024-08-30 20:32:17 +00:00 · 2022-10-30 12:19:05 -04:00 · 2022-10-30 12:19:05 -04:00 · e7368d7231
commit e7368d7231
parent 07c3c57cde 5319796e58
43 changed files with 730 additions and 2174 deletions
--- a/.github/workflows/test-invoke-conda.yml
+++ b/.github/workflows/test-invoke-conda.yml
@ -84,7 +84,9 @@ jobs:

      - name: run preload_models.py
        id: run-preload-models
-        run: python scripts/preload_models.py
+        run: |
+          python scripts/preload_models.py \
+            --no-interactive

      - name: Run the tests
        id: run-tests
--- a/.gitignore
+++ b/.gitignore
@ -199,7 +199,13 @@ checkpoints
 .scratch/
 .vscode/
 gfpgan/
-models/ldm/stable-diffusion-v1/model.sha256
+models/ldm/stable-diffusion-v1/*.sha256

 # GFPGAN model files
 gfpgan/
+
+# config file (will be created by installer)
+configs/models.yaml
+
+# weights (will be created by installer)
+models/ldm/stable-diffusion-v1/*.ckpt
--- a/configs/autoencoder/autoencoder_kl_16x16x16.yaml
+++ b/configs/autoencoder/autoencoder_kl_16x16x16.yaml
@ -1,54 +0,0 @@
-model:
-  base_learning_rate: 4.5e-6
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: "val/rec_loss"
-    embed_dim: 16
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 0.000001
-        disc_weight: 0.5
-
-    ddconfig:
-      double_z: True
-      z_channels: 16
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult: [ 1,1,2,2,4]  # num_down = len(ch_mult)-1
-      num_res_blocks: 2
-      attn_resolutions: [16]
-      dropout: 0.0
-
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 12
-    wrap: True
-    train:
-      target: ldm.data.imagenet.ImageNetSRTrain
-      params:
-        size: 256
-        degradation: pil_nearest
-    validation:
-      target: ldm.data.imagenet.ImageNetSRValidation
-      params:
-        size: 256
-        degradation: pil_nearest
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 1000
-        max_images: 8
-        increase_log_steps: True
-
-  trainer:
-    benchmark: True
-    accumulate_grad_batches: 2
--- a/configs/autoencoder/autoencoder_kl_32x32x4.yaml
+++ b/configs/autoencoder/autoencoder_kl_32x32x4.yaml
@ -1,53 +0,0 @@
-model:
-  base_learning_rate: 4.5e-6
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: "val/rec_loss"
-    embed_dim: 4
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 0.000001
-        disc_weight: 0.5
-
-    ddconfig:
-      double_z: True
-      z_channels: 4
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult: [ 1,2,4,4 ]  # num_down = len(ch_mult)-1
-      num_res_blocks: 2
-      attn_resolutions: [ ]
-      dropout: 0.0
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 12
-    wrap: True
-    train:
-      target: ldm.data.imagenet.ImageNetSRTrain
-      params:
-        size: 256
-        degradation: pil_nearest
-    validation:
-      target: ldm.data.imagenet.ImageNetSRValidation
-      params:
-        size: 256
-        degradation: pil_nearest
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 1000
-        max_images: 8
-        increase_log_steps: True
-
-  trainer:
-    benchmark: True
-    accumulate_grad_batches: 2
--- a/configs/autoencoder/autoencoder_kl_64x64x3.yaml
+++ b/configs/autoencoder/autoencoder_kl_64x64x3.yaml
@ -1,54 +0,0 @@
-model:
-  base_learning_rate: 4.5e-6
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: "val/rec_loss"
-    embed_dim: 3
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 0.000001
-        disc_weight: 0.5
-
-    ddconfig:
-      double_z: True
-      z_channels: 3
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult: [ 1,2,4 ]  # num_down = len(ch_mult)-1
-      num_res_blocks: 2
-      attn_resolutions: [ ]
-      dropout: 0.0
-
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 12
-    wrap: True
-    train:
-      target: ldm.data.imagenet.ImageNetSRTrain
-      params:
-        size: 256
-        degradation: pil_nearest
-    validation:
-      target: ldm.data.imagenet.ImageNetSRValidation
-      params:
-        size: 256
-        degradation: pil_nearest
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 1000
-        max_images: 8
-        increase_log_steps: True
-
-  trainer:
-    benchmark: True
-    accumulate_grad_batches: 2
--- a/configs/autoencoder/autoencoder_kl_8x8x64.yaml
+++ b/configs/autoencoder/autoencoder_kl_8x8x64.yaml
@ -1,53 +0,0 @@
-model:
-  base_learning_rate: 4.5e-6
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: "val/rec_loss"
-    embed_dim: 64
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 0.000001
-        disc_weight: 0.5
-
-    ddconfig:
-      double_z: True
-      z_channels: 64
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult: [ 1,1,2,2,4,4]  # num_down = len(ch_mult)-1
-      num_res_blocks: 2
-      attn_resolutions: [16,8]
-      dropout: 0.0
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 12
-    wrap: True
-    train:
-      target: ldm.data.imagenet.ImageNetSRTrain
-      params:
-        size: 256
-        degradation: pil_nearest
-    validation:
-      target: ldm.data.imagenet.ImageNetSRValidation
-      params:
-        size: 256
-        degradation: pil_nearest
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 1000
-        max_images: 8
-        increase_log_steps: True
-
-  trainer:
-    benchmark: True
-    accumulate_grad_batches: 2
--- a/configs/latent-diffusion/celebahq-ldm-vq-4.yaml
+++ b/configs/latent-diffusion/celebahq-ldm-vq-4.yaml
@ -1,86 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    image_size: 64
-    channels: 3
-    monitor: val/loss_simple_ema
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 224
-        attention_resolutions:
-        # note: this isn\t actually the resolution but
-        # the downsampling factor, i.e. this corresnponds to
-        # attention on spatial resolution 8,16,32, as the
-        # spatial reolution of the latents is 64 for f4
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_head_channels: 32
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ckpt_path: models/first_stage_models/vq-f4/model.ckpt
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config: __is_unconditional__
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 48
-    num_workers: 5
-    wrap: false
-    train:
-      target: taming.data.faceshq.CelebAHQTrain
-      params:
-        size: 256
-    validation:
-      target: taming.data.faceshq.CelebAHQValidation
-      params:
-        size: 256
-
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 5000
-        max_images: 8
-        increase_log_steps: False
-
-  trainer:
-    benchmark: True
--- a/configs/latent-diffusion/cin-ldm-vq-f8.yaml
+++ b/configs/latent-diffusion/cin-ldm-vq-f8.yaml
@ -1,98 +0,0 @@
-model:
-  base_learning_rate: 1.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: class_label
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: true
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32
-        in_channels: 4
-        out_channels: 4
-        model_channels: 256
-        attention_resolutions:
-        #note: this isn\t actually the resolution but
-        # the downsampling factor, i.e. this corresnponds to
-        # attention on spatial resolution 8,16,32, as the
-        # spatial reolution of the latents is 32 for f8
-        - 4
-        - 2
-        - 1
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 4
-        num_head_channels: 32
-        use_spatial_transformer: true
-        transformer_depth: 1
-        context_dim: 512
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 4
-        n_embed: 16384
-        ckpt_path: configs/first_stage_models/vq-f8/model.yaml
-        ddconfig:
-          double_z: false
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions:
-          - 32
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.ClassEmbedder
-      params:
-        embed_dim: 512
-        key: class_label
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 64
-    num_workers: 12
-    wrap: false
-    train:
-      target: ldm.data.imagenet.ImageNetTrain
-      params:
-        config:
-          size: 256
-    validation:
-      target: ldm.data.imagenet.ImageNetValidation
-      params:
-        config:
-          size: 256
-
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 5000
-        max_images: 8
-        increase_log_steps: False
-
-  trainer:
-    benchmark: True
--- a/configs/latent-diffusion/cin256-v2.yaml
+++ b/configs/latent-diffusion/cin256-v2.yaml
@ -1,68 +0,0 @@
-model:
-  base_learning_rate: 0.0001
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: class_label
-    image_size: 64
-    channels: 3
-    cond_stage_trainable: true
-    conditioning_key: crossattn
-    monitor: val/loss
-    use_ema: False
-    
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 192
-        attention_resolutions:
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 5
-        num_heads: 1
-        use_spatial_transformer: true
-        transformer_depth: 1
-        context_dim: 512
-    
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.ClassEmbedder
-      params:
-        n_classes: 1001
-        embed_dim: 512
-        key: class_label
--- a/configs/latent-diffusion/ffhq-ldm-vq-4.yaml
+++ b/configs/latent-diffusion/ffhq-ldm-vq-4.yaml
@ -1,85 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    image_size: 64
-    channels: 3
-    monitor: val/loss_simple_ema
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 224
-        attention_resolutions:
-        # note: this isn\t actually the resolution but
-        # the downsampling factor, i.e. this corresnponds to
-        # attention on spatial resolution 8,16,32, as the
-        # spatial reolution of the latents is 64 for f4
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_head_channels: 32
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ckpt_path: configs/first_stage_models/vq-f4/model.yaml
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config: __is_unconditional__
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 42
-    num_workers: 5
-    wrap: false
-    train:
-      target: taming.data.faceshq.FFHQTrain
-      params:
-        size: 256
-    validation:
-      target: taming.data.faceshq.FFHQValidation
-      params:
-        size: 256
-
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 5000
-        max_images: 8
-        increase_log_steps: False
-
-  trainer:
-    benchmark: True
--- a/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml
+++ b/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml
@ -1,85 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    image_size: 64
-    channels: 3
-    monitor: val/loss_simple_ema
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 224
-        attention_resolutions:
-        # note: this isn\t actually the resolution but
-        # the downsampling factor, i.e. this corresnponds to
-        # attention on spatial resolution 8,16,32, as the
-        # spatial reolution of the latents is 64 for f4
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_head_channels: 32
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        ckpt_path: configs/first_stage_models/vq-f4/model.yaml
-        embed_dim: 3
-        n_embed: 8192
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config: __is_unconditional__
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 48
-    num_workers: 5
-    wrap: false
-    train:
-      target: ldm.data.lsun.LSUNBedroomsTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.lsun.LSUNBedroomsValidation
-      params:
-        size: 256
-
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 5000
-        max_images: 8
-        increase_log_steps: False
-
-  trainer:
-    benchmark: True
--- a/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml
+++ b/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml
@ -1,91 +0,0 @@
-model:
-  base_learning_rate: 5.0e-5   # set to target_lr by starting main.py with '--scale_lr False'
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0155
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    loss_type: l1
-    first_stage_key: "image"
-    cond_stage_key: "image"
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: False
-    concat_mode: False
-    scale_by_std: True
-    monitor: 'val/loss_simple_ema'
-
-    scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [10000]
-        cycle_lengths: [10000000000000]
-        f_start: [1.e-6]
-        f_max: [1.]
-        f_min: [ 1.]
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32
-        in_channels: 4
-        out_channels: 4
-        model_channels: 192
-        attention_resolutions: [ 1, 2, 4, 8 ]   # 32, 16, 8, 4
-        num_res_blocks: 2
-        channel_mult: [ 1,2,2,4,4 ]  # 32, 16, 8, 4, 2
-        num_heads: 8
-        use_scale_shift_norm: True
-        resblock_updown: True
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: "val/rec_loss"
-        ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
-        ddconfig:
-          double_z: True
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult: [ 1,2,4,4 ]  # num_down = len(ch_mult)-1
-          num_res_blocks: 2
-          attn_resolutions: [ ]
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config: "__is_unconditional__"
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 96
-    num_workers: 5
-    wrap: False
-    train:
-      target: ldm.data.lsun.LSUNChurchesTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.lsun.LSUNChurchesValidation
-      params:
-        size: 256
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 5000
-        max_images: 8
-        increase_log_steps: False
-
-
-  trainer:
-    benchmark: True
--- a/configs/latent-diffusion/txt2img-1p4B-eval.yaml
+++ b/configs/latent-diffusion/txt2img-1p4B-eval.yaml
@ -1,71 +0,0 @@
-model:
-  base_learning_rate: 5.0e-05
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.012
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: caption
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: true
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions:
-        - 4
-        - 2
-        - 1
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 4
-        - 4
-        num_heads: 8
-        use_spatial_transformer: true
-        transformer_depth: 1
-        context_dim: 1280
-        use_checkpoint: true
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.BERTEmbedder
-      params:
-        n_embed: 1280
-        n_layer: 32
--- a/configs/models.yaml
+++ b/configs/models.yaml
@ -1,29 +1,36 @@
 # This file describes the alternative machine learning models
-# available to the dream script.
+# available to InvokeAI script.
 #
 # To add a new model, follow the examples below. Each
 # model requires a model config file, a weights file,
 # and the width and height of the images it
 # was trained on.
 stable-diffusion-1.4:
-  config: configs/stable-diffusion/v1-inference.yaml
-  weights: models/ldm/stable-diffusion-v1/model.ckpt
-#  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
-  description: Stable Diffusion inference model version 1.4
-  width: 512
-  height: 512
-  default: true
-inpainting-1.5:
-  description: runwayML tuned inpainting model v1.5
-  weights: models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt
-  config: configs/stable-diffusion/v1-inpainting-inference.yaml
-#  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
+  config: ./configs/stable-diffusion/v1-inference.yaml
+  weights: ./models/ldm/stable-diffusion-v1/sd-v1-4.ckpt
+  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
+  description: The original Stable Diffusion version 1.4 weight file (4.27 GB)
  width: 512
  height: 512
 stable-diffusion-1.5:
-  config: configs/stable-diffusion/v1-inference.yaml
-  weights: models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt
-#  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
-  description: Stable Diffusion inference model version 1.5
+  description: The newest Stable Diffusion version 1.5 weight file (4.27 GB)
+  weights: ./models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt
+  config: ./configs/stable-diffusion/v1-inference.yaml
  width: 512
  height: 512
+  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
+  default: true
+inpainting-1.5:
+  description: RunwayML SD 1.5 model optimized for inpainting (4.27 GB)
+  weights: ./models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt
+  config: ./configs/stable-diffusion/v1-inpainting-inference.yaml
+  width: 512
+  height: 512
+  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
+waifu-diffusion-1.3:
+  description: Stable Diffusion 1.4 fine tuned on anime-styled images (4.27)
+  weights: ./models/ldm/stable-diffusion-v1/model-epoch09-float32.ckpt
+  config: ./configs/stable-diffusion/v1-inference.yaml
+  width: 512
+  height: 512
+  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
--- a/configs/retrieval-augmented-diffusion/768x768.yaml
+++ b/configs/retrieval-augmented-diffusion/768x768.yaml
@ -1,68 +0,0 @@
-model:
-  base_learning_rate: 0.0001
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.015
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: jpg
-    cond_stage_key: nix
-    image_size: 48
-    channels: 16
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_by_std: false
-    scale_factor: 0.22765929
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 48
-        in_channels: 16
-        out_channels: 16
-        model_channels: 448
-        attention_resolutions:
-        - 4
-        - 2
-        - 1
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        use_scale_shift_norm: false
-        resblock_updown: false
-        num_head_channels: 32
-        use_spatial_transformer: true
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: true
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        monitor: val/rec_loss
-        embed_dim: 16
-        ddconfig:
-          double_z: true
-          z_channels: 16
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 1
-          - 2
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions:
-          - 16
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: torch.nn.Identity
--- a/docs/features/CLI.md
+++ b/docs/features/CLI.md
@ -385,7 +385,7 @@ automatically.
 Example:

 <pre>
-invoke> <b>!import_model models/ldm/stable-diffusion-v1/	model-epoch08-float16.ckpt</b>
+invoke> <b>!import_model models/ldm/stable-diffusion-v1/model-epoch08-float16.ckpt</b>
 >> Model import in process. Please enter the values needed to configure this model:

 Name for this model: <b>waifu-diffusion</b>
--- a/docs/installation/INSTALLING_MODELS.md
+++ b/docs/installation/INSTALLING_MODELS.md
@ -0,0 +1,267 @@
+---
+title: Installing Models
+---
+
+# :octicons-paintbrush-16: Installing Models
+
+## Model Weight Files
+
+The model weight files ('*.ckpt') are the Stable Diffusion "secret
+sauce". They are the product of training the AI on millions of
+captioned images gathered from multiple sources.
+
+Originally there was only a single Stable Diffusion weights file,
+which many people named `model.ckpt`. Now there are dozens or more
+that have been "fine tuned" to provide particulary styles, genres, or
+other features. InvokeAI allows you to install and run multiple model
+weight files and switch between them quickly in the command-line and
+web interfaces.
+
+This manual will guide you through installing and configuring model
+weight files.
+
+## Base Models
+
+InvokeAI comes with support for a good initial set of models listed in
+the model configuration file `configs/models.yaml`. They are:
+
+| Model                   | Weight File                   |   Description                    | DOWNLOAD FROM            |
+| ----------------------  | ----------------------------- |--------------------------------- | ----------------|
+| stable-diffusion-1.5    | v1-5-pruned-emaonly.ckpt      | Most recent version of base Stable Diffusion model| https://huggingface.co/runwayml/stable-diffusion-v1-5 |
+| stable-diffusion-1.4    | sd-v1-4.ckpt                  | Previous version of base Stable Diffusion model | https://huggingface.co/CompVis/stable-diffusion-v-1-4-original |
+| inpainting-1.5          | sd-v1-5-inpainting.ckpt       | Stable Diffusion 1.5 model specialized for inpainting | https://huggingface.co/runwayml/stable-diffusion-inpainting |
+| waifu-diffusion-1.3     | model-epoch09-float32.ckpt    | Stable Diffusion 1.4 trained to produce anime images | https://huggingface.co/hakurei/waifu-diffusion-v1-3 |
+| <all models>            | vae-ft-mse-840000-ema-pruned.ckpt   | A fine-tune file add-on file that improves face generation | https://huggingface.co/stabilityai/sd-vae-ft-mse-original/ |
+
+
+Note that these files are covered by an "Ethical AI" license which
+forbids certain uses. You will need to create an account on the
+Hugging Face website and accept the license terms before you can
+access the files.
+
+The predefined configuration file for InvokeAI (located at
+`configs/models.yaml`) provides entries for each of these weights
+files. `stable-diffusion-1.5` is the default model used, and we
+strongly recommend that you install this weights file if nothing else.
+
+## Community-Contributed Models
+
+There are too many to list here and more are being contributed every
+day. Hugging Face maintains a [fast-growing
+repository](https://huggingface.co/sd-concepts-library) of fine-tune
+(".bin") models that can be imported into InvokeAI by passing the
+`--embedding_path` option to the `invoke.py` command.
+
+[This page](https://rentry.org/sdmodels) hosts a large list of
+official and unofficial Stable Diffusion models and where they can be
+obtained.
+
+## Installation
+
+There are three ways to install weights files:
+
+1. During InvokeAI installation, the `preload_models.py` script can
+download them for you.
+
+2. You can use the command-line interface (CLI) to import, configure
+and modify new models files.
+
+3. You can download the files manually and add the appropriate entries
+to `models.yaml`.
+
+### Installation via `preload_models.py`
+
+This is the most automatic way. Run `scripts/preload_models.py` from
+the console.  It will ask you to select which models to download and
+lead you through the steps of setting up a Hugging Face account if you
+haven't done so already.
+
+To start, from within the InvokeAI directory run the command `python
+scripts/preload_models.py` (Linux/MacOS) or `python
+scripts\preload_models.py` (Windows):
+
+```
+Loading Python libraries...
+
+** INTRODUCTION **
+Welcome to InvokeAI. This script will help download the Stable Diffusion weight files
+and other large models that are needed for text to image generation. At any point you may interrupt
+this program and resume later.
+
+** WEIGHT SELECTION **
+Would you like to download the Stable Diffusion model weights now? [y] 
+
+Choose the weight file(s) you wish to download. Before downloading you 
+will be given the option to view and change your selections.
+
+[1] stable-diffusion-1.5:
+    The newest Stable Diffusion version 1.5 weight file (4.27 GB) (recommended)
+    Download? [y] 
+[2] inpainting-1.5:
+    RunwayML SD 1.5 model optimized for inpainting (4.27 GB) (recommended)
+    Download? [y] 
+[3] stable-diffusion-1.4:
+    The original Stable Diffusion version 1.4 weight file (4.27 GB) 
+    Download? [n] n
+[4] waifu-diffusion-1.3:
+    Stable Diffusion 1.4 fine tuned on anime-styled images (4.27) 
+    Download? [n] y
+[5] ft-mse-improved-autoencoder-840000:
+    StabilityAI improved autoencoder fine-tuned for human faces (recommended; 335 MB) (recommended)
+    Download? [y] y
+The following weight files will be downloaded:
+   [1] stable-diffusion-1.5*
+   [2] inpainting-1.5
+   [4] waifu-diffusion-1.3
+   [5] ft-mse-improved-autoencoder-840000
+*default
+Ok to download? [y] 
+** LICENSE AGREEMENT FOR WEIGHT FILES **
+
+1. To download the Stable Diffusion weight files you need to read and accept the
+   CreativeML Responsible AI license. If you have not already done so, please 
+   create an account using the "Sign Up" button:
+
+   https://huggingface.co 
+
+   You will need to verify your email address as part of the HuggingFace
+   registration process.
+
+2. After creating the account, login under your account and accept
+   the license terms located here:
+
+   https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
+
+Press <enter> when you are ready to continue:
+...
+```
+
+When the script is complete, you will find the downloaded weights
+files in `models/ldm/stable-diffusion-v1` and a matching configuration
+file in `configs/models.yaml`.
+
+You can run the script again to add any models you didn't select the
+first time. Note that as a safety measure the script will _never_
+remove a previously-installed weights file. You will have to do this
+manually.
+
+### Installation via the CLI
+
+You can install a new model, including any of the community-supported
+ones, via the command-line client's `!import_model` command.
+
+1. First download the desired model weights file and place it under `models/ldm/stable-diffusion-v1/`.
+   You may rename the weights file to something more memorable if you wish. Record the path of the
+   weights file (e.g. `models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt`)
+
+2. Launch the `invoke.py` CLI with `python scripts/invoke.py`.
+
+3. At the `invoke>` command-line, enter the command `!import_model <path to model>`.
+   For example:
+
+   `invoke> !import_model models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt`
+
+   (Hint - the CLI supports file path autocompletion. Type a bit of the path
+   name and hit <tab> in order to get a choice of possible completions.)
+
+4. Follow the wizard's instructions to complete installation as shown in the example
+   here:
+
+```
+invoke> <b>!import_model models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt</b>
+>> Model import in process. Please enter the values needed to configure this model:
+
+Name for this model: <b>arabian-nights</b>
+Description of this model: <b>Arabian Nights Fine Tune v1.0</b>
+Configuration file for this model: <b>configs/stable-diffusion/v1-inference.yaml</b>
+Default image width: <b>512</b>
+Default image height: <b>512</b>
+>> New configuration:
+arabian-nights:
+  config: configs/stable-diffusion/v1-inference.yaml
+  description: Arabian Nights Fine Tune v1.0
+  height: 512
+  weights: models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt
+  width: 512
+OK to import [n]? <b>y</b>
+>> Caching model stable-diffusion-1.4 in system RAM
+>> Loading waifu-diffusion from models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt
+   | LatentDiffusion: Running in eps-prediction mode
+   | DiffusionWrapper has 859.52 M params.
+   | Making attention of type 'vanilla' with 512 in_channels
+   | Working with z of shape (1, 4, 32, 32) = 4096 dimensions.
+   | Making attention of type 'vanilla' with 512 in_channels
+   | Using faster float16 precision
+
+```
+
+If you've previously installed the fine-tune VAE file `vae-ft-mse-840000-ema-pruned.ckpt`,
+the wizard will also ask you if you want to add this VAE to the model.
+
+The appropriate entry for this model will be added to `configs/models.yaml` and it will
+be available to use in the CLI immediately.
+
+The CLI has additional commands for switching among, viewing, editing,
+deleting the available models. These are described in [Command Line
+Client](../features/CLI.md#model-selection-and-importation), but the two most
+frequently-used are `!models` and `!switch <name of model>`. The first
+prints a table of models that InvokeAI knows about and their load
+status. The second will load the requested model and lets you switch
+back and forth quickly among loaded models.
+
+### Manually editing of `configs/models.yaml`
+
+If you are comfortable with a text editor then you may simply edit
+`models.yaml` directly.
+
+First you need to download the desired .ckpt file and place it in
+`models/ldm/stable-diffusion-v1` as descirbed in step #1 in the
+previous section. Record the path to the weights file,
+e.g. `models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt`
+
+Then using a **text** editor (e.g. the Windows Notepad application),
+open the file `configs/models.yaml`, and add a new stanza that follows
+this model:
+
+```
+arabian-nights-1.0:
+  description: A great fine-tune in Arabian Nights style
+  weights: ./models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt
+  config: ./configs/stable-diffusion/v1-inference.yaml
+  width: 512
+  height: 512
+  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
+  default: false
+```
+
+* arabian-nights-1.0
+  - This is the name of the model that you will refer to from within the
+  CLI and the WebGUI when you need to load and use the model.
+
+* description
+  - Any description that you want to add to the model to remind you what
+    it is.
+
+* weights
+  - Relative path to the .ckpt weights file for this model.
+
+* config
+  - This is the confusingly-named configuration file for the model itself.
+  Use `./configs/stable-diffusion/v1-inference.yaml` unless the model happens
+  to need a custom configuration, in which case the place you downloaded it
+  from will tell you what to use instead. For example, the runwayML custom
+  inpainting model requires the file `configs/stable-diffusion/v1-inpainting-inference.yaml`.
+  This is already inclued in the InvokeAI distribution and is configured automatically
+  for you by the `preload_models.py` script.
+
+* vae
+  - If you want to add a VAE file to the model, then enter its path here.
+
+* width, height
+  - This is the width and height of the images used to train the model.
+  Currently they are always 512 and 512.
+
+Save the `models.yaml` and relaunch InvokeAI. The new model should now be
+available for your use.
+
+
--- a/docs/installation/INSTALL_LINUX.md
+++ b/docs/installation/INSTALL_LINUX.md
@ -1,5 +1,5 @@
 ---
-title: Linux
+title: Manual Installation, Linux
 ---

 # :fontawesome-brands-linux: Linux
@ -63,24 +63,16 @@ title: Linux
        model loading scheme to allow the script to work on GPU machines that are not
        internet connected. See [Preload Models](../features/OTHER.md#preload-models)

-7. Now you need to install the weights for the stable diffusion model.
+7. Install the weights for the stable diffusion model.

-      - For running with the released weights, you will first need to set up an acount
-        with [Hugging Face](https://huggingface.co).
-      - Use your credentials to log in, and then point your browser [here](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original).
-      - You may be asked to sign a license agreement at this point.
-      - Click on "Files and versions" near the top of the page, and then click on the
-        file named "sd-v1-4.ckpt". You'll be taken to a page that prompts you to click
-        the "download" link. Save the file somewhere safe on your local machine.
+- Sign up at https://huggingface.co
+- Go to the [Stable diffusion diffusion model page](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original)
+- Accept the terms and click Access Repository
+- Download [v1-5-pruned-emaonly.ckpt (4.27 GB)](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt)
+and move it into this directory under `models/ldm/stable_diffusion_v1/v1-5-pruned-emaonly.ckpt`

-      Now run the following commands from within the stable-diffusion directory.
-      This will create a symbolic link from the stable-diffusion model.ckpt file, to
-      the true location of the `sd-v1-4.ckpt` file.
-
-    ```bash
-    (invokeai) ~/InvokeAI$ mkdir -p models/ldm/stable-diffusion-v1
-    (invokeai) ~/InvokeAI$ ln -sf /path/to/sd-v1-4.ckpt models/ldm/stable-diffusion-v1/model.ckpt
-    ```
+There are many other models that you can use. Please see [../features/INSTALLING_MODELS.md]
+for details.

 8. Start generating images!

--- a/docs/installation/INSTALL_MAC.md
+++ b/docs/installation/INSTALL_MAC.md
@ -1,5 +1,5 @@
 ---
-title: macOS
+title: Manual Installation, macOS
 ---

 # :fontawesome-brands-apple: macOS
@ -24,9 +24,15 @@ First you need to download a large checkpoint file.
 1. Sign up at https://huggingface.co
 2. Go to the [Stable diffusion diffusion model page](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original)
 3. Accept the terms and click Access Repository
-4. Download [sd-v1-4.ckpt (4.27 GB)](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/blob/main/sd-v1-4.ckpt) and note where you have saved it (probably the Downloads folder). You may want to move it somewhere else for longer term storage - SD needs this file to run.
+4. Download [v1-5-pruned-emaonly.ckpt (4.27 GB)](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt)
+and move it into this directory under `models/ldm/stable_diffusion_v1/v1-5-pruned-emaonly.ckpt`

-While that is downloading, open Terminal and run the following commands one at a time, reading the comments and taking care to run the appropriate command for your Mac's architecture (Intel or M1).
+There are many other models that you can try. Please see [../features/INSTALLING_MODELS.md]
+for details.
+
+While that is downloading, open Terminal and run the following
+commands one at a time, reading the comments and taking care to run
+the appropriate command for your Mac's architecture (Intel or M1).

 !!! todo "Homebrew"

--- a/docs/installation/INSTALL_WINDOWS.md
+++ b/docs/installation/INSTALL_WINDOWS.md
@ -1,5 +1,5 @@
 ---
-title: Windows
+title: Manual Installation, Windows
 ---

 # :fontawesome-brands-windows: Windows
@ -83,23 +83,14 @@ in the wiki

 8. Now you need to install the weights for the big stable diffusion model.

-      1. For running with the released weights, you will first need to set up an acount with Hugging Face (https://huggingface.co).
-      2. Use your credentials to log in, and then point your browser at https://huggingface.co/CompVis/stable-diffusion-v-1-4-original.
-      3. You may be asked to sign a license agreement at this point.
-      4. Click on "Files and versions" near the top of the page, and then click on the file named `sd-v1-4.ckpt`. You'll be taken to a page that
-        prompts you to click the "download" link. Now save the file somewhere safe on your local machine.
-      5. The weight file is >4 GB in size, so
-        downloading may take a while.
+   - Sign up at https://huggingface.co
+   - Go to the [Stable diffusion diffusion model page](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original)
+   - Accept the terms and click Access Repository
+   - Download [v1-5-pruned-emaonly.ckpt (4.27 GB)](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt)
+     and move it into this directory under `models/ldm/stable_diffusion_v1/v1-5-pruned-emaonly.ckpt`

-    Now run the following commands from **within the InvokeAI directory** to copy the weights file to the right place:
-
-    ```batch
-    mkdir -p models\ldm\stable-diffusion-v1
-    copy C:\path\to\sd-v1-4.ckpt models\ldm\stable-diffusion-v1\model.ckpt
-    ```
-
-    Please replace `C:\path\to\sd-v1.4.ckpt` with the correct path to wherever you stashed this file. If you prefer not to copy or move the .ckpt file,
-    you may instead create a shortcut to it from within `models\ldm\stable-diffusion-v1\`.
+   There are many other models that you can use. Please see [../features/INSTALLING_MODELS.md]
+   for details.

 9. Start generating images!

--- a/ldm/invoke/model_cache.py
+++ b/ldm/invoke/model_cache.py
@ -227,11 +227,14 @@ class ModelCache(object):
            print('   | Using more accurate float32 precision')

        # look and load a matching vae file. Code borrowed from AUTOMATIC1111 modules/sd_models.py
-        if vae and os.path.exists(vae):
-            print(f'   | Loading VAE weights from: {vae}')
-            vae_ckpt = torch.load(vae, map_location="cpu")
-            vae_dict = {k: v for k, v in vae_ckpt["state_dict"].items() if k[0:4] != "loss"}
-            model.first_stage_model.load_state_dict(vae_dict, strict=False)
+        if vae:
+            if os.path.exists(vae):
+                print(f'   | Loading VAE weights from: {vae}')
+                vae_ckpt = torch.load(vae, map_location="cpu")
+                vae_dict = {k: v for k, v in vae_ckpt["state_dict"].items() if k[0:4] != "loss"}
+                model.first_stage_model.load_state_dict(vae_dict, strict=False)
+            else:
+                print(f'   | VAE file {vae} not found. Skipping.')

        model.to(self.device)
        # model.to doesn't change the cond_stage_model.device used to move the tokenizer output, so set it here
@ -281,7 +284,7 @@ class ModelCache(object):
        Returns the preamble for the config file.
        '''
        return '''# This file describes the alternative machine learning models
-# available to the dream script.
+# available to InvokeAI script.
 #
 # To add a new model, follow the examples below. Each
 # model requires a model config file, a weights file,
--- a/models/first_stage_models/kl-f16/config.yaml
+++ b/models/first_stage_models/kl-f16/config.yaml
@ -1,44 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: val/rec_loss
-    embed_dim: 16
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 1.0e-06
-        disc_weight: 0.5
-    ddconfig:
-      double_z: true
-      z_channels: 16
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 1
-      - 2
-      - 2
-      - 4
-      num_res_blocks: 2
-      attn_resolutions:
-      - 16
-      dropout: 0.0
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 6
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        size: 384
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        size: 384
-        crop_size: 256
--- a/models/first_stage_models/kl-f32/config.yaml
+++ b/models/first_stage_models/kl-f32/config.yaml
@ -1,46 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: val/rec_loss
-    embed_dim: 64
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 1.0e-06
-        disc_weight: 0.5
-    ddconfig:
-      double_z: true
-      z_channels: 64
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 1
-      - 2
-      - 2
-      - 4
-      - 4
-      num_res_blocks: 2
-      attn_resolutions:
-      - 16
-      - 8
-      dropout: 0.0
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 6
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        size: 384
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        size: 384
-        crop_size: 256
--- a/models/first_stage_models/kl-f4/config.yaml
+++ b/models/first_stage_models/kl-f4/config.yaml
@ -1,41 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: val/rec_loss
-    embed_dim: 3
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 1.0e-06
-        disc_weight: 0.5
-    ddconfig:
-      double_z: true
-      z_channels: 3
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 2
-      - 4
-      num_res_blocks: 2
-      attn_resolutions: []
-      dropout: 0.0
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 10
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        size: 384
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        size: 384
-        crop_size: 256
--- a/models/first_stage_models/kl-f8/config.yaml
+++ b/models/first_stage_models/kl-f8/config.yaml
@ -1,42 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: val/rec_loss
-    embed_dim: 4
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 1.0e-06
-        disc_weight: 0.5
-    ddconfig:
-      double_z: true
-      z_channels: 4
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 2
-      - 4
-      - 4
-      num_res_blocks: 2
-      attn_resolutions: []
-      dropout: 0.0
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 4
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        size: 384
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        size: 384
-        crop_size: 256
--- a/models/first_stage_models/vq-f16/config.yaml
+++ b/models/first_stage_models/vq-f16/config.yaml
@ -1,49 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.VQModel
-  params:
-    embed_dim: 8
-    n_embed: 16384
-    ddconfig:
-      double_z: false
-      z_channels: 8
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 1
-      - 2
-      - 2
-      - 4
-      num_res_blocks: 2
-      attn_resolutions:
-      - 16
-      dropout: 0.0
-    lossconfig:
-      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
-      params:
-        disc_conditional: false
-        disc_in_channels: 3
-        disc_start: 250001
-        disc_weight: 0.75
-        disc_num_layers: 2
-        codebook_weight: 1.0
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 14
-    num_workers: 20
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        size: 384
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        size: 384
-        crop_size: 256
--- a/models/first_stage_models/vq-f4-noattn/config.yaml
+++ b/models/first_stage_models/vq-f4-noattn/config.yaml
@ -1,46 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.VQModel
-  params:
-    embed_dim: 3
-    n_embed: 8192
-    monitor: val/rec_loss
-
-    ddconfig:
-      attn_type: none
-      double_z: false
-      z_channels: 3
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 2
-      - 4
-      num_res_blocks: 2
-      attn_resolutions: []
-      dropout: 0.0
-    lossconfig:
-      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
-      params:
-        disc_conditional: false
-        disc_in_channels: 3
-        disc_start: 11
-        disc_weight: 0.75
-        codebook_weight: 1.0
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 8
-    num_workers: 12
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        crop_size: 256
--- a/models/first_stage_models/vq-f4/config.yaml
+++ b/models/first_stage_models/vq-f4/config.yaml
@ -1,45 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.VQModel
-  params:
-    embed_dim: 3
-    n_embed: 8192
-    monitor: val/rec_loss
-
-    ddconfig:
-      double_z: false
-      z_channels: 3
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 2
-      - 4
-      num_res_blocks: 2
-      attn_resolutions: []
-      dropout: 0.0
-    lossconfig:
-      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
-      params:
-        disc_conditional: false
-        disc_in_channels: 3
-        disc_start: 0
-        disc_weight: 0.75
-        codebook_weight: 1.0
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 8
-    num_workers: 16
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        crop_size: 256
--- a/models/first_stage_models/vq-f8-n256/config.yaml
+++ b/models/first_stage_models/vq-f8-n256/config.yaml
@ -1,48 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.VQModel
-  params:
-    embed_dim: 4
-    n_embed: 256
-    monitor: val/rec_loss
-    ddconfig:
-      double_z: false
-      z_channels: 4
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 2
-      - 2
-      - 4
-      num_res_blocks: 2
-      attn_resolutions:
-      - 32
-      dropout: 0.0
-    lossconfig:
-      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
-      params:
-        disc_conditional: false
-        disc_in_channels: 3
-        disc_start: 250001
-        disc_weight: 0.75
-        codebook_weight: 1.0
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 10
-    num_workers: 20
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        size: 384
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        size: 384
-        crop_size: 256
--- a/models/first_stage_models/vq-f8/config.yaml
+++ b/models/first_stage_models/vq-f8/config.yaml
@ -1,48 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.VQModel
-  params:
-    embed_dim: 4
-    n_embed: 16384
-    monitor: val/rec_loss
-    ddconfig:
-      double_z: false
-      z_channels: 4
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 2
-      - 2
-      - 4
-      num_res_blocks: 2
-      attn_resolutions:
-      - 32
-      dropout: 0.0
-    lossconfig:
-      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
-      params:
-        disc_conditional: false
-        disc_in_channels: 3
-        disc_num_layers: 2
-        disc_start: 1
-        disc_weight: 0.6
-        codebook_weight: 1.0
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 10
-    num_workers: 20
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        size: 384
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        size: 384
-        crop_size: 256
--- a/models/ldm/bsr_sr/config.yaml
+++ b/models/ldm/bsr_sr/config.yaml
@ -1,80 +0,0 @@
-model:
-  base_learning_rate: 1.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0155
-    log_every_t: 100
-    timesteps: 1000
-    loss_type: l2
-    first_stage_key: image
-    cond_stage_key: LR_image
-    image_size: 64
-    channels: 3
-    concat_mode: true
-    cond_stage_trainable: false
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 6
-        out_channels: 3
-        model_channels: 160
-        attention_resolutions:
-        - 16
-        - 8
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 2
-        - 4
-        num_head_channels: 32
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: torch.nn.Identity
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 64
-    wrap: false
-    num_workers: 12
-    train:
-      target: ldm.data.openimages.SuperresOpenImagesAdvancedTrain
-      params:
-        size: 256
-        degradation: bsrgan_light
-        downscale_f: 4
-        min_crop_f: 0.5
-        max_crop_f: 1.0
-        random_crop: true
-    validation:
-      target: ldm.data.openimages.SuperresOpenImagesAdvancedValidation
-      params:
-        size: 256
-        degradation: bsrgan_light
-        downscale_f: 4
-        min_crop_f: 0.5
-        max_crop_f: 1.0
-        random_crop: true
--- a/models/ldm/celeba256/config.yaml
+++ b/models/ldm/celeba256/config.yaml
@ -1,70 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: class_label
-    image_size: 64
-    channels: 3
-    cond_stage_trainable: false
-    concat_mode: false
-    monitor: val/loss
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 224
-        attention_resolutions:
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_head_channels: 32
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config: __is_unconditional__
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 48
-    num_workers: 5
-    wrap: false
-    train:
-      target: ldm.data.faceshq.CelebAHQTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.faceshq.CelebAHQValidation
-      params:
-        size: 256
--- a/models/ldm/cin256/config.yaml
+++ b/models/ldm/cin256/config.yaml
@ -1,80 +0,0 @@
-model:
-  base_learning_rate: 1.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: class_label
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: true
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32
-        in_channels: 4
-        out_channels: 4
-        model_channels: 256
-        attention_resolutions:
-        - 4
-        - 2
-        - 1
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 4
-        num_head_channels: 32
-        use_spatial_transformer: true
-        transformer_depth: 1
-        context_dim: 512
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 4
-        n_embed: 16384
-        ddconfig:
-          double_z: false
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions:
-          - 32
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.ClassEmbedder
-      params:
-        embed_dim: 512
-        key: class_label
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 64
-    num_workers: 12
-    wrap: false
-    train:
-      target: ldm.data.imagenet.ImageNetTrain
-      params:
-        config:
-          size: 256
-    validation:
-      target: ldm.data.imagenet.ImageNetValidation
-      params:
-        config:
-          size: 256
--- a/models/ldm/ffhq256/config.yaml
+++ b/models/ldm/ffhq256/config.yaml
@ -1,70 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: class_label
-    image_size: 64
-    channels: 3
-    cond_stage_trainable: false
-    concat_mode: false
-    monitor: val/loss
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 224
-        attention_resolutions:
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_head_channels: 32
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config: __is_unconditional__
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 42
-    num_workers: 5
-    wrap: false
-    train:
-      target: ldm.data.faceshq.FFHQTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.faceshq.FFHQValidation
-      params:
-        size: 256
--- a/models/ldm/inpainting_big/config.yaml
+++ b/models/ldm/inpainting_big/config.yaml
@ -1,67 +0,0 @@
-model:
-  base_learning_rate: 1.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0205
-    log_every_t: 100
-    timesteps: 1000
-    loss_type: l1
-    first_stage_key: image
-    cond_stage_key: masked_image
-    image_size: 64
-    channels: 3
-    concat_mode: true
-    monitor: val/loss
-    scheduler_config:
-      target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler
-      params:
-        verbosity_interval: 0
-        warm_up_steps: 1000
-        max_decay_steps: 50000
-        lr_start: 0.001
-        lr_max: 0.1
-        lr_min: 0.0001
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 7
-        out_channels: 3
-        model_channels: 256
-        attention_resolutions:
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_heads: 8
-        resblock_updown: true
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        monitor: val/rec_loss
-        ddconfig:
-          attn_type: none
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: ldm.modules.losses.contperceptual.DummyLoss
-    cond_stage_config: __is_first_stage__
--- a/models/ldm/layout2img-openimages256/config.yaml
+++ b/models/ldm/layout2img-openimages256/config.yaml
@ -1,81 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0205
-    log_every_t: 100
-    timesteps: 1000
-    loss_type: l1
-    first_stage_key: image
-    cond_stage_key: coordinates_bbox
-    image_size: 64
-    channels: 3
-    conditioning_key: crossattn
-    cond_stage_trainable: true
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 128
-        attention_resolutions:
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_head_channels: 32
-        use_spatial_transformer: true
-        transformer_depth: 3
-        context_dim: 512
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.BERTEmbedder
-      params:
-        n_embed: 512
-        n_layer: 16
-        vocab_size: 8192
-        max_seq_len: 92
-        use_tokenizer: false
-    monitor: val/loss_simple_ema
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 24
-    wrap: false
-    num_workers: 10
-    train:
-      target: ldm.data.openimages.OpenImagesBBoxTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.openimages.OpenImagesBBoxValidation
-      params:
-        size: 256
--- a/models/ldm/lsun_beds256/config.yaml
+++ b/models/ldm/lsun_beds256/config.yaml
@ -1,70 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: class_label
-    image_size: 64
-    channels: 3
-    cond_stage_trainable: false
-    concat_mode: false
-    monitor: val/loss
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 224
-        attention_resolutions:
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_head_channels: 32
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config: __is_unconditional__
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 48
-    num_workers: 5
-    wrap: false
-    train:
-      target: ldm.data.lsun.LSUNBedroomsTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.lsun.LSUNBedroomsValidation
-      params:
-        size: 256
--- a/models/ldm/lsun_churches256/config.yaml
+++ b/models/ldm/lsun_churches256/config.yaml
@ -1,92 +0,0 @@
-model:
-  base_learning_rate: 5.0e-05
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0155
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    loss_type: l1
-    first_stage_key: image
-    cond_stage_key: image
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: false
-    concat_mode: false
-    scale_by_std: true
-    monitor: val/loss_simple_ema
-    scheduler_config:
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps:
-        - 10000
-        cycle_lengths:
-        - 10000000000000
-        f_start:
-        - 1.0e-06
-        f_max:
-        - 1.0
-        f_min:
-        - 1.0
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32
-        in_channels: 4
-        out_channels: 4
-        model_channels: 192
-        attention_resolutions:
-        - 1
-        - 2
-        - 4
-        - 8
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 2
-        - 4
-        - 4
-        num_heads: 8
-        use_scale_shift_norm: true
-        resblock_updown: true
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config: '__is_unconditional__'
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 96
-    num_workers: 5
-    wrap: false
-    train:
-      target: ldm.data.lsun.LSUNChurchesTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.lsun.LSUNChurchesValidation
-      params:
-        size: 256
--- a/models/ldm/semantic_synthesis256/config.yaml
+++ b/models/ldm/semantic_synthesis256/config.yaml
@ -1,59 +0,0 @@
-model:
-  base_learning_rate: 1.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0205
-    log_every_t: 100
-    timesteps: 1000
-    loss_type: l1
-    first_stage_key: image
-    cond_stage_key: segmentation
-    image_size: 64
-    channels: 3
-    concat_mode: true
-    cond_stage_trainable: true
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 6
-        out_channels: 3
-        model_channels: 128
-        attention_resolutions:
-        - 32
-        - 16
-        - 8
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 4
-        - 8
-        num_heads: 8
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.SpatialRescaler
-      params:
-        n_stages: 2
-        in_channels: 182
-        out_channels: 3
--- a/models/ldm/semantic_synthesis512/config.yaml
+++ b/models/ldm/semantic_synthesis512/config.yaml
@ -1,78 +0,0 @@
-model:
-  base_learning_rate: 1.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0205
-    log_every_t: 100
-    timesteps: 1000
-    loss_type: l1
-    first_stage_key: image
-    cond_stage_key: segmentation
-    image_size: 128
-    channels: 3
-    concat_mode: true
-    cond_stage_trainable: true
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 128
-        in_channels: 6
-        out_channels: 3
-        model_channels: 128
-        attention_resolutions:
-        - 32
-        - 16
-        - 8
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 4
-        - 8
-        num_heads: 8
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.SpatialRescaler
-      params:
-        n_stages: 2
-        in_channels: 182
-        out_channels: 3
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 8
-    wrap: false
-    num_workers: 10
-    train:
-      target: ldm.data.landscapes.RFWTrain
-      params:
-        size: 768
-        crop_size: 512
-        segmentation_to_float32: true
-    validation:
-      target: ldm.data.landscapes.RFWValidation
-      params:
-        size: 768
-        crop_size: 512
-        segmentation_to_float32: true
--- a/models/ldm/stable-diffusion-v1/place-ckpt-files-here.txt
+++ b/models/ldm/stable-diffusion-v1/place-ckpt-files-here.txt
@ -0,0 +1,2 @@
+See docs/features/INSTALLING_MODELS.md for how to populate this
+directory with one or more Stable Diffusion model weight files.
--- a/models/ldm/text2img256/config.yaml
+++ b/models/ldm/text2img256/config.yaml
@ -1,77 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: caption
-    image_size: 64
-    channels: 3
-    cond_stage_trainable: true
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 192
-        attention_resolutions:
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 5
-        num_head_channels: 32
-        use_spatial_transformer: true
-        transformer_depth: 1
-        context_dim: 640
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.BERTEmbedder
-      params:
-        n_embed: 640
-        n_layer: 32
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 28
-    num_workers: 10
-    wrap: false
-    train:
-      target: ldm.data.previews.pytorch_dataset.PreviewsTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.previews.pytorch_dataset.PreviewsValidation
-      params:
-        size: 256
--- a/scripts/preload_models.py
+++ b/scripts/preload_models.py
@ -3,20 +3,369 @@
 # Before running stable-diffusion on an internet-isolated machine,
 # run this script from one with internet connectivity. The
 # two machines must share a common .cache directory.
-from transformers import CLIPTokenizer, CLIPTextModel
+#
+# Coauthor: Kevin Turner http://github.com/keturn
+#
+print('Loading Python libraries...\n')
+import argparse
 import clip
-from transformers import BertTokenizerFast, AutoFeatureExtractor
 import sys
 import transformers
 import os
 import warnings
 import torch
-import urllib.request
 import zipfile
 import traceback
+import getpass
+import requests
+from urllib import request
+from tqdm import tqdm
+from omegaconf import OmegaConf
+from pathlib import Path
+from transformers import CLIPTokenizer, CLIPTextModel
+from transformers import BertTokenizerFast, AutoFeatureExtractor
+from huggingface_hub import hf_hub_download, HfFolder, hf_hub_url

 transformers.logging.set_verbosity_error()

+#--------------------------globals--
+Model_dir = './models/ldm/stable-diffusion-v1/'
+Config_file = './configs/models.yaml'
+SD_Configs = './configs/stable-diffusion'
+Datasets = {
+    'stable-diffusion-1.5':  {
+        'description': 'The newest Stable Diffusion version 1.5 weight file (4.27 GB)',
+        'repo_id': 'runwayml/stable-diffusion-v1-5',
+        'config': 'v1-inference.yaml',
+        'file': 'v1-5-pruned-emaonly.ckpt',
+        'recommended': True,
+        'width': 512,
+        'height': 512,
+    },
+    'inpainting-1.5': {
+        'description': 'RunwayML SD 1.5 model optimized for inpainting (4.27 GB)',
+        'repo_id': 'runwayml/stable-diffusion-inpainting',
+        'config': 'v1-inpainting-inference.yaml',
+        'file': 'sd-v1-5-inpainting.ckpt',
+        'recommended': True,
+        'width': 512,
+        'height': 512,
+    },
+    'stable-diffusion-1.4': {
+        'description': 'The original Stable Diffusion version 1.4 weight file (4.27 GB)',
+        'repo_id': 'CompVis/stable-diffusion-v-1-4-original',
+        'config': 'v1-inference.yaml',
+        'file': 'sd-v1-4.ckpt',
+        'recommended': False,
+        'width': 512,
+        'height': 512,
+    },
+    'waifu-diffusion-1.3': {
+        'description': 'Stable Diffusion 1.4 fine tuned on anime-styled images (4.27)',
+        'repo_id': 'hakurei/waifu-diffusion-v1-3',
+        'config': 'v1-inference.yaml',
+        'file': 'model-epoch09-float32.ckpt',
+        'recommended': False,
+        'width': 512,
+        'height': 512,
+    },
+    'ft-mse-improved-autoencoder-840000': {
+        'description': 'StabilityAI improved autoencoder fine-tuned for human faces (recommended; 335 MB)',
+        'repo_id': 'stabilityai/sd-vae-ft-mse-original',
+        'config': 'VAE',
+        'file': 'vae-ft-mse-840000-ema-pruned.ckpt',
+        'recommended': True,
+        'width': 512,
+        'height': 512,
+    },
+}
+Config_preamble = '''# This file describes the alternative machine learning models
+# available to InvokeAI script.
+#
+# To add a new model, follow the examples below. Each
+# model requires a model config file, a weights file,
+# and the width and height of the images it
+# was trained on.
+'''
+
+#---------------------------------------------
+def introduction():
+    print(
+        '''Welcome to InvokeAI. This script will help download the Stable Diffusion weight files
+and other large models that are needed for text to image generation. At any point you may interrupt
+this program and resume later.\n'''
+    )
+
+#--------------------------------------------
+def postscript():
+    print(
+        '''You're all set! You may now launch InvokeAI using one of these two commands:
+Web version: 
+
+    python scripts/invoke.py --web  (connect to http://localhost:9090)
+
+Command-line version:
+
+   python scripts/invoke.py
+
+Have fun!
+'''
+)
+
+#---------------------------------------------
+def yes_or_no(prompt:str, default_yes=True):
+    default = "y" if default_yes else 'n'
+    response = input(f'{prompt} [{default}] ') or default
+    if default_yes:
+        return response[0] not in ('n','N')
+    else:
+        return response[0] in ('y','Y')
+
+#---------------------------------------------
+def user_wants_to_download_weights()->str:
+    '''
+    Returns one of "skip", "recommended" or "customized"
+    '''
+    print('''You can download and configure the weights files manually or let this
+script do it for you. Manual installation is described at:
+
+https://github.com/invoke-ai/InvokeAI/blob/main/docs/installation/INSTALLING_MODELS.md
+
+You may download the recommended models (about 10GB total), select a customized set, or
+completely skip this step.
+'''
+    )
+    selection = None
+    while selection is None:
+        choice = input('Download <r>ecommended models, <c>ustomize the list, or <s>kip this step? [r]: ')
+        if choice.startswith(('r','R')) or len(choice)==0:
+            selection = 'recommended'
+        elif choice.startswith(('c','C')):
+            selection = 'customized'
+        elif choice.startswith(('s','S')):
+            selection = 'skip'
+    return selection
+
+#---------------------------------------------
+def select_datasets(action:str):
+    done = False
+    while not done:
+        datasets = dict()
+        dflt = None   # the first model selected will be the default; TODO let user change
+        counter = 1
+
+        if action == 'customized':
+            print('''
+Choose the weight file(s) you wish to download. Before downloading you 
+will be given the option to view and change your selections.
+'''
+        )
+            for ds in Datasets.keys():
+                recommended = '(recommended)' if Datasets[ds]['recommended'] else ''
+                print(f'[{counter}] {ds}:\n    {Datasets[ds]["description"]} {recommended}')
+                if yes_or_no('    Download?',default_yes=Datasets[ds]['recommended']):
+                    datasets[ds]=counter
+                    counter += 1
+        else:
+            for ds in Datasets.keys():
+                if Datasets[ds]['recommended']:
+                    datasets[ds]=counter
+                    counter += 1
+                
+        print('The following weight files will be downloaded:')
+        for ds in datasets:
+            dflt = '*' if dflt is None else ''
+            print(f'   [{datasets[ds]}] {ds}{dflt}')
+        print("*default")
+        ok_to_download = yes_or_no('Ok to download?')
+        if not ok_to_download:
+            if yes_or_no('Change your selection?'):
+                action = 'customized'
+                pass
+            else:
+                done = True
+        else:
+            done = True
+    return datasets if ok_to_download else None
+
+
+#-------------------------------Authenticate against Hugging Face
+def authenticate():
+    print('''
+To download the Stable Diffusion weight files from the official Hugging Face 
+repository, you need to read and accept the CreativeML Responsible AI license.
+
+This involves a few easy steps.
+
+1. If you have not already done so, create an account on Hugging Face's web site
+   using the "Sign Up" button:
+
+   https://huggingface.co/join
+
+   You will need to verify your email address as part of the HuggingFace
+   registration process.
+
+2. Log into your Hugging Face account:
+
+    https://huggingface.co/login
+
+3. Accept the license terms located here:
+
+   https://huggingface.co/runwayml/stable-diffusion-v1-5
+
+   and here:
+
+   https://huggingface.co/runwayml/stable-diffusion-inpainting
+
+    (Yes, you have to accept two slightly different license agreements)
+'''
+    )
+    input('Press <enter> when you are ready to continue:')
+    access_token = HfFolder.get_token()
+    if access_token is None:
+        print('''
+4. Thank you! The last step is to enter your HuggingFace access token so that
+   this script is authorized to initiate the download. Go to the access tokens
+   page of your Hugging Face account and create a token by clicking the 
+   "New token" button:
+
+   https://huggingface.co/settings/tokens
+
+   (You can enter anything you like in the token creation field marked "Name". 
+   "Role" should be "read").
+
+   Now copy the token to your clipboard and paste it here: '''
+        )
+        access_token = getpass.getpass()
+        HfFolder.save_token(access_token)
+    return access_token
+
+#---------------------------------------------
+# look for legacy model.ckpt in models directory and offer to
+# normalize its name
+def migrate_models_ckpt():
+    if not os.path.exists(os.path.join(Model_dir,'model.ckpt')):
+        return
+    new_name = Datasets['stable-diffusion-1.4']['file']
+    print('You seem to have the Stable Diffusion v4.1 "model.ckpt" already installed.')
+    rename = yes_or_no(f'Ok to rename it to "{new_name}" for future reference?')
+    if rename:
+        print(f'model.ckpt => {new_name}')
+        os.rename(os.path.join(Model_dir,'model.ckpt'),os.path.join(Model_dir,new_name))
+            
+#---------------------------------------------
+def download_weight_datasets(models:dict, access_token:str):
+    migrate_models_ckpt()
+    successful = dict()
+    for mod in models.keys():
+        repo_id = Datasets[mod]['repo_id']
+        filename = Datasets[mod]['file']
+        success = download_with_resume(
+            repo_id=repo_id,
+            model_name=filename,
+            access_token=access_token
+        )
+        if success:
+            successful[mod] = True
+    keys = ', '.join(successful.keys())
+    print(f'Successfully installed {keys}') 
+    return successful
+    
+#---------------------------------------------
+def download_with_resume(repo_id:str, model_name:str, access_token:str)->bool:
+
+    model_dest = os.path.join(Model_dir, model_name)
+    os.makedirs(os.path.dirname(model_dest), exist_ok=True)
+    url = hf_hub_url(repo_id, model_name)
+
+    header = {"Authorization": f'Bearer {access_token}'}
+    open_mode = 'wb'
+    exist_size = 0
+    
+    if os.path.exists(model_dest):
+        exist_size = os.path.getsize(model_dest)
+        header['Range'] = f'bytes={exist_size}-'
+        open_mode = 'ab'
+
+    resp = requests.get(url, headers=header, stream=True)
+    total = int(resp.headers.get('content-length', 0))
+    
+    if resp.status_code==416:  # "range not satisfiable", which means nothing to return
+        print(f'* {model_name}: complete file found. Skipping.')
+        return True
+    elif exist_size > 0:
+        print(f'* {model_name}: partial file found. Resuming...')
+    else:
+        print(f'* {model_name}: Downloading...')
+
+    try:
+        if total < 2000:
+            print(f'* {model_name}: {resp.text}')
+            return False
+
+        with open(model_dest, open_mode) as file, tqdm(
+                desc=model_name,
+                initial=exist_size,
+                total=total+exist_size,
+                unit='iB',
+                unit_scale=True,
+                unit_divisor=1000,
+        ) as bar:
+            for data in resp.iter_content(chunk_size=1024):
+                size = file.write(data)
+                bar.update(size)
+    except Exception as e:
+        print(f'An error occurred while downloading {model_name}: {str(e)}')
+        return False
+    return True
+                             
+#---------------------------------------------
+def update_config_file(successfully_downloaded:dict):
+    try:
+        yaml = new_config_file_contents(successfully_downloaded)
+        tmpfile = os.path.join(os.path.dirname(Config_file),'new_config.tmp')
+        with open(tmpfile, 'w') as outfile:
+            outfile.write(Config_preamble)
+            outfile.write(yaml)
+        os.rename(tmpfile,Config_file)
+    except Exception as e:
+        print(f'**Error creating config file {Config_file}: {str(e)} **')
+        return
+    print(f'Successfully created new configuration file {Config_file}')
+
+    
+#---------------------------------------------    
+def new_config_file_contents(successfully_downloaded:dict)->str:
+    conf = OmegaConf.load(Config_file)
+
+    # find the VAE file, if there is one
+    vae = None
+    default_selected = False
+    
+    for model in successfully_downloaded:
+        if Datasets[model]['config'] == 'VAE':
+            vae = Datasets[model]['file']
+    
+    for model in successfully_downloaded:
+        if Datasets[model]['config'] == 'VAE': # skip VAE entries
+            continue
+        stanza = conf[model] if model in conf else { }
+        
+        stanza['description'] = Datasets[model]['description']
+        stanza['weights'] = os.path.join(Model_dir,Datasets[model]['file'])
+        stanza['config'] =os.path.join(SD_Configs, Datasets[model]['config'])
+        stanza['width'] = Datasets[model]['width']
+        stanza['height'] = Datasets[model]['height']
+        stanza.pop('default',None)  # this will be set later
+        if vae:
+            stanza['vae'] = os.path.join(Model_dir,vae)
+        # BUG - the first stanza is always the default. User should select.
+        if not default_selected:
+            stanza['default'] = True
+            default_selected = True
+        conf[model] = stanza
+    return OmegaConf.to_yaml(conf)
+    
 #---------------------------------------------
 # this will preload the Bert tokenizer fles
 def download_bert():
@ -66,7 +415,6 @@ def download_gfpgan():
        print(traceback.format_exc())

    print('Loading models from GFPGAN')
-    import urllib.request
    for model in (
            [
                'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth',
@ -149,15 +497,46 @@ def download_safety_checker():
    safety_feature_extractor = AutoFeatureExtractor.from_pretrained(safety_model_id)
    safety_checker = StableDiffusionSafetyChecker.from_pretrained(safety_model_id)
    print('...success')
-    
+
 #-------------------------------------
 if __name__ == '__main__':
-    download_bert()
-    download_kornia()
-    download_clip()
-    download_gfpgan()
-    download_codeformer()
-    download_clipseg()
-    download_safety_checker()
+    parser = argparse.ArgumentParser(description='InvokeAI model downloader')
+    parser.add_argument('--interactive',
+                        dest='interactive',
+                        action=argparse.BooleanOptionalAction,
+                        default=True,
+                        help='run in interactive mode (default)')
+    opt = parser.parse_args()
+    
+    try:
+        if opt.interactive:
+            introduction()
+            print('** WEIGHT SELECTION **')
+            choice = user_wants_to_download_weights()
+            if choice != 'skip':
+                models = select_datasets(choice)
+                if models is None:
+                    if yes_or_no('Quit?',default_yes=False):
+                        sys.exit(0)
+                print('** LICENSE AGREEMENT FOR WEIGHT FILES **')
+                access_token = authenticate()
+                print('\n** DOWNLOADING WEIGHTS **')
+                successfully_downloaded = download_weight_datasets(models, access_token)
+                update_config_file(successfully_downloaded)
+        else:
+            print('\n** DOWNLOADING SUPPORT MODELS **')
+            download_bert()
+            download_kornia()
+            download_clip()
+            download_gfpgan()
+            download_codeformer()
+            download_clipseg()
+            download_safety_checker()
+            postscript()
+    except KeyboardInterrupt:
+        print('\nGoodbye! Come back soon.')
+    except Exception as e:
+        print(f'\nA problem occurred during download.\nThe error was: "{str(e)}"')
+