preload_models.py script downloads the weight files

- user can select which weight files to download using huggingface cache - user must log in to huggingface, generate an access token, and accept license terms the very first time this is run. After that, everything works automatically. - added placeholder for docs for installing models - also got rid of unused config files. hopefully they weren't needed for textual inversion, but I don't think so.
2024-08-30 20:32:17 +00:00 · 2022-10-29 01:02:45 -04:00
parent 104466f5c0
commit ef68a419f1
37 changed files with 286 additions and 2115 deletions
--- a/.gitignore
+++ b/.gitignore
@ -199,7 +199,13 @@ checkpoints
 .scratch/
 .vscode/
 gfpgan/
-models/ldm/stable-diffusion-v1/model.sha256
+models/ldm/stable-diffusion-v1/*.sha256

 # GFPGAN model files
 gfpgan/
+
+# config file (will be created by installer)
+configs/models.yaml
+
+# weights (will be created by installer)
+models/ldm/stable-diffusion-v1/*.ckpt
--- a/configs/autoencoder/autoencoder_kl_16x16x16.yaml
+++ b/configs/autoencoder/autoencoder_kl_16x16x16.yaml
@ -1,54 +0,0 @@
-model:
-  base_learning_rate: 4.5e-6
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: "val/rec_loss"
-    embed_dim: 16
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 0.000001
-        disc_weight: 0.5
-
-    ddconfig:
-      double_z: True
-      z_channels: 16
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult: [ 1,1,2,2,4]  # num_down = len(ch_mult)-1
-      num_res_blocks: 2
-      attn_resolutions: [16]
-      dropout: 0.0
-
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 12
-    wrap: True
-    train:
-      target: ldm.data.imagenet.ImageNetSRTrain
-      params:
-        size: 256
-        degradation: pil_nearest
-    validation:
-      target: ldm.data.imagenet.ImageNetSRValidation
-      params:
-        size: 256
-        degradation: pil_nearest
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 1000
-        max_images: 8
-        increase_log_steps: True
-
-  trainer:
-    benchmark: True
-    accumulate_grad_batches: 2
--- a/configs/autoencoder/autoencoder_kl_32x32x4.yaml
+++ b/configs/autoencoder/autoencoder_kl_32x32x4.yaml
@ -1,53 +0,0 @@
-model:
-  base_learning_rate: 4.5e-6
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: "val/rec_loss"
-    embed_dim: 4
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 0.000001
-        disc_weight: 0.5
-
-    ddconfig:
-      double_z: True
-      z_channels: 4
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult: [ 1,2,4,4 ]  # num_down = len(ch_mult)-1
-      num_res_blocks: 2
-      attn_resolutions: [ ]
-      dropout: 0.0
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 12
-    wrap: True
-    train:
-      target: ldm.data.imagenet.ImageNetSRTrain
-      params:
-        size: 256
-        degradation: pil_nearest
-    validation:
-      target: ldm.data.imagenet.ImageNetSRValidation
-      params:
-        size: 256
-        degradation: pil_nearest
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 1000
-        max_images: 8
-        increase_log_steps: True
-
-  trainer:
-    benchmark: True
-    accumulate_grad_batches: 2
--- a/configs/autoencoder/autoencoder_kl_64x64x3.yaml
+++ b/configs/autoencoder/autoencoder_kl_64x64x3.yaml
@ -1,54 +0,0 @@
-model:
-  base_learning_rate: 4.5e-6
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: "val/rec_loss"
-    embed_dim: 3
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 0.000001
-        disc_weight: 0.5
-
-    ddconfig:
-      double_z: True
-      z_channels: 3
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult: [ 1,2,4 ]  # num_down = len(ch_mult)-1
-      num_res_blocks: 2
-      attn_resolutions: [ ]
-      dropout: 0.0
-
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 12
-    wrap: True
-    train:
-      target: ldm.data.imagenet.ImageNetSRTrain
-      params:
-        size: 256
-        degradation: pil_nearest
-    validation:
-      target: ldm.data.imagenet.ImageNetSRValidation
-      params:
-        size: 256
-        degradation: pil_nearest
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 1000
-        max_images: 8
-        increase_log_steps: True
-
-  trainer:
-    benchmark: True
-    accumulate_grad_batches: 2
--- a/configs/autoencoder/autoencoder_kl_8x8x64.yaml
+++ b/configs/autoencoder/autoencoder_kl_8x8x64.yaml
@ -1,53 +0,0 @@
-model:
-  base_learning_rate: 4.5e-6
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: "val/rec_loss"
-    embed_dim: 64
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 0.000001
-        disc_weight: 0.5
-
-    ddconfig:
-      double_z: True
-      z_channels: 64
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult: [ 1,1,2,2,4,4]  # num_down = len(ch_mult)-1
-      num_res_blocks: 2
-      attn_resolutions: [16,8]
-      dropout: 0.0
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 12
-    wrap: True
-    train:
-      target: ldm.data.imagenet.ImageNetSRTrain
-      params:
-        size: 256
-        degradation: pil_nearest
-    validation:
-      target: ldm.data.imagenet.ImageNetSRValidation
-      params:
-        size: 256
-        degradation: pil_nearest
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 1000
-        max_images: 8
-        increase_log_steps: True
-
-  trainer:
-    benchmark: True
-    accumulate_grad_batches: 2
--- a/configs/latent-diffusion/celebahq-ldm-vq-4.yaml
+++ b/configs/latent-diffusion/celebahq-ldm-vq-4.yaml
@ -1,86 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    image_size: 64
-    channels: 3
-    monitor: val/loss_simple_ema
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 224
-        attention_resolutions:
-        # note: this isn\t actually the resolution but
-        # the downsampling factor, i.e. this corresnponds to
-        # attention on spatial resolution 8,16,32, as the
-        # spatial reolution of the latents is 64 for f4
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_head_channels: 32
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ckpt_path: models/first_stage_models/vq-f4/model.ckpt
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config: __is_unconditional__
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 48
-    num_workers: 5
-    wrap: false
-    train:
-      target: taming.data.faceshq.CelebAHQTrain
-      params:
-        size: 256
-    validation:
-      target: taming.data.faceshq.CelebAHQValidation
-      params:
-        size: 256
-
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 5000
-        max_images: 8
-        increase_log_steps: False
-
-  trainer:
-    benchmark: True
--- a/configs/latent-diffusion/cin-ldm-vq-f8.yaml
+++ b/configs/latent-diffusion/cin-ldm-vq-f8.yaml
@ -1,98 +0,0 @@
-model:
-  base_learning_rate: 1.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: class_label
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: true
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32
-        in_channels: 4
-        out_channels: 4
-        model_channels: 256
-        attention_resolutions:
-        #note: this isn\t actually the resolution but
-        # the downsampling factor, i.e. this corresnponds to
-        # attention on spatial resolution 8,16,32, as the
-        # spatial reolution of the latents is 32 for f8
-        - 4
-        - 2
-        - 1
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 4
-        num_head_channels: 32
-        use_spatial_transformer: true
-        transformer_depth: 1
-        context_dim: 512
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 4
-        n_embed: 16384
-        ckpt_path: configs/first_stage_models/vq-f8/model.yaml
-        ddconfig:
-          double_z: false
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions:
-          - 32
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.ClassEmbedder
-      params:
-        embed_dim: 512
-        key: class_label
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 64
-    num_workers: 12
-    wrap: false
-    train:
-      target: ldm.data.imagenet.ImageNetTrain
-      params:
-        config:
-          size: 256
-    validation:
-      target: ldm.data.imagenet.ImageNetValidation
-      params:
-        config:
-          size: 256
-
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 5000
-        max_images: 8
-        increase_log_steps: False
-
-  trainer:
-    benchmark: True
--- a/configs/latent-diffusion/cin256-v2.yaml
+++ b/configs/latent-diffusion/cin256-v2.yaml
@ -1,68 +0,0 @@
-model:
-  base_learning_rate: 0.0001
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: class_label
-    image_size: 64
-    channels: 3
-    cond_stage_trainable: true
-    conditioning_key: crossattn
-    monitor: val/loss
-    use_ema: False
-    
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 192
-        attention_resolutions:
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 5
-        num_heads: 1
-        use_spatial_transformer: true
-        transformer_depth: 1
-        context_dim: 512
-    
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.ClassEmbedder
-      params:
-        n_classes: 1001
-        embed_dim: 512
-        key: class_label
--- a/configs/latent-diffusion/ffhq-ldm-vq-4.yaml
+++ b/configs/latent-diffusion/ffhq-ldm-vq-4.yaml
@ -1,85 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    image_size: 64
-    channels: 3
-    monitor: val/loss_simple_ema
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 224
-        attention_resolutions:
-        # note: this isn\t actually the resolution but
-        # the downsampling factor, i.e. this corresnponds to
-        # attention on spatial resolution 8,16,32, as the
-        # spatial reolution of the latents is 64 for f4
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_head_channels: 32
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ckpt_path: configs/first_stage_models/vq-f4/model.yaml
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config: __is_unconditional__
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 42
-    num_workers: 5
-    wrap: false
-    train:
-      target: taming.data.faceshq.FFHQTrain
-      params:
-        size: 256
-    validation:
-      target: taming.data.faceshq.FFHQValidation
-      params:
-        size: 256
-
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 5000
-        max_images: 8
-        increase_log_steps: False
-
-  trainer:
-    benchmark: True
--- a/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml
+++ b/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml
@ -1,85 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    image_size: 64
-    channels: 3
-    monitor: val/loss_simple_ema
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 224
-        attention_resolutions:
-        # note: this isn\t actually the resolution but
-        # the downsampling factor, i.e. this corresnponds to
-        # attention on spatial resolution 8,16,32, as the
-        # spatial reolution of the latents is 64 for f4
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_head_channels: 32
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        ckpt_path: configs/first_stage_models/vq-f4/model.yaml
-        embed_dim: 3
-        n_embed: 8192
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config: __is_unconditional__
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 48
-    num_workers: 5
-    wrap: false
-    train:
-      target: ldm.data.lsun.LSUNBedroomsTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.lsun.LSUNBedroomsValidation
-      params:
-        size: 256
-
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 5000
-        max_images: 8
-        increase_log_steps: False
-
-  trainer:
-    benchmark: True
--- a/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml
+++ b/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml
@ -1,91 +0,0 @@
-model:
-  base_learning_rate: 5.0e-5   # set to target_lr by starting main.py with '--scale_lr False'
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0155
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    loss_type: l1
-    first_stage_key: "image"
-    cond_stage_key: "image"
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: False
-    concat_mode: False
-    scale_by_std: True
-    monitor: 'val/loss_simple_ema'
-
-    scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [10000]
-        cycle_lengths: [10000000000000]
-        f_start: [1.e-6]
-        f_max: [1.]
-        f_min: [ 1.]
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32
-        in_channels: 4
-        out_channels: 4
-        model_channels: 192
-        attention_resolutions: [ 1, 2, 4, 8 ]   # 32, 16, 8, 4
-        num_res_blocks: 2
-        channel_mult: [ 1,2,2,4,4 ]  # 32, 16, 8, 4, 2
-        num_heads: 8
-        use_scale_shift_norm: True
-        resblock_updown: True
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: "val/rec_loss"
-        ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
-        ddconfig:
-          double_z: True
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult: [ 1,2,4,4 ]  # num_down = len(ch_mult)-1
-          num_res_blocks: 2
-          attn_resolutions: [ ]
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config: "__is_unconditional__"
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 96
-    num_workers: 5
-    wrap: False
-    train:
-      target: ldm.data.lsun.LSUNChurchesTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.lsun.LSUNChurchesValidation
-      params:
-        size: 256
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 5000
-        max_images: 8
-        increase_log_steps: False
-
-
-  trainer:
-    benchmark: True
--- a/configs/latent-diffusion/txt2img-1p4B-eval.yaml
+++ b/configs/latent-diffusion/txt2img-1p4B-eval.yaml
@ -1,71 +0,0 @@
-model:
-  base_learning_rate: 5.0e-05
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.012
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: caption
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: true
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions:
-        - 4
-        - 2
-        - 1
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 4
-        - 4
-        num_heads: 8
-        use_spatial_transformer: true
-        transformer_depth: 1
-        context_dim: 1280
-        use_checkpoint: true
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.BERTEmbedder
-      params:
-        n_embed: 1280
-        n_layer: 32
--- a/configs/models.yaml
+++ b/configs/models.yaml
@ -1,5 +1,5 @@
 # This file describes the alternative machine learning models
-# available to the dream script.
+# available to InvokeAI script.
 #
 # To add a new model, follow the examples below. Each
 # model requires a model config file, a weights file,
@ -8,22 +8,29 @@
 stable-diffusion-1.4:
  config: configs/stable-diffusion/v1-inference.yaml
  weights: models/ldm/stable-diffusion-v1/model.ckpt
-#  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
+  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
  description: Stable Diffusion inference model version 1.4
  width: 512
  height: 512
+stable-diffusion-1.5:
+  description: The newest Stable Diffusion version 1.5 weight file (4.27 GB)
+  weights: ./models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt
+  config: ./configs/stable-diffusion/v1-inference.yaml
+  width: 512
+  height: 512
+  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
  default: true
 inpainting-1.5:
-  description: runwayML tuned inpainting model v1.5
-  weights: models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt
-  config: configs/stable-diffusion/v1-inpainting-inference.yaml
-#  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
+  description: RunwayML SD 1.5 model optimized for inpainting (4.27 GB)
+  weights: ./models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt
+  config: ./configs/stable-diffusion/v1-inpainting-inference.yaml
  width: 512
  height: 512
-stable-diffusion-1.5:
-  config: configs/stable-diffusion/v1-inference.yaml
-  weights: models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt
-#  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
-  description: Stable Diffusion inference model version 1.5
+  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
+waifu-diffusion-1.3:
+  description: Stable Diffusion 1.4 fine tuned on anime-styled images (4.27)
+  weights: ./models/ldm/stable-diffusion-v1/model-epoch09-float32.ckpt
+  config: ./configs/stable-diffusion/v1-inference.yaml
  width: 512
  height: 512
+  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
--- a/configs/retrieval-augmented-diffusion/768x768.yaml
+++ b/configs/retrieval-augmented-diffusion/768x768.yaml
@ -1,68 +0,0 @@
-model:
-  base_learning_rate: 0.0001
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.015
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: jpg
-    cond_stage_key: nix
-    image_size: 48
-    channels: 16
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_by_std: false
-    scale_factor: 0.22765929
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 48
-        in_channels: 16
-        out_channels: 16
-        model_channels: 448
-        attention_resolutions:
-        - 4
-        - 2
-        - 1
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        use_scale_shift_norm: false
-        resblock_updown: false
-        num_head_channels: 32
-        use_spatial_transformer: true
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: true
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        monitor: val/rec_loss
-        embed_dim: 16
-        ddconfig:
-          double_z: true
-          z_channels: 16
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 1
-          - 2
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions:
-          - 16
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: torch.nn.Identity
--- a/docs/features/INSTALLING_MODELS.md
+++ b/docs/features/INSTALLING_MODELS.md
@ -0,0 +1,9 @@
+---
+title: Installing Models
+---
+
+# :octicons-paintbrush-16: Installing Models
+
+## TO COME
+
+
--- a/ldm/invoke/model_cache.py
+++ b/ldm/invoke/model_cache.py
@ -281,7 +281,7 @@ class ModelCache(object):
        Returns the preamble for the config file.
        '''
        return '''# This file describes the alternative machine learning models
-# available to the dream script.
+# available to InvokeAI script.
 #
 # To add a new model, follow the examples below. Each
 # model requires a model config file, a weights file,
--- a/models/first_stage_models/kl-f16/config.yaml
+++ b/models/first_stage_models/kl-f16/config.yaml
@ -1,44 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: val/rec_loss
-    embed_dim: 16
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 1.0e-06
-        disc_weight: 0.5
-    ddconfig:
-      double_z: true
-      z_channels: 16
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 1
-      - 2
-      - 2
-      - 4
-      num_res_blocks: 2
-      attn_resolutions:
-      - 16
-      dropout: 0.0
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 6
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        size: 384
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        size: 384
-        crop_size: 256
--- a/models/first_stage_models/kl-f32/config.yaml
+++ b/models/first_stage_models/kl-f32/config.yaml
@ -1,46 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: val/rec_loss
-    embed_dim: 64
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 1.0e-06
-        disc_weight: 0.5
-    ddconfig:
-      double_z: true
-      z_channels: 64
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 1
-      - 2
-      - 2
-      - 4
-      - 4
-      num_res_blocks: 2
-      attn_resolutions:
-      - 16
-      - 8
-      dropout: 0.0
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 6
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        size: 384
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        size: 384
-        crop_size: 256
--- a/models/first_stage_models/kl-f4/config.yaml
+++ b/models/first_stage_models/kl-f4/config.yaml
@ -1,41 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: val/rec_loss
-    embed_dim: 3
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 1.0e-06
-        disc_weight: 0.5
-    ddconfig:
-      double_z: true
-      z_channels: 3
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 2
-      - 4
-      num_res_blocks: 2
-      attn_resolutions: []
-      dropout: 0.0
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 10
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        size: 384
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        size: 384
-        crop_size: 256
--- a/models/first_stage_models/kl-f8/config.yaml
+++ b/models/first_stage_models/kl-f8/config.yaml
@ -1,42 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.AutoencoderKL
-  params:
-    monitor: val/rec_loss
-    embed_dim: 4
-    lossconfig:
-      target: ldm.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 1.0e-06
-        disc_weight: 0.5
-    ddconfig:
-      double_z: true
-      z_channels: 4
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 2
-      - 4
-      - 4
-      num_res_blocks: 2
-      attn_resolutions: []
-      dropout: 0.0
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 4
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        size: 384
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        size: 384
-        crop_size: 256
--- a/models/first_stage_models/vq-f16/config.yaml
+++ b/models/first_stage_models/vq-f16/config.yaml
@ -1,49 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.VQModel
-  params:
-    embed_dim: 8
-    n_embed: 16384
-    ddconfig:
-      double_z: false
-      z_channels: 8
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 1
-      - 2
-      - 2
-      - 4
-      num_res_blocks: 2
-      attn_resolutions:
-      - 16
-      dropout: 0.0
-    lossconfig:
-      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
-      params:
-        disc_conditional: false
-        disc_in_channels: 3
-        disc_start: 250001
-        disc_weight: 0.75
-        disc_num_layers: 2
-        codebook_weight: 1.0
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 14
-    num_workers: 20
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        size: 384
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        size: 384
-        crop_size: 256
--- a/models/first_stage_models/vq-f4-noattn/config.yaml
+++ b/models/first_stage_models/vq-f4-noattn/config.yaml
@ -1,46 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.VQModel
-  params:
-    embed_dim: 3
-    n_embed: 8192
-    monitor: val/rec_loss
-
-    ddconfig:
-      attn_type: none
-      double_z: false
-      z_channels: 3
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 2
-      - 4
-      num_res_blocks: 2
-      attn_resolutions: []
-      dropout: 0.0
-    lossconfig:
-      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
-      params:
-        disc_conditional: false
-        disc_in_channels: 3
-        disc_start: 11
-        disc_weight: 0.75
-        codebook_weight: 1.0
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 8
-    num_workers: 12
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        crop_size: 256
--- a/models/first_stage_models/vq-f4/config.yaml
+++ b/models/first_stage_models/vq-f4/config.yaml
@ -1,45 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.VQModel
-  params:
-    embed_dim: 3
-    n_embed: 8192
-    monitor: val/rec_loss
-
-    ddconfig:
-      double_z: false
-      z_channels: 3
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 2
-      - 4
-      num_res_blocks: 2
-      attn_resolutions: []
-      dropout: 0.0
-    lossconfig:
-      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
-      params:
-        disc_conditional: false
-        disc_in_channels: 3
-        disc_start: 0
-        disc_weight: 0.75
-        codebook_weight: 1.0
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 8
-    num_workers: 16
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        crop_size: 256
--- a/models/first_stage_models/vq-f8-n256/config.yaml
+++ b/models/first_stage_models/vq-f8-n256/config.yaml
@ -1,48 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.VQModel
-  params:
-    embed_dim: 4
-    n_embed: 256
-    monitor: val/rec_loss
-    ddconfig:
-      double_z: false
-      z_channels: 4
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 2
-      - 2
-      - 4
-      num_res_blocks: 2
-      attn_resolutions:
-      - 32
-      dropout: 0.0
-    lossconfig:
-      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
-      params:
-        disc_conditional: false
-        disc_in_channels: 3
-        disc_start: 250001
-        disc_weight: 0.75
-        codebook_weight: 1.0
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 10
-    num_workers: 20
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        size: 384
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        size: 384
-        crop_size: 256
--- a/models/first_stage_models/vq-f8/config.yaml
+++ b/models/first_stage_models/vq-f8/config.yaml
@ -1,48 +0,0 @@
-model:
-  base_learning_rate: 4.5e-06
-  target: ldm.models.autoencoder.VQModel
-  params:
-    embed_dim: 4
-    n_embed: 16384
-    monitor: val/rec_loss
-    ddconfig:
-      double_z: false
-      z_channels: 4
-      resolution: 256
-      in_channels: 3
-      out_ch: 3
-      ch: 128
-      ch_mult:
-      - 1
-      - 2
-      - 2
-      - 4
-      num_res_blocks: 2
-      attn_resolutions:
-      - 32
-      dropout: 0.0
-    lossconfig:
-      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
-      params:
-        disc_conditional: false
-        disc_in_channels: 3
-        disc_num_layers: 2
-        disc_start: 1
-        disc_weight: 0.6
-        codebook_weight: 1.0
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 10
-    num_workers: 20
-    wrap: true
-    train:
-      target: ldm.data.openimages.FullOpenImagesTrain
-      params:
-        size: 384
-        crop_size: 256
-    validation:
-      target: ldm.data.openimages.FullOpenImagesValidation
-      params:
-        size: 384
-        crop_size: 256
--- a/models/ldm/bsr_sr/config.yaml
+++ b/models/ldm/bsr_sr/config.yaml
@ -1,80 +0,0 @@
-model:
-  base_learning_rate: 1.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0155
-    log_every_t: 100
-    timesteps: 1000
-    loss_type: l2
-    first_stage_key: image
-    cond_stage_key: LR_image
-    image_size: 64
-    channels: 3
-    concat_mode: true
-    cond_stage_trainable: false
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 6
-        out_channels: 3
-        model_channels: 160
-        attention_resolutions:
-        - 16
-        - 8
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 2
-        - 4
-        num_head_channels: 32
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: torch.nn.Identity
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 64
-    wrap: false
-    num_workers: 12
-    train:
-      target: ldm.data.openimages.SuperresOpenImagesAdvancedTrain
-      params:
-        size: 256
-        degradation: bsrgan_light
-        downscale_f: 4
-        min_crop_f: 0.5
-        max_crop_f: 1.0
-        random_crop: true
-    validation:
-      target: ldm.data.openimages.SuperresOpenImagesAdvancedValidation
-      params:
-        size: 256
-        degradation: bsrgan_light
-        downscale_f: 4
-        min_crop_f: 0.5
-        max_crop_f: 1.0
-        random_crop: true
--- a/models/ldm/celeba256/config.yaml
+++ b/models/ldm/celeba256/config.yaml
@ -1,70 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: class_label
-    image_size: 64
-    channels: 3
-    cond_stage_trainable: false
-    concat_mode: false
-    monitor: val/loss
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 224
-        attention_resolutions:
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_head_channels: 32
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config: __is_unconditional__
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 48
-    num_workers: 5
-    wrap: false
-    train:
-      target: ldm.data.faceshq.CelebAHQTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.faceshq.CelebAHQValidation
-      params:
-        size: 256
--- a/models/ldm/cin256/config.yaml
+++ b/models/ldm/cin256/config.yaml
@ -1,80 +0,0 @@
-model:
-  base_learning_rate: 1.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: class_label
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: true
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32
-        in_channels: 4
-        out_channels: 4
-        model_channels: 256
-        attention_resolutions:
-        - 4
-        - 2
-        - 1
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 4
-        num_head_channels: 32
-        use_spatial_transformer: true
-        transformer_depth: 1
-        context_dim: 512
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 4
-        n_embed: 16384
-        ddconfig:
-          double_z: false
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions:
-          - 32
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.ClassEmbedder
-      params:
-        embed_dim: 512
-        key: class_label
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 64
-    num_workers: 12
-    wrap: false
-    train:
-      target: ldm.data.imagenet.ImageNetTrain
-      params:
-        config:
-          size: 256
-    validation:
-      target: ldm.data.imagenet.ImageNetValidation
-      params:
-        config:
-          size: 256
--- a/models/ldm/ffhq256/config.yaml
+++ b/models/ldm/ffhq256/config.yaml
@ -1,70 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: class_label
-    image_size: 64
-    channels: 3
-    cond_stage_trainable: false
-    concat_mode: false
-    monitor: val/loss
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 224
-        attention_resolutions:
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_head_channels: 32
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config: __is_unconditional__
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 42
-    num_workers: 5
-    wrap: false
-    train:
-      target: ldm.data.faceshq.FFHQTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.faceshq.FFHQValidation
-      params:
-        size: 256
--- a/models/ldm/inpainting_big/config.yaml
+++ b/models/ldm/inpainting_big/config.yaml
@ -1,67 +0,0 @@
-model:
-  base_learning_rate: 1.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0205
-    log_every_t: 100
-    timesteps: 1000
-    loss_type: l1
-    first_stage_key: image
-    cond_stage_key: masked_image
-    image_size: 64
-    channels: 3
-    concat_mode: true
-    monitor: val/loss
-    scheduler_config:
-      target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler
-      params:
-        verbosity_interval: 0
-        warm_up_steps: 1000
-        max_decay_steps: 50000
-        lr_start: 0.001
-        lr_max: 0.1
-        lr_min: 0.0001
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 7
-        out_channels: 3
-        model_channels: 256
-        attention_resolutions:
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_heads: 8
-        resblock_updown: true
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        monitor: val/rec_loss
-        ddconfig:
-          attn_type: none
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: ldm.modules.losses.contperceptual.DummyLoss
-    cond_stage_config: __is_first_stage__
--- a/models/ldm/layout2img-openimages256/config.yaml
+++ b/models/ldm/layout2img-openimages256/config.yaml
@ -1,81 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0205
-    log_every_t: 100
-    timesteps: 1000
-    loss_type: l1
-    first_stage_key: image
-    cond_stage_key: coordinates_bbox
-    image_size: 64
-    channels: 3
-    conditioning_key: crossattn
-    cond_stage_trainable: true
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 128
-        attention_resolutions:
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_head_channels: 32
-        use_spatial_transformer: true
-        transformer_depth: 3
-        context_dim: 512
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.BERTEmbedder
-      params:
-        n_embed: 512
-        n_layer: 16
-        vocab_size: 8192
-        max_seq_len: 92
-        use_tokenizer: false
-    monitor: val/loss_simple_ema
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 24
-    wrap: false
-    num_workers: 10
-    train:
-      target: ldm.data.openimages.OpenImagesBBoxTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.openimages.OpenImagesBBoxValidation
-      params:
-        size: 256
--- a/models/ldm/lsun_beds256/config.yaml
+++ b/models/ldm/lsun_beds256/config.yaml
@ -1,70 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: class_label
-    image_size: 64
-    channels: 3
-    cond_stage_trainable: false
-    concat_mode: false
-    monitor: val/loss
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 224
-        attention_resolutions:
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 4
-        num_head_channels: 32
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config: __is_unconditional__
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 48
-    num_workers: 5
-    wrap: false
-    train:
-      target: ldm.data.lsun.LSUNBedroomsTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.lsun.LSUNBedroomsValidation
-      params:
-        size: 256
--- a/models/ldm/lsun_churches256/config.yaml
+++ b/models/ldm/lsun_churches256/config.yaml
@ -1,92 +0,0 @@
-model:
-  base_learning_rate: 5.0e-05
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0155
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    loss_type: l1
-    first_stage_key: image
-    cond_stage_key: image
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: false
-    concat_mode: false
-    scale_by_std: true
-    monitor: val/loss_simple_ema
-    scheduler_config:
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps:
-        - 10000
-        cycle_lengths:
-        - 10000000000000
-        f_start:
-        - 1.0e-06
-        f_max:
-        - 1.0
-        f_min:
-        - 1.0
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32
-        in_channels: 4
-        out_channels: 4
-        model_channels: 192
-        attention_resolutions:
-        - 1
-        - 2
-        - 4
-        - 8
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 2
-        - 4
-        - 4
-        num_heads: 8
-        use_scale_shift_norm: true
-        resblock_updown: true
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config: '__is_unconditional__'
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 96
-    num_workers: 5
-    wrap: false
-    train:
-      target: ldm.data.lsun.LSUNChurchesTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.lsun.LSUNChurchesValidation
-      params:
-        size: 256
--- a/models/ldm/semantic_synthesis256/config.yaml
+++ b/models/ldm/semantic_synthesis256/config.yaml
@ -1,59 +0,0 @@
-model:
-  base_learning_rate: 1.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0205
-    log_every_t: 100
-    timesteps: 1000
-    loss_type: l1
-    first_stage_key: image
-    cond_stage_key: segmentation
-    image_size: 64
-    channels: 3
-    concat_mode: true
-    cond_stage_trainable: true
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 6
-        out_channels: 3
-        model_channels: 128
-        attention_resolutions:
-        - 32
-        - 16
-        - 8
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 4
-        - 8
-        num_heads: 8
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.SpatialRescaler
-      params:
-        n_stages: 2
-        in_channels: 182
-        out_channels: 3
--- a/models/ldm/semantic_synthesis512/config.yaml
+++ b/models/ldm/semantic_synthesis512/config.yaml
@ -1,78 +0,0 @@
-model:
-  base_learning_rate: 1.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0205
-    log_every_t: 100
-    timesteps: 1000
-    loss_type: l1
-    first_stage_key: image
-    cond_stage_key: segmentation
-    image_size: 128
-    channels: 3
-    concat_mode: true
-    cond_stage_trainable: true
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 128
-        in_channels: 6
-        out_channels: 3
-        model_channels: 128
-        attention_resolutions:
-        - 32
-        - 16
-        - 8
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 4
-        - 8
-        num_heads: 8
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.SpatialRescaler
-      params:
-        n_stages: 2
-        in_channels: 182
-        out_channels: 3
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 8
-    wrap: false
-    num_workers: 10
-    train:
-      target: ldm.data.landscapes.RFWTrain
-      params:
-        size: 768
-        crop_size: 512
-        segmentation_to_float32: true
-    validation:
-      target: ldm.data.landscapes.RFWValidation
-      params:
-        size: 768
-        crop_size: 512
-        segmentation_to_float32: true
--- a/models/ldm/text2img256/config.yaml
+++ b/models/ldm/text2img256/config.yaml
@ -1,77 +0,0 @@
-model:
-  base_learning_rate: 2.0e-06
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.0015
-    linear_end: 0.0195
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: caption
-    image_size: 64
-    channels: 3
-    cond_stage_trainable: true
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 64
-        in_channels: 3
-        out_channels: 3
-        model_channels: 192
-        attention_resolutions:
-        - 8
-        - 4
-        - 2
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 3
-        - 5
-        num_head_channels: 32
-        use_spatial_transformer: true
-        transformer_depth: 1
-        context_dim: 640
-    first_stage_config:
-      target: ldm.models.autoencoder.VQModelInterface
-      params:
-        embed_dim: 3
-        n_embed: 8192
-        ddconfig:
-          double_z: false
-          z_channels: 3
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.BERTEmbedder
-      params:
-        n_embed: 640
-        n_layer: 32
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 28
-    num_workers: 10
-    wrap: false
-    train:
-      target: ldm.data.previews.pytorch_dataset.PreviewsTrain
-      params:
-        size: 256
-    validation:
-      target: ldm.data.previews.pytorch_dataset.PreviewsValidation
-      params:
-        size: 256
--- a/scripts/preload_models.py
+++ b/scripts/preload_models.py
@ -3,9 +3,11 @@
 # Before running stable-diffusion on an internet-isolated machine,
 # run this script from one with internet connectivity. The
 # two machines must share a common .cache directory.
-from transformers import CLIPTokenizer, CLIPTextModel
+#
+# Coauthor: Kevin Turner http://github.com/keturn
+#
+print('Loading Python libraries...\n')
 import clip
-from transformers import BertTokenizerFast, AutoFeatureExtractor
 import sys
 import transformers
 import os
@ -14,9 +16,247 @@ import torch
 import urllib.request
 import zipfile
 import traceback
+import getpass
+from omegaconf import OmegaConf
+from pathlib import Path
+from transformers import CLIPTokenizer, CLIPTextModel
+from transformers import BertTokenizerFast, AutoFeatureExtractor
+from huggingface_hub import hf_hub_download, HfFolder, hf_hub_url

 transformers.logging.set_verbosity_error()

+#--------------------------globals--
+Model_dir = './models/ldm/stable-diffusion-v1/'
+Config_file = './configs/models.yaml'
+SD_Configs = './configs/stable-diffusion'
+Datasets = {
+    'stable-diffusion-1.5':  {
+        'description': 'The newest Stable Diffusion version 1.5 weight file (4.27 GB)',
+        'repo_id': 'runwayml/stable-diffusion-v1-5',
+        'config': 'v1-inference.yaml',
+        'file': 'v1-5-pruned-emaonly.ckpt',
+        'recommended': True,
+        'width': 512,
+        'height': 512,
+    },
+    'inpainting-1.5': {
+        'description': 'RunwayML SD 1.5 model optimized for inpainting (4.27 GB)',
+        'repo_id': 'runwayml/stable-diffusion-inpainting',
+        'config': 'v1-inpainting-inference.yaml',
+        'file': 'sd-v1-5-inpainting.ckpt',
+        'recommended': True,
+        'width': 512,
+        'height': 512,
+    },
+    'stable-diffusion-1.4': {
+        'description': 'The original Stable Diffusion version 1.4 weight file (4.27 GB)',
+        'repo_id': 'CompVis/stable-diffusion-v-1-4-original',
+        'config': 'v1-inference.yaml',
+        'file': 'sd-v1-4.ckpt',
+        'recommended': False,
+        'width': 512,
+        'height': 512,
+    },
+    'waifu-diffusion-1.3': {
+        'description': 'Stable Diffusion 1.4 fine tuned on anime-styled images (4.27)',
+        'repo_id': 'hakurei/waifu-diffusion-v1-3',
+        'config': 'v1-inference.yaml',
+        'file': 'model-epoch09-float32.ckpt',
+        'recommended': False,
+        'width': 512,
+        'height': 512,
+    },
+    'ft-mse-improved-autoencoder-840000': {
+        'description': 'StabilityAI improved autoencoder fine-tuned for human faces (recommended; 335 MB)',
+        'repo_id': 'stabilityai/sd-vae-ft-mse-original',
+        'config': 'VAE',
+        'file': 'vae-ft-mse-840000-ema-pruned.ckpt',
+        'recommended': True,
+        'width': 512,
+        'height': 512,
+    },
+}
+Config_preamble = '''# This file describes the alternative machine learning models
+# available to InvokeAI script.
+#
+# To add a new model, follow the examples below. Each
+# model requires a model config file, a weights file,
+# and the width and height of the images it
+# was trained on.
+'''
+
+#---------------------------------------------
+def introduction():
+    print(
+        '''Welcome to InvokeAI. This script will help download the Stable Diffusion weight files
+and other large models that are needed for text to image generation. At any point you may interrupt
+this program and resume later.\n'''
+    )
+
+#---------------------------------------------
+def yes_or_no(prompt:str, default_yes=True):
+    default = "y" if default_yes else 'n'
+    response = input(f'{prompt} [{default}] ') or default
+    if default_yes:
+        return response[0] not in ('n','N')
+    else:
+        return response[0] in ('y','Y')
+
+#---------------------------------------------
+def user_wants_to_download_weights():
+    return yes_or_no('Would you like to download the Stable Diffusion model weights now?')
+
+#---------------------------------------------
+def select_datasets():
+    done = False
+    while not done:
+        print('''
+Choose the weight file(s) you wish to download. Before downloading you 
+will be given the option to view and change your selections.
+'''
+        )
+        datasets = dict()
+
+        counter = 1
+        dflt = None   # the first model selected will be the default; TODO let user change
+        for ds in Datasets.keys():
+            recommended = '(recommended)' if Datasets[ds]['recommended'] else ''
+            print(f'[{counter}] {ds}:\n    {Datasets[ds]["description"]} {recommended}')
+            if yes_or_no('    Download?',default_yes=Datasets[ds]['recommended']):
+                datasets[ds]=counter
+            counter += 1
+
+        print('The following weight files will be downloaded:')
+        for ds in datasets:
+            dflt = '*' if dflt is None else ''
+            print(f'   [{datasets[ds]}] {ds}{dflt}')
+        print("*default")
+        ok_to_download = yes_or_no('Ok to download?')
+        if not ok_to_download:
+            if yes_or_no('Change your selection?'):
+                pass
+            else:
+                done = True
+        else:
+            done = True
+    return datasets if ok_to_download else None
+    
+#-------------------------------Authenticate against Hugging Face
+def authenticate():
+    print('''
+To download the Stable Diffusion weight files you need to read and accept the
+CreativeML Responsible AI license. If you have not already done so, please 
+create an account at https://huggingface.co. Then login under your account and
+read and accept the license available at https://huggingface.co/CompVis/stable-diffusion-v-1-4-original.
+'''
+    )
+    input('Press <enter> when you are ready to continue:')
+    access_token = HfFolder.get_token()
+    if access_token is None:
+        print('''
+Thank you! Now you need to authenticate with your HuggingFace access token.
+Go to https://huggingface.co/settings/tokens and create a token. Copy it to the
+clipboard and paste it here: '''
+        )
+        access_token = getpass.getpass()
+        HfFolder.save_token(access_token)
+    return access_token
+
+#---------------------------------------------
+# look for legacy model.ckpt in models directory and offer to
+# normalize its name
+def migrate_models_ckpt():
+    if not os.path.exists(os.path.join(Model_dir,'model.ckpt')):
+        return
+    new_name = Datasets['stable-diffusion-1.4']['file']
+    print('You seem to have the Stable Diffusion v4.1 "model.ckpt" already installed.')
+    rename = yes_or_no(f'Ok to rename it to "{new_name}" for future reference?')
+    if rename:
+        print(f'model.ckpt => {new_name}')
+        os.rename(os.path.join(Model_dir,'model.ckpt'),os.path.join(Model_dir,new_name))
+            
+#---------------------------------------------
+def download_weight_datasets(models:dict, access_token:str):
+    migrate_models_ckpt()
+    successful = dict()
+    for mod in models.keys():
+        repo_id = Datasets[mod]['repo_id']
+        filename = Datasets[mod]['file']
+        success = conditional_download(
+            repo_id=repo_id,
+            model_name=filename,
+            access_token=access_token
+        )
+        if success:
+            successful[mod] = True
+    keys = ', '.join(successful.keys())
+    print(f'Successfully installed {keys}') 
+    return successful
+    
+#---------------------------------------------
+def conditional_download(repo_id:str, model_name:str, access_token:str):
+    model_dest = os.path.join(Model_dir, model_name)
+    if os.path.exists(model_dest):
+        print(f' * {model_name}: exists')
+        return True
+    os.makedirs(os.path.dirname(model_dest), exist_ok=True)
+
+    try:
+        print(f' * {model_name}: downloading or retrieving from cache...')
+        path = Path(hf_hub_download(repo_id, model_name, use_auth_token=access_token))
+        path.resolve(strict=True).link_to(model_dest)
+    except Exception as e:
+        print(f'** Error downloading {model_name}: {str(e)} **')
+        return False
+    return True
+                             
+#---------------------------------------------
+def update_config_file(successfully_downloaded:dict):
+    try:
+        yaml = new_config_file_contents(successfully_downloaded)
+        tmpfile = os.path.join(os.path.dirname(Config_file),'new_config.tmp')
+        with open(tmpfile, 'w') as outfile:
+            outfile.write(Config_preamble)
+            outfile.write(yaml)
+        os.rename(tmpfile,Config_file)
+    except Exception as e:
+        print(f'**Error creating config file {Config_file}: {str(e)} **')
+        return
+    print(f'Successfully created new configuration file {Config_file}')
+
+    
+#---------------------------------------------    
+def new_config_file_contents(successfully_downloaded:dict)->str:
+    conf = OmegaConf.load(Config_file)
+
+    # find the VAE file, if there is one
+    vae = None
+    default_selected = False
+    
+    for model in successfully_downloaded:
+        if Datasets[model]['config'] == 'VAE':
+            vae = Datasets[model]['file']
+    
+    for model in successfully_downloaded:
+        if Datasets[model]['config'] == 'VAE': # skip VAE entries
+            continue
+        stanza = conf[model] if model in conf else { }
+        
+        stanza['description'] = Datasets[model]['description']
+        stanza['weights'] = os.path.join(Model_dir,Datasets[model]['file'])
+        stanza['config'] =os.path.join(SD_Configs, Datasets[model]['config'])
+        stanza['width'] = Datasets[model]['width']
+        stanza['height'] = Datasets[model]['height']
+        stanza.pop('default',None)  # this will be set later
+        if vae:
+            stanza['vae'] = os.path.join(Model_dir,vae)
+        # BUG - the first stanza is always the default. User should select.
+        if not default_selected:
+            stanza['default'] = True
+            default_selected = True
+        conf[model] = stanza
+    return OmegaConf.to_yaml(conf)
+    
 #---------------------------------------------
 # this will preload the Bert tokenizer fles
 def download_bert():
@ -66,7 +306,6 @@ def download_gfpgan():
        print(traceback.format_exc())

    print('Loading models from GFPGAN')
-    import urllib.request
    for model in (
            [
                'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth',
@ -152,6 +391,15 @@ def download_safety_checker():
    
 #-------------------------------------
 if __name__ == '__main__':
+    introduction()
+    if user_wants_to_download_weights():
+        models = select_datasets()
+        if models is None:
+            if yes_or_no('Quit?',default_yes=False):
+                sys.exit(0)
+        access_token = authenticate()
+        successfully_downloaded = download_weight_datasets(models, access_token)
+        update_config_file(successfully_downloaded)
    download_bert()
    download_kornia()
    download_clip()