preload_models.py script downloads the weight files

- user can select which weight files to download using huggingface cache - user must log in to huggingface, generate an access token, and accept license terms the very first time this is run. After that, everything works automatically. - added placeholder for docs for installing models - also got rid of unused config files. hopefully they weren't needed for textual inversion, but I don't think so.
2024-08-30 20:32:17 +00:00 · 2022-10-29 01:02:45 -04:00
parent 104466f5c0
commit ef68a419f1
37 changed files with 286 additions and 2115 deletions
--- a/.gitignore
+++ b/.gitignore
@ -199,7 +199,13 @@ checkpoints
 .scratch/
 .vscode/
 gfpgan/
-models/ldm/stable-diffusion-v1/model.sha256
+models/ldm/stable-diffusion-v1/*.sha256
 # GFPGAN model files
 gfpgan/
 # config file (will be created by installer)
 configs/models.yaml
 # weights (will be created by installer)
 models/ldm/stable-diffusion-v1/*.ckpt
--- a/configs/autoencoder/autoencoder_kl_16x16x16.yaml
+++ b/configs/autoencoder/autoencoder_kl_16x16x16.yaml
@ -1,54 +0,0 @@
 model:
  base_learning_rate: 4.5e-6
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: "val/rec_loss"
    embed_dim: 16
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 0.000001
        disc_weight: 0.5
    ddconfig:
      double_z: True
      z_channels: 16
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult: [ 1,1,2,2,4]  # num_down = len(ch_mult)-1
      num_res_blocks: 2
      attn_resolutions: [16]
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 12
    wrap: True
    train:
      target: ldm.data.imagenet.ImageNetSRTrain
      params:
        size: 256
        degradation: pil_nearest
    validation:
      target: ldm.data.imagenet.ImageNetSRValidation
      params:
        size: 256
        degradation: pil_nearest
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 1000
        max_images: 8
        increase_log_steps: True
  trainer:
    benchmark: True
    accumulate_grad_batches: 2
--- a/configs/autoencoder/autoencoder_kl_32x32x4.yaml
+++ b/configs/autoencoder/autoencoder_kl_32x32x4.yaml
@ -1,53 +0,0 @@
 model:
  base_learning_rate: 4.5e-6
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: "val/rec_loss"
    embed_dim: 4
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 0.000001
        disc_weight: 0.5
    ddconfig:
      double_z: True
      z_channels: 4
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult: [ 1,2,4,4 ]  # num_down = len(ch_mult)-1
      num_res_blocks: 2
      attn_resolutions: [ ]
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 12
    wrap: True
    train:
      target: ldm.data.imagenet.ImageNetSRTrain
      params:
        size: 256
        degradation: pil_nearest
    validation:
      target: ldm.data.imagenet.ImageNetSRValidation
      params:
        size: 256
        degradation: pil_nearest
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 1000
        max_images: 8
        increase_log_steps: True
  trainer:
    benchmark: True
    accumulate_grad_batches: 2
--- a/configs/autoencoder/autoencoder_kl_64x64x3.yaml
+++ b/configs/autoencoder/autoencoder_kl_64x64x3.yaml
@ -1,54 +0,0 @@
 model:
  base_learning_rate: 4.5e-6
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: "val/rec_loss"
    embed_dim: 3
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 0.000001
        disc_weight: 0.5
    ddconfig:
      double_z: True
      z_channels: 3
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult: [ 1,2,4 ]  # num_down = len(ch_mult)-1
      num_res_blocks: 2
      attn_resolutions: [ ]
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 12
    wrap: True
    train:
      target: ldm.data.imagenet.ImageNetSRTrain
      params:
        size: 256
        degradation: pil_nearest
    validation:
      target: ldm.data.imagenet.ImageNetSRValidation
      params:
        size: 256
        degradation: pil_nearest
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 1000
        max_images: 8
        increase_log_steps: True
  trainer:
    benchmark: True
    accumulate_grad_batches: 2
--- a/configs/autoencoder/autoencoder_kl_8x8x64.yaml
+++ b/configs/autoencoder/autoencoder_kl_8x8x64.yaml
@ -1,53 +0,0 @@
 model:
  base_learning_rate: 4.5e-6
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: "val/rec_loss"
    embed_dim: 64
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 0.000001
        disc_weight: 0.5
    ddconfig:
      double_z: True
      z_channels: 64
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult: [ 1,1,2,2,4,4]  # num_down = len(ch_mult)-1
      num_res_blocks: 2
      attn_resolutions: [16,8]
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 12
    wrap: True
    train:
      target: ldm.data.imagenet.ImageNetSRTrain
      params:
        size: 256
        degradation: pil_nearest
    validation:
      target: ldm.data.imagenet.ImageNetSRValidation
      params:
        size: 256
        degradation: pil_nearest
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 1000
        max_images: 8
        increase_log_steps: True
  trainer:
    benchmark: True
    accumulate_grad_batches: 2
--- a/configs/latent-diffusion/celebahq-ldm-vq-4.yaml
+++ b/configs/latent-diffusion/celebahq-ldm-vq-4.yaml
@ -1,86 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    image_size: 64
    channels: 3
    monitor: val/loss_simple_ema
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 224
        attention_resolutions:
        # note: this isn\t actually the resolution but
        # the downsampling factor, i.e. this corresnponds to
        # attention on spatial resolution 8,16,32, as the
        # spatial reolution of the latents is 64 for f4
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_head_channels: 32
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ckpt_path: models/first_stage_models/vq-f4/model.ckpt
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: __is_unconditional__
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 48
    num_workers: 5
    wrap: false
    train:
      target: taming.data.faceshq.CelebAHQTrain
      params:
        size: 256
    validation:
      target: taming.data.faceshq.CelebAHQValidation
      params:
        size: 256
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 5000
        max_images: 8
        increase_log_steps: False
  trainer:
    benchmark: True
--- a/configs/latent-diffusion/cin-ldm-vq-f8.yaml
+++ b/configs/latent-diffusion/cin-ldm-vq-f8.yaml
@ -1,98 +0,0 @@
 model:
  base_learning_rate: 1.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: class_label
    image_size: 32
    channels: 4
    cond_stage_trainable: true
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32
        in_channels: 4
        out_channels: 4
        model_channels: 256
        attention_resolutions:
        #note: this isn\t actually the resolution but
        # the downsampling factor, i.e. this corresnponds to
        # attention on spatial resolution 8,16,32, as the
        # spatial reolution of the latents is 32 for f8
        - 4
        - 2
        - 1
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 4
        num_head_channels: 32
        use_spatial_transformer: true
        transformer_depth: 1
        context_dim: 512
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 4
        n_embed: 16384
        ckpt_path: configs/first_stage_models/vq-f8/model.yaml
        ddconfig:
          double_z: false
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions:
          - 32
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.ClassEmbedder
      params:
        embed_dim: 512
        key: class_label
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 64
    num_workers: 12
    wrap: false
    train:
      target: ldm.data.imagenet.ImageNetTrain
      params:
        config:
          size: 256
    validation:
      target: ldm.data.imagenet.ImageNetValidation
      params:
        config:
          size: 256
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 5000
        max_images: 8
        increase_log_steps: False
  trainer:
    benchmark: True
--- a/configs/latent-diffusion/cin256-v2.yaml
+++ b/configs/latent-diffusion/cin256-v2.yaml
@ -1,68 +0,0 @@
 model:
  base_learning_rate: 0.0001
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: class_label
    image_size: 64
    channels: 3
    cond_stage_trainable: true
    conditioning_key: crossattn
    monitor: val/loss
    use_ema: False
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 192
        attention_resolutions:
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 5
        num_heads: 1
        use_spatial_transformer: true
        transformer_depth: 1
        context_dim: 512
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.ClassEmbedder
      params:
        n_classes: 1001
        embed_dim: 512
        key: class_label
--- a/configs/latent-diffusion/ffhq-ldm-vq-4.yaml
+++ b/configs/latent-diffusion/ffhq-ldm-vq-4.yaml
@ -1,85 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    image_size: 64
    channels: 3
    monitor: val/loss_simple_ema
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 224
        attention_resolutions:
        # note: this isn\t actually the resolution but
        # the downsampling factor, i.e. this corresnponds to
        # attention on spatial resolution 8,16,32, as the
        # spatial reolution of the latents is 64 for f4
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_head_channels: 32
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ckpt_path: configs/first_stage_models/vq-f4/model.yaml
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: __is_unconditional__
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 42
    num_workers: 5
    wrap: false
    train:
      target: taming.data.faceshq.FFHQTrain
      params:
        size: 256
    validation:
      target: taming.data.faceshq.FFHQValidation
      params:
        size: 256
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 5000
        max_images: 8
        increase_log_steps: False
  trainer:
    benchmark: True
--- a/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml
+++ b/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml
@ -1,85 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    image_size: 64
    channels: 3
    monitor: val/loss_simple_ema
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 224
        attention_resolutions:
        # note: this isn\t actually the resolution but
        # the downsampling factor, i.e. this corresnponds to
        # attention on spatial resolution 8,16,32, as the
        # spatial reolution of the latents is 64 for f4
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_head_channels: 32
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        ckpt_path: configs/first_stage_models/vq-f4/model.yaml
        embed_dim: 3
        n_embed: 8192
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: __is_unconditional__
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 48
    num_workers: 5
    wrap: false
    train:
      target: ldm.data.lsun.LSUNBedroomsTrain
      params:
        size: 256
    validation:
      target: ldm.data.lsun.LSUNBedroomsValidation
      params:
        size: 256
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 5000
        max_images: 8
        increase_log_steps: False
  trainer:
    benchmark: True
--- a/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml
+++ b/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml
@ -1,91 +0,0 @@
 model:
  base_learning_rate: 5.0e-5   # set to target_lr by starting main.py with '--scale_lr False'
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0155
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    loss_type: l1
    first_stage_key: "image"
    cond_stage_key: "image"
    image_size: 32
    channels: 4
    cond_stage_trainable: False
    concat_mode: False
    scale_by_std: True
    monitor: 'val/loss_simple_ema'
    scheduler_config: # 10000 warmup steps
      target: ldm.lr_scheduler.LambdaLinearScheduler
      params:
        warm_up_steps: [10000]
        cycle_lengths: [10000000000000]
        f_start: [1.e-6]
        f_max: [1.]
        f_min: [ 1.]
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32
        in_channels: 4
        out_channels: 4
        model_channels: 192
        attention_resolutions: [ 1, 2, 4, 8 ]   # 32, 16, 8, 4
        num_res_blocks: 2
        channel_mult: [ 1,2,2,4,4 ]  # 32, 16, 8, 4, 2
        num_heads: 8
        use_scale_shift_norm: True
        resblock_updown: True
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: "val/rec_loss"
        ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
        ddconfig:
          double_z: True
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult: [ 1,2,4,4 ]  # num_down = len(ch_mult)-1
          num_res_blocks: 2
          attn_resolutions: [ ]
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: "__is_unconditional__"
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 96
    num_workers: 5
    wrap: False
    train:
      target: ldm.data.lsun.LSUNChurchesTrain
      params:
        size: 256
    validation:
      target: ldm.data.lsun.LSUNChurchesValidation
      params:
        size: 256
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 5000
        max_images: 8
        increase_log_steps: False
  trainer:
    benchmark: True
--- a/configs/latent-diffusion/txt2img-1p4B-eval.yaml
+++ b/configs/latent-diffusion/txt2img-1p4B-eval.yaml
@ -1,71 +0,0 @@
 model:
  base_learning_rate: 5.0e-05
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.00085
    linear_end: 0.012
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: caption
    image_size: 32
    channels: 4
    cond_stage_trainable: true
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_factor: 0.18215
    use_ema: False
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions:
        - 4
        - 2
        - 1
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 4
        - 4
        num_heads: 8
        use_spatial_transformer: true
        transformer_depth: 1
        context_dim: 1280
        use_checkpoint: true
        legacy: False
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.BERTEmbedder
      params:
        n_embed: 1280
        n_layer: 32
--- a/configs/models.yaml
+++ b/configs/models.yaml
@ -1,5 +1,5 @@
 # This file describes the alternative machine learning models
-# available to the dream script.
+# available to InvokeAI script.
 #
 # To add a new model, follow the examples below. Each
 # model requires a model config file, a weights file,
@ -8,22 +8,29 @@
 stable-diffusion-1.4:
  config: configs/stable-diffusion/v1-inference.yaml
  weights: models/ldm/stable-diffusion-v1/model.ckpt
-#  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
+  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
  description: Stable Diffusion inference model version 1.4
  width: 512
  height: 512
 stable-diffusion-1.5:
  description: The newest Stable Diffusion version 1.5 weight file (4.27 GB)
  weights: ./models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt
  config: ./configs/stable-diffusion/v1-inference.yaml
  width: 512
  height: 512
  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
  default: true
 inpainting-1.5:
-  description: runwayML tuned inpainting model v1.5
+  description: RunwayML SD 1.5 model optimized for inpainting (4.27 GB)
-  weights: models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt
+  weights: ./models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt
-  config: configs/stable-diffusion/v1-inpainting-inference.yaml
+  config: ./configs/stable-diffusion/v1-inpainting-inference.yaml
 #  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
  width: 512
  height: 512
-stable-diffusion-1.5:
+  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
-  config: configs/stable-diffusion/v1-inference.yaml
+waifu-diffusion-1.3:
-  weights: models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt
+  description: Stable Diffusion 1.4 fine tuned on anime-styled images (4.27)
-#  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
+  weights: ./models/ldm/stable-diffusion-v1/model-epoch09-float32.ckpt
-  description: Stable Diffusion inference model version 1.5
+  config: ./configs/stable-diffusion/v1-inference.yaml
  width: 512
  height: 512
  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
--- a/configs/retrieval-augmented-diffusion/768x768.yaml
+++ b/configs/retrieval-augmented-diffusion/768x768.yaml
@ -1,68 +0,0 @@
 model:
  base_learning_rate: 0.0001
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.015
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: jpg
    cond_stage_key: nix
    image_size: 48
    channels: 16
    cond_stage_trainable: false
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_by_std: false
    scale_factor: 0.22765929
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 48
        in_channels: 16
        out_channels: 16
        model_channels: 448
        attention_resolutions:
        - 4
        - 2
        - 1
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        use_scale_shift_norm: false
        resblock_updown: false
        num_head_channels: 32
        use_spatial_transformer: true
        transformer_depth: 1
        context_dim: 768
        use_checkpoint: true
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        monitor: val/rec_loss
        embed_dim: 16
        ddconfig:
          double_z: true
          z_channels: 16
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 1
          - 2
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions:
          - 16
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: torch.nn.Identity
--- a/docs/features/INSTALLING_MODELS.md
+++ b/docs/features/INSTALLING_MODELS.md
@ -0,0 +1,9 @@
 ---
 title: Installing Models
 ---
 # :octicons-paintbrush-16: Installing Models
 ## TO COME
--- a/ldm/invoke/model_cache.py
+++ b/ldm/invoke/model_cache.py
@ -281,7 +281,7 @@ class ModelCache(object):
        Returns the preamble for the config file.
        '''
        return '''# This file describes the alternative machine learning models
-# available to the dream script.
+# available to InvokeAI script.
 #
 # To add a new model, follow the examples below. Each
 # model requires a model config file, a weights file,
--- a/models/first_stage_models/kl-f16/config.yaml
+++ b/models/first_stage_models/kl-f16/config.yaml
@ -1,44 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: val/rec_loss
    embed_dim: 16
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 1.0e-06
        disc_weight: 0.5
    ddconfig:
      double_z: true
      z_channels: 16
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 1
      - 2
      - 2
      - 4
      num_res_blocks: 2
      attn_resolutions:
      - 16
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 6
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        size: 384
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        size: 384
        crop_size: 256
--- a/models/first_stage_models/kl-f32/config.yaml
+++ b/models/first_stage_models/kl-f32/config.yaml
@ -1,46 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: val/rec_loss
    embed_dim: 64
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 1.0e-06
        disc_weight: 0.5
    ddconfig:
      double_z: true
      z_channels: 64
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 1
      - 2
      - 2
      - 4
      - 4
      num_res_blocks: 2
      attn_resolutions:
      - 16
      - 8
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 6
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        size: 384
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        size: 384
        crop_size: 256
--- a/models/first_stage_models/kl-f4/config.yaml
+++ b/models/first_stage_models/kl-f4/config.yaml
@ -1,41 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: val/rec_loss
    embed_dim: 3
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 1.0e-06
        disc_weight: 0.5
    ddconfig:
      double_z: true
      z_channels: 3
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 2
      - 4
      num_res_blocks: 2
      attn_resolutions: []
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 10
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        size: 384
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        size: 384
        crop_size: 256
--- a/models/first_stage_models/kl-f8/config.yaml
+++ b/models/first_stage_models/kl-f8/config.yaml
@ -1,42 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: val/rec_loss
    embed_dim: 4
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 1.0e-06
        disc_weight: 0.5
    ddconfig:
      double_z: true
      z_channels: 4
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 2
      - 4
      - 4
      num_res_blocks: 2
      attn_resolutions: []
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 4
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        size: 384
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        size: 384
        crop_size: 256
--- a/models/first_stage_models/vq-f16/config.yaml
+++ b/models/first_stage_models/vq-f16/config.yaml
@ -1,49 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.VQModel
  params:
    embed_dim: 8
    n_embed: 16384
    ddconfig:
      double_z: false
      z_channels: 8
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 1
      - 2
      - 2
      - 4
      num_res_blocks: 2
      attn_resolutions:
      - 16
      dropout: 0.0
    lossconfig:
      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
      params:
        disc_conditional: false
        disc_in_channels: 3
        disc_start: 250001
        disc_weight: 0.75
        disc_num_layers: 2
        codebook_weight: 1.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 14
    num_workers: 20
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        size: 384
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        size: 384
        crop_size: 256
--- a/models/first_stage_models/vq-f4-noattn/config.yaml
+++ b/models/first_stage_models/vq-f4-noattn/config.yaml
@ -1,46 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.VQModel
  params:
    embed_dim: 3
    n_embed: 8192
    monitor: val/rec_loss
    ddconfig:
      attn_type: none
      double_z: false
      z_channels: 3
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 2
      - 4
      num_res_blocks: 2
      attn_resolutions: []
      dropout: 0.0
    lossconfig:
      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
      params:
        disc_conditional: false
        disc_in_channels: 3
        disc_start: 11
        disc_weight: 0.75
        codebook_weight: 1.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 8
    num_workers: 12
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        crop_size: 256
--- a/models/first_stage_models/vq-f4/config.yaml
+++ b/models/first_stage_models/vq-f4/config.yaml
@ -1,45 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.VQModel
  params:
    embed_dim: 3
    n_embed: 8192
    monitor: val/rec_loss
    ddconfig:
      double_z: false
      z_channels: 3
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 2
      - 4
      num_res_blocks: 2
      attn_resolutions: []
      dropout: 0.0
    lossconfig:
      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
      params:
        disc_conditional: false
        disc_in_channels: 3
        disc_start: 0
        disc_weight: 0.75
        codebook_weight: 1.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 8
    num_workers: 16
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        crop_size: 256
--- a/models/first_stage_models/vq-f8-n256/config.yaml
+++ b/models/first_stage_models/vq-f8-n256/config.yaml
@ -1,48 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.VQModel
  params:
    embed_dim: 4
    n_embed: 256
    monitor: val/rec_loss
    ddconfig:
      double_z: false
      z_channels: 4
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 2
      - 2
      - 4
      num_res_blocks: 2
      attn_resolutions:
      - 32
      dropout: 0.0
    lossconfig:
      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
      params:
        disc_conditional: false
        disc_in_channels: 3
        disc_start: 250001
        disc_weight: 0.75
        codebook_weight: 1.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 10
    num_workers: 20
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        size: 384
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        size: 384
        crop_size: 256
--- a/models/first_stage_models/vq-f8/config.yaml
+++ b/models/first_stage_models/vq-f8/config.yaml
@ -1,48 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.VQModel
  params:
    embed_dim: 4
    n_embed: 16384
    monitor: val/rec_loss
    ddconfig:
      double_z: false
      z_channels: 4
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 2
      - 2
      - 4
      num_res_blocks: 2
      attn_resolutions:
      - 32
      dropout: 0.0
    lossconfig:
      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
      params:
        disc_conditional: false
        disc_in_channels: 3
        disc_num_layers: 2
        disc_start: 1
        disc_weight: 0.6
        codebook_weight: 1.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 10
    num_workers: 20
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        size: 384
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        size: 384
        crop_size: 256
--- a/models/ldm/bsr_sr/config.yaml
+++ b/models/ldm/bsr_sr/config.yaml
@ -1,80 +0,0 @@
 model:
  base_learning_rate: 1.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0155
    log_every_t: 100
    timesteps: 1000
    loss_type: l2
    first_stage_key: image
    cond_stage_key: LR_image
    image_size: 64
    channels: 3
    concat_mode: true
    cond_stage_trainable: false
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 6
        out_channels: 3
        model_channels: 160
        attention_resolutions:
        - 16
        - 8
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 2
        - 4
        num_head_channels: 32
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        monitor: val/rec_loss
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: torch.nn.Identity
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 64
    wrap: false
    num_workers: 12
    train:
      target: ldm.data.openimages.SuperresOpenImagesAdvancedTrain
      params:
        size: 256
        degradation: bsrgan_light
        downscale_f: 4
        min_crop_f: 0.5
        max_crop_f: 1.0
        random_crop: true
    validation:
      target: ldm.data.openimages.SuperresOpenImagesAdvancedValidation
      params:
        size: 256
        degradation: bsrgan_light
        downscale_f: 4
        min_crop_f: 0.5
        max_crop_f: 1.0
        random_crop: true
--- a/models/ldm/celeba256/config.yaml
+++ b/models/ldm/celeba256/config.yaml
@ -1,70 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: class_label
    image_size: 64
    channels: 3
    cond_stage_trainable: false
    concat_mode: false
    monitor: val/loss
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 224
        attention_resolutions:
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_head_channels: 32
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: __is_unconditional__
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 48
    num_workers: 5
    wrap: false
    train:
      target: ldm.data.faceshq.CelebAHQTrain
      params:
        size: 256
    validation:
      target: ldm.data.faceshq.CelebAHQValidation
      params:
        size: 256
--- a/models/ldm/cin256/config.yaml
+++ b/models/ldm/cin256/config.yaml
@ -1,80 +0,0 @@
 model:
  base_learning_rate: 1.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: class_label
    image_size: 32
    channels: 4
    cond_stage_trainable: true
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32
        in_channels: 4
        out_channels: 4
        model_channels: 256
        attention_resolutions:
        - 4
        - 2
        - 1
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 4
        num_head_channels: 32
        use_spatial_transformer: true
        transformer_depth: 1
        context_dim: 512
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 4
        n_embed: 16384
        ddconfig:
          double_z: false
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions:
          - 32
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.ClassEmbedder
      params:
        embed_dim: 512
        key: class_label
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 64
    num_workers: 12
    wrap: false
    train:
      target: ldm.data.imagenet.ImageNetTrain
      params:
        config:
          size: 256
    validation:
      target: ldm.data.imagenet.ImageNetValidation
      params:
        config:
          size: 256
--- a/models/ldm/ffhq256/config.yaml
+++ b/models/ldm/ffhq256/config.yaml
@ -1,70 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: class_label
    image_size: 64
    channels: 3
    cond_stage_trainable: false
    concat_mode: false
    monitor: val/loss
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 224
        attention_resolutions:
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_head_channels: 32
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: __is_unconditional__
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 42
    num_workers: 5
    wrap: false
    train:
      target: ldm.data.faceshq.FFHQTrain
      params:
        size: 256
    validation:
      target: ldm.data.faceshq.FFHQValidation
      params:
        size: 256
--- a/models/ldm/inpainting_big/config.yaml
+++ b/models/ldm/inpainting_big/config.yaml
@ -1,67 +0,0 @@
 model:
  base_learning_rate: 1.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0205
    log_every_t: 100
    timesteps: 1000
    loss_type: l1
    first_stage_key: image
    cond_stage_key: masked_image
    image_size: 64
    channels: 3
    concat_mode: true
    monitor: val/loss
    scheduler_config:
      target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler
      params:
        verbosity_interval: 0
        warm_up_steps: 1000
        max_decay_steps: 50000
        lr_start: 0.001
        lr_max: 0.1
        lr_min: 0.0001
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 7
        out_channels: 3
        model_channels: 256
        attention_resolutions:
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_heads: 8
        resblock_updown: true
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        monitor: val/rec_loss
        ddconfig:
          attn_type: none
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: ldm.modules.losses.contperceptual.DummyLoss
    cond_stage_config: __is_first_stage__
--- a/models/ldm/layout2img-openimages256/config.yaml
+++ b/models/ldm/layout2img-openimages256/config.yaml
@ -1,81 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0205
    log_every_t: 100
    timesteps: 1000
    loss_type: l1
    first_stage_key: image
    cond_stage_key: coordinates_bbox
    image_size: 64
    channels: 3
    conditioning_key: crossattn
    cond_stage_trainable: true
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 128
        attention_resolutions:
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_head_channels: 32
        use_spatial_transformer: true
        transformer_depth: 3
        context_dim: 512
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        monitor: val/rec_loss
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.BERTEmbedder
      params:
        n_embed: 512
        n_layer: 16
        vocab_size: 8192
        max_seq_len: 92
        use_tokenizer: false
    monitor: val/loss_simple_ema
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 24
    wrap: false
    num_workers: 10
    train:
      target: ldm.data.openimages.OpenImagesBBoxTrain
      params:
        size: 256
    validation:
      target: ldm.data.openimages.OpenImagesBBoxValidation
      params:
        size: 256
--- a/models/ldm/lsun_beds256/config.yaml
+++ b/models/ldm/lsun_beds256/config.yaml
@ -1,70 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: class_label
    image_size: 64
    channels: 3
    cond_stage_trainable: false
    concat_mode: false
    monitor: val/loss
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 224
        attention_resolutions:
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_head_channels: 32
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: __is_unconditional__
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 48
    num_workers: 5
    wrap: false
    train:
      target: ldm.data.lsun.LSUNBedroomsTrain
      params:
        size: 256
    validation:
      target: ldm.data.lsun.LSUNBedroomsValidation
      params:
        size: 256
--- a/models/ldm/lsun_churches256/config.yaml
+++ b/models/ldm/lsun_churches256/config.yaml
@ -1,92 +0,0 @@
 model:
  base_learning_rate: 5.0e-05
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0155
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    loss_type: l1
    first_stage_key: image
    cond_stage_key: image
    image_size: 32
    channels: 4
    cond_stage_trainable: false
    concat_mode: false
    scale_by_std: true
    monitor: val/loss_simple_ema
    scheduler_config:
      target: ldm.lr_scheduler.LambdaLinearScheduler
      params:
        warm_up_steps:
        - 10000
        cycle_lengths:
        - 10000000000000
        f_start:
        - 1.0e-06
        f_max:
        - 1.0
        f_min:
        - 1.0
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32
        in_channels: 4
        out_channels: 4
        model_channels: 192
        attention_resolutions:
        - 1
        - 2
        - 4
        - 8
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 2
        - 4
        - 4
        num_heads: 8
        use_scale_shift_norm: true
        resblock_updown: true
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: '__is_unconditional__'
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 96
    num_workers: 5
    wrap: false
    train:
      target: ldm.data.lsun.LSUNChurchesTrain
      params:
        size: 256
    validation:
      target: ldm.data.lsun.LSUNChurchesValidation
      params:
        size: 256
--- a/models/ldm/semantic_synthesis256/config.yaml
+++ b/models/ldm/semantic_synthesis256/config.yaml
@ -1,59 +0,0 @@
 model:
  base_learning_rate: 1.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0205
    log_every_t: 100
    timesteps: 1000
    loss_type: l1
    first_stage_key: image
    cond_stage_key: segmentation
    image_size: 64
    channels: 3
    concat_mode: true
    cond_stage_trainable: true
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 6
        out_channels: 3
        model_channels: 128
        attention_resolutions:
        - 32
        - 16
        - 8
        num_res_blocks: 2
        channel_mult:
        - 1
        - 4
        - 8
        num_heads: 8
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.SpatialRescaler
      params:
        n_stages: 2
        in_channels: 182
        out_channels: 3
--- a/models/ldm/semantic_synthesis512/config.yaml
+++ b/models/ldm/semantic_synthesis512/config.yaml
@ -1,78 +0,0 @@
 model:
  base_learning_rate: 1.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0205
    log_every_t: 100
    timesteps: 1000
    loss_type: l1
    first_stage_key: image
    cond_stage_key: segmentation
    image_size: 128
    channels: 3
    concat_mode: true
    cond_stage_trainable: true
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 128
        in_channels: 6
        out_channels: 3
        model_channels: 128
        attention_resolutions:
        - 32
        - 16
        - 8
        num_res_blocks: 2
        channel_mult:
        - 1
        - 4
        - 8
        num_heads: 8
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        monitor: val/rec_loss
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.SpatialRescaler
      params:
        n_stages: 2
        in_channels: 182
        out_channels: 3
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 8
    wrap: false
    num_workers: 10
    train:
      target: ldm.data.landscapes.RFWTrain
      params:
        size: 768
        crop_size: 512
        segmentation_to_float32: true
    validation:
      target: ldm.data.landscapes.RFWValidation
      params:
        size: 768
        crop_size: 512
        segmentation_to_float32: true
--- a/models/ldm/text2img256/config.yaml
+++ b/models/ldm/text2img256/config.yaml
@ -1,77 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: caption
    image_size: 64
    channels: 3
    cond_stage_trainable: true
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 192
        attention_resolutions:
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 5
        num_head_channels: 32
        use_spatial_transformer: true
        transformer_depth: 1
        context_dim: 640
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.BERTEmbedder
      params:
        n_embed: 640
        n_layer: 32
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 28
    num_workers: 10
    wrap: false
    train:
      target: ldm.data.previews.pytorch_dataset.PreviewsTrain
      params:
        size: 256
    validation:
      target: ldm.data.previews.pytorch_dataset.PreviewsValidation
      params:
        size: 256
--- a/scripts/preload_models.py
+++ b/scripts/preload_models.py
@ -3,9 +3,11 @@
 # Before running stable-diffusion on an internet-isolated machine,
 # run this script from one with internet connectivity. The
 # two machines must share a common .cache directory.
-from transformers import CLIPTokenizer, CLIPTextModel
+#
 # Coauthor: Kevin Turner http://github.com/keturn
 #
 print('Loading Python libraries...\n')
 import clip
 from transformers import BertTokenizerFast, AutoFeatureExtractor
 import sys
 import transformers
 import os
@ -14,9 +16,247 @@ import torch
 import urllib.request
 import zipfile
 import traceback
 import getpass
 from omegaconf import OmegaConf
 from pathlib import Path
 from transformers import CLIPTokenizer, CLIPTextModel
 from transformers import BertTokenizerFast, AutoFeatureExtractor
 from huggingface_hub import hf_hub_download, HfFolder, hf_hub_url
 transformers.logging.set_verbosity_error()
 #--------------------------globals--
 Model_dir = './models/ldm/stable-diffusion-v1/'
 Config_file = './configs/models.yaml'
 SD_Configs = './configs/stable-diffusion'
 Datasets = {
    'stable-diffusion-1.5':  {
        'description': 'The newest Stable Diffusion version 1.5 weight file (4.27 GB)',
        'repo_id': 'runwayml/stable-diffusion-v1-5',
        'config': 'v1-inference.yaml',
        'file': 'v1-5-pruned-emaonly.ckpt',
        'recommended': True,
        'width': 512,
        'height': 512,
    },
    'inpainting-1.5': {
        'description': 'RunwayML SD 1.5 model optimized for inpainting (4.27 GB)',
        'repo_id': 'runwayml/stable-diffusion-inpainting',
        'config': 'v1-inpainting-inference.yaml',
        'file': 'sd-v1-5-inpainting.ckpt',
        'recommended': True,
        'width': 512,
        'height': 512,
    },
    'stable-diffusion-1.4': {
        'description': 'The original Stable Diffusion version 1.4 weight file (4.27 GB)',
        'repo_id': 'CompVis/stable-diffusion-v-1-4-original',
        'config': 'v1-inference.yaml',
        'file': 'sd-v1-4.ckpt',
        'recommended': False,
        'width': 512,
        'height': 512,
    },
    'waifu-diffusion-1.3': {
        'description': 'Stable Diffusion 1.4 fine tuned on anime-styled images (4.27)',
        'repo_id': 'hakurei/waifu-diffusion-v1-3',
        'config': 'v1-inference.yaml',
        'file': 'model-epoch09-float32.ckpt',
        'recommended': False,
        'width': 512,
        'height': 512,
    },
    'ft-mse-improved-autoencoder-840000': {
        'description': 'StabilityAI improved autoencoder fine-tuned for human faces (recommended; 335 MB)',
        'repo_id': 'stabilityai/sd-vae-ft-mse-original',
        'config': 'VAE',
        'file': 'vae-ft-mse-840000-ema-pruned.ckpt',
        'recommended': True,
        'width': 512,
        'height': 512,
    },
 }
 Config_preamble = '''# This file describes the alternative machine learning models
 # available to InvokeAI script.
 #
 # To add a new model, follow the examples below. Each
 # model requires a model config file, a weights file,
 # and the width and height of the images it
 # was trained on.
 '''
 #---------------------------------------------
 def introduction():
    print(
        '''Welcome to InvokeAI. This script will help download the Stable Diffusion weight files
 and other large models that are needed for text to image generation. At any point you may interrupt
 this program and resume later.\n'''
    )
 #---------------------------------------------
 def yes_or_no(prompt:str, default_yes=True):
    default = "y" if default_yes else 'n'
    response = input(f'{prompt} [{default}] ') or default
    if default_yes:
        return response[0] not in ('n','N')
    else:
        return response[0] in ('y','Y')
 #---------------------------------------------
 def user_wants_to_download_weights():
    return yes_or_no('Would you like to download the Stable Diffusion model weights now?')
 #---------------------------------------------
 def select_datasets():
    done = False
    while not done:
        print('''
 Choose the weight file(s) you wish to download. Before downloading you 
 will be given the option to view and change your selections.
 '''
        )
        datasets = dict()
        counter = 1
        dflt = None   # the first model selected will be the default; TODO let user change
        for ds in Datasets.keys():
            recommended = '(recommended)' if Datasets[ds]['recommended'] else ''
            print(f'[{counter}] {ds}:\n    {Datasets[ds]["description"]} {recommended}')
            if yes_or_no('    Download?',default_yes=Datasets[ds]['recommended']):
                datasets[ds]=counter
            counter += 1
        print('The following weight files will be downloaded:')
        for ds in datasets:
            dflt = '*' if dflt is None else ''
            print(f'   [{datasets[ds]}] {ds}{dflt}')
        print("*default")
        ok_to_download = yes_or_no('Ok to download?')
        if not ok_to_download:
            if yes_or_no('Change your selection?'):
                pass
            else:
                done = True
        else:
            done = True
    return datasets if ok_to_download else None
 #-------------------------------Authenticate against Hugging Face
 def authenticate():
    print('''
 To download the Stable Diffusion weight files you need to read and accept the
 CreativeML Responsible AI license. If you have not already done so, please 
 create an account at https://huggingface.co. Then login under your account and
 read and accept the license available at https://huggingface.co/CompVis/stable-diffusion-v-1-4-original.
 '''
    )
    input('Press <enter> when you are ready to continue:')
    access_token = HfFolder.get_token()
    if access_token is None:
        print('''
 Thank you! Now you need to authenticate with your HuggingFace access token.
 Go to https://huggingface.co/settings/tokens and create a token. Copy it to the
 clipboard and paste it here: '''
        )
        access_token = getpass.getpass()
        HfFolder.save_token(access_token)
    return access_token
 #---------------------------------------------
 # look for legacy model.ckpt in models directory and offer to
 # normalize its name
 def migrate_models_ckpt():
    if not os.path.exists(os.path.join(Model_dir,'model.ckpt')):
        return
    new_name = Datasets['stable-diffusion-1.4']['file']
    print('You seem to have the Stable Diffusion v4.1 "model.ckpt" already installed.')
    rename = yes_or_no(f'Ok to rename it to "{new_name}" for future reference?')
    if rename:
        print(f'model.ckpt => {new_name}')
        os.rename(os.path.join(Model_dir,'model.ckpt'),os.path.join(Model_dir,new_name))
 #---------------------------------------------
 def download_weight_datasets(models:dict, access_token:str):
    migrate_models_ckpt()
    successful = dict()
    for mod in models.keys():
        repo_id = Datasets[mod]['repo_id']
        filename = Datasets[mod]['file']
        success = conditional_download(
            repo_id=repo_id,
            model_name=filename,
            access_token=access_token
        )
        if success:
            successful[mod] = True
    keys = ', '.join(successful.keys())
    print(f'Successfully installed {keys}') 
    return successful
 #---------------------------------------------
 def conditional_download(repo_id:str, model_name:str, access_token:str):
    model_dest = os.path.join(Model_dir, model_name)
    if os.path.exists(model_dest):
        print(f' * {model_name}: exists')
        return True
    os.makedirs(os.path.dirname(model_dest), exist_ok=True)
    try:
        print(f' * {model_name}: downloading or retrieving from cache...')
        path = Path(hf_hub_download(repo_id, model_name, use_auth_token=access_token))
        path.resolve(strict=True).link_to(model_dest)
    except Exception as e:
        print(f'** Error downloading {model_name}: {str(e)} **')
        return False
    return True
 #---------------------------------------------
 def update_config_file(successfully_downloaded:dict):
    try:
        yaml = new_config_file_contents(successfully_downloaded)
        tmpfile = os.path.join(os.path.dirname(Config_file),'new_config.tmp')
        with open(tmpfile, 'w') as outfile:
            outfile.write(Config_preamble)
            outfile.write(yaml)
        os.rename(tmpfile,Config_file)
    except Exception as e:
        print(f'**Error creating config file {Config_file}: {str(e)} **')
        return
    print(f'Successfully created new configuration file {Config_file}')
 #---------------------------------------------    
 def new_config_file_contents(successfully_downloaded:dict)->str:
    conf = OmegaConf.load(Config_file)
    # find the VAE file, if there is one
    vae = None
    default_selected = False
    for model in successfully_downloaded:
        if Datasets[model]['config'] == 'VAE':
            vae = Datasets[model]['file']
    for model in successfully_downloaded:
        if Datasets[model]['config'] == 'VAE': # skip VAE entries
            continue
        stanza = conf[model] if model in conf else { }
        stanza['description'] = Datasets[model]['description']
        stanza['weights'] = os.path.join(Model_dir,Datasets[model]['file'])
        stanza['config'] =os.path.join(SD_Configs, Datasets[model]['config'])
        stanza['width'] = Datasets[model]['width']
        stanza['height'] = Datasets[model]['height']
        stanza.pop('default',None)  # this will be set later
        if vae:
            stanza['vae'] = os.path.join(Model_dir,vae)
        # BUG - the first stanza is always the default. User should select.
        if not default_selected:
            stanza['default'] = True
            default_selected = True
        conf[model] = stanza
    return OmegaConf.to_yaml(conf)
 #---------------------------------------------
 # this will preload the Bert tokenizer fles
 def download_bert():
@ -66,7 +306,6 @@ def download_gfpgan():
        print(traceback.format_exc())
    print('Loading models from GFPGAN')
    import urllib.request
    for model in (
            [
                'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth',
@ -152,6 +391,15 @@ def download_safety_checker():
 #-------------------------------------
 if __name__ == '__main__':
    introduction()
    if user_wants_to_download_weights():
        models = select_datasets()
        if models is None:
            if yes_or_no('Quit?',default_yes=False):
                sys.exit(0)
        access_token = authenticate()
        successfully_downloaded = download_weight_datasets(models, access_token)
        update_config_file(successfully_downloaded)
    download_bert()
    download_kornia()
    download_clip()