preload_models interactively downloads sd model files

2024-08-30 20:32:17 +00:00 · 2022-10-30 12:19:05 -04:00
parent 07c3c57cde 5319796e58
commit e7368d7231
43 changed files with 730 additions and 2174 deletions
--- a/.github/workflows/test-invoke-conda.yml
+++ b/.github/workflows/test-invoke-conda.yml
@ -84,7 +84,9 @@ jobs:
      - name: run preload_models.py
        id: run-preload-models
-        run: python scripts/preload_models.py
+        run: |
          python scripts/preload_models.py \
            --no-interactive
      - name: Run the tests
        id: run-tests
--- a/.gitignore
+++ b/.gitignore
@ -199,7 +199,13 @@ checkpoints
 .scratch/
 .vscode/
 gfpgan/
-models/ldm/stable-diffusion-v1/model.sha256
+models/ldm/stable-diffusion-v1/*.sha256
 # GFPGAN model files
 gfpgan/
 # config file (will be created by installer)
 configs/models.yaml
 # weights (will be created by installer)
 models/ldm/stable-diffusion-v1/*.ckpt
--- a/configs/autoencoder/autoencoder_kl_16x16x16.yaml
+++ b/configs/autoencoder/autoencoder_kl_16x16x16.yaml
@ -1,54 +0,0 @@
 model:
  base_learning_rate: 4.5e-6
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: "val/rec_loss"
    embed_dim: 16
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 0.000001
        disc_weight: 0.5
    ddconfig:
      double_z: True
      z_channels: 16
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult: [ 1,1,2,2,4]  # num_down = len(ch_mult)-1
      num_res_blocks: 2
      attn_resolutions: [16]
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 12
    wrap: True
    train:
      target: ldm.data.imagenet.ImageNetSRTrain
      params:
        size: 256
        degradation: pil_nearest
    validation:
      target: ldm.data.imagenet.ImageNetSRValidation
      params:
        size: 256
        degradation: pil_nearest
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 1000
        max_images: 8
        increase_log_steps: True
  trainer:
    benchmark: True
    accumulate_grad_batches: 2
--- a/configs/autoencoder/autoencoder_kl_32x32x4.yaml
+++ b/configs/autoencoder/autoencoder_kl_32x32x4.yaml
@ -1,53 +0,0 @@
 model:
  base_learning_rate: 4.5e-6
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: "val/rec_loss"
    embed_dim: 4
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 0.000001
        disc_weight: 0.5
    ddconfig:
      double_z: True
      z_channels: 4
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult: [ 1,2,4,4 ]  # num_down = len(ch_mult)-1
      num_res_blocks: 2
      attn_resolutions: [ ]
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 12
    wrap: True
    train:
      target: ldm.data.imagenet.ImageNetSRTrain
      params:
        size: 256
        degradation: pil_nearest
    validation:
      target: ldm.data.imagenet.ImageNetSRValidation
      params:
        size: 256
        degradation: pil_nearest
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 1000
        max_images: 8
        increase_log_steps: True
  trainer:
    benchmark: True
    accumulate_grad_batches: 2
--- a/configs/autoencoder/autoencoder_kl_64x64x3.yaml
+++ b/configs/autoencoder/autoencoder_kl_64x64x3.yaml
@ -1,54 +0,0 @@
 model:
  base_learning_rate: 4.5e-6
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: "val/rec_loss"
    embed_dim: 3
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 0.000001
        disc_weight: 0.5
    ddconfig:
      double_z: True
      z_channels: 3
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult: [ 1,2,4 ]  # num_down = len(ch_mult)-1
      num_res_blocks: 2
      attn_resolutions: [ ]
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 12
    wrap: True
    train:
      target: ldm.data.imagenet.ImageNetSRTrain
      params:
        size: 256
        degradation: pil_nearest
    validation:
      target: ldm.data.imagenet.ImageNetSRValidation
      params:
        size: 256
        degradation: pil_nearest
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 1000
        max_images: 8
        increase_log_steps: True
  trainer:
    benchmark: True
    accumulate_grad_batches: 2
--- a/configs/autoencoder/autoencoder_kl_8x8x64.yaml
+++ b/configs/autoencoder/autoencoder_kl_8x8x64.yaml
@ -1,53 +0,0 @@
 model:
  base_learning_rate: 4.5e-6
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: "val/rec_loss"
    embed_dim: 64
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 0.000001
        disc_weight: 0.5
    ddconfig:
      double_z: True
      z_channels: 64
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult: [ 1,1,2,2,4,4]  # num_down = len(ch_mult)-1
      num_res_blocks: 2
      attn_resolutions: [16,8]
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 12
    wrap: True
    train:
      target: ldm.data.imagenet.ImageNetSRTrain
      params:
        size: 256
        degradation: pil_nearest
    validation:
      target: ldm.data.imagenet.ImageNetSRValidation
      params:
        size: 256
        degradation: pil_nearest
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 1000
        max_images: 8
        increase_log_steps: True
  trainer:
    benchmark: True
    accumulate_grad_batches: 2
--- a/configs/latent-diffusion/celebahq-ldm-vq-4.yaml
+++ b/configs/latent-diffusion/celebahq-ldm-vq-4.yaml
@ -1,86 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    image_size: 64
    channels: 3
    monitor: val/loss_simple_ema
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 224
        attention_resolutions:
        # note: this isn\t actually the resolution but
        # the downsampling factor, i.e. this corresnponds to
        # attention on spatial resolution 8,16,32, as the
        # spatial reolution of the latents is 64 for f4
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_head_channels: 32
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ckpt_path: models/first_stage_models/vq-f4/model.ckpt
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: __is_unconditional__
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 48
    num_workers: 5
    wrap: false
    train:
      target: taming.data.faceshq.CelebAHQTrain
      params:
        size: 256
    validation:
      target: taming.data.faceshq.CelebAHQValidation
      params:
        size: 256
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 5000
        max_images: 8
        increase_log_steps: False
  trainer:
    benchmark: True
--- a/configs/latent-diffusion/cin-ldm-vq-f8.yaml
+++ b/configs/latent-diffusion/cin-ldm-vq-f8.yaml
@ -1,98 +0,0 @@
 model:
  base_learning_rate: 1.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: class_label
    image_size: 32
    channels: 4
    cond_stage_trainable: true
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32
        in_channels: 4
        out_channels: 4
        model_channels: 256
        attention_resolutions:
        #note: this isn\t actually the resolution but
        # the downsampling factor, i.e. this corresnponds to
        # attention on spatial resolution 8,16,32, as the
        # spatial reolution of the latents is 32 for f8
        - 4
        - 2
        - 1
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 4
        num_head_channels: 32
        use_spatial_transformer: true
        transformer_depth: 1
        context_dim: 512
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 4
        n_embed: 16384
        ckpt_path: configs/first_stage_models/vq-f8/model.yaml
        ddconfig:
          double_z: false
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions:
          - 32
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.ClassEmbedder
      params:
        embed_dim: 512
        key: class_label
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 64
    num_workers: 12
    wrap: false
    train:
      target: ldm.data.imagenet.ImageNetTrain
      params:
        config:
          size: 256
    validation:
      target: ldm.data.imagenet.ImageNetValidation
      params:
        config:
          size: 256
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 5000
        max_images: 8
        increase_log_steps: False
  trainer:
    benchmark: True
--- a/configs/latent-diffusion/cin256-v2.yaml
+++ b/configs/latent-diffusion/cin256-v2.yaml
@ -1,68 +0,0 @@
 model:
  base_learning_rate: 0.0001
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: class_label
    image_size: 64
    channels: 3
    cond_stage_trainable: true
    conditioning_key: crossattn
    monitor: val/loss
    use_ema: False
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 192
        attention_resolutions:
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 5
        num_heads: 1
        use_spatial_transformer: true
        transformer_depth: 1
        context_dim: 512
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.ClassEmbedder
      params:
        n_classes: 1001
        embed_dim: 512
        key: class_label
--- a/configs/latent-diffusion/ffhq-ldm-vq-4.yaml
+++ b/configs/latent-diffusion/ffhq-ldm-vq-4.yaml
@ -1,85 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    image_size: 64
    channels: 3
    monitor: val/loss_simple_ema
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 224
        attention_resolutions:
        # note: this isn\t actually the resolution but
        # the downsampling factor, i.e. this corresnponds to
        # attention on spatial resolution 8,16,32, as the
        # spatial reolution of the latents is 64 for f4
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_head_channels: 32
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ckpt_path: configs/first_stage_models/vq-f4/model.yaml
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: __is_unconditional__
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 42
    num_workers: 5
    wrap: false
    train:
      target: taming.data.faceshq.FFHQTrain
      params:
        size: 256
    validation:
      target: taming.data.faceshq.FFHQValidation
      params:
        size: 256
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 5000
        max_images: 8
        increase_log_steps: False
  trainer:
    benchmark: True
--- a/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml
+++ b/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml
@ -1,85 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    image_size: 64
    channels: 3
    monitor: val/loss_simple_ema
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 224
        attention_resolutions:
        # note: this isn\t actually the resolution but
        # the downsampling factor, i.e. this corresnponds to
        # attention on spatial resolution 8,16,32, as the
        # spatial reolution of the latents is 64 for f4
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_head_channels: 32
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        ckpt_path: configs/first_stage_models/vq-f4/model.yaml
        embed_dim: 3
        n_embed: 8192
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: __is_unconditional__
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 48
    num_workers: 5
    wrap: false
    train:
      target: ldm.data.lsun.LSUNBedroomsTrain
      params:
        size: 256
    validation:
      target: ldm.data.lsun.LSUNBedroomsValidation
      params:
        size: 256
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 5000
        max_images: 8
        increase_log_steps: False
  trainer:
    benchmark: True
--- a/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml
+++ b/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml
@ -1,91 +0,0 @@
 model:
  base_learning_rate: 5.0e-5   # set to target_lr by starting main.py with '--scale_lr False'
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0155
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    loss_type: l1
    first_stage_key: "image"
    cond_stage_key: "image"
    image_size: 32
    channels: 4
    cond_stage_trainable: False
    concat_mode: False
    scale_by_std: True
    monitor: 'val/loss_simple_ema'
    scheduler_config: # 10000 warmup steps
      target: ldm.lr_scheduler.LambdaLinearScheduler
      params:
        warm_up_steps: [10000]
        cycle_lengths: [10000000000000]
        f_start: [1.e-6]
        f_max: [1.]
        f_min: [ 1.]
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32
        in_channels: 4
        out_channels: 4
        model_channels: 192
        attention_resolutions: [ 1, 2, 4, 8 ]   # 32, 16, 8, 4
        num_res_blocks: 2
        channel_mult: [ 1,2,2,4,4 ]  # 32, 16, 8, 4, 2
        num_heads: 8
        use_scale_shift_norm: True
        resblock_updown: True
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: "val/rec_loss"
        ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
        ddconfig:
          double_z: True
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult: [ 1,2,4,4 ]  # num_down = len(ch_mult)-1
          num_res_blocks: 2
          attn_resolutions: [ ]
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: "__is_unconditional__"
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 96
    num_workers: 5
    wrap: False
    train:
      target: ldm.data.lsun.LSUNChurchesTrain
      params:
        size: 256
    validation:
      target: ldm.data.lsun.LSUNChurchesValidation
      params:
        size: 256
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 5000
        max_images: 8
        increase_log_steps: False
  trainer:
    benchmark: True
--- a/configs/latent-diffusion/txt2img-1p4B-eval.yaml
+++ b/configs/latent-diffusion/txt2img-1p4B-eval.yaml
@ -1,71 +0,0 @@
 model:
  base_learning_rate: 5.0e-05
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.00085
    linear_end: 0.012
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: caption
    image_size: 32
    channels: 4
    cond_stage_trainable: true
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_factor: 0.18215
    use_ema: False
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions:
        - 4
        - 2
        - 1
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 4
        - 4
        num_heads: 8
        use_spatial_transformer: true
        transformer_depth: 1
        context_dim: 1280
        use_checkpoint: true
        legacy: False
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.BERTEmbedder
      params:
        n_embed: 1280
        n_layer: 32
--- a/configs/models.yaml
+++ b/configs/models.yaml
@ -1,29 +1,36 @@
 # This file describes the alternative machine learning models
-# available to the dream script.
+# available to InvokeAI script.
 #
 # To add a new model, follow the examples below. Each
 # model requires a model config file, a weights file,
 # and the width and height of the images it
 # was trained on.
 stable-diffusion-1.4:
-  config: configs/stable-diffusion/v1-inference.yaml
+  config: ./configs/stable-diffusion/v1-inference.yaml
-  weights: models/ldm/stable-diffusion-v1/model.ckpt
+  weights: ./models/ldm/stable-diffusion-v1/sd-v1-4.ckpt
-#  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
+  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
-  description: Stable Diffusion inference model version 1.4
+  description: The original Stable Diffusion version 1.4 weight file (4.27 GB)
  width: 512
  height: 512
  default: true
 inpainting-1.5:
  description: runwayML tuned inpainting model v1.5
  weights: models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt
  config: configs/stable-diffusion/v1-inpainting-inference.yaml
 #  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
  width: 512
  height: 512
 stable-diffusion-1.5:
-  config: configs/stable-diffusion/v1-inference.yaml
+  description: The newest Stable Diffusion version 1.5 weight file (4.27 GB)
-  weights: models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt
+  weights: ./models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt
-#  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
+  config: ./configs/stable-diffusion/v1-inference.yaml
  description: Stable Diffusion inference model version 1.5
  width: 512
  height: 512
  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
  default: true
 inpainting-1.5:
  description: RunwayML SD 1.5 model optimized for inpainting (4.27 GB)
  weights: ./models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt
  config: ./configs/stable-diffusion/v1-inpainting-inference.yaml
  width: 512
  height: 512
  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
 waifu-diffusion-1.3:
  description: Stable Diffusion 1.4 fine tuned on anime-styled images (4.27)
  weights: ./models/ldm/stable-diffusion-v1/model-epoch09-float32.ckpt
  config: ./configs/stable-diffusion/v1-inference.yaml
  width: 512
  height: 512
  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
--- a/configs/retrieval-augmented-diffusion/768x768.yaml
+++ b/configs/retrieval-augmented-diffusion/768x768.yaml
@ -1,68 +0,0 @@
 model:
  base_learning_rate: 0.0001
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.015
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: jpg
    cond_stage_key: nix
    image_size: 48
    channels: 16
    cond_stage_trainable: false
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_by_std: false
    scale_factor: 0.22765929
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 48
        in_channels: 16
        out_channels: 16
        model_channels: 448
        attention_resolutions:
        - 4
        - 2
        - 1
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        use_scale_shift_norm: false
        resblock_updown: false
        num_head_channels: 32
        use_spatial_transformer: true
        transformer_depth: 1
        context_dim: 768
        use_checkpoint: true
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        monitor: val/rec_loss
        embed_dim: 16
        ddconfig:
          double_z: true
          z_channels: 16
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 1
          - 2
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions:
          - 16
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: torch.nn.Identity
--- a/docs/features/CLI.md
+++ b/docs/features/CLI.md
@ -385,7 +385,7 @@ automatically.
 Example:
 <pre>
-invoke> <b>!import_model models/ldm/stable-diffusion-v1/	model-epoch08-float16.ckpt</b>
+invoke> <b>!import_model models/ldm/stable-diffusion-v1/model-epoch08-float16.ckpt</b>
 >> Model import in process. Please enter the values needed to configure this model:
 Name for this model: <b>waifu-diffusion</b>
--- a/docs/installation/INSTALLING_MODELS.md
+++ b/docs/installation/INSTALLING_MODELS.md
@ -0,0 +1,267 @@
 ---
 title: Installing Models
 ---
 # :octicons-paintbrush-16: Installing Models
 ## Model Weight Files
 The model weight files ('*.ckpt') are the Stable Diffusion "secret
 sauce". They are the product of training the AI on millions of
 captioned images gathered from multiple sources.
 Originally there was only a single Stable Diffusion weights file,
 which many people named `model.ckpt`. Now there are dozens or more
 that have been "fine tuned" to provide particulary styles, genres, or
 other features. InvokeAI allows you to install and run multiple model
 weight files and switch between them quickly in the command-line and
 web interfaces.
 This manual will guide you through installing and configuring model
 weight files.
 ## Base Models
 InvokeAI comes with support for a good initial set of models listed in
 the model configuration file `configs/models.yaml`. They are:
 | Model                   | Weight File                   |   Description                    | DOWNLOAD FROM            |
 | ----------------------  | ----------------------------- |--------------------------------- | ----------------|
 | stable-diffusion-1.5    | v1-5-pruned-emaonly.ckpt      | Most recent version of base Stable Diffusion model| https://huggingface.co/runwayml/stable-diffusion-v1-5 |
 | stable-diffusion-1.4    | sd-v1-4.ckpt                  | Previous version of base Stable Diffusion model | https://huggingface.co/CompVis/stable-diffusion-v-1-4-original |
 | inpainting-1.5          | sd-v1-5-inpainting.ckpt       | Stable Diffusion 1.5 model specialized for inpainting | https://huggingface.co/runwayml/stable-diffusion-inpainting |
 | waifu-diffusion-1.3     | model-epoch09-float32.ckpt    | Stable Diffusion 1.4 trained to produce anime images | https://huggingface.co/hakurei/waifu-diffusion-v1-3 |
 | <all models>            | vae-ft-mse-840000-ema-pruned.ckpt   | A fine-tune file add-on file that improves face generation | https://huggingface.co/stabilityai/sd-vae-ft-mse-original/ |
 Note that these files are covered by an "Ethical AI" license which
 forbids certain uses. You will need to create an account on the
 Hugging Face website and accept the license terms before you can
 access the files.
 The predefined configuration file for InvokeAI (located at
 `configs/models.yaml`) provides entries for each of these weights
 files. `stable-diffusion-1.5` is the default model used, and we
 strongly recommend that you install this weights file if nothing else.
 ## Community-Contributed Models
 There are too many to list here and more are being contributed every
 day. Hugging Face maintains a [fast-growing
 repository](https://huggingface.co/sd-concepts-library) of fine-tune
 (".bin") models that can be imported into InvokeAI by passing the
 `--embedding_path` option to the `invoke.py` command.
 [This page](https://rentry.org/sdmodels) hosts a large list of
 official and unofficial Stable Diffusion models and where they can be
 obtained.
 ## Installation
 There are three ways to install weights files:
 1. During InvokeAI installation, the `preload_models.py` script can
 download them for you.
 2. You can use the command-line interface (CLI) to import, configure
 and modify new models files.
 3. You can download the files manually and add the appropriate entries
 to `models.yaml`.
 ### Installation via `preload_models.py`
 This is the most automatic way. Run `scripts/preload_models.py` from
 the console.  It will ask you to select which models to download and
 lead you through the steps of setting up a Hugging Face account if you
 haven't done so already.
 To start, from within the InvokeAI directory run the command `python
 scripts/preload_models.py` (Linux/MacOS) or `python
 scripts\preload_models.py` (Windows):
 ```
 Loading Python libraries...
 ** INTRODUCTION **
 Welcome to InvokeAI. This script will help download the Stable Diffusion weight files
 and other large models that are needed for text to image generation. At any point you may interrupt
 this program and resume later.
 ** WEIGHT SELECTION **
 Would you like to download the Stable Diffusion model weights now? [y] 
 Choose the weight file(s) you wish to download. Before downloading you 
 will be given the option to view and change your selections.
 [1] stable-diffusion-1.5:
    The newest Stable Diffusion version 1.5 weight file (4.27 GB) (recommended)
    Download? [y] 
 [2] inpainting-1.5:
    RunwayML SD 1.5 model optimized for inpainting (4.27 GB) (recommended)
    Download? [y] 
 [3] stable-diffusion-1.4:
    The original Stable Diffusion version 1.4 weight file (4.27 GB) 
    Download? [n] n
 [4] waifu-diffusion-1.3:
    Stable Diffusion 1.4 fine tuned on anime-styled images (4.27) 
    Download? [n] y
 [5] ft-mse-improved-autoencoder-840000:
    StabilityAI improved autoencoder fine-tuned for human faces (recommended; 335 MB) (recommended)
    Download? [y] y
 The following weight files will be downloaded:
   [1] stable-diffusion-1.5*
   [2] inpainting-1.5
   [4] waifu-diffusion-1.3
   [5] ft-mse-improved-autoencoder-840000
 *default
 Ok to download? [y] 
 ** LICENSE AGREEMENT FOR WEIGHT FILES **
 1. To download the Stable Diffusion weight files you need to read and accept the
   CreativeML Responsible AI license. If you have not already done so, please 
   create an account using the "Sign Up" button:
   https://huggingface.co 
   You will need to verify your email address as part of the HuggingFace
   registration process.
 2. After creating the account, login under your account and accept
   the license terms located here:
   https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
 Press <enter> when you are ready to continue:
 ...
 ```
 When the script is complete, you will find the downloaded weights
 files in `models/ldm/stable-diffusion-v1` and a matching configuration
 file in `configs/models.yaml`.
 You can run the script again to add any models you didn't select the
 first time. Note that as a safety measure the script will _never_
 remove a previously-installed weights file. You will have to do this
 manually.
 ### Installation via the CLI
 You can install a new model, including any of the community-supported
 ones, via the command-line client's `!import_model` command.
 1. First download the desired model weights file and place it under `models/ldm/stable-diffusion-v1/`.
   You may rename the weights file to something more memorable if you wish. Record the path of the
   weights file (e.g. `models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt`)
 2. Launch the `invoke.py` CLI with `python scripts/invoke.py`.
 3. At the `invoke>` command-line, enter the command `!import_model <path to model>`.
   For example:
   `invoke> !import_model models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt`
   (Hint - the CLI supports file path autocompletion. Type a bit of the path
   name and hit <tab> in order to get a choice of possible completions.)
 4. Follow the wizard's instructions to complete installation as shown in the example
   here:
 ```
 invoke> <b>!import_model models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt</b>
 >> Model import in process. Please enter the values needed to configure this model:
 Name for this model: <b>arabian-nights</b>
 Description of this model: <b>Arabian Nights Fine Tune v1.0</b>
 Configuration file for this model: <b>configs/stable-diffusion/v1-inference.yaml</b>
 Default image width: <b>512</b>
 Default image height: <b>512</b>
 >> New configuration:
 arabian-nights:
  config: configs/stable-diffusion/v1-inference.yaml
  description: Arabian Nights Fine Tune v1.0
  height: 512
  weights: models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt
  width: 512
 OK to import [n]? <b>y</b>
 >> Caching model stable-diffusion-1.4 in system RAM
 >> Loading waifu-diffusion from models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt
   | LatentDiffusion: Running in eps-prediction mode
   | DiffusionWrapper has 859.52 M params.
   | Making attention of type 'vanilla' with 512 in_channels
   | Working with z of shape (1, 4, 32, 32) = 4096 dimensions.
   | Making attention of type 'vanilla' with 512 in_channels
   | Using faster float16 precision
 ```
 If you've previously installed the fine-tune VAE file `vae-ft-mse-840000-ema-pruned.ckpt`,
 the wizard will also ask you if you want to add this VAE to the model.
 The appropriate entry for this model will be added to `configs/models.yaml` and it will
 be available to use in the CLI immediately.
 The CLI has additional commands for switching among, viewing, editing,
 deleting the available models. These are described in [Command Line
 Client](../features/CLI.md#model-selection-and-importation), but the two most
 frequently-used are `!models` and `!switch <name of model>`. The first
 prints a table of models that InvokeAI knows about and their load
 status. The second will load the requested model and lets you switch
 back and forth quickly among loaded models.
 ### Manually editing of `configs/models.yaml`
 If you are comfortable with a text editor then you may simply edit
 `models.yaml` directly.
 First you need to download the desired .ckpt file and place it in
 `models/ldm/stable-diffusion-v1` as descirbed in step #1 in the
 previous section. Record the path to the weights file,
 e.g. `models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt`
 Then using a **text** editor (e.g. the Windows Notepad application),
 open the file `configs/models.yaml`, and add a new stanza that follows
 this model:
 ```
 arabian-nights-1.0:
  description: A great fine-tune in Arabian Nights style
  weights: ./models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt
  config: ./configs/stable-diffusion/v1-inference.yaml
  width: 512
  height: 512
  vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
  default: false
 ```
 * arabian-nights-1.0
  - This is the name of the model that you will refer to from within the
  CLI and the WebGUI when you need to load and use the model.
 * description
  - Any description that you want to add to the model to remind you what
    it is.
 * weights
  - Relative path to the .ckpt weights file for this model.
 * config
  - This is the confusingly-named configuration file for the model itself.
  Use `./configs/stable-diffusion/v1-inference.yaml` unless the model happens
  to need a custom configuration, in which case the place you downloaded it
  from will tell you what to use instead. For example, the runwayML custom
  inpainting model requires the file `configs/stable-diffusion/v1-inpainting-inference.yaml`.
  This is already inclued in the InvokeAI distribution and is configured automatically
  for you by the `preload_models.py` script.
 * vae
  - If you want to add a VAE file to the model, then enter its path here.
 * width, height
  - This is the width and height of the images used to train the model.
  Currently they are always 512 and 512.
 Save the `models.yaml` and relaunch InvokeAI. The new model should now be
 available for your use.
--- a/docs/installation/INSTALL_LINUX.md
+++ b/docs/installation/INSTALL_LINUX.md
@ -1,5 +1,5 @@
 ---
-title: Linux
+title: Manual Installation, Linux
 ---
 # :fontawesome-brands-linux: Linux
@ -63,24 +63,16 @@ title: Linux
        model loading scheme to allow the script to work on GPU machines that are not
        internet connected. See [Preload Models](../features/OTHER.md#preload-models)
-7. Now you need to install the weights for the stable diffusion model.
+7. Install the weights for the stable diffusion model.
-      - For running with the released weights, you will first need to set up an acount
+- Sign up at https://huggingface.co
-        with [Hugging Face](https://huggingface.co).
+- Go to the [Stable diffusion diffusion model page](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original)
-      - Use your credentials to log in, and then point your browser [here](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original).
+- Accept the terms and click Access Repository
-      - You may be asked to sign a license agreement at this point.
+- Download [v1-5-pruned-emaonly.ckpt (4.27 GB)](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt)
-      - Click on "Files and versions" near the top of the page, and then click on the
+and move it into this directory under `models/ldm/stable_diffusion_v1/v1-5-pruned-emaonly.ckpt`
        file named "sd-v1-4.ckpt". You'll be taken to a page that prompts you to click
        the "download" link. Save the file somewhere safe on your local machine.
-      Now run the following commands from within the stable-diffusion directory.
+There are many other models that you can use. Please see [../features/INSTALLING_MODELS.md]
-      This will create a symbolic link from the stable-diffusion model.ckpt file, to
+for details.
      the true location of the `sd-v1-4.ckpt` file.
    ```bash
    (invokeai) ~/InvokeAI$ mkdir -p models/ldm/stable-diffusion-v1
    (invokeai) ~/InvokeAI$ ln -sf /path/to/sd-v1-4.ckpt models/ldm/stable-diffusion-v1/model.ckpt
    ```
 8. Start generating images!
--- a/docs/installation/INSTALL_MAC.md
+++ b/docs/installation/INSTALL_MAC.md
@ -1,5 +1,5 @@
 ---
-title: macOS
+title: Manual Installation, macOS
 ---
 # :fontawesome-brands-apple: macOS
@ -24,9 +24,15 @@ First you need to download a large checkpoint file.
 1. Sign up at https://huggingface.co
 2. Go to the [Stable diffusion diffusion model page](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original)
 3. Accept the terms and click Access Repository
-4. Download [sd-v1-4.ckpt (4.27 GB)](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/blob/main/sd-v1-4.ckpt) and note where you have saved it (probably the Downloads folder). You may want to move it somewhere else for longer term storage - SD needs this file to run.
+4. Download [v1-5-pruned-emaonly.ckpt (4.27 GB)](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt)
 and move it into this directory under `models/ldm/stable_diffusion_v1/v1-5-pruned-emaonly.ckpt`
-While that is downloading, open Terminal and run the following commands one at a time, reading the comments and taking care to run the appropriate command for your Mac's architecture (Intel or M1).
+There are many other models that you can try. Please see [../features/INSTALLING_MODELS.md]
 for details.
 While that is downloading, open Terminal and run the following
 commands one at a time, reading the comments and taking care to run
 the appropriate command for your Mac's architecture (Intel or M1).
 !!! todo "Homebrew"
--- a/docs/installation/INSTALL_WINDOWS.md
+++ b/docs/installation/INSTALL_WINDOWS.md
@ -1,5 +1,5 @@
 ---
-title: Windows
+title: Manual Installation, Windows
 ---
 # :fontawesome-brands-windows: Windows
@ -83,23 +83,14 @@ in the wiki
 8. Now you need to install the weights for the big stable diffusion model.
-      1. For running with the released weights, you will first need to set up an acount with Hugging Face (https://huggingface.co).
+   - Sign up at https://huggingface.co
-      2. Use your credentials to log in, and then point your browser at https://huggingface.co/CompVis/stable-diffusion-v-1-4-original.
+   - Go to the [Stable diffusion diffusion model page](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original)
-      3. You may be asked to sign a license agreement at this point.
+   - Accept the terms and click Access Repository
-      4. Click on "Files and versions" near the top of the page, and then click on the file named `sd-v1-4.ckpt`. You'll be taken to a page that
+   - Download [v1-5-pruned-emaonly.ckpt (4.27 GB)](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt)
-        prompts you to click the "download" link. Now save the file somewhere safe on your local machine.
+     and move it into this directory under `models/ldm/stable_diffusion_v1/v1-5-pruned-emaonly.ckpt`
      5. The weight file is >4 GB in size, so
        downloading may take a while.
-    Now run the following commands from **within the InvokeAI directory** to copy the weights file to the right place:
+   There are many other models that you can use. Please see [../features/INSTALLING_MODELS.md]
-
+   for details.
    ```batch
    mkdir -p models\ldm\stable-diffusion-v1
    copy C:\path\to\sd-v1-4.ckpt models\ldm\stable-diffusion-v1\model.ckpt
    ```
    Please replace `C:\path\to\sd-v1.4.ckpt` with the correct path to wherever you stashed this file. If you prefer not to copy or move the .ckpt file,
    you may instead create a shortcut to it from within `models\ldm\stable-diffusion-v1\`.
 9. Start generating images!
--- a/ldm/invoke/model_cache.py
+++ b/ldm/invoke/model_cache.py
@ -227,11 +227,14 @@ class ModelCache(object):
            print('   | Using more accurate float32 precision')
        # look and load a matching vae file. Code borrowed from AUTOMATIC1111 modules/sd_models.py
-        if vae and os.path.exists(vae):
+        if vae:
            if os.path.exists(vae):
                print(f'   | Loading VAE weights from: {vae}')
                vae_ckpt = torch.load(vae, map_location="cpu")
                vae_dict = {k: v for k, v in vae_ckpt["state_dict"].items() if k[0:4] != "loss"}
                model.first_stage_model.load_state_dict(vae_dict, strict=False)
            else:
                print(f'   | VAE file {vae} not found. Skipping.')
        model.to(self.device)
        # model.to doesn't change the cond_stage_model.device used to move the tokenizer output, so set it here
@ -281,7 +284,7 @@ class ModelCache(object):
        Returns the preamble for the config file.
        '''
        return '''# This file describes the alternative machine learning models
-# available to the dream script.
+# available to InvokeAI script.
 #
 # To add a new model, follow the examples below. Each
 # model requires a model config file, a weights file,
--- a/models/first_stage_models/kl-f16/config.yaml
+++ b/models/first_stage_models/kl-f16/config.yaml
@ -1,44 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: val/rec_loss
    embed_dim: 16
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 1.0e-06
        disc_weight: 0.5
    ddconfig:
      double_z: true
      z_channels: 16
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 1
      - 2
      - 2
      - 4
      num_res_blocks: 2
      attn_resolutions:
      - 16
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 6
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        size: 384
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        size: 384
        crop_size: 256
--- a/models/first_stage_models/kl-f32/config.yaml
+++ b/models/first_stage_models/kl-f32/config.yaml
@ -1,46 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: val/rec_loss
    embed_dim: 64
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 1.0e-06
        disc_weight: 0.5
    ddconfig:
      double_z: true
      z_channels: 64
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 1
      - 2
      - 2
      - 4
      - 4
      num_res_blocks: 2
      attn_resolutions:
      - 16
      - 8
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 6
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        size: 384
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        size: 384
        crop_size: 256
--- a/models/first_stage_models/kl-f4/config.yaml
+++ b/models/first_stage_models/kl-f4/config.yaml
@ -1,41 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: val/rec_loss
    embed_dim: 3
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 1.0e-06
        disc_weight: 0.5
    ddconfig:
      double_z: true
      z_channels: 3
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 2
      - 4
      num_res_blocks: 2
      attn_resolutions: []
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 10
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        size: 384
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        size: 384
        crop_size: 256
--- a/models/first_stage_models/kl-f8/config.yaml
+++ b/models/first_stage_models/kl-f8/config.yaml
@ -1,42 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    monitor: val/rec_loss
    embed_dim: 4
    lossconfig:
      target: ldm.modules.losses.LPIPSWithDiscriminator
      params:
        disc_start: 50001
        kl_weight: 1.0e-06
        disc_weight: 0.5
    ddconfig:
      double_z: true
      z_channels: 4
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 2
      - 4
      - 4
      num_res_blocks: 2
      attn_resolutions: []
      dropout: 0.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 4
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        size: 384
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        size: 384
        crop_size: 256
--- a/models/first_stage_models/vq-f16/config.yaml
+++ b/models/first_stage_models/vq-f16/config.yaml
@ -1,49 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.VQModel
  params:
    embed_dim: 8
    n_embed: 16384
    ddconfig:
      double_z: false
      z_channels: 8
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 1
      - 2
      - 2
      - 4
      num_res_blocks: 2
      attn_resolutions:
      - 16
      dropout: 0.0
    lossconfig:
      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
      params:
        disc_conditional: false
        disc_in_channels: 3
        disc_start: 250001
        disc_weight: 0.75
        disc_num_layers: 2
        codebook_weight: 1.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 14
    num_workers: 20
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        size: 384
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        size: 384
        crop_size: 256
--- a/models/first_stage_models/vq-f4-noattn/config.yaml
+++ b/models/first_stage_models/vq-f4-noattn/config.yaml
@ -1,46 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.VQModel
  params:
    embed_dim: 3
    n_embed: 8192
    monitor: val/rec_loss
    ddconfig:
      attn_type: none
      double_z: false
      z_channels: 3
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 2
      - 4
      num_res_blocks: 2
      attn_resolutions: []
      dropout: 0.0
    lossconfig:
      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
      params:
        disc_conditional: false
        disc_in_channels: 3
        disc_start: 11
        disc_weight: 0.75
        codebook_weight: 1.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 8
    num_workers: 12
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        crop_size: 256
--- a/models/first_stage_models/vq-f4/config.yaml
+++ b/models/first_stage_models/vq-f4/config.yaml
@ -1,45 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.VQModel
  params:
    embed_dim: 3
    n_embed: 8192
    monitor: val/rec_loss
    ddconfig:
      double_z: false
      z_channels: 3
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 2
      - 4
      num_res_blocks: 2
      attn_resolutions: []
      dropout: 0.0
    lossconfig:
      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
      params:
        disc_conditional: false
        disc_in_channels: 3
        disc_start: 0
        disc_weight: 0.75
        codebook_weight: 1.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 8
    num_workers: 16
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        crop_size: 256
--- a/models/first_stage_models/vq-f8-n256/config.yaml
+++ b/models/first_stage_models/vq-f8-n256/config.yaml
@ -1,48 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.VQModel
  params:
    embed_dim: 4
    n_embed: 256
    monitor: val/rec_loss
    ddconfig:
      double_z: false
      z_channels: 4
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 2
      - 2
      - 4
      num_res_blocks: 2
      attn_resolutions:
      - 32
      dropout: 0.0
    lossconfig:
      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
      params:
        disc_conditional: false
        disc_in_channels: 3
        disc_start: 250001
        disc_weight: 0.75
        codebook_weight: 1.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 10
    num_workers: 20
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        size: 384
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        size: 384
        crop_size: 256
--- a/models/first_stage_models/vq-f8/config.yaml
+++ b/models/first_stage_models/vq-f8/config.yaml
@ -1,48 +0,0 @@
 model:
  base_learning_rate: 4.5e-06
  target: ldm.models.autoencoder.VQModel
  params:
    embed_dim: 4
    n_embed: 16384
    monitor: val/rec_loss
    ddconfig:
      double_z: false
      z_channels: 4
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 2
      - 2
      - 4
      num_res_blocks: 2
      attn_resolutions:
      - 32
      dropout: 0.0
    lossconfig:
      target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
      params:
        disc_conditional: false
        disc_in_channels: 3
        disc_num_layers: 2
        disc_start: 1
        disc_weight: 0.6
        codebook_weight: 1.0
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 10
    num_workers: 20
    wrap: true
    train:
      target: ldm.data.openimages.FullOpenImagesTrain
      params:
        size: 384
        crop_size: 256
    validation:
      target: ldm.data.openimages.FullOpenImagesValidation
      params:
        size: 384
        crop_size: 256
--- a/models/ldm/bsr_sr/config.yaml
+++ b/models/ldm/bsr_sr/config.yaml
@ -1,80 +0,0 @@
 model:
  base_learning_rate: 1.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0155
    log_every_t: 100
    timesteps: 1000
    loss_type: l2
    first_stage_key: image
    cond_stage_key: LR_image
    image_size: 64
    channels: 3
    concat_mode: true
    cond_stage_trainable: false
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 6
        out_channels: 3
        model_channels: 160
        attention_resolutions:
        - 16
        - 8
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 2
        - 4
        num_head_channels: 32
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        monitor: val/rec_loss
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: torch.nn.Identity
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 64
    wrap: false
    num_workers: 12
    train:
      target: ldm.data.openimages.SuperresOpenImagesAdvancedTrain
      params:
        size: 256
        degradation: bsrgan_light
        downscale_f: 4
        min_crop_f: 0.5
        max_crop_f: 1.0
        random_crop: true
    validation:
      target: ldm.data.openimages.SuperresOpenImagesAdvancedValidation
      params:
        size: 256
        degradation: bsrgan_light
        downscale_f: 4
        min_crop_f: 0.5
        max_crop_f: 1.0
        random_crop: true
--- a/models/ldm/celeba256/config.yaml
+++ b/models/ldm/celeba256/config.yaml
@ -1,70 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: class_label
    image_size: 64
    channels: 3
    cond_stage_trainable: false
    concat_mode: false
    monitor: val/loss
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 224
        attention_resolutions:
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_head_channels: 32
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: __is_unconditional__
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 48
    num_workers: 5
    wrap: false
    train:
      target: ldm.data.faceshq.CelebAHQTrain
      params:
        size: 256
    validation:
      target: ldm.data.faceshq.CelebAHQValidation
      params:
        size: 256
--- a/models/ldm/cin256/config.yaml
+++ b/models/ldm/cin256/config.yaml
@ -1,80 +0,0 @@
 model:
  base_learning_rate: 1.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: class_label
    image_size: 32
    channels: 4
    cond_stage_trainable: true
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32
        in_channels: 4
        out_channels: 4
        model_channels: 256
        attention_resolutions:
        - 4
        - 2
        - 1
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 4
        num_head_channels: 32
        use_spatial_transformer: true
        transformer_depth: 1
        context_dim: 512
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 4
        n_embed: 16384
        ddconfig:
          double_z: false
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions:
          - 32
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.ClassEmbedder
      params:
        embed_dim: 512
        key: class_label
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 64
    num_workers: 12
    wrap: false
    train:
      target: ldm.data.imagenet.ImageNetTrain
      params:
        config:
          size: 256
    validation:
      target: ldm.data.imagenet.ImageNetValidation
      params:
        config:
          size: 256
--- a/models/ldm/ffhq256/config.yaml
+++ b/models/ldm/ffhq256/config.yaml
@ -1,70 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: class_label
    image_size: 64
    channels: 3
    cond_stage_trainable: false
    concat_mode: false
    monitor: val/loss
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 224
        attention_resolutions:
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_head_channels: 32
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: __is_unconditional__
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 42
    num_workers: 5
    wrap: false
    train:
      target: ldm.data.faceshq.FFHQTrain
      params:
        size: 256
    validation:
      target: ldm.data.faceshq.FFHQValidation
      params:
        size: 256
--- a/models/ldm/inpainting_big/config.yaml
+++ b/models/ldm/inpainting_big/config.yaml
@ -1,67 +0,0 @@
 model:
  base_learning_rate: 1.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0205
    log_every_t: 100
    timesteps: 1000
    loss_type: l1
    first_stage_key: image
    cond_stage_key: masked_image
    image_size: 64
    channels: 3
    concat_mode: true
    monitor: val/loss
    scheduler_config:
      target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler
      params:
        verbosity_interval: 0
        warm_up_steps: 1000
        max_decay_steps: 50000
        lr_start: 0.001
        lr_max: 0.1
        lr_min: 0.0001
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 7
        out_channels: 3
        model_channels: 256
        attention_resolutions:
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_heads: 8
        resblock_updown: true
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        monitor: val/rec_loss
        ddconfig:
          attn_type: none
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: ldm.modules.losses.contperceptual.DummyLoss
    cond_stage_config: __is_first_stage__
--- a/models/ldm/layout2img-openimages256/config.yaml
+++ b/models/ldm/layout2img-openimages256/config.yaml
@ -1,81 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0205
    log_every_t: 100
    timesteps: 1000
    loss_type: l1
    first_stage_key: image
    cond_stage_key: coordinates_bbox
    image_size: 64
    channels: 3
    conditioning_key: crossattn
    cond_stage_trainable: true
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 128
        attention_resolutions:
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_head_channels: 32
        use_spatial_transformer: true
        transformer_depth: 3
        context_dim: 512
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        monitor: val/rec_loss
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.BERTEmbedder
      params:
        n_embed: 512
        n_layer: 16
        vocab_size: 8192
        max_seq_len: 92
        use_tokenizer: false
    monitor: val/loss_simple_ema
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 24
    wrap: false
    num_workers: 10
    train:
      target: ldm.data.openimages.OpenImagesBBoxTrain
      params:
        size: 256
    validation:
      target: ldm.data.openimages.OpenImagesBBoxValidation
      params:
        size: 256
--- a/models/ldm/lsun_beds256/config.yaml
+++ b/models/ldm/lsun_beds256/config.yaml
@ -1,70 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: class_label
    image_size: 64
    channels: 3
    cond_stage_trainable: false
    concat_mode: false
    monitor: val/loss
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 224
        attention_resolutions:
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 4
        num_head_channels: 32
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: __is_unconditional__
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 48
    num_workers: 5
    wrap: false
    train:
      target: ldm.data.lsun.LSUNBedroomsTrain
      params:
        size: 256
    validation:
      target: ldm.data.lsun.LSUNBedroomsValidation
      params:
        size: 256
--- a/models/ldm/lsun_churches256/config.yaml
+++ b/models/ldm/lsun_churches256/config.yaml
@ -1,92 +0,0 @@
 model:
  base_learning_rate: 5.0e-05
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0155
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    loss_type: l1
    first_stage_key: image
    cond_stage_key: image
    image_size: 32
    channels: 4
    cond_stage_trainable: false
    concat_mode: false
    scale_by_std: true
    monitor: val/loss_simple_ema
    scheduler_config:
      target: ldm.lr_scheduler.LambdaLinearScheduler
      params:
        warm_up_steps:
        - 10000
        cycle_lengths:
        - 10000000000000
        f_start:
        - 1.0e-06
        f_max:
        - 1.0
        f_min:
        - 1.0
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32
        in_channels: 4
        out_channels: 4
        model_channels: 192
        attention_resolutions:
        - 1
        - 2
        - 4
        - 8
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 2
        - 4
        - 4
        num_heads: 8
        use_scale_shift_norm: true
        resblock_updown: true
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config: '__is_unconditional__'
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 96
    num_workers: 5
    wrap: false
    train:
      target: ldm.data.lsun.LSUNChurchesTrain
      params:
        size: 256
    validation:
      target: ldm.data.lsun.LSUNChurchesValidation
      params:
        size: 256
--- a/models/ldm/semantic_synthesis256/config.yaml
+++ b/models/ldm/semantic_synthesis256/config.yaml
@ -1,59 +0,0 @@
 model:
  base_learning_rate: 1.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0205
    log_every_t: 100
    timesteps: 1000
    loss_type: l1
    first_stage_key: image
    cond_stage_key: segmentation
    image_size: 64
    channels: 3
    concat_mode: true
    cond_stage_trainable: true
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 6
        out_channels: 3
        model_channels: 128
        attention_resolutions:
        - 32
        - 16
        - 8
        num_res_blocks: 2
        channel_mult:
        - 1
        - 4
        - 8
        num_heads: 8
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.SpatialRescaler
      params:
        n_stages: 2
        in_channels: 182
        out_channels: 3
--- a/models/ldm/semantic_synthesis512/config.yaml
+++ b/models/ldm/semantic_synthesis512/config.yaml
@ -1,78 +0,0 @@
 model:
  base_learning_rate: 1.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0205
    log_every_t: 100
    timesteps: 1000
    loss_type: l1
    first_stage_key: image
    cond_stage_key: segmentation
    image_size: 128
    channels: 3
    concat_mode: true
    cond_stage_trainable: true
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 128
        in_channels: 6
        out_channels: 3
        model_channels: 128
        attention_resolutions:
        - 32
        - 16
        - 8
        num_res_blocks: 2
        channel_mult:
        - 1
        - 4
        - 8
        num_heads: 8
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        monitor: val/rec_loss
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.SpatialRescaler
      params:
        n_stages: 2
        in_channels: 182
        out_channels: 3
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 8
    wrap: false
    num_workers: 10
    train:
      target: ldm.data.landscapes.RFWTrain
      params:
        size: 768
        crop_size: 512
        segmentation_to_float32: true
    validation:
      target: ldm.data.landscapes.RFWValidation
      params:
        size: 768
        crop_size: 512
        segmentation_to_float32: true
--- a/models/ldm/stable-diffusion-v1/place-ckpt-files-here.txt
+++ b/models/ldm/stable-diffusion-v1/place-ckpt-files-here.txt
@ -0,0 +1,2 @@
 See docs/features/INSTALLING_MODELS.md for how to populate this
 directory with one or more Stable Diffusion model weight files.
--- a/models/ldm/text2img256/config.yaml
+++ b/models/ldm/text2img256/config.yaml
@ -1,77 +0,0 @@
 model:
  base_learning_rate: 2.0e-06
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: caption
    image_size: 64
    channels: 3
    cond_stage_trainable: true
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64
        in_channels: 3
        out_channels: 3
        model_channels: 192
        attention_resolutions:
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 5
        num_head_channels: 32
        use_spatial_transformer: true
        transformer_depth: 1
        context_dim: 640
    first_stage_config:
      target: ldm.models.autoencoder.VQModelInterface
      params:
        embed_dim: 3
        n_embed: 8192
        ddconfig:
          double_z: false
          z_channels: 3
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.BERTEmbedder
      params:
        n_embed: 640
        n_layer: 32
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 28
    num_workers: 10
    wrap: false
    train:
      target: ldm.data.previews.pytorch_dataset.PreviewsTrain
      params:
        size: 256
    validation:
      target: ldm.data.previews.pytorch_dataset.PreviewsValidation
      params:
        size: 256
--- a/scripts/preload_models.py
+++ b/scripts/preload_models.py
@ -3,20 +3,369 @@
 # Before running stable-diffusion on an internet-isolated machine,
 # run this script from one with internet connectivity. The
 # two machines must share a common .cache directory.
-from transformers import CLIPTokenizer, CLIPTextModel
+#
 # Coauthor: Kevin Turner http://github.com/keturn
 #
 print('Loading Python libraries...\n')
 import argparse
 import clip
 from transformers import BertTokenizerFast, AutoFeatureExtractor
 import sys
 import transformers
 import os
 import warnings
 import torch
 import urllib.request
 import zipfile
 import traceback
 import getpass
 import requests
 from urllib import request
 from tqdm import tqdm
 from omegaconf import OmegaConf
 from pathlib import Path
 from transformers import CLIPTokenizer, CLIPTextModel
 from transformers import BertTokenizerFast, AutoFeatureExtractor
 from huggingface_hub import hf_hub_download, HfFolder, hf_hub_url
 transformers.logging.set_verbosity_error()
 #--------------------------globals--
 Model_dir = './models/ldm/stable-diffusion-v1/'
 Config_file = './configs/models.yaml'
 SD_Configs = './configs/stable-diffusion'
 Datasets = {
    'stable-diffusion-1.5':  {
        'description': 'The newest Stable Diffusion version 1.5 weight file (4.27 GB)',
        'repo_id': 'runwayml/stable-diffusion-v1-5',
        'config': 'v1-inference.yaml',
        'file': 'v1-5-pruned-emaonly.ckpt',
        'recommended': True,
        'width': 512,
        'height': 512,
    },
    'inpainting-1.5': {
        'description': 'RunwayML SD 1.5 model optimized for inpainting (4.27 GB)',
        'repo_id': 'runwayml/stable-diffusion-inpainting',
        'config': 'v1-inpainting-inference.yaml',
        'file': 'sd-v1-5-inpainting.ckpt',
        'recommended': True,
        'width': 512,
        'height': 512,
    },
    'stable-diffusion-1.4': {
        'description': 'The original Stable Diffusion version 1.4 weight file (4.27 GB)',
        'repo_id': 'CompVis/stable-diffusion-v-1-4-original',
        'config': 'v1-inference.yaml',
        'file': 'sd-v1-4.ckpt',
        'recommended': False,
        'width': 512,
        'height': 512,
    },
    'waifu-diffusion-1.3': {
        'description': 'Stable Diffusion 1.4 fine tuned on anime-styled images (4.27)',
        'repo_id': 'hakurei/waifu-diffusion-v1-3',
        'config': 'v1-inference.yaml',
        'file': 'model-epoch09-float32.ckpt',
        'recommended': False,
        'width': 512,
        'height': 512,
    },
    'ft-mse-improved-autoencoder-840000': {
        'description': 'StabilityAI improved autoencoder fine-tuned for human faces (recommended; 335 MB)',
        'repo_id': 'stabilityai/sd-vae-ft-mse-original',
        'config': 'VAE',
        'file': 'vae-ft-mse-840000-ema-pruned.ckpt',
        'recommended': True,
        'width': 512,
        'height': 512,
    },
 }
 Config_preamble = '''# This file describes the alternative machine learning models
 # available to InvokeAI script.
 #
 # To add a new model, follow the examples below. Each
 # model requires a model config file, a weights file,
 # and the width and height of the images it
 # was trained on.
 '''
 #---------------------------------------------
 def introduction():
    print(
        '''Welcome to InvokeAI. This script will help download the Stable Diffusion weight files
 and other large models that are needed for text to image generation. At any point you may interrupt
 this program and resume later.\n'''
    )
 #--------------------------------------------
 def postscript():
    print(
        '''You're all set! You may now launch InvokeAI using one of these two commands:
 Web version: 
    python scripts/invoke.py --web  (connect to http://localhost:9090)
 Command-line version:
   python scripts/invoke.py
 Have fun!
 '''
 )
 #---------------------------------------------
 def yes_or_no(prompt:str, default_yes=True):
    default = "y" if default_yes else 'n'
    response = input(f'{prompt} [{default}] ') or default
    if default_yes:
        return response[0] not in ('n','N')
    else:
        return response[0] in ('y','Y')
 #---------------------------------------------
 def user_wants_to_download_weights()->str:
    '''
    Returns one of "skip", "recommended" or "customized"
    '''
    print('''You can download and configure the weights files manually or let this
 script do it for you. Manual installation is described at:
 https://github.com/invoke-ai/InvokeAI/blob/main/docs/installation/INSTALLING_MODELS.md
 You may download the recommended models (about 10GB total), select a customized set, or
 completely skip this step.
 '''
    )
    selection = None
    while selection is None:
        choice = input('Download <r>ecommended models, <c>ustomize the list, or <s>kip this step? [r]: ')
        if choice.startswith(('r','R')) or len(choice)==0:
            selection = 'recommended'
        elif choice.startswith(('c','C')):
            selection = 'customized'
        elif choice.startswith(('s','S')):
            selection = 'skip'
    return selection
 #---------------------------------------------
 def select_datasets(action:str):
    done = False
    while not done:
        datasets = dict()
        dflt = None   # the first model selected will be the default; TODO let user change
        counter = 1
        if action == 'customized':
            print('''
 Choose the weight file(s) you wish to download. Before downloading you 
 will be given the option to view and change your selections.
 '''
        )
            for ds in Datasets.keys():
                recommended = '(recommended)' if Datasets[ds]['recommended'] else ''
                print(f'[{counter}] {ds}:\n    {Datasets[ds]["description"]} {recommended}')
                if yes_or_no('    Download?',default_yes=Datasets[ds]['recommended']):
                    datasets[ds]=counter
                    counter += 1
        else:
            for ds in Datasets.keys():
                if Datasets[ds]['recommended']:
                    datasets[ds]=counter
                    counter += 1
        print('The following weight files will be downloaded:')
        for ds in datasets:
            dflt = '*' if dflt is None else ''
            print(f'   [{datasets[ds]}] {ds}{dflt}')
        print("*default")
        ok_to_download = yes_or_no('Ok to download?')
        if not ok_to_download:
            if yes_or_no('Change your selection?'):
                action = 'customized'
                pass
            else:
                done = True
        else:
            done = True
    return datasets if ok_to_download else None
 #-------------------------------Authenticate against Hugging Face
 def authenticate():
    print('''
 To download the Stable Diffusion weight files from the official Hugging Face 
 repository, you need to read and accept the CreativeML Responsible AI license.
 This involves a few easy steps.
 1. If you have not already done so, create an account on Hugging Face's web site
   using the "Sign Up" button:
   https://huggingface.co/join
   You will need to verify your email address as part of the HuggingFace
   registration process.
 2. Log into your Hugging Face account:
    https://huggingface.co/login
 3. Accept the license terms located here:
   https://huggingface.co/runwayml/stable-diffusion-v1-5
   and here:
   https://huggingface.co/runwayml/stable-diffusion-inpainting
    (Yes, you have to accept two slightly different license agreements)
 '''
    )
    input('Press <enter> when you are ready to continue:')
    access_token = HfFolder.get_token()
    if access_token is None:
        print('''
 4. Thank you! The last step is to enter your HuggingFace access token so that
   this script is authorized to initiate the download. Go to the access tokens
   page of your Hugging Face account and create a token by clicking the 
   "New token" button:
   https://huggingface.co/settings/tokens
   (You can enter anything you like in the token creation field marked "Name". 
   "Role" should be "read").
   Now copy the token to your clipboard and paste it here: '''
        )
        access_token = getpass.getpass()
        HfFolder.save_token(access_token)
    return access_token
 #---------------------------------------------
 # look for legacy model.ckpt in models directory and offer to
 # normalize its name
 def migrate_models_ckpt():
    if not os.path.exists(os.path.join(Model_dir,'model.ckpt')):
        return
    new_name = Datasets['stable-diffusion-1.4']['file']
    print('You seem to have the Stable Diffusion v4.1 "model.ckpt" already installed.')
    rename = yes_or_no(f'Ok to rename it to "{new_name}" for future reference?')
    if rename:
        print(f'model.ckpt => {new_name}')
        os.rename(os.path.join(Model_dir,'model.ckpt'),os.path.join(Model_dir,new_name))
 #---------------------------------------------
 def download_weight_datasets(models:dict, access_token:str):
    migrate_models_ckpt()
    successful = dict()
    for mod in models.keys():
        repo_id = Datasets[mod]['repo_id']
        filename = Datasets[mod]['file']
        success = download_with_resume(
            repo_id=repo_id,
            model_name=filename,
            access_token=access_token
        )
        if success:
            successful[mod] = True
    keys = ', '.join(successful.keys())
    print(f'Successfully installed {keys}') 
    return successful
 #---------------------------------------------
 def download_with_resume(repo_id:str, model_name:str, access_token:str)->bool:
    model_dest = os.path.join(Model_dir, model_name)
    os.makedirs(os.path.dirname(model_dest), exist_ok=True)
    url = hf_hub_url(repo_id, model_name)
    header = {"Authorization": f'Bearer {access_token}'}
    open_mode = 'wb'
    exist_size = 0
    if os.path.exists(model_dest):
        exist_size = os.path.getsize(model_dest)
        header['Range'] = f'bytes={exist_size}-'
        open_mode = 'ab'
    resp = requests.get(url, headers=header, stream=True)
    total = int(resp.headers.get('content-length', 0))
    if resp.status_code==416:  # "range not satisfiable", which means nothing to return
        print(f'* {model_name}: complete file found. Skipping.')
        return True
    elif exist_size > 0:
        print(f'* {model_name}: partial file found. Resuming...')
    else:
        print(f'* {model_name}: Downloading...')
    try:
        if total < 2000:
            print(f'* {model_name}: {resp.text}')
            return False
        with open(model_dest, open_mode) as file, tqdm(
                desc=model_name,
                initial=exist_size,
                total=total+exist_size,
                unit='iB',
                unit_scale=True,
                unit_divisor=1000,
        ) as bar:
            for data in resp.iter_content(chunk_size=1024):
                size = file.write(data)
                bar.update(size)
    except Exception as e:
        print(f'An error occurred while downloading {model_name}: {str(e)}')
        return False
    return True
 #---------------------------------------------
 def update_config_file(successfully_downloaded:dict):
    try:
        yaml = new_config_file_contents(successfully_downloaded)
        tmpfile = os.path.join(os.path.dirname(Config_file),'new_config.tmp')
        with open(tmpfile, 'w') as outfile:
            outfile.write(Config_preamble)
            outfile.write(yaml)
        os.rename(tmpfile,Config_file)
    except Exception as e:
        print(f'**Error creating config file {Config_file}: {str(e)} **')
        return
    print(f'Successfully created new configuration file {Config_file}')
 #---------------------------------------------    
 def new_config_file_contents(successfully_downloaded:dict)->str:
    conf = OmegaConf.load(Config_file)
    # find the VAE file, if there is one
    vae = None
    default_selected = False
    for model in successfully_downloaded:
        if Datasets[model]['config'] == 'VAE':
            vae = Datasets[model]['file']
    for model in successfully_downloaded:
        if Datasets[model]['config'] == 'VAE': # skip VAE entries
            continue
        stanza = conf[model] if model in conf else { }
        stanza['description'] = Datasets[model]['description']
        stanza['weights'] = os.path.join(Model_dir,Datasets[model]['file'])
        stanza['config'] =os.path.join(SD_Configs, Datasets[model]['config'])
        stanza['width'] = Datasets[model]['width']
        stanza['height'] = Datasets[model]['height']
        stanza.pop('default',None)  # this will be set later
        if vae:
            stanza['vae'] = os.path.join(Model_dir,vae)
        # BUG - the first stanza is always the default. User should select.
        if not default_selected:
            stanza['default'] = True
            default_selected = True
        conf[model] = stanza
    return OmegaConf.to_yaml(conf)
 #---------------------------------------------
 # this will preload the Bert tokenizer fles
 def download_bert():
@ -66,7 +415,6 @@ def download_gfpgan():
        print(traceback.format_exc())
    print('Loading models from GFPGAN')
    import urllib.request
    for model in (
            [
                'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth',
@ -152,6 +500,31 @@ def download_safety_checker():
 #-------------------------------------
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='InvokeAI model downloader')
    parser.add_argument('--interactive',
                        dest='interactive',
                        action=argparse.BooleanOptionalAction,
                        default=True,
                        help='run in interactive mode (default)')
    opt = parser.parse_args()
    try:
        if opt.interactive:
            introduction()
            print('** WEIGHT SELECTION **')
            choice = user_wants_to_download_weights()
            if choice != 'skip':
                models = select_datasets(choice)
                if models is None:
                    if yes_or_no('Quit?',default_yes=False):
                        sys.exit(0)
                print('** LICENSE AGREEMENT FOR WEIGHT FILES **')
                access_token = authenticate()
                print('\n** DOWNLOADING WEIGHTS **')
                successfully_downloaded = download_weight_datasets(models, access_token)
                update_config_file(successfully_downloaded)
        else:
            print('\n** DOWNLOADING SUPPORT MODELS **')
            download_bert()
            download_kornia()
            download_clip()
@ -159,5 +532,11 @@ if __name__ == '__main__':
            download_codeformer()
            download_clipseg()
            download_safety_checker()
            postscript()
    except KeyboardInterrupt:
        print('\nGoodbye! Come back soon.')
    except Exception as e:
        print(f'\nA problem occurred during download.\nThe error was: "{str(e)}"')
		`@ -0,0 +1,2 @@`
							`See docs/features/INSTALLING_MODELS.md for how to populate this`
							`directory with one or more Stable Diffusion model weight files.`