diff --git a/.github/workflows/test-invoke-conda.yml b/.github/workflows/test-invoke-conda.yml index 5825d58c11..adac557382 100644 --- a/.github/workflows/test-invoke-conda.yml +++ b/.github/workflows/test-invoke-conda.yml @@ -84,7 +84,9 @@ jobs: - name: run preload_models.py id: run-preload-models - run: python scripts/preload_models.py + run: | + python scripts/preload_models.py \ + --no-interactive - name: Run the tests id: run-tests diff --git a/.gitignore b/.gitignore index ecef2713bc..33f0de4df2 100644 --- a/.gitignore +++ b/.gitignore @@ -199,7 +199,13 @@ checkpoints .scratch/ .vscode/ gfpgan/ -models/ldm/stable-diffusion-v1/model.sha256 +models/ldm/stable-diffusion-v1/*.sha256 # GFPGAN model files gfpgan/ + +# config file (will be created by installer) +configs/models.yaml + +# weights (will be created by installer) +models/ldm/stable-diffusion-v1/*.ckpt \ No newline at end of file diff --git a/configs/autoencoder/autoencoder_kl_16x16x16.yaml b/configs/autoencoder/autoencoder_kl_16x16x16.yaml deleted file mode 100644 index 5f1d10ec75..0000000000 --- a/configs/autoencoder/autoencoder_kl_16x16x16.yaml +++ /dev/null @@ -1,54 +0,0 @@ -model: - base_learning_rate: 4.5e-6 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: "val/rec_loss" - embed_dim: 16 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 0.000001 - disc_weight: 0.5 - - ddconfig: - double_z: True - z_channels: 16 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: [ 1,1,2,2,4] # num_down = len(ch_mult)-1 - num_res_blocks: 2 - attn_resolutions: [16] - dropout: 0.0 - - -data: - target: main.DataModuleFromConfig - params: - batch_size: 12 - wrap: True - train: - target: ldm.data.imagenet.ImageNetSRTrain - params: - size: 256 - degradation: pil_nearest - validation: - target: ldm.data.imagenet.ImageNetSRValidation - params: - size: 256 - degradation: pil_nearest - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 1000 - max_images: 8 - increase_log_steps: True - - trainer: - benchmark: True - accumulate_grad_batches: 2 diff --git a/configs/autoencoder/autoencoder_kl_32x32x4.yaml b/configs/autoencoder/autoencoder_kl_32x32x4.yaml deleted file mode 100644 index ab8b36fe6e..0000000000 --- a/configs/autoencoder/autoencoder_kl_32x32x4.yaml +++ /dev/null @@ -1,53 +0,0 @@ -model: - base_learning_rate: 4.5e-6 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: "val/rec_loss" - embed_dim: 4 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 0.000001 - disc_weight: 0.5 - - ddconfig: - double_z: True - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: [ 1,2,4,4 ] # num_down = len(ch_mult)-1 - num_res_blocks: 2 - attn_resolutions: [ ] - dropout: 0.0 - -data: - target: main.DataModuleFromConfig - params: - batch_size: 12 - wrap: True - train: - target: ldm.data.imagenet.ImageNetSRTrain - params: - size: 256 - degradation: pil_nearest - validation: - target: ldm.data.imagenet.ImageNetSRValidation - params: - size: 256 - degradation: pil_nearest - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 1000 - max_images: 8 - increase_log_steps: True - - trainer: - benchmark: True - accumulate_grad_batches: 2 diff --git a/configs/autoencoder/autoencoder_kl_64x64x3.yaml b/configs/autoencoder/autoencoder_kl_64x64x3.yaml deleted file mode 100644 index 5e3db5c4e2..0000000000 --- a/configs/autoencoder/autoencoder_kl_64x64x3.yaml +++ /dev/null @@ -1,54 +0,0 @@ -model: - base_learning_rate: 4.5e-6 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: "val/rec_loss" - embed_dim: 3 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 0.000001 - disc_weight: 0.5 - - ddconfig: - double_z: True - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1 - num_res_blocks: 2 - attn_resolutions: [ ] - dropout: 0.0 - - -data: - target: main.DataModuleFromConfig - params: - batch_size: 12 - wrap: True - train: - target: ldm.data.imagenet.ImageNetSRTrain - params: - size: 256 - degradation: pil_nearest - validation: - target: ldm.data.imagenet.ImageNetSRValidation - params: - size: 256 - degradation: pil_nearest - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 1000 - max_images: 8 - increase_log_steps: True - - trainer: - benchmark: True - accumulate_grad_batches: 2 diff --git a/configs/autoencoder/autoencoder_kl_8x8x64.yaml b/configs/autoencoder/autoencoder_kl_8x8x64.yaml deleted file mode 100644 index 5ccd09d38e..0000000000 --- a/configs/autoencoder/autoencoder_kl_8x8x64.yaml +++ /dev/null @@ -1,53 +0,0 @@ -model: - base_learning_rate: 4.5e-6 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: "val/rec_loss" - embed_dim: 64 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 0.000001 - disc_weight: 0.5 - - ddconfig: - double_z: True - z_channels: 64 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: [ 1,1,2,2,4,4] # num_down = len(ch_mult)-1 - num_res_blocks: 2 - attn_resolutions: [16,8] - dropout: 0.0 - -data: - target: main.DataModuleFromConfig - params: - batch_size: 12 - wrap: True - train: - target: ldm.data.imagenet.ImageNetSRTrain - params: - size: 256 - degradation: pil_nearest - validation: - target: ldm.data.imagenet.ImageNetSRValidation - params: - size: 256 - degradation: pil_nearest - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 1000 - max_images: 8 - increase_log_steps: True - - trainer: - benchmark: True - accumulate_grad_batches: 2 diff --git a/configs/latent-diffusion/celebahq-ldm-vq-4.yaml b/configs/latent-diffusion/celebahq-ldm-vq-4.yaml deleted file mode 100644 index 89b3df4fe1..0000000000 --- a/configs/latent-diffusion/celebahq-ldm-vq-4.yaml +++ /dev/null @@ -1,86 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - image_size: 64 - channels: 3 - monitor: val/loss_simple_ema - - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 224 - attention_resolutions: - # note: this isn\t actually the resolution but - # the downsampling factor, i.e. this corresnponds to - # attention on spatial resolution 8,16,32, as the - # spatial reolution of the latents is 64 for f4 - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_head_channels: 32 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ckpt_path: models/first_stage_models/vq-f4/model.ckpt - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: __is_unconditional__ -data: - target: main.DataModuleFromConfig - params: - batch_size: 48 - num_workers: 5 - wrap: false - train: - target: taming.data.faceshq.CelebAHQTrain - params: - size: 256 - validation: - target: taming.data.faceshq.CelebAHQValidation - params: - size: 256 - - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 5000 - max_images: 8 - increase_log_steps: False - - trainer: - benchmark: True \ No newline at end of file diff --git a/configs/latent-diffusion/cin-ldm-vq-f8.yaml b/configs/latent-diffusion/cin-ldm-vq-f8.yaml deleted file mode 100644 index b8cd9e2ef5..0000000000 --- a/configs/latent-diffusion/cin-ldm-vq-f8.yaml +++ /dev/null @@ -1,98 +0,0 @@ -model: - base_learning_rate: 1.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: class_label - image_size: 32 - channels: 4 - cond_stage_trainable: true - conditioning_key: crossattn - monitor: val/loss_simple_ema - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 32 - in_channels: 4 - out_channels: 4 - model_channels: 256 - attention_resolutions: - #note: this isn\t actually the resolution but - # the downsampling factor, i.e. this corresnponds to - # attention on spatial resolution 8,16,32, as the - # spatial reolution of the latents is 32 for f8 - - 4 - - 2 - - 1 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 4 - num_head_channels: 32 - use_spatial_transformer: true - transformer_depth: 1 - context_dim: 512 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 4 - n_embed: 16384 - ckpt_path: configs/first_stage_models/vq-f8/model.yaml - ddconfig: - double_z: false - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 32 - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: ldm.modules.encoders.modules.ClassEmbedder - params: - embed_dim: 512 - key: class_label -data: - target: main.DataModuleFromConfig - params: - batch_size: 64 - num_workers: 12 - wrap: false - train: - target: ldm.data.imagenet.ImageNetTrain - params: - config: - size: 256 - validation: - target: ldm.data.imagenet.ImageNetValidation - params: - config: - size: 256 - - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 5000 - max_images: 8 - increase_log_steps: False - - trainer: - benchmark: True \ No newline at end of file diff --git a/configs/latent-diffusion/cin256-v2.yaml b/configs/latent-diffusion/cin256-v2.yaml deleted file mode 100644 index b7c1aa240c..0000000000 --- a/configs/latent-diffusion/cin256-v2.yaml +++ /dev/null @@ -1,68 +0,0 @@ -model: - base_learning_rate: 0.0001 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: class_label - image_size: 64 - channels: 3 - cond_stage_trainable: true - conditioning_key: crossattn - monitor: val/loss - use_ema: False - - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 192 - attention_resolutions: - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 5 - num_heads: 1 - use_spatial_transformer: true - transformer_depth: 1 - context_dim: 512 - - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - - cond_stage_config: - target: ldm.modules.encoders.modules.ClassEmbedder - params: - n_classes: 1001 - embed_dim: 512 - key: class_label diff --git a/configs/latent-diffusion/ffhq-ldm-vq-4.yaml b/configs/latent-diffusion/ffhq-ldm-vq-4.yaml deleted file mode 100644 index 1899e30f77..0000000000 --- a/configs/latent-diffusion/ffhq-ldm-vq-4.yaml +++ /dev/null @@ -1,85 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - image_size: 64 - channels: 3 - monitor: val/loss_simple_ema - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 224 - attention_resolutions: - # note: this isn\t actually the resolution but - # the downsampling factor, i.e. this corresnponds to - # attention on spatial resolution 8,16,32, as the - # spatial reolution of the latents is 64 for f4 - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_head_channels: 32 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ckpt_path: configs/first_stage_models/vq-f4/model.yaml - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: __is_unconditional__ -data: - target: main.DataModuleFromConfig - params: - batch_size: 42 - num_workers: 5 - wrap: false - train: - target: taming.data.faceshq.FFHQTrain - params: - size: 256 - validation: - target: taming.data.faceshq.FFHQValidation - params: - size: 256 - - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 5000 - max_images: 8 - increase_log_steps: False - - trainer: - benchmark: True \ No newline at end of file diff --git a/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml b/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml deleted file mode 100644 index c4ca66c16c..0000000000 --- a/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml +++ /dev/null @@ -1,85 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - image_size: 64 - channels: 3 - monitor: val/loss_simple_ema - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 224 - attention_resolutions: - # note: this isn\t actually the resolution but - # the downsampling factor, i.e. this corresnponds to - # attention on spatial resolution 8,16,32, as the - # spatial reolution of the latents is 64 for f4 - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_head_channels: 32 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - ckpt_path: configs/first_stage_models/vq-f4/model.yaml - embed_dim: 3 - n_embed: 8192 - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: __is_unconditional__ -data: - target: main.DataModuleFromConfig - params: - batch_size: 48 - num_workers: 5 - wrap: false - train: - target: ldm.data.lsun.LSUNBedroomsTrain - params: - size: 256 - validation: - target: ldm.data.lsun.LSUNBedroomsValidation - params: - size: 256 - - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 5000 - max_images: 8 - increase_log_steps: False - - trainer: - benchmark: True \ No newline at end of file diff --git a/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml b/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml deleted file mode 100644 index 18dc8c2d9c..0000000000 --- a/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml +++ /dev/null @@ -1,91 +0,0 @@ -model: - base_learning_rate: 5.0e-5 # set to target_lr by starting main.py with '--scale_lr False' - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0155 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - loss_type: l1 - first_stage_key: "image" - cond_stage_key: "image" - image_size: 32 - channels: 4 - cond_stage_trainable: False - concat_mode: False - scale_by_std: True - monitor: 'val/loss_simple_ema' - - scheduler_config: # 10000 warmup steps - target: ldm.lr_scheduler.LambdaLinearScheduler - params: - warm_up_steps: [10000] - cycle_lengths: [10000000000000] - f_start: [1.e-6] - f_max: [1.] - f_min: [ 1.] - - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 32 - in_channels: 4 - out_channels: 4 - model_channels: 192 - attention_resolutions: [ 1, 2, 4, 8 ] # 32, 16, 8, 4 - num_res_blocks: 2 - channel_mult: [ 1,2,2,4,4 ] # 32, 16, 8, 4, 2 - num_heads: 8 - use_scale_shift_norm: True - resblock_updown: True - - first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: "val/rec_loss" - ckpt_path: "models/first_stage_models/kl-f8/model.ckpt" - ddconfig: - double_z: True - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: [ 1,2,4,4 ] # num_down = len(ch_mult)-1 - num_res_blocks: 2 - attn_resolutions: [ ] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - - cond_stage_config: "__is_unconditional__" - -data: - target: main.DataModuleFromConfig - params: - batch_size: 96 - num_workers: 5 - wrap: False - train: - target: ldm.data.lsun.LSUNChurchesTrain - params: - size: 256 - validation: - target: ldm.data.lsun.LSUNChurchesValidation - params: - size: 256 - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 5000 - max_images: 8 - increase_log_steps: False - - - trainer: - benchmark: True \ No newline at end of file diff --git a/configs/latent-diffusion/txt2img-1p4B-eval.yaml b/configs/latent-diffusion/txt2img-1p4B-eval.yaml deleted file mode 100644 index 8e331cbfdf..0000000000 --- a/configs/latent-diffusion/txt2img-1p4B-eval.yaml +++ /dev/null @@ -1,71 +0,0 @@ -model: - base_learning_rate: 5.0e-05 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.00085 - linear_end: 0.012 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: caption - image_size: 32 - channels: 4 - cond_stage_trainable: true - conditioning_key: crossattn - monitor: val/loss_simple_ema - scale_factor: 0.18215 - use_ema: False - - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 32 - in_channels: 4 - out_channels: 4 - model_channels: 320 - attention_resolutions: - - 4 - - 2 - - 1 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 4 - - 4 - num_heads: 8 - use_spatial_transformer: true - transformer_depth: 1 - context_dim: 1280 - use_checkpoint: true - legacy: False - - first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: val/rec_loss - ddconfig: - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - - cond_stage_config: - target: ldm.modules.encoders.modules.BERTEmbedder - params: - n_embed: 1280 - n_layer: 32 diff --git a/configs/models.yaml b/configs/models.yaml index 162da38da2..aabe4b9ce2 100644 --- a/configs/models.yaml +++ b/configs/models.yaml @@ -1,29 +1,36 @@ # This file describes the alternative machine learning models -# available to the dream script. +# available to InvokeAI script. # # To add a new model, follow the examples below. Each # model requires a model config file, a weights file, # and the width and height of the images it # was trained on. stable-diffusion-1.4: - config: configs/stable-diffusion/v1-inference.yaml - weights: models/ldm/stable-diffusion-v1/model.ckpt -# vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt - description: Stable Diffusion inference model version 1.4 - width: 512 - height: 512 - default: true -inpainting-1.5: - description: runwayML tuned inpainting model v1.5 - weights: models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt - config: configs/stable-diffusion/v1-inpainting-inference.yaml -# vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt + config: ./configs/stable-diffusion/v1-inference.yaml + weights: ./models/ldm/stable-diffusion-v1/sd-v1-4.ckpt + vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt + description: The original Stable Diffusion version 1.4 weight file (4.27 GB) width: 512 height: 512 stable-diffusion-1.5: - config: configs/stable-diffusion/v1-inference.yaml - weights: models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt -# vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt - description: Stable Diffusion inference model version 1.5 + description: The newest Stable Diffusion version 1.5 weight file (4.27 GB) + weights: ./models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt + config: ./configs/stable-diffusion/v1-inference.yaml width: 512 height: 512 + vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt + default: true +inpainting-1.5: + description: RunwayML SD 1.5 model optimized for inpainting (4.27 GB) + weights: ./models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt + config: ./configs/stable-diffusion/v1-inpainting-inference.yaml + width: 512 + height: 512 + vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt +waifu-diffusion-1.3: + description: Stable Diffusion 1.4 fine tuned on anime-styled images (4.27) + weights: ./models/ldm/stable-diffusion-v1/model-epoch09-float32.ckpt + config: ./configs/stable-diffusion/v1-inference.yaml + width: 512 + height: 512 + vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt diff --git a/configs/retrieval-augmented-diffusion/768x768.yaml b/configs/retrieval-augmented-diffusion/768x768.yaml deleted file mode 100644 index b51b1d8373..0000000000 --- a/configs/retrieval-augmented-diffusion/768x768.yaml +++ /dev/null @@ -1,68 +0,0 @@ -model: - base_learning_rate: 0.0001 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.015 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: jpg - cond_stage_key: nix - image_size: 48 - channels: 16 - cond_stage_trainable: false - conditioning_key: crossattn - monitor: val/loss_simple_ema - scale_by_std: false - scale_factor: 0.22765929 - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 48 - in_channels: 16 - out_channels: 16 - model_channels: 448 - attention_resolutions: - - 4 - - 2 - - 1 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - use_scale_shift_norm: false - resblock_updown: false - num_head_channels: 32 - use_spatial_transformer: true - transformer_depth: 1 - context_dim: 768 - use_checkpoint: true - first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: val/rec_loss - embed_dim: 16 - ddconfig: - double_z: true - z_channels: 16 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 1 - - 2 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 16 - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: torch.nn.Identity \ No newline at end of file diff --git a/docs/features/CLI.md b/docs/features/CLI.md index 8e7b5780f0..7ce06d1fea 100644 --- a/docs/features/CLI.md +++ b/docs/features/CLI.md @@ -385,7 +385,7 @@ automatically. Example:
-invoke> !import_model models/ldm/stable-diffusion-v1/ model-epoch08-float16.ckpt +invoke> !import_model models/ldm/stable-diffusion-v1/model-epoch08-float16.ckpt >> Model import in process. Please enter the values needed to configure this model: Name for this model: waifu-diffusion diff --git a/docs/installation/INSTALLING_MODELS.md b/docs/installation/INSTALLING_MODELS.md new file mode 100644 index 0000000000..b5d659b0d1 --- /dev/null +++ b/docs/installation/INSTALLING_MODELS.md @@ -0,0 +1,267 @@ +--- +title: Installing Models +--- + +# :octicons-paintbrush-16: Installing Models + +## Model Weight Files + +The model weight files ('*.ckpt') are the Stable Diffusion "secret +sauce". They are the product of training the AI on millions of +captioned images gathered from multiple sources. + +Originally there was only a single Stable Diffusion weights file, +which many people named `model.ckpt`. Now there are dozens or more +that have been "fine tuned" to provide particulary styles, genres, or +other features. InvokeAI allows you to install and run multiple model +weight files and switch between them quickly in the command-line and +web interfaces. + +This manual will guide you through installing and configuring model +weight files. + +## Base Models + +InvokeAI comes with support for a good initial set of models listed in +the model configuration file `configs/models.yaml`. They are: + +| Model | Weight File | Description | DOWNLOAD FROM | +| ---------------------- | ----------------------------- |--------------------------------- | ----------------| +| stable-diffusion-1.5 | v1-5-pruned-emaonly.ckpt | Most recent version of base Stable Diffusion model| https://huggingface.co/runwayml/stable-diffusion-v1-5 | +| stable-diffusion-1.4 | sd-v1-4.ckpt | Previous version of base Stable Diffusion model | https://huggingface.co/CompVis/stable-diffusion-v-1-4-original | +| inpainting-1.5 | sd-v1-5-inpainting.ckpt | Stable Diffusion 1.5 model specialized for inpainting | https://huggingface.co/runwayml/stable-diffusion-inpainting | +| waifu-diffusion-1.3 | model-epoch09-float32.ckpt | Stable Diffusion 1.4 trained to produce anime images | https://huggingface.co/hakurei/waifu-diffusion-v1-3 | +|| vae-ft-mse-840000-ema-pruned.ckpt | A fine-tune file add-on file that improves face generation | https://huggingface.co/stabilityai/sd-vae-ft-mse-original/ | + + +Note that these files are covered by an "Ethical AI" license which +forbids certain uses. You will need to create an account on the +Hugging Face website and accept the license terms before you can +access the files. + +The predefined configuration file for InvokeAI (located at +`configs/models.yaml`) provides entries for each of these weights +files. `stable-diffusion-1.5` is the default model used, and we +strongly recommend that you install this weights file if nothing else. + +## Community-Contributed Models + +There are too many to list here and more are being contributed every +day. Hugging Face maintains a [fast-growing +repository](https://huggingface.co/sd-concepts-library) of fine-tune +(".bin") models that can be imported into InvokeAI by passing the +`--embedding_path` option to the `invoke.py` command. + +[This page](https://rentry.org/sdmodels) hosts a large list of +official and unofficial Stable Diffusion models and where they can be +obtained. + +## Installation + +There are three ways to install weights files: + +1. During InvokeAI installation, the `preload_models.py` script can +download them for you. + +2. You can use the command-line interface (CLI) to import, configure +and modify new models files. + +3. You can download the files manually and add the appropriate entries +to `models.yaml`. + +### Installation via `preload_models.py` + +This is the most automatic way. Run `scripts/preload_models.py` from +the console. It will ask you to select which models to download and +lead you through the steps of setting up a Hugging Face account if you +haven't done so already. + +To start, from within the InvokeAI directory run the command `python +scripts/preload_models.py` (Linux/MacOS) or `python +scripts\preload_models.py` (Windows): + +``` +Loading Python libraries... + +** INTRODUCTION ** +Welcome to InvokeAI. This script will help download the Stable Diffusion weight files +and other large models that are needed for text to image generation. At any point you may interrupt +this program and resume later. + +** WEIGHT SELECTION ** +Would you like to download the Stable Diffusion model weights now? [y] + +Choose the weight file(s) you wish to download. Before downloading you +will be given the option to view and change your selections. + +[1] stable-diffusion-1.5: + The newest Stable Diffusion version 1.5 weight file (4.27 GB) (recommended) + Download? [y] +[2] inpainting-1.5: + RunwayML SD 1.5 model optimized for inpainting (4.27 GB) (recommended) + Download? [y] +[3] stable-diffusion-1.4: + The original Stable Diffusion version 1.4 weight file (4.27 GB) + Download? [n] n +[4] waifu-diffusion-1.3: + Stable Diffusion 1.4 fine tuned on anime-styled images (4.27) + Download? [n] y +[5] ft-mse-improved-autoencoder-840000: + StabilityAI improved autoencoder fine-tuned for human faces (recommended; 335 MB) (recommended) + Download? [y] y +The following weight files will be downloaded: + [1] stable-diffusion-1.5* + [2] inpainting-1.5 + [4] waifu-diffusion-1.3 + [5] ft-mse-improved-autoencoder-840000 +*default +Ok to download? [y] +** LICENSE AGREEMENT FOR WEIGHT FILES ** + +1. To download the Stable Diffusion weight files you need to read and accept the + CreativeML Responsible AI license. If you have not already done so, please + create an account using the "Sign Up" button: + + https://huggingface.co + + You will need to verify your email address as part of the HuggingFace + registration process. + +2. After creating the account, login under your account and accept + the license terms located here: + + https://huggingface.co/CompVis/stable-diffusion-v-1-4-original + +Press when you are ready to continue: +... +``` + +When the script is complete, you will find the downloaded weights +files in `models/ldm/stable-diffusion-v1` and a matching configuration +file in `configs/models.yaml`. + +You can run the script again to add any models you didn't select the +first time. Note that as a safety measure the script will _never_ +remove a previously-installed weights file. You will have to do this +manually. + +### Installation via the CLI + +You can install a new model, including any of the community-supported +ones, via the command-line client's `!import_model` command. + +1. First download the desired model weights file and place it under `models/ldm/stable-diffusion-v1/`. + You may rename the weights file to something more memorable if you wish. Record the path of the + weights file (e.g. `models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt`) + +2. Launch the `invoke.py` CLI with `python scripts/invoke.py`. + +3. At the `invoke>` command-line, enter the command `!import_model `. + For example: + + `invoke> !import_model models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt` + + (Hint - the CLI supports file path autocompletion. Type a bit of the path + name and hit in order to get a choice of possible completions.) + +4. Follow the wizard's instructions to complete installation as shown in the example + here: + +``` +invoke> !import_model models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt +>> Model import in process. Please enter the values needed to configure this model: + +Name for this model: arabian-nights +Description of this model: Arabian Nights Fine Tune v1.0 +Configuration file for this model: configs/stable-diffusion/v1-inference.yaml +Default image width: 512 +Default image height: 512 +>> New configuration: +arabian-nights: + config: configs/stable-diffusion/v1-inference.yaml + description: Arabian Nights Fine Tune v1.0 + height: 512 + weights: models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt + width: 512 +OK to import [n]? y +>> Caching model stable-diffusion-1.4 in system RAM +>> Loading waifu-diffusion from models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt + | LatentDiffusion: Running in eps-prediction mode + | DiffusionWrapper has 859.52 M params. + | Making attention of type 'vanilla' with 512 in_channels + | Working with z of shape (1, 4, 32, 32) = 4096 dimensions. + | Making attention of type 'vanilla' with 512 in_channels + | Using faster float16 precision + +``` + +If you've previously installed the fine-tune VAE file `vae-ft-mse-840000-ema-pruned.ckpt`, +the wizard will also ask you if you want to add this VAE to the model. + +The appropriate entry for this model will be added to `configs/models.yaml` and it will +be available to use in the CLI immediately. + +The CLI has additional commands for switching among, viewing, editing, +deleting the available models. These are described in [Command Line +Client](../features/CLI.md#model-selection-and-importation), but the two most +frequently-used are `!models` and `!switch `. The first +prints a table of models that InvokeAI knows about and their load +status. The second will load the requested model and lets you switch +back and forth quickly among loaded models. + +### Manually editing of `configs/models.yaml` + +If you are comfortable with a text editor then you may simply edit +`models.yaml` directly. + +First you need to download the desired .ckpt file and place it in +`models/ldm/stable-diffusion-v1` as descirbed in step #1 in the +previous section. Record the path to the weights file, +e.g. `models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt` + +Then using a **text** editor (e.g. the Windows Notepad application), +open the file `configs/models.yaml`, and add a new stanza that follows +this model: + +``` +arabian-nights-1.0: + description: A great fine-tune in Arabian Nights style + weights: ./models/ldm/stable-diffusion-v1/arabian-nights-1.0.ckpt + config: ./configs/stable-diffusion/v1-inference.yaml + width: 512 + height: 512 + vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt + default: false +``` + +* arabian-nights-1.0 + - This is the name of the model that you will refer to from within the + CLI and the WebGUI when you need to load and use the model. + +* description + - Any description that you want to add to the model to remind you what + it is. + +* weights + - Relative path to the .ckpt weights file for this model. + +* config + - This is the confusingly-named configuration file for the model itself. + Use `./configs/stable-diffusion/v1-inference.yaml` unless the model happens + to need a custom configuration, in which case the place you downloaded it + from will tell you what to use instead. For example, the runwayML custom + inpainting model requires the file `configs/stable-diffusion/v1-inpainting-inference.yaml`. + This is already inclued in the InvokeAI distribution and is configured automatically + for you by the `preload_models.py` script. + +* vae + - If you want to add a VAE file to the model, then enter its path here. + +* width, height + - This is the width and height of the images used to train the model. + Currently they are always 512 and 512. + +Save the `models.yaml` and relaunch InvokeAI. The new model should now be +available for your use. + + diff --git a/docs/installation/INSTALL_LINUX.md b/docs/installation/INSTALL_LINUX.md index 629175c3fa..174fdfde05 100644 --- a/docs/installation/INSTALL_LINUX.md +++ b/docs/installation/INSTALL_LINUX.md @@ -1,5 +1,5 @@ --- -title: Linux +title: Manual Installation, Linux --- # :fontawesome-brands-linux: Linux @@ -63,24 +63,16 @@ title: Linux model loading scheme to allow the script to work on GPU machines that are not internet connected. See [Preload Models](../features/OTHER.md#preload-models) -7. Now you need to install the weights for the stable diffusion model. +7. Install the weights for the stable diffusion model. - - For running with the released weights, you will first need to set up an acount - with [Hugging Face](https://huggingface.co). - - Use your credentials to log in, and then point your browser [here](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original). - - You may be asked to sign a license agreement at this point. - - Click on "Files and versions" near the top of the page, and then click on the - file named "sd-v1-4.ckpt". You'll be taken to a page that prompts you to click - the "download" link. Save the file somewhere safe on your local machine. +- Sign up at https://huggingface.co +- Go to the [Stable diffusion diffusion model page](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original) +- Accept the terms and click Access Repository +- Download [v1-5-pruned-emaonly.ckpt (4.27 GB)](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt) +and move it into this directory under `models/ldm/stable_diffusion_v1/v1-5-pruned-emaonly.ckpt` - Now run the following commands from within the stable-diffusion directory. - This will create a symbolic link from the stable-diffusion model.ckpt file, to - the true location of the `sd-v1-4.ckpt` file. - - ```bash - (invokeai) ~/InvokeAI$ mkdir -p models/ldm/stable-diffusion-v1 - (invokeai) ~/InvokeAI$ ln -sf /path/to/sd-v1-4.ckpt models/ldm/stable-diffusion-v1/model.ckpt - ``` +There are many other models that you can use. Please see [../features/INSTALLING_MODELS.md] +for details. 8. Start generating images! diff --git a/docs/installation/INSTALL_MAC.md b/docs/installation/INSTALL_MAC.md index 06624eb5e8..a458eaf43c 100644 --- a/docs/installation/INSTALL_MAC.md +++ b/docs/installation/INSTALL_MAC.md @@ -1,5 +1,5 @@ --- -title: macOS +title: Manual Installation, macOS --- # :fontawesome-brands-apple: macOS @@ -24,9 +24,15 @@ First you need to download a large checkpoint file. 1. Sign up at https://huggingface.co 2. Go to the [Stable diffusion diffusion model page](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original) 3. Accept the terms and click Access Repository -4. Download [sd-v1-4.ckpt (4.27 GB)](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/blob/main/sd-v1-4.ckpt) and note where you have saved it (probably the Downloads folder). You may want to move it somewhere else for longer term storage - SD needs this file to run. +4. Download [v1-5-pruned-emaonly.ckpt (4.27 GB)](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt) +and move it into this directory under `models/ldm/stable_diffusion_v1/v1-5-pruned-emaonly.ckpt` -While that is downloading, open Terminal and run the following commands one at a time, reading the comments and taking care to run the appropriate command for your Mac's architecture (Intel or M1). +There are many other models that you can try. Please see [../features/INSTALLING_MODELS.md] +for details. + +While that is downloading, open Terminal and run the following +commands one at a time, reading the comments and taking care to run +the appropriate command for your Mac's architecture (Intel or M1). !!! todo "Homebrew" diff --git a/docs/installation/INSTALL_WINDOWS.md b/docs/installation/INSTALL_WINDOWS.md index c7dc9065ea..c3cd6465f4 100644 --- a/docs/installation/INSTALL_WINDOWS.md +++ b/docs/installation/INSTALL_WINDOWS.md @@ -1,5 +1,5 @@ --- -title: Windows +title: Manual Installation, Windows --- # :fontawesome-brands-windows: Windows @@ -83,23 +83,14 @@ in the wiki 8. Now you need to install the weights for the big stable diffusion model. - 1. For running with the released weights, you will first need to set up an acount with Hugging Face (https://huggingface.co). - 2. Use your credentials to log in, and then point your browser at https://huggingface.co/CompVis/stable-diffusion-v-1-4-original. - 3. You may be asked to sign a license agreement at this point. - 4. Click on "Files and versions" near the top of the page, and then click on the file named `sd-v1-4.ckpt`. You'll be taken to a page that - prompts you to click the "download" link. Now save the file somewhere safe on your local machine. - 5. The weight file is >4 GB in size, so - downloading may take a while. + - Sign up at https://huggingface.co + - Go to the [Stable diffusion diffusion model page](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original) + - Accept the terms and click Access Repository + - Download [v1-5-pruned-emaonly.ckpt (4.27 GB)](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt) + and move it into this directory under `models/ldm/stable_diffusion_v1/v1-5-pruned-emaonly.ckpt` - Now run the following commands from **within the InvokeAI directory** to copy the weights file to the right place: - - ```batch - mkdir -p models\ldm\stable-diffusion-v1 - copy C:\path\to\sd-v1-4.ckpt models\ldm\stable-diffusion-v1\model.ckpt - ``` - - Please replace `C:\path\to\sd-v1.4.ckpt` with the correct path to wherever you stashed this file. If you prefer not to copy or move the .ckpt file, - you may instead create a shortcut to it from within `models\ldm\stable-diffusion-v1\`. + There are many other models that you can use. Please see [../features/INSTALLING_MODELS.md] + for details. 9. Start generating images! diff --git a/ldm/invoke/model_cache.py b/ldm/invoke/model_cache.py index f972a9eb16..ff72ce951f 100644 --- a/ldm/invoke/model_cache.py +++ b/ldm/invoke/model_cache.py @@ -227,11 +227,14 @@ class ModelCache(object): print(' | Using more accurate float32 precision') # look and load a matching vae file. Code borrowed from AUTOMATIC1111 modules/sd_models.py - if vae and os.path.exists(vae): - print(f' | Loading VAE weights from: {vae}') - vae_ckpt = torch.load(vae, map_location="cpu") - vae_dict = {k: v for k, v in vae_ckpt["state_dict"].items() if k[0:4] != "loss"} - model.first_stage_model.load_state_dict(vae_dict, strict=False) + if vae: + if os.path.exists(vae): + print(f' | Loading VAE weights from: {vae}') + vae_ckpt = torch.load(vae, map_location="cpu") + vae_dict = {k: v for k, v in vae_ckpt["state_dict"].items() if k[0:4] != "loss"} + model.first_stage_model.load_state_dict(vae_dict, strict=False) + else: + print(f' | VAE file {vae} not found. Skipping.') model.to(self.device) # model.to doesn't change the cond_stage_model.device used to move the tokenizer output, so set it here @@ -281,7 +284,7 @@ class ModelCache(object): Returns the preamble for the config file. ''' return '''# This file describes the alternative machine learning models -# available to the dream script. +# available to InvokeAI script. # # To add a new model, follow the examples below. Each # model requires a model config file, a weights file, diff --git a/models/first_stage_models/kl-f16/config.yaml b/models/first_stage_models/kl-f16/config.yaml deleted file mode 100644 index 661921cf75..0000000000 --- a/models/first_stage_models/kl-f16/config.yaml +++ /dev/null @@ -1,44 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: val/rec_loss - embed_dim: 16 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 1.0e-06 - disc_weight: 0.5 - ddconfig: - double_z: true - z_channels: 16 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 1 - - 2 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 16 - dropout: 0.0 -data: - target: main.DataModuleFromConfig - params: - batch_size: 6 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - size: 384 - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - size: 384 - crop_size: 256 diff --git a/models/first_stage_models/kl-f32/config.yaml b/models/first_stage_models/kl-f32/config.yaml deleted file mode 100644 index 7b642b136a..0000000000 --- a/models/first_stage_models/kl-f32/config.yaml +++ /dev/null @@ -1,46 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: val/rec_loss - embed_dim: 64 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 1.0e-06 - disc_weight: 0.5 - ddconfig: - double_z: true - z_channels: 64 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 1 - - 2 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 16 - - 8 - dropout: 0.0 -data: - target: main.DataModuleFromConfig - params: - batch_size: 6 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - size: 384 - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - size: 384 - crop_size: 256 diff --git a/models/first_stage_models/kl-f4/config.yaml b/models/first_stage_models/kl-f4/config.yaml deleted file mode 100644 index 85cfb3e94e..0000000000 --- a/models/first_stage_models/kl-f4/config.yaml +++ /dev/null @@ -1,41 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: val/rec_loss - embed_dim: 3 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 1.0e-06 - disc_weight: 0.5 - ddconfig: - double_z: true - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 -data: - target: main.DataModuleFromConfig - params: - batch_size: 10 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - size: 384 - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - size: 384 - crop_size: 256 diff --git a/models/first_stage_models/kl-f8/config.yaml b/models/first_stage_models/kl-f8/config.yaml deleted file mode 100644 index 921aa42533..0000000000 --- a/models/first_stage_models/kl-f8/config.yaml +++ /dev/null @@ -1,42 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: val/rec_loss - embed_dim: 4 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 1.0e-06 - disc_weight: 0.5 - ddconfig: - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 -data: - target: main.DataModuleFromConfig - params: - batch_size: 4 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - size: 384 - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - size: 384 - crop_size: 256 diff --git a/models/first_stage_models/vq-f16/config.yaml b/models/first_stage_models/vq-f16/config.yaml deleted file mode 100644 index 91c7454906..0000000000 --- a/models/first_stage_models/vq-f16/config.yaml +++ /dev/null @@ -1,49 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.VQModel - params: - embed_dim: 8 - n_embed: 16384 - ddconfig: - double_z: false - z_channels: 8 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 1 - - 2 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 16 - dropout: 0.0 - lossconfig: - target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator - params: - disc_conditional: false - disc_in_channels: 3 - disc_start: 250001 - disc_weight: 0.75 - disc_num_layers: 2 - codebook_weight: 1.0 - -data: - target: main.DataModuleFromConfig - params: - batch_size: 14 - num_workers: 20 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - size: 384 - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - size: 384 - crop_size: 256 diff --git a/models/first_stage_models/vq-f4-noattn/config.yaml b/models/first_stage_models/vq-f4-noattn/config.yaml deleted file mode 100644 index f8e499fa2a..0000000000 --- a/models/first_stage_models/vq-f4-noattn/config.yaml +++ /dev/null @@ -1,46 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.VQModel - params: - embed_dim: 3 - n_embed: 8192 - monitor: val/rec_loss - - ddconfig: - attn_type: none - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator - params: - disc_conditional: false - disc_in_channels: 3 - disc_start: 11 - disc_weight: 0.75 - codebook_weight: 1.0 - -data: - target: main.DataModuleFromConfig - params: - batch_size: 8 - num_workers: 12 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - crop_size: 256 diff --git a/models/first_stage_models/vq-f4/config.yaml b/models/first_stage_models/vq-f4/config.yaml deleted file mode 100644 index 7d8cef3252..0000000000 --- a/models/first_stage_models/vq-f4/config.yaml +++ /dev/null @@ -1,45 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.VQModel - params: - embed_dim: 3 - n_embed: 8192 - monitor: val/rec_loss - - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator - params: - disc_conditional: false - disc_in_channels: 3 - disc_start: 0 - disc_weight: 0.75 - codebook_weight: 1.0 - -data: - target: main.DataModuleFromConfig - params: - batch_size: 8 - num_workers: 16 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - crop_size: 256 diff --git a/models/first_stage_models/vq-f8-n256/config.yaml b/models/first_stage_models/vq-f8-n256/config.yaml deleted file mode 100644 index 8519e13d61..0000000000 --- a/models/first_stage_models/vq-f8-n256/config.yaml +++ /dev/null @@ -1,48 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.VQModel - params: - embed_dim: 4 - n_embed: 256 - monitor: val/rec_loss - ddconfig: - double_z: false - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 32 - dropout: 0.0 - lossconfig: - target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator - params: - disc_conditional: false - disc_in_channels: 3 - disc_start: 250001 - disc_weight: 0.75 - codebook_weight: 1.0 - -data: - target: main.DataModuleFromConfig - params: - batch_size: 10 - num_workers: 20 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - size: 384 - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - size: 384 - crop_size: 256 diff --git a/models/first_stage_models/vq-f8/config.yaml b/models/first_stage_models/vq-f8/config.yaml deleted file mode 100644 index efd6801ca9..0000000000 --- a/models/first_stage_models/vq-f8/config.yaml +++ /dev/null @@ -1,48 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.VQModel - params: - embed_dim: 4 - n_embed: 16384 - monitor: val/rec_loss - ddconfig: - double_z: false - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 32 - dropout: 0.0 - lossconfig: - target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator - params: - disc_conditional: false - disc_in_channels: 3 - disc_num_layers: 2 - disc_start: 1 - disc_weight: 0.6 - codebook_weight: 1.0 -data: - target: main.DataModuleFromConfig - params: - batch_size: 10 - num_workers: 20 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - size: 384 - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - size: 384 - crop_size: 256 diff --git a/models/ldm/bsr_sr/config.yaml b/models/ldm/bsr_sr/config.yaml deleted file mode 100644 index 861692a8d1..0000000000 --- a/models/ldm/bsr_sr/config.yaml +++ /dev/null @@ -1,80 +0,0 @@ -model: - base_learning_rate: 1.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0155 - log_every_t: 100 - timesteps: 1000 - loss_type: l2 - first_stage_key: image - cond_stage_key: LR_image - image_size: 64 - channels: 3 - concat_mode: true - cond_stage_trainable: false - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 6 - out_channels: 3 - model_channels: 160 - attention_resolutions: - - 16 - - 8 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 2 - - 4 - num_head_channels: 32 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - monitor: val/rec_loss - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: torch.nn.Identity -data: - target: main.DataModuleFromConfig - params: - batch_size: 64 - wrap: false - num_workers: 12 - train: - target: ldm.data.openimages.SuperresOpenImagesAdvancedTrain - params: - size: 256 - degradation: bsrgan_light - downscale_f: 4 - min_crop_f: 0.5 - max_crop_f: 1.0 - random_crop: true - validation: - target: ldm.data.openimages.SuperresOpenImagesAdvancedValidation - params: - size: 256 - degradation: bsrgan_light - downscale_f: 4 - min_crop_f: 0.5 - max_crop_f: 1.0 - random_crop: true diff --git a/models/ldm/celeba256/config.yaml b/models/ldm/celeba256/config.yaml deleted file mode 100644 index a12f4e9d39..0000000000 --- a/models/ldm/celeba256/config.yaml +++ /dev/null @@ -1,70 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: class_label - image_size: 64 - channels: 3 - cond_stage_trainable: false - concat_mode: false - monitor: val/loss - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 224 - attention_resolutions: - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_head_channels: 32 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: __is_unconditional__ -data: - target: main.DataModuleFromConfig - params: - batch_size: 48 - num_workers: 5 - wrap: false - train: - target: ldm.data.faceshq.CelebAHQTrain - params: - size: 256 - validation: - target: ldm.data.faceshq.CelebAHQValidation - params: - size: 256 diff --git a/models/ldm/cin256/config.yaml b/models/ldm/cin256/config.yaml deleted file mode 100644 index 9bc1b4566a..0000000000 --- a/models/ldm/cin256/config.yaml +++ /dev/null @@ -1,80 +0,0 @@ -model: - base_learning_rate: 1.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: class_label - image_size: 32 - channels: 4 - cond_stage_trainable: true - conditioning_key: crossattn - monitor: val/loss_simple_ema - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 32 - in_channels: 4 - out_channels: 4 - model_channels: 256 - attention_resolutions: - - 4 - - 2 - - 1 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 4 - num_head_channels: 32 - use_spatial_transformer: true - transformer_depth: 1 - context_dim: 512 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 4 - n_embed: 16384 - ddconfig: - double_z: false - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 32 - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: ldm.modules.encoders.modules.ClassEmbedder - params: - embed_dim: 512 - key: class_label -data: - target: main.DataModuleFromConfig - params: - batch_size: 64 - num_workers: 12 - wrap: false - train: - target: ldm.data.imagenet.ImageNetTrain - params: - config: - size: 256 - validation: - target: ldm.data.imagenet.ImageNetValidation - params: - config: - size: 256 diff --git a/models/ldm/ffhq256/config.yaml b/models/ldm/ffhq256/config.yaml deleted file mode 100644 index 0ddfd1b93e..0000000000 --- a/models/ldm/ffhq256/config.yaml +++ /dev/null @@ -1,70 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: class_label - image_size: 64 - channels: 3 - cond_stage_trainable: false - concat_mode: false - monitor: val/loss - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 224 - attention_resolutions: - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_head_channels: 32 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: __is_unconditional__ -data: - target: main.DataModuleFromConfig - params: - batch_size: 42 - num_workers: 5 - wrap: false - train: - target: ldm.data.faceshq.FFHQTrain - params: - size: 256 - validation: - target: ldm.data.faceshq.FFHQValidation - params: - size: 256 diff --git a/models/ldm/inpainting_big/config.yaml b/models/ldm/inpainting_big/config.yaml deleted file mode 100644 index da5fd5ea50..0000000000 --- a/models/ldm/inpainting_big/config.yaml +++ /dev/null @@ -1,67 +0,0 @@ -model: - base_learning_rate: 1.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0205 - log_every_t: 100 - timesteps: 1000 - loss_type: l1 - first_stage_key: image - cond_stage_key: masked_image - image_size: 64 - channels: 3 - concat_mode: true - monitor: val/loss - scheduler_config: - target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler - params: - verbosity_interval: 0 - warm_up_steps: 1000 - max_decay_steps: 50000 - lr_start: 0.001 - lr_max: 0.1 - lr_min: 0.0001 - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 7 - out_channels: 3 - model_channels: 256 - attention_resolutions: - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_heads: 8 - resblock_updown: true - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - monitor: val/rec_loss - ddconfig: - attn_type: none - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: ldm.modules.losses.contperceptual.DummyLoss - cond_stage_config: __is_first_stage__ diff --git a/models/ldm/layout2img-openimages256/config.yaml b/models/ldm/layout2img-openimages256/config.yaml deleted file mode 100644 index 9e1dc15fe2..0000000000 --- a/models/ldm/layout2img-openimages256/config.yaml +++ /dev/null @@ -1,81 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0205 - log_every_t: 100 - timesteps: 1000 - loss_type: l1 - first_stage_key: image - cond_stage_key: coordinates_bbox - image_size: 64 - channels: 3 - conditioning_key: crossattn - cond_stage_trainable: true - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 128 - attention_resolutions: - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_head_channels: 32 - use_spatial_transformer: true - transformer_depth: 3 - context_dim: 512 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - monitor: val/rec_loss - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: ldm.modules.encoders.modules.BERTEmbedder - params: - n_embed: 512 - n_layer: 16 - vocab_size: 8192 - max_seq_len: 92 - use_tokenizer: false - monitor: val/loss_simple_ema -data: - target: main.DataModuleFromConfig - params: - batch_size: 24 - wrap: false - num_workers: 10 - train: - target: ldm.data.openimages.OpenImagesBBoxTrain - params: - size: 256 - validation: - target: ldm.data.openimages.OpenImagesBBoxValidation - params: - size: 256 diff --git a/models/ldm/lsun_beds256/config.yaml b/models/ldm/lsun_beds256/config.yaml deleted file mode 100644 index 1a50c766a5..0000000000 --- a/models/ldm/lsun_beds256/config.yaml +++ /dev/null @@ -1,70 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: class_label - image_size: 64 - channels: 3 - cond_stage_trainable: false - concat_mode: false - monitor: val/loss - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 224 - attention_resolutions: - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_head_channels: 32 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: __is_unconditional__ -data: - target: main.DataModuleFromConfig - params: - batch_size: 48 - num_workers: 5 - wrap: false - train: - target: ldm.data.lsun.LSUNBedroomsTrain - params: - size: 256 - validation: - target: ldm.data.lsun.LSUNBedroomsValidation - params: - size: 256 diff --git a/models/ldm/lsun_churches256/config.yaml b/models/ldm/lsun_churches256/config.yaml deleted file mode 100644 index 424d0914c9..0000000000 --- a/models/ldm/lsun_churches256/config.yaml +++ /dev/null @@ -1,92 +0,0 @@ -model: - base_learning_rate: 5.0e-05 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0155 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - loss_type: l1 - first_stage_key: image - cond_stage_key: image - image_size: 32 - channels: 4 - cond_stage_trainable: false - concat_mode: false - scale_by_std: true - monitor: val/loss_simple_ema - scheduler_config: - target: ldm.lr_scheduler.LambdaLinearScheduler - params: - warm_up_steps: - - 10000 - cycle_lengths: - - 10000000000000 - f_start: - - 1.0e-06 - f_max: - - 1.0 - f_min: - - 1.0 - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 32 - in_channels: 4 - out_channels: 4 - model_channels: 192 - attention_resolutions: - - 1 - - 2 - - 4 - - 8 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 2 - - 4 - - 4 - num_heads: 8 - use_scale_shift_norm: true - resblock_updown: true - first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: val/rec_loss - ddconfig: - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - - cond_stage_config: '__is_unconditional__' - -data: - target: main.DataModuleFromConfig - params: - batch_size: 96 - num_workers: 5 - wrap: false - train: - target: ldm.data.lsun.LSUNChurchesTrain - params: - size: 256 - validation: - target: ldm.data.lsun.LSUNChurchesValidation - params: - size: 256 diff --git a/models/ldm/semantic_synthesis256/config.yaml b/models/ldm/semantic_synthesis256/config.yaml deleted file mode 100644 index 1a721cfffa..0000000000 --- a/models/ldm/semantic_synthesis256/config.yaml +++ /dev/null @@ -1,59 +0,0 @@ -model: - base_learning_rate: 1.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0205 - log_every_t: 100 - timesteps: 1000 - loss_type: l1 - first_stage_key: image - cond_stage_key: segmentation - image_size: 64 - channels: 3 - concat_mode: true - cond_stage_trainable: true - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 6 - out_channels: 3 - model_channels: 128 - attention_resolutions: - - 32 - - 16 - - 8 - num_res_blocks: 2 - channel_mult: - - 1 - - 4 - - 8 - num_heads: 8 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: ldm.modules.encoders.modules.SpatialRescaler - params: - n_stages: 2 - in_channels: 182 - out_channels: 3 diff --git a/models/ldm/semantic_synthesis512/config.yaml b/models/ldm/semantic_synthesis512/config.yaml deleted file mode 100644 index 8faded2eec..0000000000 --- a/models/ldm/semantic_synthesis512/config.yaml +++ /dev/null @@ -1,78 +0,0 @@ -model: - base_learning_rate: 1.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0205 - log_every_t: 100 - timesteps: 1000 - loss_type: l1 - first_stage_key: image - cond_stage_key: segmentation - image_size: 128 - channels: 3 - concat_mode: true - cond_stage_trainable: true - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 128 - in_channels: 6 - out_channels: 3 - model_channels: 128 - attention_resolutions: - - 32 - - 16 - - 8 - num_res_blocks: 2 - channel_mult: - - 1 - - 4 - - 8 - num_heads: 8 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - monitor: val/rec_loss - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: ldm.modules.encoders.modules.SpatialRescaler - params: - n_stages: 2 - in_channels: 182 - out_channels: 3 -data: - target: main.DataModuleFromConfig - params: - batch_size: 8 - wrap: false - num_workers: 10 - train: - target: ldm.data.landscapes.RFWTrain - params: - size: 768 - crop_size: 512 - segmentation_to_float32: true - validation: - target: ldm.data.landscapes.RFWValidation - params: - size: 768 - crop_size: 512 - segmentation_to_float32: true diff --git a/models/ldm/stable-diffusion-v1/place-ckpt-files-here.txt b/models/ldm/stable-diffusion-v1/place-ckpt-files-here.txt new file mode 100644 index 0000000000..a174e54540 --- /dev/null +++ b/models/ldm/stable-diffusion-v1/place-ckpt-files-here.txt @@ -0,0 +1,2 @@ +See docs/features/INSTALLING_MODELS.md for how to populate this +directory with one or more Stable Diffusion model weight files. diff --git a/models/ldm/text2img256/config.yaml b/models/ldm/text2img256/config.yaml deleted file mode 100644 index 3f54a01515..0000000000 --- a/models/ldm/text2img256/config.yaml +++ /dev/null @@ -1,77 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: caption - image_size: 64 - channels: 3 - cond_stage_trainable: true - conditioning_key: crossattn - monitor: val/loss_simple_ema - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 192 - attention_resolutions: - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 5 - num_head_channels: 32 - use_spatial_transformer: true - transformer_depth: 1 - context_dim: 640 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: ldm.modules.encoders.modules.BERTEmbedder - params: - n_embed: 640 - n_layer: 32 -data: - target: main.DataModuleFromConfig - params: - batch_size: 28 - num_workers: 10 - wrap: false - train: - target: ldm.data.previews.pytorch_dataset.PreviewsTrain - params: - size: 256 - validation: - target: ldm.data.previews.pytorch_dataset.PreviewsValidation - params: - size: 256 diff --git a/scripts/preload_models.py b/scripts/preload_models.py index bf0a5ffb99..6a1d5741fd 100644 --- a/scripts/preload_models.py +++ b/scripts/preload_models.py @@ -3,20 +3,369 @@ # Before running stable-diffusion on an internet-isolated machine, # run this script from one with internet connectivity. The # two machines must share a common .cache directory. -from transformers import CLIPTokenizer, CLIPTextModel +# +# Coauthor: Kevin Turner http://github.com/keturn +# +print('Loading Python libraries...\n') +import argparse import clip -from transformers import BertTokenizerFast, AutoFeatureExtractor import sys import transformers import os import warnings import torch -import urllib.request import zipfile import traceback +import getpass +import requests +from urllib import request +from tqdm import tqdm +from omegaconf import OmegaConf +from pathlib import Path +from transformers import CLIPTokenizer, CLIPTextModel +from transformers import BertTokenizerFast, AutoFeatureExtractor +from huggingface_hub import hf_hub_download, HfFolder, hf_hub_url transformers.logging.set_verbosity_error() +#--------------------------globals-- +Model_dir = './models/ldm/stable-diffusion-v1/' +Config_file = './configs/models.yaml' +SD_Configs = './configs/stable-diffusion' +Datasets = { + 'stable-diffusion-1.5': { + 'description': 'The newest Stable Diffusion version 1.5 weight file (4.27 GB)', + 'repo_id': 'runwayml/stable-diffusion-v1-5', + 'config': 'v1-inference.yaml', + 'file': 'v1-5-pruned-emaonly.ckpt', + 'recommended': True, + 'width': 512, + 'height': 512, + }, + 'inpainting-1.5': { + 'description': 'RunwayML SD 1.5 model optimized for inpainting (4.27 GB)', + 'repo_id': 'runwayml/stable-diffusion-inpainting', + 'config': 'v1-inpainting-inference.yaml', + 'file': 'sd-v1-5-inpainting.ckpt', + 'recommended': True, + 'width': 512, + 'height': 512, + }, + 'stable-diffusion-1.4': { + 'description': 'The original Stable Diffusion version 1.4 weight file (4.27 GB)', + 'repo_id': 'CompVis/stable-diffusion-v-1-4-original', + 'config': 'v1-inference.yaml', + 'file': 'sd-v1-4.ckpt', + 'recommended': False, + 'width': 512, + 'height': 512, + }, + 'waifu-diffusion-1.3': { + 'description': 'Stable Diffusion 1.4 fine tuned on anime-styled images (4.27)', + 'repo_id': 'hakurei/waifu-diffusion-v1-3', + 'config': 'v1-inference.yaml', + 'file': 'model-epoch09-float32.ckpt', + 'recommended': False, + 'width': 512, + 'height': 512, + }, + 'ft-mse-improved-autoencoder-840000': { + 'description': 'StabilityAI improved autoencoder fine-tuned for human faces (recommended; 335 MB)', + 'repo_id': 'stabilityai/sd-vae-ft-mse-original', + 'config': 'VAE', + 'file': 'vae-ft-mse-840000-ema-pruned.ckpt', + 'recommended': True, + 'width': 512, + 'height': 512, + }, +} +Config_preamble = '''# This file describes the alternative machine learning models +# available to InvokeAI script. +# +# To add a new model, follow the examples below. Each +# model requires a model config file, a weights file, +# and the width and height of the images it +# was trained on. +''' + +#--------------------------------------------- +def introduction(): + print( + '''Welcome to InvokeAI. This script will help download the Stable Diffusion weight files +and other large models that are needed for text to image generation. At any point you may interrupt +this program and resume later.\n''' + ) + +#-------------------------------------------- +def postscript(): + print( + '''You're all set! You may now launch InvokeAI using one of these two commands: +Web version: + + python scripts/invoke.py --web (connect to http://localhost:9090) + +Command-line version: + + python scripts/invoke.py + +Have fun! +''' +) + +#--------------------------------------------- +def yes_or_no(prompt:str, default_yes=True): + default = "y" if default_yes else 'n' + response = input(f'{prompt} [{default}] ') or default + if default_yes: + return response[0] not in ('n','N') + else: + return response[0] in ('y','Y') + +#--------------------------------------------- +def user_wants_to_download_weights()->str: + ''' + Returns one of "skip", "recommended" or "customized" + ''' + print('''You can download and configure the weights files manually or let this +script do it for you. Manual installation is described at: + +https://github.com/invoke-ai/InvokeAI/blob/main/docs/installation/INSTALLING_MODELS.md + +You may download the recommended models (about 10GB total), select a customized set, or +completely skip this step. +''' + ) + selection = None + while selection is None: + choice = input('Download ecommended models, ustomize the list, or kip this step? [r]: ') + if choice.startswith(('r','R')) or len(choice)==0: + selection = 'recommended' + elif choice.startswith(('c','C')): + selection = 'customized' + elif choice.startswith(('s','S')): + selection = 'skip' + return selection + +#--------------------------------------------- +def select_datasets(action:str): + done = False + while not done: + datasets = dict() + dflt = None # the first model selected will be the default; TODO let user change + counter = 1 + + if action == 'customized': + print(''' +Choose the weight file(s) you wish to download. Before downloading you +will be given the option to view and change your selections. +''' + ) + for ds in Datasets.keys(): + recommended = '(recommended)' if Datasets[ds]['recommended'] else '' + print(f'[{counter}] {ds}:\n {Datasets[ds]["description"]} {recommended}') + if yes_or_no(' Download?',default_yes=Datasets[ds]['recommended']): + datasets[ds]=counter + counter += 1 + else: + for ds in Datasets.keys(): + if Datasets[ds]['recommended']: + datasets[ds]=counter + counter += 1 + + print('The following weight files will be downloaded:') + for ds in datasets: + dflt = '*' if dflt is None else '' + print(f' [{datasets[ds]}] {ds}{dflt}') + print("*default") + ok_to_download = yes_or_no('Ok to download?') + if not ok_to_download: + if yes_or_no('Change your selection?'): + action = 'customized' + pass + else: + done = True + else: + done = True + return datasets if ok_to_download else None + + +#-------------------------------Authenticate against Hugging Face +def authenticate(): + print(''' +To download the Stable Diffusion weight files from the official Hugging Face +repository, you need to read and accept the CreativeML Responsible AI license. + +This involves a few easy steps. + +1. If you have not already done so, create an account on Hugging Face's web site + using the "Sign Up" button: + + https://huggingface.co/join + + You will need to verify your email address as part of the HuggingFace + registration process. + +2. Log into your Hugging Face account: + + https://huggingface.co/login + +3. Accept the license terms located here: + + https://huggingface.co/runwayml/stable-diffusion-v1-5 + + and here: + + https://huggingface.co/runwayml/stable-diffusion-inpainting + + (Yes, you have to accept two slightly different license agreements) +''' + ) + input('Presswhen you are ready to continue:') + access_token = HfFolder.get_token() + if access_token is None: + print(''' +4. Thank you! The last step is to enter your HuggingFace access token so that + this script is authorized to initiate the download. Go to the access tokens + page of your Hugging Face account and create a token by clicking the + "New token" button: + + https://huggingface.co/settings/tokens + + (You can enter anything you like in the token creation field marked "Name". + "Role" should be "read"). + + Now copy the token to your clipboard and paste it here: ''' + ) + access_token = getpass.getpass() + HfFolder.save_token(access_token) + return access_token + +#--------------------------------------------- +# look for legacy model.ckpt in models directory and offer to +# normalize its name +def migrate_models_ckpt(): + if not os.path.exists(os.path.join(Model_dir,'model.ckpt')): + return + new_name = Datasets['stable-diffusion-1.4']['file'] + print('You seem to have the Stable Diffusion v4.1 "model.ckpt" already installed.') + rename = yes_or_no(f'Ok to rename it to "{new_name}" for future reference?') + if rename: + print(f'model.ckpt => {new_name}') + os.rename(os.path.join(Model_dir,'model.ckpt'),os.path.join(Model_dir,new_name)) + +#--------------------------------------------- +def download_weight_datasets(models:dict, access_token:str): + migrate_models_ckpt() + successful = dict() + for mod in models.keys(): + repo_id = Datasets[mod]['repo_id'] + filename = Datasets[mod]['file'] + success = download_with_resume( + repo_id=repo_id, + model_name=filename, + access_token=access_token + ) + if success: + successful[mod] = True + keys = ', '.join(successful.keys()) + print(f'Successfully installed {keys}') + return successful + +#--------------------------------------------- +def download_with_resume(repo_id:str, model_name:str, access_token:str)->bool: + + model_dest = os.path.join(Model_dir, model_name) + os.makedirs(os.path.dirname(model_dest), exist_ok=True) + url = hf_hub_url(repo_id, model_name) + + header = {"Authorization": f'Bearer {access_token}'} + open_mode = 'wb' + exist_size = 0 + + if os.path.exists(model_dest): + exist_size = os.path.getsize(model_dest) + header['Range'] = f'bytes={exist_size}-' + open_mode = 'ab' + + resp = requests.get(url, headers=header, stream=True) + total = int(resp.headers.get('content-length', 0)) + + if resp.status_code==416: # "range not satisfiable", which means nothing to return + print(f'* {model_name}: complete file found. Skipping.') + return True + elif exist_size > 0: + print(f'* {model_name}: partial file found. Resuming...') + else: + print(f'* {model_name}: Downloading...') + + try: + if total < 2000: + print(f'* {model_name}: {resp.text}') + return False + + with open(model_dest, open_mode) as file, tqdm( + desc=model_name, + initial=exist_size, + total=total+exist_size, + unit='iB', + unit_scale=True, + unit_divisor=1000, + ) as bar: + for data in resp.iter_content(chunk_size=1024): + size = file.write(data) + bar.update(size) + except Exception as e: + print(f'An error occurred while downloading {model_name}: {str(e)}') + return False + return True + +#--------------------------------------------- +def update_config_file(successfully_downloaded:dict): + try: + yaml = new_config_file_contents(successfully_downloaded) + tmpfile = os.path.join(os.path.dirname(Config_file),'new_config.tmp') + with open(tmpfile, 'w') as outfile: + outfile.write(Config_preamble) + outfile.write(yaml) + os.rename(tmpfile,Config_file) + except Exception as e: + print(f'**Error creating config file {Config_file}: {str(e)} **') + return + print(f'Successfully created new configuration file {Config_file}') + + +#--------------------------------------------- +def new_config_file_contents(successfully_downloaded:dict)->str: + conf = OmegaConf.load(Config_file) + + # find the VAE file, if there is one + vae = None + default_selected = False + + for model in successfully_downloaded: + if Datasets[model]['config'] == 'VAE': + vae = Datasets[model]['file'] + + for model in successfully_downloaded: + if Datasets[model]['config'] == 'VAE': # skip VAE entries + continue + stanza = conf[model] if model in conf else { } + + stanza['description'] = Datasets[model]['description'] + stanza['weights'] = os.path.join(Model_dir,Datasets[model]['file']) + stanza['config'] =os.path.join(SD_Configs, Datasets[model]['config']) + stanza['width'] = Datasets[model]['width'] + stanza['height'] = Datasets[model]['height'] + stanza.pop('default',None) # this will be set later + if vae: + stanza['vae'] = os.path.join(Model_dir,vae) + # BUG - the first stanza is always the default. User should select. + if not default_selected: + stanza['default'] = True + default_selected = True + conf[model] = stanza + return OmegaConf.to_yaml(conf) + #--------------------------------------------- # this will preload the Bert tokenizer fles def download_bert(): @@ -66,7 +415,6 @@ def download_gfpgan(): print(traceback.format_exc()) print('Loading models from GFPGAN') - import urllib.request for model in ( [ 'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth', @@ -149,15 +497,46 @@ def download_safety_checker(): safety_feature_extractor = AutoFeatureExtractor.from_pretrained(safety_model_id) safety_checker = StableDiffusionSafetyChecker.from_pretrained(safety_model_id) print('...success') - + #------------------------------------- if __name__ == '__main__': - download_bert() - download_kornia() - download_clip() - download_gfpgan() - download_codeformer() - download_clipseg() - download_safety_checker() + parser = argparse.ArgumentParser(description='InvokeAI model downloader') + parser.add_argument('--interactive', + dest='interactive', + action=argparse.BooleanOptionalAction, + default=True, + help='run in interactive mode (default)') + opt = parser.parse_args() + + try: + if opt.interactive: + introduction() + print('** WEIGHT SELECTION **') + choice = user_wants_to_download_weights() + if choice != 'skip': + models = select_datasets(choice) + if models is None: + if yes_or_no('Quit?',default_yes=False): + sys.exit(0) + print('** LICENSE AGREEMENT FOR WEIGHT FILES **') + access_token = authenticate() + print('\n** DOWNLOADING WEIGHTS **') + successfully_downloaded = download_weight_datasets(models, access_token) + update_config_file(successfully_downloaded) + else: + print('\n** DOWNLOADING SUPPORT MODELS **') + download_bert() + download_kornia() + download_clip() + download_gfpgan() + download_codeformer() + download_clipseg() + download_safety_checker() + postscript() + except KeyboardInterrupt: + print('\nGoodbye! Come back soon.') + except Exception as e: + print(f'\nA problem occurred during download.\nThe error was: "{str(e)}"') +