mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
preload_models.py script downloads the weight files
- user can select which weight files to download using huggingface cache - user must log in to huggingface, generate an access token, and accept license terms the very first time this is run. After that, everything works automatically. - added placeholder for docs for installing models - also got rid of unused config files. hopefully they weren't needed for textual inversion, but I don't think so.
This commit is contained in:
parent
104466f5c0
commit
ef68a419f1
8
.gitignore
vendored
8
.gitignore
vendored
@ -199,7 +199,13 @@ checkpoints
|
||||
.scratch/
|
||||
.vscode/
|
||||
gfpgan/
|
||||
models/ldm/stable-diffusion-v1/model.sha256
|
||||
models/ldm/stable-diffusion-v1/*.sha256
|
||||
|
||||
# GFPGAN model files
|
||||
gfpgan/
|
||||
|
||||
# config file (will be created by installer)
|
||||
configs/models.yaml
|
||||
|
||||
# weights (will be created by installer)
|
||||
models/ldm/stable-diffusion-v1/*.ckpt
|
@ -1,54 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 4.5e-6
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
monitor: "val/rec_loss"
|
||||
embed_dim: 16
|
||||
lossconfig:
|
||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
||||
params:
|
||||
disc_start: 50001
|
||||
kl_weight: 0.000001
|
||||
disc_weight: 0.5
|
||||
|
||||
ddconfig:
|
||||
double_z: True
|
||||
z_channels: 16
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult: [ 1,1,2,2,4] # num_down = len(ch_mult)-1
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: [16]
|
||||
dropout: 0.0
|
||||
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 12
|
||||
wrap: True
|
||||
train:
|
||||
target: ldm.data.imagenet.ImageNetSRTrain
|
||||
params:
|
||||
size: 256
|
||||
degradation: pil_nearest
|
||||
validation:
|
||||
target: ldm.data.imagenet.ImageNetSRValidation
|
||||
params:
|
||||
size: 256
|
||||
degradation: pil_nearest
|
||||
|
||||
lightning:
|
||||
callbacks:
|
||||
image_logger:
|
||||
target: main.ImageLogger
|
||||
params:
|
||||
batch_frequency: 1000
|
||||
max_images: 8
|
||||
increase_log_steps: True
|
||||
|
||||
trainer:
|
||||
benchmark: True
|
||||
accumulate_grad_batches: 2
|
@ -1,53 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 4.5e-6
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
monitor: "val/rec_loss"
|
||||
embed_dim: 4
|
||||
lossconfig:
|
||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
||||
params:
|
||||
disc_start: 50001
|
||||
kl_weight: 0.000001
|
||||
disc_weight: 0.5
|
||||
|
||||
ddconfig:
|
||||
double_z: True
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult: [ 1,2,4,4 ] # num_down = len(ch_mult)-1
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: [ ]
|
||||
dropout: 0.0
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 12
|
||||
wrap: True
|
||||
train:
|
||||
target: ldm.data.imagenet.ImageNetSRTrain
|
||||
params:
|
||||
size: 256
|
||||
degradation: pil_nearest
|
||||
validation:
|
||||
target: ldm.data.imagenet.ImageNetSRValidation
|
||||
params:
|
||||
size: 256
|
||||
degradation: pil_nearest
|
||||
|
||||
lightning:
|
||||
callbacks:
|
||||
image_logger:
|
||||
target: main.ImageLogger
|
||||
params:
|
||||
batch_frequency: 1000
|
||||
max_images: 8
|
||||
increase_log_steps: True
|
||||
|
||||
trainer:
|
||||
benchmark: True
|
||||
accumulate_grad_batches: 2
|
@ -1,54 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 4.5e-6
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
monitor: "val/rec_loss"
|
||||
embed_dim: 3
|
||||
lossconfig:
|
||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
||||
params:
|
||||
disc_start: 50001
|
||||
kl_weight: 0.000001
|
||||
disc_weight: 0.5
|
||||
|
||||
ddconfig:
|
||||
double_z: True
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: [ ]
|
||||
dropout: 0.0
|
||||
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 12
|
||||
wrap: True
|
||||
train:
|
||||
target: ldm.data.imagenet.ImageNetSRTrain
|
||||
params:
|
||||
size: 256
|
||||
degradation: pil_nearest
|
||||
validation:
|
||||
target: ldm.data.imagenet.ImageNetSRValidation
|
||||
params:
|
||||
size: 256
|
||||
degradation: pil_nearest
|
||||
|
||||
lightning:
|
||||
callbacks:
|
||||
image_logger:
|
||||
target: main.ImageLogger
|
||||
params:
|
||||
batch_frequency: 1000
|
||||
max_images: 8
|
||||
increase_log_steps: True
|
||||
|
||||
trainer:
|
||||
benchmark: True
|
||||
accumulate_grad_batches: 2
|
@ -1,53 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 4.5e-6
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
monitor: "val/rec_loss"
|
||||
embed_dim: 64
|
||||
lossconfig:
|
||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
||||
params:
|
||||
disc_start: 50001
|
||||
kl_weight: 0.000001
|
||||
disc_weight: 0.5
|
||||
|
||||
ddconfig:
|
||||
double_z: True
|
||||
z_channels: 64
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult: [ 1,1,2,2,4,4] # num_down = len(ch_mult)-1
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: [16,8]
|
||||
dropout: 0.0
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 12
|
||||
wrap: True
|
||||
train:
|
||||
target: ldm.data.imagenet.ImageNetSRTrain
|
||||
params:
|
||||
size: 256
|
||||
degradation: pil_nearest
|
||||
validation:
|
||||
target: ldm.data.imagenet.ImageNetSRValidation
|
||||
params:
|
||||
size: 256
|
||||
degradation: pil_nearest
|
||||
|
||||
lightning:
|
||||
callbacks:
|
||||
image_logger:
|
||||
target: main.ImageLogger
|
||||
params:
|
||||
batch_frequency: 1000
|
||||
max_images: 8
|
||||
increase_log_steps: True
|
||||
|
||||
trainer:
|
||||
benchmark: True
|
||||
accumulate_grad_batches: 2
|
@ -1,86 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 2.0e-06
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0195
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
image_size: 64
|
||||
channels: 3
|
||||
monitor: val/loss_simple_ema
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 64
|
||||
in_channels: 3
|
||||
out_channels: 3
|
||||
model_channels: 224
|
||||
attention_resolutions:
|
||||
# note: this isn\t actually the resolution but
|
||||
# the downsampling factor, i.e. this corresnponds to
|
||||
# attention on spatial resolution 8,16,32, as the
|
||||
# spatial reolution of the latents is 64 for f4
|
||||
- 8
|
||||
- 4
|
||||
- 2
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 3
|
||||
- 4
|
||||
num_head_channels: 32
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.VQModelInterface
|
||||
params:
|
||||
embed_dim: 3
|
||||
n_embed: 8192
|
||||
ckpt_path: models/first_stage_models/vq-f4/model.ckpt
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
cond_stage_config: __is_unconditional__
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 48
|
||||
num_workers: 5
|
||||
wrap: false
|
||||
train:
|
||||
target: taming.data.faceshq.CelebAHQTrain
|
||||
params:
|
||||
size: 256
|
||||
validation:
|
||||
target: taming.data.faceshq.CelebAHQValidation
|
||||
params:
|
||||
size: 256
|
||||
|
||||
|
||||
lightning:
|
||||
callbacks:
|
||||
image_logger:
|
||||
target: main.ImageLogger
|
||||
params:
|
||||
batch_frequency: 5000
|
||||
max_images: 8
|
||||
increase_log_steps: False
|
||||
|
||||
trainer:
|
||||
benchmark: True
|
@ -1,98 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-06
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0195
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
cond_stage_key: class_label
|
||||
image_size: 32
|
||||
channels: 4
|
||||
cond_stage_trainable: true
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 32
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 256
|
||||
attention_resolutions:
|
||||
#note: this isn\t actually the resolution but
|
||||
# the downsampling factor, i.e. this corresnponds to
|
||||
# attention on spatial resolution 8,16,32, as the
|
||||
# spatial reolution of the latents is 32 for f8
|
||||
- 4
|
||||
- 2
|
||||
- 1
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_head_channels: 32
|
||||
use_spatial_transformer: true
|
||||
transformer_depth: 1
|
||||
context_dim: 512
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.VQModelInterface
|
||||
params:
|
||||
embed_dim: 4
|
||||
n_embed: 16384
|
||||
ckpt_path: configs/first_stage_models/vq-f8/model.yaml
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions:
|
||||
- 32
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.ClassEmbedder
|
||||
params:
|
||||
embed_dim: 512
|
||||
key: class_label
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 64
|
||||
num_workers: 12
|
||||
wrap: false
|
||||
train:
|
||||
target: ldm.data.imagenet.ImageNetTrain
|
||||
params:
|
||||
config:
|
||||
size: 256
|
||||
validation:
|
||||
target: ldm.data.imagenet.ImageNetValidation
|
||||
params:
|
||||
config:
|
||||
size: 256
|
||||
|
||||
|
||||
lightning:
|
||||
callbacks:
|
||||
image_logger:
|
||||
target: main.ImageLogger
|
||||
params:
|
||||
batch_frequency: 5000
|
||||
max_images: 8
|
||||
increase_log_steps: False
|
||||
|
||||
trainer:
|
||||
benchmark: True
|
@ -1,68 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 0.0001
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0195
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
cond_stage_key: class_label
|
||||
image_size: 64
|
||||
channels: 3
|
||||
cond_stage_trainable: true
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss
|
||||
use_ema: False
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 64
|
||||
in_channels: 3
|
||||
out_channels: 3
|
||||
model_channels: 192
|
||||
attention_resolutions:
|
||||
- 8
|
||||
- 4
|
||||
- 2
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 3
|
||||
- 5
|
||||
num_heads: 1
|
||||
use_spatial_transformer: true
|
||||
transformer_depth: 1
|
||||
context_dim: 512
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.VQModelInterface
|
||||
params:
|
||||
embed_dim: 3
|
||||
n_embed: 8192
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.ClassEmbedder
|
||||
params:
|
||||
n_classes: 1001
|
||||
embed_dim: 512
|
||||
key: class_label
|
@ -1,85 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 2.0e-06
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0195
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
image_size: 64
|
||||
channels: 3
|
||||
monitor: val/loss_simple_ema
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 64
|
||||
in_channels: 3
|
||||
out_channels: 3
|
||||
model_channels: 224
|
||||
attention_resolutions:
|
||||
# note: this isn\t actually the resolution but
|
||||
# the downsampling factor, i.e. this corresnponds to
|
||||
# attention on spatial resolution 8,16,32, as the
|
||||
# spatial reolution of the latents is 64 for f4
|
||||
- 8
|
||||
- 4
|
||||
- 2
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 3
|
||||
- 4
|
||||
num_head_channels: 32
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.VQModelInterface
|
||||
params:
|
||||
embed_dim: 3
|
||||
n_embed: 8192
|
||||
ckpt_path: configs/first_stage_models/vq-f4/model.yaml
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
cond_stage_config: __is_unconditional__
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 42
|
||||
num_workers: 5
|
||||
wrap: false
|
||||
train:
|
||||
target: taming.data.faceshq.FFHQTrain
|
||||
params:
|
||||
size: 256
|
||||
validation:
|
||||
target: taming.data.faceshq.FFHQValidation
|
||||
params:
|
||||
size: 256
|
||||
|
||||
|
||||
lightning:
|
||||
callbacks:
|
||||
image_logger:
|
||||
target: main.ImageLogger
|
||||
params:
|
||||
batch_frequency: 5000
|
||||
max_images: 8
|
||||
increase_log_steps: False
|
||||
|
||||
trainer:
|
||||
benchmark: True
|
@ -1,85 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 2.0e-06
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0195
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
image_size: 64
|
||||
channels: 3
|
||||
monitor: val/loss_simple_ema
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 64
|
||||
in_channels: 3
|
||||
out_channels: 3
|
||||
model_channels: 224
|
||||
attention_resolutions:
|
||||
# note: this isn\t actually the resolution but
|
||||
# the downsampling factor, i.e. this corresnponds to
|
||||
# attention on spatial resolution 8,16,32, as the
|
||||
# spatial reolution of the latents is 64 for f4
|
||||
- 8
|
||||
- 4
|
||||
- 2
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 3
|
||||
- 4
|
||||
num_head_channels: 32
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.VQModelInterface
|
||||
params:
|
||||
ckpt_path: configs/first_stage_models/vq-f4/model.yaml
|
||||
embed_dim: 3
|
||||
n_embed: 8192
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
cond_stage_config: __is_unconditional__
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 48
|
||||
num_workers: 5
|
||||
wrap: false
|
||||
train:
|
||||
target: ldm.data.lsun.LSUNBedroomsTrain
|
||||
params:
|
||||
size: 256
|
||||
validation:
|
||||
target: ldm.data.lsun.LSUNBedroomsValidation
|
||||
params:
|
||||
size: 256
|
||||
|
||||
|
||||
lightning:
|
||||
callbacks:
|
||||
image_logger:
|
||||
target: main.ImageLogger
|
||||
params:
|
||||
batch_frequency: 5000
|
||||
max_images: 8
|
||||
increase_log_steps: False
|
||||
|
||||
trainer:
|
||||
benchmark: True
|
@ -1,91 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 5.0e-5 # set to target_lr by starting main.py with '--scale_lr False'
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0155
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
loss_type: l1
|
||||
first_stage_key: "image"
|
||||
cond_stage_key: "image"
|
||||
image_size: 32
|
||||
channels: 4
|
||||
cond_stage_trainable: False
|
||||
concat_mode: False
|
||||
scale_by_std: True
|
||||
monitor: 'val/loss_simple_ema'
|
||||
|
||||
scheduler_config: # 10000 warmup steps
|
||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||
params:
|
||||
warm_up_steps: [10000]
|
||||
cycle_lengths: [10000000000000]
|
||||
f_start: [1.e-6]
|
||||
f_max: [1.]
|
||||
f_min: [ 1.]
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 32
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 192
|
||||
attention_resolutions: [ 1, 2, 4, 8 ] # 32, 16, 8, 4
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1,2,2,4,4 ] # 32, 16, 8, 4, 2
|
||||
num_heads: 8
|
||||
use_scale_shift_norm: True
|
||||
resblock_updown: True
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: "val/rec_loss"
|
||||
ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
|
||||
ddconfig:
|
||||
double_z: True
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult: [ 1,2,4,4 ] # num_down = len(ch_mult)-1
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: [ ]
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config: "__is_unconditional__"
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 96
|
||||
num_workers: 5
|
||||
wrap: False
|
||||
train:
|
||||
target: ldm.data.lsun.LSUNChurchesTrain
|
||||
params:
|
||||
size: 256
|
||||
validation:
|
||||
target: ldm.data.lsun.LSUNChurchesValidation
|
||||
params:
|
||||
size: 256
|
||||
|
||||
lightning:
|
||||
callbacks:
|
||||
image_logger:
|
||||
target: main.ImageLogger
|
||||
params:
|
||||
batch_frequency: 5000
|
||||
max_images: 8
|
||||
increase_log_steps: False
|
||||
|
||||
|
||||
trainer:
|
||||
benchmark: True
|
@ -1,71 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 5.0e-05
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.012
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
cond_stage_key: caption
|
||||
image_size: 32
|
||||
channels: 4
|
||||
cond_stage_trainable: true
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
scale_factor: 0.18215
|
||||
use_ema: False
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 32
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions:
|
||||
- 4
|
||||
- 2
|
||||
- 1
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_heads: 8
|
||||
use_spatial_transformer: true
|
||||
transformer_depth: 1
|
||||
context_dim: 1280
|
||||
use_checkpoint: true
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.BERTEmbedder
|
||||
params:
|
||||
n_embed: 1280
|
||||
n_layer: 32
|
@ -1,5 +1,5 @@
|
||||
# This file describes the alternative machine learning models
|
||||
# available to the dream script.
|
||||
# available to InvokeAI script.
|
||||
#
|
||||
# To add a new model, follow the examples below. Each
|
||||
# model requires a model config file, a weights file,
|
||||
@ -8,22 +8,29 @@
|
||||
stable-diffusion-1.4:
|
||||
config: configs/stable-diffusion/v1-inference.yaml
|
||||
weights: models/ldm/stable-diffusion-v1/model.ckpt
|
||||
# vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
|
||||
vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
|
||||
description: Stable Diffusion inference model version 1.4
|
||||
width: 512
|
||||
height: 512
|
||||
stable-diffusion-1.5:
|
||||
description: The newest Stable Diffusion version 1.5 weight file (4.27 GB)
|
||||
weights: ./models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt
|
||||
config: ./configs/stable-diffusion/v1-inference.yaml
|
||||
width: 512
|
||||
height: 512
|
||||
vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
|
||||
default: true
|
||||
inpainting-1.5:
|
||||
description: runwayML tuned inpainting model v1.5
|
||||
weights: models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt
|
||||
config: configs/stable-diffusion/v1-inpainting-inference.yaml
|
||||
# vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
|
||||
description: RunwayML SD 1.5 model optimized for inpainting (4.27 GB)
|
||||
weights: ./models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt
|
||||
config: ./configs/stable-diffusion/v1-inpainting-inference.yaml
|
||||
width: 512
|
||||
height: 512
|
||||
stable-diffusion-1.5:
|
||||
config: configs/stable-diffusion/v1-inference.yaml
|
||||
weights: models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt
|
||||
# vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
|
||||
description: Stable Diffusion inference model version 1.5
|
||||
vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
|
||||
waifu-diffusion-1.3:
|
||||
description: Stable Diffusion 1.4 fine tuned on anime-styled images (4.27)
|
||||
weights: ./models/ldm/stable-diffusion-v1/model-epoch09-float32.ckpt
|
||||
config: ./configs/stable-diffusion/v1-inference.yaml
|
||||
width: 512
|
||||
height: 512
|
||||
vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
|
||||
|
@ -1,68 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 0.0001
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.015
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: jpg
|
||||
cond_stage_key: nix
|
||||
image_size: 48
|
||||
channels: 16
|
||||
cond_stage_trainable: false
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
scale_by_std: false
|
||||
scale_factor: 0.22765929
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 48
|
||||
in_channels: 16
|
||||
out_channels: 16
|
||||
model_channels: 448
|
||||
attention_resolutions:
|
||||
- 4
|
||||
- 2
|
||||
- 1
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 3
|
||||
- 4
|
||||
use_scale_shift_norm: false
|
||||
resblock_updown: false
|
||||
num_head_channels: 32
|
||||
use_spatial_transformer: true
|
||||
transformer_depth: 1
|
||||
context_dim: 768
|
||||
use_checkpoint: true
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
monitor: val/rec_loss
|
||||
embed_dim: 16
|
||||
ddconfig:
|
||||
double_z: true
|
||||
z_channels: 16
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions:
|
||||
- 16
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
cond_stage_config:
|
||||
target: torch.nn.Identity
|
9
docs/features/INSTALLING_MODELS.md
Normal file
9
docs/features/INSTALLING_MODELS.md
Normal file
@ -0,0 +1,9 @@
|
||||
---
|
||||
title: Installing Models
|
||||
---
|
||||
|
||||
# :octicons-paintbrush-16: Installing Models
|
||||
|
||||
## TO COME
|
||||
|
||||
|
@ -281,7 +281,7 @@ class ModelCache(object):
|
||||
Returns the preamble for the config file.
|
||||
'''
|
||||
return '''# This file describes the alternative machine learning models
|
||||
# available to the dream script.
|
||||
# available to InvokeAI script.
|
||||
#
|
||||
# To add a new model, follow the examples below. Each
|
||||
# model requires a model config file, a weights file,
|
||||
|
@ -1,44 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 4.5e-06
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
monitor: val/rec_loss
|
||||
embed_dim: 16
|
||||
lossconfig:
|
||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
||||
params:
|
||||
disc_start: 50001
|
||||
kl_weight: 1.0e-06
|
||||
disc_weight: 0.5
|
||||
ddconfig:
|
||||
double_z: true
|
||||
z_channels: 16
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions:
|
||||
- 16
|
||||
dropout: 0.0
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 6
|
||||
wrap: true
|
||||
train:
|
||||
target: ldm.data.openimages.FullOpenImagesTrain
|
||||
params:
|
||||
size: 384
|
||||
crop_size: 256
|
||||
validation:
|
||||
target: ldm.data.openimages.FullOpenImagesValidation
|
||||
params:
|
||||
size: 384
|
||||
crop_size: 256
|
@ -1,46 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 4.5e-06
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
monitor: val/rec_loss
|
||||
embed_dim: 64
|
||||
lossconfig:
|
||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
||||
params:
|
||||
disc_start: 50001
|
||||
kl_weight: 1.0e-06
|
||||
disc_weight: 0.5
|
||||
ddconfig:
|
||||
double_z: true
|
||||
z_channels: 64
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions:
|
||||
- 16
|
||||
- 8
|
||||
dropout: 0.0
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 6
|
||||
wrap: true
|
||||
train:
|
||||
target: ldm.data.openimages.FullOpenImagesTrain
|
||||
params:
|
||||
size: 384
|
||||
crop_size: 256
|
||||
validation:
|
||||
target: ldm.data.openimages.FullOpenImagesValidation
|
||||
params:
|
||||
size: 384
|
||||
crop_size: 256
|
@ -1,41 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 4.5e-06
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
monitor: val/rec_loss
|
||||
embed_dim: 3
|
||||
lossconfig:
|
||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
||||
params:
|
||||
disc_start: 50001
|
||||
kl_weight: 1.0e-06
|
||||
disc_weight: 0.5
|
||||
ddconfig:
|
||||
double_z: true
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 10
|
||||
wrap: true
|
||||
train:
|
||||
target: ldm.data.openimages.FullOpenImagesTrain
|
||||
params:
|
||||
size: 384
|
||||
crop_size: 256
|
||||
validation:
|
||||
target: ldm.data.openimages.FullOpenImagesValidation
|
||||
params:
|
||||
size: 384
|
||||
crop_size: 256
|
@ -1,42 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 4.5e-06
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
monitor: val/rec_loss
|
||||
embed_dim: 4
|
||||
lossconfig:
|
||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
||||
params:
|
||||
disc_start: 50001
|
||||
kl_weight: 1.0e-06
|
||||
disc_weight: 0.5
|
||||
ddconfig:
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 4
|
||||
wrap: true
|
||||
train:
|
||||
target: ldm.data.openimages.FullOpenImagesTrain
|
||||
params:
|
||||
size: 384
|
||||
crop_size: 256
|
||||
validation:
|
||||
target: ldm.data.openimages.FullOpenImagesValidation
|
||||
params:
|
||||
size: 384
|
||||
crop_size: 256
|
@ -1,49 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 4.5e-06
|
||||
target: ldm.models.autoencoder.VQModel
|
||||
params:
|
||||
embed_dim: 8
|
||||
n_embed: 16384
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 8
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions:
|
||||
- 16
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
|
||||
params:
|
||||
disc_conditional: false
|
||||
disc_in_channels: 3
|
||||
disc_start: 250001
|
||||
disc_weight: 0.75
|
||||
disc_num_layers: 2
|
||||
codebook_weight: 1.0
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 14
|
||||
num_workers: 20
|
||||
wrap: true
|
||||
train:
|
||||
target: ldm.data.openimages.FullOpenImagesTrain
|
||||
params:
|
||||
size: 384
|
||||
crop_size: 256
|
||||
validation:
|
||||
target: ldm.data.openimages.FullOpenImagesValidation
|
||||
params:
|
||||
size: 384
|
||||
crop_size: 256
|
@ -1,46 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 4.5e-06
|
||||
target: ldm.models.autoencoder.VQModel
|
||||
params:
|
||||
embed_dim: 3
|
||||
n_embed: 8192
|
||||
monitor: val/rec_loss
|
||||
|
||||
ddconfig:
|
||||
attn_type: none
|
||||
double_z: false
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
|
||||
params:
|
||||
disc_conditional: false
|
||||
disc_in_channels: 3
|
||||
disc_start: 11
|
||||
disc_weight: 0.75
|
||||
codebook_weight: 1.0
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 8
|
||||
num_workers: 12
|
||||
wrap: true
|
||||
train:
|
||||
target: ldm.data.openimages.FullOpenImagesTrain
|
||||
params:
|
||||
crop_size: 256
|
||||
validation:
|
||||
target: ldm.data.openimages.FullOpenImagesValidation
|
||||
params:
|
||||
crop_size: 256
|
@ -1,45 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 4.5e-06
|
||||
target: ldm.models.autoencoder.VQModel
|
||||
params:
|
||||
embed_dim: 3
|
||||
n_embed: 8192
|
||||
monitor: val/rec_loss
|
||||
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
|
||||
params:
|
||||
disc_conditional: false
|
||||
disc_in_channels: 3
|
||||
disc_start: 0
|
||||
disc_weight: 0.75
|
||||
codebook_weight: 1.0
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 8
|
||||
num_workers: 16
|
||||
wrap: true
|
||||
train:
|
||||
target: ldm.data.openimages.FullOpenImagesTrain
|
||||
params:
|
||||
crop_size: 256
|
||||
validation:
|
||||
target: ldm.data.openimages.FullOpenImagesValidation
|
||||
params:
|
||||
crop_size: 256
|
@ -1,48 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 4.5e-06
|
||||
target: ldm.models.autoencoder.VQModel
|
||||
params:
|
||||
embed_dim: 4
|
||||
n_embed: 256
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions:
|
||||
- 32
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
|
||||
params:
|
||||
disc_conditional: false
|
||||
disc_in_channels: 3
|
||||
disc_start: 250001
|
||||
disc_weight: 0.75
|
||||
codebook_weight: 1.0
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 10
|
||||
num_workers: 20
|
||||
wrap: true
|
||||
train:
|
||||
target: ldm.data.openimages.FullOpenImagesTrain
|
||||
params:
|
||||
size: 384
|
||||
crop_size: 256
|
||||
validation:
|
||||
target: ldm.data.openimages.FullOpenImagesValidation
|
||||
params:
|
||||
size: 384
|
||||
crop_size: 256
|
@ -1,48 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 4.5e-06
|
||||
target: ldm.models.autoencoder.VQModel
|
||||
params:
|
||||
embed_dim: 4
|
||||
n_embed: 16384
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions:
|
||||
- 32
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
|
||||
params:
|
||||
disc_conditional: false
|
||||
disc_in_channels: 3
|
||||
disc_num_layers: 2
|
||||
disc_start: 1
|
||||
disc_weight: 0.6
|
||||
codebook_weight: 1.0
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 10
|
||||
num_workers: 20
|
||||
wrap: true
|
||||
train:
|
||||
target: ldm.data.openimages.FullOpenImagesTrain
|
||||
params:
|
||||
size: 384
|
||||
crop_size: 256
|
||||
validation:
|
||||
target: ldm.data.openimages.FullOpenImagesValidation
|
||||
params:
|
||||
size: 384
|
||||
crop_size: 256
|
@ -1,80 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-06
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0155
|
||||
log_every_t: 100
|
||||
timesteps: 1000
|
||||
loss_type: l2
|
||||
first_stage_key: image
|
||||
cond_stage_key: LR_image
|
||||
image_size: 64
|
||||
channels: 3
|
||||
concat_mode: true
|
||||
cond_stage_trainable: false
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 64
|
||||
in_channels: 6
|
||||
out_channels: 3
|
||||
model_channels: 160
|
||||
attention_resolutions:
|
||||
- 16
|
||||
- 8
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 4
|
||||
num_head_channels: 32
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.VQModelInterface
|
||||
params:
|
||||
embed_dim: 3
|
||||
n_embed: 8192
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
cond_stage_config:
|
||||
target: torch.nn.Identity
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 64
|
||||
wrap: false
|
||||
num_workers: 12
|
||||
train:
|
||||
target: ldm.data.openimages.SuperresOpenImagesAdvancedTrain
|
||||
params:
|
||||
size: 256
|
||||
degradation: bsrgan_light
|
||||
downscale_f: 4
|
||||
min_crop_f: 0.5
|
||||
max_crop_f: 1.0
|
||||
random_crop: true
|
||||
validation:
|
||||
target: ldm.data.openimages.SuperresOpenImagesAdvancedValidation
|
||||
params:
|
||||
size: 256
|
||||
degradation: bsrgan_light
|
||||
downscale_f: 4
|
||||
min_crop_f: 0.5
|
||||
max_crop_f: 1.0
|
||||
random_crop: true
|
@ -1,70 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 2.0e-06
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0195
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
cond_stage_key: class_label
|
||||
image_size: 64
|
||||
channels: 3
|
||||
cond_stage_trainable: false
|
||||
concat_mode: false
|
||||
monitor: val/loss
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 64
|
||||
in_channels: 3
|
||||
out_channels: 3
|
||||
model_channels: 224
|
||||
attention_resolutions:
|
||||
- 8
|
||||
- 4
|
||||
- 2
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 3
|
||||
- 4
|
||||
num_head_channels: 32
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.VQModelInterface
|
||||
params:
|
||||
embed_dim: 3
|
||||
n_embed: 8192
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
cond_stage_config: __is_unconditional__
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 48
|
||||
num_workers: 5
|
||||
wrap: false
|
||||
train:
|
||||
target: ldm.data.faceshq.CelebAHQTrain
|
||||
params:
|
||||
size: 256
|
||||
validation:
|
||||
target: ldm.data.faceshq.CelebAHQValidation
|
||||
params:
|
||||
size: 256
|
@ -1,80 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-06
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0195
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
cond_stage_key: class_label
|
||||
image_size: 32
|
||||
channels: 4
|
||||
cond_stage_trainable: true
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 32
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 256
|
||||
attention_resolutions:
|
||||
- 4
|
||||
- 2
|
||||
- 1
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_head_channels: 32
|
||||
use_spatial_transformer: true
|
||||
transformer_depth: 1
|
||||
context_dim: 512
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.VQModelInterface
|
||||
params:
|
||||
embed_dim: 4
|
||||
n_embed: 16384
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions:
|
||||
- 32
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.ClassEmbedder
|
||||
params:
|
||||
embed_dim: 512
|
||||
key: class_label
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 64
|
||||
num_workers: 12
|
||||
wrap: false
|
||||
train:
|
||||
target: ldm.data.imagenet.ImageNetTrain
|
||||
params:
|
||||
config:
|
||||
size: 256
|
||||
validation:
|
||||
target: ldm.data.imagenet.ImageNetValidation
|
||||
params:
|
||||
config:
|
||||
size: 256
|
@ -1,70 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 2.0e-06
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0195
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
cond_stage_key: class_label
|
||||
image_size: 64
|
||||
channels: 3
|
||||
cond_stage_trainable: false
|
||||
concat_mode: false
|
||||
monitor: val/loss
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 64
|
||||
in_channels: 3
|
||||
out_channels: 3
|
||||
model_channels: 224
|
||||
attention_resolutions:
|
||||
- 8
|
||||
- 4
|
||||
- 2
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 3
|
||||
- 4
|
||||
num_head_channels: 32
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.VQModelInterface
|
||||
params:
|
||||
embed_dim: 3
|
||||
n_embed: 8192
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
cond_stage_config: __is_unconditional__
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 42
|
||||
num_workers: 5
|
||||
wrap: false
|
||||
train:
|
||||
target: ldm.data.faceshq.FFHQTrain
|
||||
params:
|
||||
size: 256
|
||||
validation:
|
||||
target: ldm.data.faceshq.FFHQValidation
|
||||
params:
|
||||
size: 256
|
@ -1,67 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-06
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0205
|
||||
log_every_t: 100
|
||||
timesteps: 1000
|
||||
loss_type: l1
|
||||
first_stage_key: image
|
||||
cond_stage_key: masked_image
|
||||
image_size: 64
|
||||
channels: 3
|
||||
concat_mode: true
|
||||
monitor: val/loss
|
||||
scheduler_config:
|
||||
target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler
|
||||
params:
|
||||
verbosity_interval: 0
|
||||
warm_up_steps: 1000
|
||||
max_decay_steps: 50000
|
||||
lr_start: 0.001
|
||||
lr_max: 0.1
|
||||
lr_min: 0.0001
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 64
|
||||
in_channels: 7
|
||||
out_channels: 3
|
||||
model_channels: 256
|
||||
attention_resolutions:
|
||||
- 8
|
||||
- 4
|
||||
- 2
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 3
|
||||
- 4
|
||||
num_heads: 8
|
||||
resblock_updown: true
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.VQModelInterface
|
||||
params:
|
||||
embed_dim: 3
|
||||
n_embed: 8192
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
attn_type: none
|
||||
double_z: false
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: ldm.modules.losses.contperceptual.DummyLoss
|
||||
cond_stage_config: __is_first_stage__
|
@ -1,81 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 2.0e-06
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0205
|
||||
log_every_t: 100
|
||||
timesteps: 1000
|
||||
loss_type: l1
|
||||
first_stage_key: image
|
||||
cond_stage_key: coordinates_bbox
|
||||
image_size: 64
|
||||
channels: 3
|
||||
conditioning_key: crossattn
|
||||
cond_stage_trainable: true
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 64
|
||||
in_channels: 3
|
||||
out_channels: 3
|
||||
model_channels: 128
|
||||
attention_resolutions:
|
||||
- 8
|
||||
- 4
|
||||
- 2
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 3
|
||||
- 4
|
||||
num_head_channels: 32
|
||||
use_spatial_transformer: true
|
||||
transformer_depth: 3
|
||||
context_dim: 512
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.VQModelInterface
|
||||
params:
|
||||
embed_dim: 3
|
||||
n_embed: 8192
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.BERTEmbedder
|
||||
params:
|
||||
n_embed: 512
|
||||
n_layer: 16
|
||||
vocab_size: 8192
|
||||
max_seq_len: 92
|
||||
use_tokenizer: false
|
||||
monitor: val/loss_simple_ema
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 24
|
||||
wrap: false
|
||||
num_workers: 10
|
||||
train:
|
||||
target: ldm.data.openimages.OpenImagesBBoxTrain
|
||||
params:
|
||||
size: 256
|
||||
validation:
|
||||
target: ldm.data.openimages.OpenImagesBBoxValidation
|
||||
params:
|
||||
size: 256
|
@ -1,70 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 2.0e-06
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0195
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
cond_stage_key: class_label
|
||||
image_size: 64
|
||||
channels: 3
|
||||
cond_stage_trainable: false
|
||||
concat_mode: false
|
||||
monitor: val/loss
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 64
|
||||
in_channels: 3
|
||||
out_channels: 3
|
||||
model_channels: 224
|
||||
attention_resolutions:
|
||||
- 8
|
||||
- 4
|
||||
- 2
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 3
|
||||
- 4
|
||||
num_head_channels: 32
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.VQModelInterface
|
||||
params:
|
||||
embed_dim: 3
|
||||
n_embed: 8192
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
cond_stage_config: __is_unconditional__
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 48
|
||||
num_workers: 5
|
||||
wrap: false
|
||||
train:
|
||||
target: ldm.data.lsun.LSUNBedroomsTrain
|
||||
params:
|
||||
size: 256
|
||||
validation:
|
||||
target: ldm.data.lsun.LSUNBedroomsValidation
|
||||
params:
|
||||
size: 256
|
@ -1,92 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 5.0e-05
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0155
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
loss_type: l1
|
||||
first_stage_key: image
|
||||
cond_stage_key: image
|
||||
image_size: 32
|
||||
channels: 4
|
||||
cond_stage_trainable: false
|
||||
concat_mode: false
|
||||
scale_by_std: true
|
||||
monitor: val/loss_simple_ema
|
||||
scheduler_config:
|
||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||
params:
|
||||
warm_up_steps:
|
||||
- 10000
|
||||
cycle_lengths:
|
||||
- 10000000000000
|
||||
f_start:
|
||||
- 1.0e-06
|
||||
f_max:
|
||||
- 1.0
|
||||
f_min:
|
||||
- 1.0
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 32
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 192
|
||||
attention_resolutions:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_heads: 8
|
||||
use_scale_shift_norm: true
|
||||
resblock_updown: true
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config: '__is_unconditional__'
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 96
|
||||
num_workers: 5
|
||||
wrap: false
|
||||
train:
|
||||
target: ldm.data.lsun.LSUNChurchesTrain
|
||||
params:
|
||||
size: 256
|
||||
validation:
|
||||
target: ldm.data.lsun.LSUNChurchesValidation
|
||||
params:
|
||||
size: 256
|
@ -1,59 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-06
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0205
|
||||
log_every_t: 100
|
||||
timesteps: 1000
|
||||
loss_type: l1
|
||||
first_stage_key: image
|
||||
cond_stage_key: segmentation
|
||||
image_size: 64
|
||||
channels: 3
|
||||
concat_mode: true
|
||||
cond_stage_trainable: true
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 64
|
||||
in_channels: 6
|
||||
out_channels: 3
|
||||
model_channels: 128
|
||||
attention_resolutions:
|
||||
- 32
|
||||
- 16
|
||||
- 8
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 4
|
||||
- 8
|
||||
num_heads: 8
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.VQModelInterface
|
||||
params:
|
||||
embed_dim: 3
|
||||
n_embed: 8192
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.SpatialRescaler
|
||||
params:
|
||||
n_stages: 2
|
||||
in_channels: 182
|
||||
out_channels: 3
|
@ -1,78 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-06
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0205
|
||||
log_every_t: 100
|
||||
timesteps: 1000
|
||||
loss_type: l1
|
||||
first_stage_key: image
|
||||
cond_stage_key: segmentation
|
||||
image_size: 128
|
||||
channels: 3
|
||||
concat_mode: true
|
||||
cond_stage_trainable: true
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 128
|
||||
in_channels: 6
|
||||
out_channels: 3
|
||||
model_channels: 128
|
||||
attention_resolutions:
|
||||
- 32
|
||||
- 16
|
||||
- 8
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 4
|
||||
- 8
|
||||
num_heads: 8
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.VQModelInterface
|
||||
params:
|
||||
embed_dim: 3
|
||||
n_embed: 8192
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.SpatialRescaler
|
||||
params:
|
||||
n_stages: 2
|
||||
in_channels: 182
|
||||
out_channels: 3
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 8
|
||||
wrap: false
|
||||
num_workers: 10
|
||||
train:
|
||||
target: ldm.data.landscapes.RFWTrain
|
||||
params:
|
||||
size: 768
|
||||
crop_size: 512
|
||||
segmentation_to_float32: true
|
||||
validation:
|
||||
target: ldm.data.landscapes.RFWValidation
|
||||
params:
|
||||
size: 768
|
||||
crop_size: 512
|
||||
segmentation_to_float32: true
|
@ -1,77 +0,0 @@
|
||||
model:
|
||||
base_learning_rate: 2.0e-06
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.0015
|
||||
linear_end: 0.0195
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
cond_stage_key: caption
|
||||
image_size: 64
|
||||
channels: 3
|
||||
cond_stage_trainable: true
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 64
|
||||
in_channels: 3
|
||||
out_channels: 3
|
||||
model_channels: 192
|
||||
attention_resolutions:
|
||||
- 8
|
||||
- 4
|
||||
- 2
|
||||
num_res_blocks: 2
|
||||
channel_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 3
|
||||
- 5
|
||||
num_head_channels: 32
|
||||
use_spatial_transformer: true
|
||||
transformer_depth: 1
|
||||
context_dim: 640
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.VQModelInterface
|
||||
params:
|
||||
embed_dim: 3
|
||||
n_embed: 8192
|
||||
ddconfig:
|
||||
double_z: false
|
||||
z_channels: 3
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.BERTEmbedder
|
||||
params:
|
||||
n_embed: 640
|
||||
n_layer: 32
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 28
|
||||
num_workers: 10
|
||||
wrap: false
|
||||
train:
|
||||
target: ldm.data.previews.pytorch_dataset.PreviewsTrain
|
||||
params:
|
||||
size: 256
|
||||
validation:
|
||||
target: ldm.data.previews.pytorch_dataset.PreviewsValidation
|
||||
params:
|
||||
size: 256
|
@ -3,9 +3,11 @@
|
||||
# Before running stable-diffusion on an internet-isolated machine,
|
||||
# run this script from one with internet connectivity. The
|
||||
# two machines must share a common .cache directory.
|
||||
from transformers import CLIPTokenizer, CLIPTextModel
|
||||
#
|
||||
# Coauthor: Kevin Turner http://github.com/keturn
|
||||
#
|
||||
print('Loading Python libraries...\n')
|
||||
import clip
|
||||
from transformers import BertTokenizerFast, AutoFeatureExtractor
|
||||
import sys
|
||||
import transformers
|
||||
import os
|
||||
@ -14,9 +16,247 @@ import torch
|
||||
import urllib.request
|
||||
import zipfile
|
||||
import traceback
|
||||
import getpass
|
||||
from omegaconf import OmegaConf
|
||||
from pathlib import Path
|
||||
from transformers import CLIPTokenizer, CLIPTextModel
|
||||
from transformers import BertTokenizerFast, AutoFeatureExtractor
|
||||
from huggingface_hub import hf_hub_download, HfFolder, hf_hub_url
|
||||
|
||||
transformers.logging.set_verbosity_error()
|
||||
|
||||
#--------------------------globals--
|
||||
Model_dir = './models/ldm/stable-diffusion-v1/'
|
||||
Config_file = './configs/models.yaml'
|
||||
SD_Configs = './configs/stable-diffusion'
|
||||
Datasets = {
|
||||
'stable-diffusion-1.5': {
|
||||
'description': 'The newest Stable Diffusion version 1.5 weight file (4.27 GB)',
|
||||
'repo_id': 'runwayml/stable-diffusion-v1-5',
|
||||
'config': 'v1-inference.yaml',
|
||||
'file': 'v1-5-pruned-emaonly.ckpt',
|
||||
'recommended': True,
|
||||
'width': 512,
|
||||
'height': 512,
|
||||
},
|
||||
'inpainting-1.5': {
|
||||
'description': 'RunwayML SD 1.5 model optimized for inpainting (4.27 GB)',
|
||||
'repo_id': 'runwayml/stable-diffusion-inpainting',
|
||||
'config': 'v1-inpainting-inference.yaml',
|
||||
'file': 'sd-v1-5-inpainting.ckpt',
|
||||
'recommended': True,
|
||||
'width': 512,
|
||||
'height': 512,
|
||||
},
|
||||
'stable-diffusion-1.4': {
|
||||
'description': 'The original Stable Diffusion version 1.4 weight file (4.27 GB)',
|
||||
'repo_id': 'CompVis/stable-diffusion-v-1-4-original',
|
||||
'config': 'v1-inference.yaml',
|
||||
'file': 'sd-v1-4.ckpt',
|
||||
'recommended': False,
|
||||
'width': 512,
|
||||
'height': 512,
|
||||
},
|
||||
'waifu-diffusion-1.3': {
|
||||
'description': 'Stable Diffusion 1.4 fine tuned on anime-styled images (4.27)',
|
||||
'repo_id': 'hakurei/waifu-diffusion-v1-3',
|
||||
'config': 'v1-inference.yaml',
|
||||
'file': 'model-epoch09-float32.ckpt',
|
||||
'recommended': False,
|
||||
'width': 512,
|
||||
'height': 512,
|
||||
},
|
||||
'ft-mse-improved-autoencoder-840000': {
|
||||
'description': 'StabilityAI improved autoencoder fine-tuned for human faces (recommended; 335 MB)',
|
||||
'repo_id': 'stabilityai/sd-vae-ft-mse-original',
|
||||
'config': 'VAE',
|
||||
'file': 'vae-ft-mse-840000-ema-pruned.ckpt',
|
||||
'recommended': True,
|
||||
'width': 512,
|
||||
'height': 512,
|
||||
},
|
||||
}
|
||||
Config_preamble = '''# This file describes the alternative machine learning models
|
||||
# available to InvokeAI script.
|
||||
#
|
||||
# To add a new model, follow the examples below. Each
|
||||
# model requires a model config file, a weights file,
|
||||
# and the width and height of the images it
|
||||
# was trained on.
|
||||
'''
|
||||
|
||||
#---------------------------------------------
|
||||
def introduction():
|
||||
print(
|
||||
'''Welcome to InvokeAI. This script will help download the Stable Diffusion weight files
|
||||
and other large models that are needed for text to image generation. At any point you may interrupt
|
||||
this program and resume later.\n'''
|
||||
)
|
||||
|
||||
#---------------------------------------------
|
||||
def yes_or_no(prompt:str, default_yes=True):
|
||||
default = "y" if default_yes else 'n'
|
||||
response = input(f'{prompt} [{default}] ') or default
|
||||
if default_yes:
|
||||
return response[0] not in ('n','N')
|
||||
else:
|
||||
return response[0] in ('y','Y')
|
||||
|
||||
#---------------------------------------------
|
||||
def user_wants_to_download_weights():
|
||||
return yes_or_no('Would you like to download the Stable Diffusion model weights now?')
|
||||
|
||||
#---------------------------------------------
|
||||
def select_datasets():
|
||||
done = False
|
||||
while not done:
|
||||
print('''
|
||||
Choose the weight file(s) you wish to download. Before downloading you
|
||||
will be given the option to view and change your selections.
|
||||
'''
|
||||
)
|
||||
datasets = dict()
|
||||
|
||||
counter = 1
|
||||
dflt = None # the first model selected will be the default; TODO let user change
|
||||
for ds in Datasets.keys():
|
||||
recommended = '(recommended)' if Datasets[ds]['recommended'] else ''
|
||||
print(f'[{counter}] {ds}:\n {Datasets[ds]["description"]} {recommended}')
|
||||
if yes_or_no(' Download?',default_yes=Datasets[ds]['recommended']):
|
||||
datasets[ds]=counter
|
||||
counter += 1
|
||||
|
||||
print('The following weight files will be downloaded:')
|
||||
for ds in datasets:
|
||||
dflt = '*' if dflt is None else ''
|
||||
print(f' [{datasets[ds]}] {ds}{dflt}')
|
||||
print("*default")
|
||||
ok_to_download = yes_or_no('Ok to download?')
|
||||
if not ok_to_download:
|
||||
if yes_or_no('Change your selection?'):
|
||||
pass
|
||||
else:
|
||||
done = True
|
||||
else:
|
||||
done = True
|
||||
return datasets if ok_to_download else None
|
||||
|
||||
#-------------------------------Authenticate against Hugging Face
|
||||
def authenticate():
|
||||
print('''
|
||||
To download the Stable Diffusion weight files you need to read and accept the
|
||||
CreativeML Responsible AI license. If you have not already done so, please
|
||||
create an account at https://huggingface.co. Then login under your account and
|
||||
read and accept the license available at https://huggingface.co/CompVis/stable-diffusion-v-1-4-original.
|
||||
'''
|
||||
)
|
||||
input('Press <enter> when you are ready to continue:')
|
||||
access_token = HfFolder.get_token()
|
||||
if access_token is None:
|
||||
print('''
|
||||
Thank you! Now you need to authenticate with your HuggingFace access token.
|
||||
Go to https://huggingface.co/settings/tokens and create a token. Copy it to the
|
||||
clipboard and paste it here: '''
|
||||
)
|
||||
access_token = getpass.getpass()
|
||||
HfFolder.save_token(access_token)
|
||||
return access_token
|
||||
|
||||
#---------------------------------------------
|
||||
# look for legacy model.ckpt in models directory and offer to
|
||||
# normalize its name
|
||||
def migrate_models_ckpt():
|
||||
if not os.path.exists(os.path.join(Model_dir,'model.ckpt')):
|
||||
return
|
||||
new_name = Datasets['stable-diffusion-1.4']['file']
|
||||
print('You seem to have the Stable Diffusion v4.1 "model.ckpt" already installed.')
|
||||
rename = yes_or_no(f'Ok to rename it to "{new_name}" for future reference?')
|
||||
if rename:
|
||||
print(f'model.ckpt => {new_name}')
|
||||
os.rename(os.path.join(Model_dir,'model.ckpt'),os.path.join(Model_dir,new_name))
|
||||
|
||||
#---------------------------------------------
|
||||
def download_weight_datasets(models:dict, access_token:str):
|
||||
migrate_models_ckpt()
|
||||
successful = dict()
|
||||
for mod in models.keys():
|
||||
repo_id = Datasets[mod]['repo_id']
|
||||
filename = Datasets[mod]['file']
|
||||
success = conditional_download(
|
||||
repo_id=repo_id,
|
||||
model_name=filename,
|
||||
access_token=access_token
|
||||
)
|
||||
if success:
|
||||
successful[mod] = True
|
||||
keys = ', '.join(successful.keys())
|
||||
print(f'Successfully installed {keys}')
|
||||
return successful
|
||||
|
||||
#---------------------------------------------
|
||||
def conditional_download(repo_id:str, model_name:str, access_token:str):
|
||||
model_dest = os.path.join(Model_dir, model_name)
|
||||
if os.path.exists(model_dest):
|
||||
print(f' * {model_name}: exists')
|
||||
return True
|
||||
os.makedirs(os.path.dirname(model_dest), exist_ok=True)
|
||||
|
||||
try:
|
||||
print(f' * {model_name}: downloading or retrieving from cache...')
|
||||
path = Path(hf_hub_download(repo_id, model_name, use_auth_token=access_token))
|
||||
path.resolve(strict=True).link_to(model_dest)
|
||||
except Exception as e:
|
||||
print(f'** Error downloading {model_name}: {str(e)} **')
|
||||
return False
|
||||
return True
|
||||
|
||||
#---------------------------------------------
|
||||
def update_config_file(successfully_downloaded:dict):
|
||||
try:
|
||||
yaml = new_config_file_contents(successfully_downloaded)
|
||||
tmpfile = os.path.join(os.path.dirname(Config_file),'new_config.tmp')
|
||||
with open(tmpfile, 'w') as outfile:
|
||||
outfile.write(Config_preamble)
|
||||
outfile.write(yaml)
|
||||
os.rename(tmpfile,Config_file)
|
||||
except Exception as e:
|
||||
print(f'**Error creating config file {Config_file}: {str(e)} **')
|
||||
return
|
||||
print(f'Successfully created new configuration file {Config_file}')
|
||||
|
||||
|
||||
#---------------------------------------------
|
||||
def new_config_file_contents(successfully_downloaded:dict)->str:
|
||||
conf = OmegaConf.load(Config_file)
|
||||
|
||||
# find the VAE file, if there is one
|
||||
vae = None
|
||||
default_selected = False
|
||||
|
||||
for model in successfully_downloaded:
|
||||
if Datasets[model]['config'] == 'VAE':
|
||||
vae = Datasets[model]['file']
|
||||
|
||||
for model in successfully_downloaded:
|
||||
if Datasets[model]['config'] == 'VAE': # skip VAE entries
|
||||
continue
|
||||
stanza = conf[model] if model in conf else { }
|
||||
|
||||
stanza['description'] = Datasets[model]['description']
|
||||
stanza['weights'] = os.path.join(Model_dir,Datasets[model]['file'])
|
||||
stanza['config'] =os.path.join(SD_Configs, Datasets[model]['config'])
|
||||
stanza['width'] = Datasets[model]['width']
|
||||
stanza['height'] = Datasets[model]['height']
|
||||
stanza.pop('default',None) # this will be set later
|
||||
if vae:
|
||||
stanza['vae'] = os.path.join(Model_dir,vae)
|
||||
# BUG - the first stanza is always the default. User should select.
|
||||
if not default_selected:
|
||||
stanza['default'] = True
|
||||
default_selected = True
|
||||
conf[model] = stanza
|
||||
return OmegaConf.to_yaml(conf)
|
||||
|
||||
#---------------------------------------------
|
||||
# this will preload the Bert tokenizer fles
|
||||
def download_bert():
|
||||
@ -66,7 +306,6 @@ def download_gfpgan():
|
||||
print(traceback.format_exc())
|
||||
|
||||
print('Loading models from GFPGAN')
|
||||
import urllib.request
|
||||
for model in (
|
||||
[
|
||||
'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth',
|
||||
@ -152,6 +391,15 @@ def download_safety_checker():
|
||||
|
||||
#-------------------------------------
|
||||
if __name__ == '__main__':
|
||||
introduction()
|
||||
if user_wants_to_download_weights():
|
||||
models = select_datasets()
|
||||
if models is None:
|
||||
if yes_or_no('Quit?',default_yes=False):
|
||||
sys.exit(0)
|
||||
access_token = authenticate()
|
||||
successfully_downloaded = download_weight_datasets(models, access_token)
|
||||
update_config_file(successfully_downloaded)
|
||||
download_bert()
|
||||
download_kornia()
|
||||
download_clip()
|
||||
|
Loading…
Reference in New Issue
Block a user