mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
preload_models.py script downloads the weight files
- user can select which weight files to download using huggingface cache - user must log in to huggingface, generate an access token, and accept license terms the very first time this is run. After that, everything works automatically. - added placeholder for docs for installing models - also got rid of unused config files. hopefully they weren't needed for textual inversion, but I don't think so.
This commit is contained in:
8
.gitignore
vendored
8
.gitignore
vendored
@ -199,7 +199,13 @@ checkpoints
|
|||||||
.scratch/
|
.scratch/
|
||||||
.vscode/
|
.vscode/
|
||||||
gfpgan/
|
gfpgan/
|
||||||
models/ldm/stable-diffusion-v1/model.sha256
|
models/ldm/stable-diffusion-v1/*.sha256
|
||||||
|
|
||||||
# GFPGAN model files
|
# GFPGAN model files
|
||||||
gfpgan/
|
gfpgan/
|
||||||
|
|
||||||
|
# config file (will be created by installer)
|
||||||
|
configs/models.yaml
|
||||||
|
|
||||||
|
# weights (will be created by installer)
|
||||||
|
models/ldm/stable-diffusion-v1/*.ckpt
|
@ -1,54 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 4.5e-6
|
|
||||||
target: ldm.models.autoencoder.AutoencoderKL
|
|
||||||
params:
|
|
||||||
monitor: "val/rec_loss"
|
|
||||||
embed_dim: 16
|
|
||||||
lossconfig:
|
|
||||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
|
||||||
params:
|
|
||||||
disc_start: 50001
|
|
||||||
kl_weight: 0.000001
|
|
||||||
disc_weight: 0.5
|
|
||||||
|
|
||||||
ddconfig:
|
|
||||||
double_z: True
|
|
||||||
z_channels: 16
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult: [ 1,1,2,2,4] # num_down = len(ch_mult)-1
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: [16]
|
|
||||||
dropout: 0.0
|
|
||||||
|
|
||||||
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 12
|
|
||||||
wrap: True
|
|
||||||
train:
|
|
||||||
target: ldm.data.imagenet.ImageNetSRTrain
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
degradation: pil_nearest
|
|
||||||
validation:
|
|
||||||
target: ldm.data.imagenet.ImageNetSRValidation
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
degradation: pil_nearest
|
|
||||||
|
|
||||||
lightning:
|
|
||||||
callbacks:
|
|
||||||
image_logger:
|
|
||||||
target: main.ImageLogger
|
|
||||||
params:
|
|
||||||
batch_frequency: 1000
|
|
||||||
max_images: 8
|
|
||||||
increase_log_steps: True
|
|
||||||
|
|
||||||
trainer:
|
|
||||||
benchmark: True
|
|
||||||
accumulate_grad_batches: 2
|
|
@ -1,53 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 4.5e-6
|
|
||||||
target: ldm.models.autoencoder.AutoencoderKL
|
|
||||||
params:
|
|
||||||
monitor: "val/rec_loss"
|
|
||||||
embed_dim: 4
|
|
||||||
lossconfig:
|
|
||||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
|
||||||
params:
|
|
||||||
disc_start: 50001
|
|
||||||
kl_weight: 0.000001
|
|
||||||
disc_weight: 0.5
|
|
||||||
|
|
||||||
ddconfig:
|
|
||||||
double_z: True
|
|
||||||
z_channels: 4
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult: [ 1,2,4,4 ] # num_down = len(ch_mult)-1
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: [ ]
|
|
||||||
dropout: 0.0
|
|
||||||
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 12
|
|
||||||
wrap: True
|
|
||||||
train:
|
|
||||||
target: ldm.data.imagenet.ImageNetSRTrain
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
degradation: pil_nearest
|
|
||||||
validation:
|
|
||||||
target: ldm.data.imagenet.ImageNetSRValidation
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
degradation: pil_nearest
|
|
||||||
|
|
||||||
lightning:
|
|
||||||
callbacks:
|
|
||||||
image_logger:
|
|
||||||
target: main.ImageLogger
|
|
||||||
params:
|
|
||||||
batch_frequency: 1000
|
|
||||||
max_images: 8
|
|
||||||
increase_log_steps: True
|
|
||||||
|
|
||||||
trainer:
|
|
||||||
benchmark: True
|
|
||||||
accumulate_grad_batches: 2
|
|
@ -1,54 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 4.5e-6
|
|
||||||
target: ldm.models.autoencoder.AutoencoderKL
|
|
||||||
params:
|
|
||||||
monitor: "val/rec_loss"
|
|
||||||
embed_dim: 3
|
|
||||||
lossconfig:
|
|
||||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
|
||||||
params:
|
|
||||||
disc_start: 50001
|
|
||||||
kl_weight: 0.000001
|
|
||||||
disc_weight: 0.5
|
|
||||||
|
|
||||||
ddconfig:
|
|
||||||
double_z: True
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: [ ]
|
|
||||||
dropout: 0.0
|
|
||||||
|
|
||||||
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 12
|
|
||||||
wrap: True
|
|
||||||
train:
|
|
||||||
target: ldm.data.imagenet.ImageNetSRTrain
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
degradation: pil_nearest
|
|
||||||
validation:
|
|
||||||
target: ldm.data.imagenet.ImageNetSRValidation
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
degradation: pil_nearest
|
|
||||||
|
|
||||||
lightning:
|
|
||||||
callbacks:
|
|
||||||
image_logger:
|
|
||||||
target: main.ImageLogger
|
|
||||||
params:
|
|
||||||
batch_frequency: 1000
|
|
||||||
max_images: 8
|
|
||||||
increase_log_steps: True
|
|
||||||
|
|
||||||
trainer:
|
|
||||||
benchmark: True
|
|
||||||
accumulate_grad_batches: 2
|
|
@ -1,53 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 4.5e-6
|
|
||||||
target: ldm.models.autoencoder.AutoencoderKL
|
|
||||||
params:
|
|
||||||
monitor: "val/rec_loss"
|
|
||||||
embed_dim: 64
|
|
||||||
lossconfig:
|
|
||||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
|
||||||
params:
|
|
||||||
disc_start: 50001
|
|
||||||
kl_weight: 0.000001
|
|
||||||
disc_weight: 0.5
|
|
||||||
|
|
||||||
ddconfig:
|
|
||||||
double_z: True
|
|
||||||
z_channels: 64
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult: [ 1,1,2,2,4,4] # num_down = len(ch_mult)-1
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: [16,8]
|
|
||||||
dropout: 0.0
|
|
||||||
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 12
|
|
||||||
wrap: True
|
|
||||||
train:
|
|
||||||
target: ldm.data.imagenet.ImageNetSRTrain
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
degradation: pil_nearest
|
|
||||||
validation:
|
|
||||||
target: ldm.data.imagenet.ImageNetSRValidation
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
degradation: pil_nearest
|
|
||||||
|
|
||||||
lightning:
|
|
||||||
callbacks:
|
|
||||||
image_logger:
|
|
||||||
target: main.ImageLogger
|
|
||||||
params:
|
|
||||||
batch_frequency: 1000
|
|
||||||
max_images: 8
|
|
||||||
increase_log_steps: True
|
|
||||||
|
|
||||||
trainer:
|
|
||||||
benchmark: True
|
|
||||||
accumulate_grad_batches: 2
|
|
@ -1,86 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 2.0e-06
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0195
|
|
||||||
num_timesteps_cond: 1
|
|
||||||
log_every_t: 200
|
|
||||||
timesteps: 1000
|
|
||||||
first_stage_key: image
|
|
||||||
image_size: 64
|
|
||||||
channels: 3
|
|
||||||
monitor: val/loss_simple_ema
|
|
||||||
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 64
|
|
||||||
in_channels: 3
|
|
||||||
out_channels: 3
|
|
||||||
model_channels: 224
|
|
||||||
attention_resolutions:
|
|
||||||
# note: this isn\t actually the resolution but
|
|
||||||
# the downsampling factor, i.e. this corresnponds to
|
|
||||||
# attention on spatial resolution 8,16,32, as the
|
|
||||||
# spatial reolution of the latents is 64 for f4
|
|
||||||
- 8
|
|
||||||
- 4
|
|
||||||
- 2
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 3
|
|
||||||
- 4
|
|
||||||
num_head_channels: 32
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.VQModelInterface
|
|
||||||
params:
|
|
||||||
embed_dim: 3
|
|
||||||
n_embed: 8192
|
|
||||||
ckpt_path: models/first_stage_models/vq-f4/model.ckpt
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
cond_stage_config: __is_unconditional__
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 48
|
|
||||||
num_workers: 5
|
|
||||||
wrap: false
|
|
||||||
train:
|
|
||||||
target: taming.data.faceshq.CelebAHQTrain
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
validation:
|
|
||||||
target: taming.data.faceshq.CelebAHQValidation
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
|
|
||||||
|
|
||||||
lightning:
|
|
||||||
callbacks:
|
|
||||||
image_logger:
|
|
||||||
target: main.ImageLogger
|
|
||||||
params:
|
|
||||||
batch_frequency: 5000
|
|
||||||
max_images: 8
|
|
||||||
increase_log_steps: False
|
|
||||||
|
|
||||||
trainer:
|
|
||||||
benchmark: True
|
|
@ -1,98 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 1.0e-06
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0195
|
|
||||||
num_timesteps_cond: 1
|
|
||||||
log_every_t: 200
|
|
||||||
timesteps: 1000
|
|
||||||
first_stage_key: image
|
|
||||||
cond_stage_key: class_label
|
|
||||||
image_size: 32
|
|
||||||
channels: 4
|
|
||||||
cond_stage_trainable: true
|
|
||||||
conditioning_key: crossattn
|
|
||||||
monitor: val/loss_simple_ema
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 32
|
|
||||||
in_channels: 4
|
|
||||||
out_channels: 4
|
|
||||||
model_channels: 256
|
|
||||||
attention_resolutions:
|
|
||||||
#note: this isn\t actually the resolution but
|
|
||||||
# the downsampling factor, i.e. this corresnponds to
|
|
||||||
# attention on spatial resolution 8,16,32, as the
|
|
||||||
# spatial reolution of the latents is 32 for f8
|
|
||||||
- 4
|
|
||||||
- 2
|
|
||||||
- 1
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_head_channels: 32
|
|
||||||
use_spatial_transformer: true
|
|
||||||
transformer_depth: 1
|
|
||||||
context_dim: 512
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.VQModelInterface
|
|
||||||
params:
|
|
||||||
embed_dim: 4
|
|
||||||
n_embed: 16384
|
|
||||||
ckpt_path: configs/first_stage_models/vq-f8/model.yaml
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 4
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions:
|
|
||||||
- 32
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
cond_stage_config:
|
|
||||||
target: ldm.modules.encoders.modules.ClassEmbedder
|
|
||||||
params:
|
|
||||||
embed_dim: 512
|
|
||||||
key: class_label
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 64
|
|
||||||
num_workers: 12
|
|
||||||
wrap: false
|
|
||||||
train:
|
|
||||||
target: ldm.data.imagenet.ImageNetTrain
|
|
||||||
params:
|
|
||||||
config:
|
|
||||||
size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.imagenet.ImageNetValidation
|
|
||||||
params:
|
|
||||||
config:
|
|
||||||
size: 256
|
|
||||||
|
|
||||||
|
|
||||||
lightning:
|
|
||||||
callbacks:
|
|
||||||
image_logger:
|
|
||||||
target: main.ImageLogger
|
|
||||||
params:
|
|
||||||
batch_frequency: 5000
|
|
||||||
max_images: 8
|
|
||||||
increase_log_steps: False
|
|
||||||
|
|
||||||
trainer:
|
|
||||||
benchmark: True
|
|
@ -1,68 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 0.0001
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0195
|
|
||||||
num_timesteps_cond: 1
|
|
||||||
log_every_t: 200
|
|
||||||
timesteps: 1000
|
|
||||||
first_stage_key: image
|
|
||||||
cond_stage_key: class_label
|
|
||||||
image_size: 64
|
|
||||||
channels: 3
|
|
||||||
cond_stage_trainable: true
|
|
||||||
conditioning_key: crossattn
|
|
||||||
monitor: val/loss
|
|
||||||
use_ema: False
|
|
||||||
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 64
|
|
||||||
in_channels: 3
|
|
||||||
out_channels: 3
|
|
||||||
model_channels: 192
|
|
||||||
attention_resolutions:
|
|
||||||
- 8
|
|
||||||
- 4
|
|
||||||
- 2
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 3
|
|
||||||
- 5
|
|
||||||
num_heads: 1
|
|
||||||
use_spatial_transformer: true
|
|
||||||
transformer_depth: 1
|
|
||||||
context_dim: 512
|
|
||||||
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.VQModelInterface
|
|
||||||
params:
|
|
||||||
embed_dim: 3
|
|
||||||
n_embed: 8192
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
|
|
||||||
cond_stage_config:
|
|
||||||
target: ldm.modules.encoders.modules.ClassEmbedder
|
|
||||||
params:
|
|
||||||
n_classes: 1001
|
|
||||||
embed_dim: 512
|
|
||||||
key: class_label
|
|
@ -1,85 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 2.0e-06
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0195
|
|
||||||
num_timesteps_cond: 1
|
|
||||||
log_every_t: 200
|
|
||||||
timesteps: 1000
|
|
||||||
first_stage_key: image
|
|
||||||
image_size: 64
|
|
||||||
channels: 3
|
|
||||||
monitor: val/loss_simple_ema
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 64
|
|
||||||
in_channels: 3
|
|
||||||
out_channels: 3
|
|
||||||
model_channels: 224
|
|
||||||
attention_resolutions:
|
|
||||||
# note: this isn\t actually the resolution but
|
|
||||||
# the downsampling factor, i.e. this corresnponds to
|
|
||||||
# attention on spatial resolution 8,16,32, as the
|
|
||||||
# spatial reolution of the latents is 64 for f4
|
|
||||||
- 8
|
|
||||||
- 4
|
|
||||||
- 2
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 3
|
|
||||||
- 4
|
|
||||||
num_head_channels: 32
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.VQModelInterface
|
|
||||||
params:
|
|
||||||
embed_dim: 3
|
|
||||||
n_embed: 8192
|
|
||||||
ckpt_path: configs/first_stage_models/vq-f4/model.yaml
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
cond_stage_config: __is_unconditional__
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 42
|
|
||||||
num_workers: 5
|
|
||||||
wrap: false
|
|
||||||
train:
|
|
||||||
target: taming.data.faceshq.FFHQTrain
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
validation:
|
|
||||||
target: taming.data.faceshq.FFHQValidation
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
|
|
||||||
|
|
||||||
lightning:
|
|
||||||
callbacks:
|
|
||||||
image_logger:
|
|
||||||
target: main.ImageLogger
|
|
||||||
params:
|
|
||||||
batch_frequency: 5000
|
|
||||||
max_images: 8
|
|
||||||
increase_log_steps: False
|
|
||||||
|
|
||||||
trainer:
|
|
||||||
benchmark: True
|
|
@ -1,85 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 2.0e-06
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0195
|
|
||||||
num_timesteps_cond: 1
|
|
||||||
log_every_t: 200
|
|
||||||
timesteps: 1000
|
|
||||||
first_stage_key: image
|
|
||||||
image_size: 64
|
|
||||||
channels: 3
|
|
||||||
monitor: val/loss_simple_ema
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 64
|
|
||||||
in_channels: 3
|
|
||||||
out_channels: 3
|
|
||||||
model_channels: 224
|
|
||||||
attention_resolutions:
|
|
||||||
# note: this isn\t actually the resolution but
|
|
||||||
# the downsampling factor, i.e. this corresnponds to
|
|
||||||
# attention on spatial resolution 8,16,32, as the
|
|
||||||
# spatial reolution of the latents is 64 for f4
|
|
||||||
- 8
|
|
||||||
- 4
|
|
||||||
- 2
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 3
|
|
||||||
- 4
|
|
||||||
num_head_channels: 32
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.VQModelInterface
|
|
||||||
params:
|
|
||||||
ckpt_path: configs/first_stage_models/vq-f4/model.yaml
|
|
||||||
embed_dim: 3
|
|
||||||
n_embed: 8192
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
cond_stage_config: __is_unconditional__
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 48
|
|
||||||
num_workers: 5
|
|
||||||
wrap: false
|
|
||||||
train:
|
|
||||||
target: ldm.data.lsun.LSUNBedroomsTrain
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.lsun.LSUNBedroomsValidation
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
|
|
||||||
|
|
||||||
lightning:
|
|
||||||
callbacks:
|
|
||||||
image_logger:
|
|
||||||
target: main.ImageLogger
|
|
||||||
params:
|
|
||||||
batch_frequency: 5000
|
|
||||||
max_images: 8
|
|
||||||
increase_log_steps: False
|
|
||||||
|
|
||||||
trainer:
|
|
||||||
benchmark: True
|
|
@ -1,91 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 5.0e-5 # set to target_lr by starting main.py with '--scale_lr False'
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0155
|
|
||||||
num_timesteps_cond: 1
|
|
||||||
log_every_t: 200
|
|
||||||
timesteps: 1000
|
|
||||||
loss_type: l1
|
|
||||||
first_stage_key: "image"
|
|
||||||
cond_stage_key: "image"
|
|
||||||
image_size: 32
|
|
||||||
channels: 4
|
|
||||||
cond_stage_trainable: False
|
|
||||||
concat_mode: False
|
|
||||||
scale_by_std: True
|
|
||||||
monitor: 'val/loss_simple_ema'
|
|
||||||
|
|
||||||
scheduler_config: # 10000 warmup steps
|
|
||||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
|
||||||
params:
|
|
||||||
warm_up_steps: [10000]
|
|
||||||
cycle_lengths: [10000000000000]
|
|
||||||
f_start: [1.e-6]
|
|
||||||
f_max: [1.]
|
|
||||||
f_min: [ 1.]
|
|
||||||
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 32
|
|
||||||
in_channels: 4
|
|
||||||
out_channels: 4
|
|
||||||
model_channels: 192
|
|
||||||
attention_resolutions: [ 1, 2, 4, 8 ] # 32, 16, 8, 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult: [ 1,2,2,4,4 ] # 32, 16, 8, 4, 2
|
|
||||||
num_heads: 8
|
|
||||||
use_scale_shift_norm: True
|
|
||||||
resblock_updown: True
|
|
||||||
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.AutoencoderKL
|
|
||||||
params:
|
|
||||||
embed_dim: 4
|
|
||||||
monitor: "val/rec_loss"
|
|
||||||
ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
|
|
||||||
ddconfig:
|
|
||||||
double_z: True
|
|
||||||
z_channels: 4
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult: [ 1,2,4,4 ] # num_down = len(ch_mult)-1
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: [ ]
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
|
|
||||||
cond_stage_config: "__is_unconditional__"
|
|
||||||
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 96
|
|
||||||
num_workers: 5
|
|
||||||
wrap: False
|
|
||||||
train:
|
|
||||||
target: ldm.data.lsun.LSUNChurchesTrain
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.lsun.LSUNChurchesValidation
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
|
|
||||||
lightning:
|
|
||||||
callbacks:
|
|
||||||
image_logger:
|
|
||||||
target: main.ImageLogger
|
|
||||||
params:
|
|
||||||
batch_frequency: 5000
|
|
||||||
max_images: 8
|
|
||||||
increase_log_steps: False
|
|
||||||
|
|
||||||
|
|
||||||
trainer:
|
|
||||||
benchmark: True
|
|
@ -1,71 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 5.0e-05
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.00085
|
|
||||||
linear_end: 0.012
|
|
||||||
num_timesteps_cond: 1
|
|
||||||
log_every_t: 200
|
|
||||||
timesteps: 1000
|
|
||||||
first_stage_key: image
|
|
||||||
cond_stage_key: caption
|
|
||||||
image_size: 32
|
|
||||||
channels: 4
|
|
||||||
cond_stage_trainable: true
|
|
||||||
conditioning_key: crossattn
|
|
||||||
monitor: val/loss_simple_ema
|
|
||||||
scale_factor: 0.18215
|
|
||||||
use_ema: False
|
|
||||||
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 32
|
|
||||||
in_channels: 4
|
|
||||||
out_channels: 4
|
|
||||||
model_channels: 320
|
|
||||||
attention_resolutions:
|
|
||||||
- 4
|
|
||||||
- 2
|
|
||||||
- 1
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
- 4
|
|
||||||
num_heads: 8
|
|
||||||
use_spatial_transformer: true
|
|
||||||
transformer_depth: 1
|
|
||||||
context_dim: 1280
|
|
||||||
use_checkpoint: true
|
|
||||||
legacy: False
|
|
||||||
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.AutoencoderKL
|
|
||||||
params:
|
|
||||||
embed_dim: 4
|
|
||||||
monitor: val/rec_loss
|
|
||||||
ddconfig:
|
|
||||||
double_z: true
|
|
||||||
z_channels: 4
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
|
|
||||||
cond_stage_config:
|
|
||||||
target: ldm.modules.encoders.modules.BERTEmbedder
|
|
||||||
params:
|
|
||||||
n_embed: 1280
|
|
||||||
n_layer: 32
|
|
@ -1,5 +1,5 @@
|
|||||||
# This file describes the alternative machine learning models
|
# This file describes the alternative machine learning models
|
||||||
# available to the dream script.
|
# available to InvokeAI script.
|
||||||
#
|
#
|
||||||
# To add a new model, follow the examples below. Each
|
# To add a new model, follow the examples below. Each
|
||||||
# model requires a model config file, a weights file,
|
# model requires a model config file, a weights file,
|
||||||
@ -8,22 +8,29 @@
|
|||||||
stable-diffusion-1.4:
|
stable-diffusion-1.4:
|
||||||
config: configs/stable-diffusion/v1-inference.yaml
|
config: configs/stable-diffusion/v1-inference.yaml
|
||||||
weights: models/ldm/stable-diffusion-v1/model.ckpt
|
weights: models/ldm/stable-diffusion-v1/model.ckpt
|
||||||
# vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
|
vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
|
||||||
description: Stable Diffusion inference model version 1.4
|
description: Stable Diffusion inference model version 1.4
|
||||||
width: 512
|
width: 512
|
||||||
height: 512
|
height: 512
|
||||||
|
stable-diffusion-1.5:
|
||||||
|
description: The newest Stable Diffusion version 1.5 weight file (4.27 GB)
|
||||||
|
weights: ./models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt
|
||||||
|
config: ./configs/stable-diffusion/v1-inference.yaml
|
||||||
|
width: 512
|
||||||
|
height: 512
|
||||||
|
vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
|
||||||
default: true
|
default: true
|
||||||
inpainting-1.5:
|
inpainting-1.5:
|
||||||
description: runwayML tuned inpainting model v1.5
|
description: RunwayML SD 1.5 model optimized for inpainting (4.27 GB)
|
||||||
weights: models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt
|
weights: ./models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt
|
||||||
config: configs/stable-diffusion/v1-inpainting-inference.yaml
|
config: ./configs/stable-diffusion/v1-inpainting-inference.yaml
|
||||||
# vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
|
|
||||||
width: 512
|
width: 512
|
||||||
height: 512
|
height: 512
|
||||||
stable-diffusion-1.5:
|
vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
|
||||||
config: configs/stable-diffusion/v1-inference.yaml
|
waifu-diffusion-1.3:
|
||||||
weights: models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt
|
description: Stable Diffusion 1.4 fine tuned on anime-styled images (4.27)
|
||||||
# vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
|
weights: ./models/ldm/stable-diffusion-v1/model-epoch09-float32.ckpt
|
||||||
description: Stable Diffusion inference model version 1.5
|
config: ./configs/stable-diffusion/v1-inference.yaml
|
||||||
width: 512
|
width: 512
|
||||||
height: 512
|
height: 512
|
||||||
|
vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
|
||||||
|
@ -1,68 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 0.0001
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.015
|
|
||||||
num_timesteps_cond: 1
|
|
||||||
log_every_t: 200
|
|
||||||
timesteps: 1000
|
|
||||||
first_stage_key: jpg
|
|
||||||
cond_stage_key: nix
|
|
||||||
image_size: 48
|
|
||||||
channels: 16
|
|
||||||
cond_stage_trainable: false
|
|
||||||
conditioning_key: crossattn
|
|
||||||
monitor: val/loss_simple_ema
|
|
||||||
scale_by_std: false
|
|
||||||
scale_factor: 0.22765929
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 48
|
|
||||||
in_channels: 16
|
|
||||||
out_channels: 16
|
|
||||||
model_channels: 448
|
|
||||||
attention_resolutions:
|
|
||||||
- 4
|
|
||||||
- 2
|
|
||||||
- 1
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 3
|
|
||||||
- 4
|
|
||||||
use_scale_shift_norm: false
|
|
||||||
resblock_updown: false
|
|
||||||
num_head_channels: 32
|
|
||||||
use_spatial_transformer: true
|
|
||||||
transformer_depth: 1
|
|
||||||
context_dim: 768
|
|
||||||
use_checkpoint: true
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.AutoencoderKL
|
|
||||||
params:
|
|
||||||
monitor: val/rec_loss
|
|
||||||
embed_dim: 16
|
|
||||||
ddconfig:
|
|
||||||
double_z: true
|
|
||||||
z_channels: 16
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions:
|
|
||||||
- 16
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
cond_stage_config:
|
|
||||||
target: torch.nn.Identity
|
|
9
docs/features/INSTALLING_MODELS.md
Normal file
9
docs/features/INSTALLING_MODELS.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
---
|
||||||
|
title: Installing Models
|
||||||
|
---
|
||||||
|
|
||||||
|
# :octicons-paintbrush-16: Installing Models
|
||||||
|
|
||||||
|
## TO COME
|
||||||
|
|
||||||
|
|
@ -281,7 +281,7 @@ class ModelCache(object):
|
|||||||
Returns the preamble for the config file.
|
Returns the preamble for the config file.
|
||||||
'''
|
'''
|
||||||
return '''# This file describes the alternative machine learning models
|
return '''# This file describes the alternative machine learning models
|
||||||
# available to the dream script.
|
# available to InvokeAI script.
|
||||||
#
|
#
|
||||||
# To add a new model, follow the examples below. Each
|
# To add a new model, follow the examples below. Each
|
||||||
# model requires a model config file, a weights file,
|
# model requires a model config file, a weights file,
|
||||||
|
@ -1,44 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 4.5e-06
|
|
||||||
target: ldm.models.autoencoder.AutoencoderKL
|
|
||||||
params:
|
|
||||||
monitor: val/rec_loss
|
|
||||||
embed_dim: 16
|
|
||||||
lossconfig:
|
|
||||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
|
||||||
params:
|
|
||||||
disc_start: 50001
|
|
||||||
kl_weight: 1.0e-06
|
|
||||||
disc_weight: 0.5
|
|
||||||
ddconfig:
|
|
||||||
double_z: true
|
|
||||||
z_channels: 16
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions:
|
|
||||||
- 16
|
|
||||||
dropout: 0.0
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 6
|
|
||||||
wrap: true
|
|
||||||
train:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesTrain
|
|
||||||
params:
|
|
||||||
size: 384
|
|
||||||
crop_size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesValidation
|
|
||||||
params:
|
|
||||||
size: 384
|
|
||||||
crop_size: 256
|
|
@ -1,46 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 4.5e-06
|
|
||||||
target: ldm.models.autoencoder.AutoencoderKL
|
|
||||||
params:
|
|
||||||
monitor: val/rec_loss
|
|
||||||
embed_dim: 64
|
|
||||||
lossconfig:
|
|
||||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
|
||||||
params:
|
|
||||||
disc_start: 50001
|
|
||||||
kl_weight: 1.0e-06
|
|
||||||
disc_weight: 0.5
|
|
||||||
ddconfig:
|
|
||||||
double_z: true
|
|
||||||
z_channels: 64
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions:
|
|
||||||
- 16
|
|
||||||
- 8
|
|
||||||
dropout: 0.0
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 6
|
|
||||||
wrap: true
|
|
||||||
train:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesTrain
|
|
||||||
params:
|
|
||||||
size: 384
|
|
||||||
crop_size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesValidation
|
|
||||||
params:
|
|
||||||
size: 384
|
|
||||||
crop_size: 256
|
|
@ -1,41 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 4.5e-06
|
|
||||||
target: ldm.models.autoencoder.AutoencoderKL
|
|
||||||
params:
|
|
||||||
monitor: val/rec_loss
|
|
||||||
embed_dim: 3
|
|
||||||
lossconfig:
|
|
||||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
|
||||||
params:
|
|
||||||
disc_start: 50001
|
|
||||||
kl_weight: 1.0e-06
|
|
||||||
disc_weight: 0.5
|
|
||||||
ddconfig:
|
|
||||||
double_z: true
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 10
|
|
||||||
wrap: true
|
|
||||||
train:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesTrain
|
|
||||||
params:
|
|
||||||
size: 384
|
|
||||||
crop_size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesValidation
|
|
||||||
params:
|
|
||||||
size: 384
|
|
||||||
crop_size: 256
|
|
@ -1,42 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 4.5e-06
|
|
||||||
target: ldm.models.autoencoder.AutoencoderKL
|
|
||||||
params:
|
|
||||||
monitor: val/rec_loss
|
|
||||||
embed_dim: 4
|
|
||||||
lossconfig:
|
|
||||||
target: ldm.modules.losses.LPIPSWithDiscriminator
|
|
||||||
params:
|
|
||||||
disc_start: 50001
|
|
||||||
kl_weight: 1.0e-06
|
|
||||||
disc_weight: 0.5
|
|
||||||
ddconfig:
|
|
||||||
double_z: true
|
|
||||||
z_channels: 4
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 4
|
|
||||||
wrap: true
|
|
||||||
train:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesTrain
|
|
||||||
params:
|
|
||||||
size: 384
|
|
||||||
crop_size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesValidation
|
|
||||||
params:
|
|
||||||
size: 384
|
|
||||||
crop_size: 256
|
|
@ -1,49 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 4.5e-06
|
|
||||||
target: ldm.models.autoencoder.VQModel
|
|
||||||
params:
|
|
||||||
embed_dim: 8
|
|
||||||
n_embed: 16384
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 8
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions:
|
|
||||||
- 16
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
|
|
||||||
params:
|
|
||||||
disc_conditional: false
|
|
||||||
disc_in_channels: 3
|
|
||||||
disc_start: 250001
|
|
||||||
disc_weight: 0.75
|
|
||||||
disc_num_layers: 2
|
|
||||||
codebook_weight: 1.0
|
|
||||||
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 14
|
|
||||||
num_workers: 20
|
|
||||||
wrap: true
|
|
||||||
train:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesTrain
|
|
||||||
params:
|
|
||||||
size: 384
|
|
||||||
crop_size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesValidation
|
|
||||||
params:
|
|
||||||
size: 384
|
|
||||||
crop_size: 256
|
|
@ -1,46 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 4.5e-06
|
|
||||||
target: ldm.models.autoencoder.VQModel
|
|
||||||
params:
|
|
||||||
embed_dim: 3
|
|
||||||
n_embed: 8192
|
|
||||||
monitor: val/rec_loss
|
|
||||||
|
|
||||||
ddconfig:
|
|
||||||
attn_type: none
|
|
||||||
double_z: false
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
|
|
||||||
params:
|
|
||||||
disc_conditional: false
|
|
||||||
disc_in_channels: 3
|
|
||||||
disc_start: 11
|
|
||||||
disc_weight: 0.75
|
|
||||||
codebook_weight: 1.0
|
|
||||||
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 8
|
|
||||||
num_workers: 12
|
|
||||||
wrap: true
|
|
||||||
train:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesTrain
|
|
||||||
params:
|
|
||||||
crop_size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesValidation
|
|
||||||
params:
|
|
||||||
crop_size: 256
|
|
@ -1,45 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 4.5e-06
|
|
||||||
target: ldm.models.autoencoder.VQModel
|
|
||||||
params:
|
|
||||||
embed_dim: 3
|
|
||||||
n_embed: 8192
|
|
||||||
monitor: val/rec_loss
|
|
||||||
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
|
|
||||||
params:
|
|
||||||
disc_conditional: false
|
|
||||||
disc_in_channels: 3
|
|
||||||
disc_start: 0
|
|
||||||
disc_weight: 0.75
|
|
||||||
codebook_weight: 1.0
|
|
||||||
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 8
|
|
||||||
num_workers: 16
|
|
||||||
wrap: true
|
|
||||||
train:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesTrain
|
|
||||||
params:
|
|
||||||
crop_size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesValidation
|
|
||||||
params:
|
|
||||||
crop_size: 256
|
|
@ -1,48 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 4.5e-06
|
|
||||||
target: ldm.models.autoencoder.VQModel
|
|
||||||
params:
|
|
||||||
embed_dim: 4
|
|
||||||
n_embed: 256
|
|
||||||
monitor: val/rec_loss
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 4
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions:
|
|
||||||
- 32
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
|
|
||||||
params:
|
|
||||||
disc_conditional: false
|
|
||||||
disc_in_channels: 3
|
|
||||||
disc_start: 250001
|
|
||||||
disc_weight: 0.75
|
|
||||||
codebook_weight: 1.0
|
|
||||||
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 10
|
|
||||||
num_workers: 20
|
|
||||||
wrap: true
|
|
||||||
train:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesTrain
|
|
||||||
params:
|
|
||||||
size: 384
|
|
||||||
crop_size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesValidation
|
|
||||||
params:
|
|
||||||
size: 384
|
|
||||||
crop_size: 256
|
|
@ -1,48 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 4.5e-06
|
|
||||||
target: ldm.models.autoencoder.VQModel
|
|
||||||
params:
|
|
||||||
embed_dim: 4
|
|
||||||
n_embed: 16384
|
|
||||||
monitor: val/rec_loss
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 4
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions:
|
|
||||||
- 32
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
|
|
||||||
params:
|
|
||||||
disc_conditional: false
|
|
||||||
disc_in_channels: 3
|
|
||||||
disc_num_layers: 2
|
|
||||||
disc_start: 1
|
|
||||||
disc_weight: 0.6
|
|
||||||
codebook_weight: 1.0
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 10
|
|
||||||
num_workers: 20
|
|
||||||
wrap: true
|
|
||||||
train:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesTrain
|
|
||||||
params:
|
|
||||||
size: 384
|
|
||||||
crop_size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.openimages.FullOpenImagesValidation
|
|
||||||
params:
|
|
||||||
size: 384
|
|
||||||
crop_size: 256
|
|
@ -1,80 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 1.0e-06
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0155
|
|
||||||
log_every_t: 100
|
|
||||||
timesteps: 1000
|
|
||||||
loss_type: l2
|
|
||||||
first_stage_key: image
|
|
||||||
cond_stage_key: LR_image
|
|
||||||
image_size: 64
|
|
||||||
channels: 3
|
|
||||||
concat_mode: true
|
|
||||||
cond_stage_trainable: false
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 64
|
|
||||||
in_channels: 6
|
|
||||||
out_channels: 3
|
|
||||||
model_channels: 160
|
|
||||||
attention_resolutions:
|
|
||||||
- 16
|
|
||||||
- 8
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_head_channels: 32
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.VQModelInterface
|
|
||||||
params:
|
|
||||||
embed_dim: 3
|
|
||||||
n_embed: 8192
|
|
||||||
monitor: val/rec_loss
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
cond_stage_config:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 64
|
|
||||||
wrap: false
|
|
||||||
num_workers: 12
|
|
||||||
train:
|
|
||||||
target: ldm.data.openimages.SuperresOpenImagesAdvancedTrain
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
degradation: bsrgan_light
|
|
||||||
downscale_f: 4
|
|
||||||
min_crop_f: 0.5
|
|
||||||
max_crop_f: 1.0
|
|
||||||
random_crop: true
|
|
||||||
validation:
|
|
||||||
target: ldm.data.openimages.SuperresOpenImagesAdvancedValidation
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
degradation: bsrgan_light
|
|
||||||
downscale_f: 4
|
|
||||||
min_crop_f: 0.5
|
|
||||||
max_crop_f: 1.0
|
|
||||||
random_crop: true
|
|
@ -1,70 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 2.0e-06
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0195
|
|
||||||
num_timesteps_cond: 1
|
|
||||||
log_every_t: 200
|
|
||||||
timesteps: 1000
|
|
||||||
first_stage_key: image
|
|
||||||
cond_stage_key: class_label
|
|
||||||
image_size: 64
|
|
||||||
channels: 3
|
|
||||||
cond_stage_trainable: false
|
|
||||||
concat_mode: false
|
|
||||||
monitor: val/loss
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 64
|
|
||||||
in_channels: 3
|
|
||||||
out_channels: 3
|
|
||||||
model_channels: 224
|
|
||||||
attention_resolutions:
|
|
||||||
- 8
|
|
||||||
- 4
|
|
||||||
- 2
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 3
|
|
||||||
- 4
|
|
||||||
num_head_channels: 32
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.VQModelInterface
|
|
||||||
params:
|
|
||||||
embed_dim: 3
|
|
||||||
n_embed: 8192
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
cond_stage_config: __is_unconditional__
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 48
|
|
||||||
num_workers: 5
|
|
||||||
wrap: false
|
|
||||||
train:
|
|
||||||
target: ldm.data.faceshq.CelebAHQTrain
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.faceshq.CelebAHQValidation
|
|
||||||
params:
|
|
||||||
size: 256
|
|
@ -1,80 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 1.0e-06
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0195
|
|
||||||
num_timesteps_cond: 1
|
|
||||||
log_every_t: 200
|
|
||||||
timesteps: 1000
|
|
||||||
first_stage_key: image
|
|
||||||
cond_stage_key: class_label
|
|
||||||
image_size: 32
|
|
||||||
channels: 4
|
|
||||||
cond_stage_trainable: true
|
|
||||||
conditioning_key: crossattn
|
|
||||||
monitor: val/loss_simple_ema
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 32
|
|
||||||
in_channels: 4
|
|
||||||
out_channels: 4
|
|
||||||
model_channels: 256
|
|
||||||
attention_resolutions:
|
|
||||||
- 4
|
|
||||||
- 2
|
|
||||||
- 1
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_head_channels: 32
|
|
||||||
use_spatial_transformer: true
|
|
||||||
transformer_depth: 1
|
|
||||||
context_dim: 512
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.VQModelInterface
|
|
||||||
params:
|
|
||||||
embed_dim: 4
|
|
||||||
n_embed: 16384
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 4
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions:
|
|
||||||
- 32
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
cond_stage_config:
|
|
||||||
target: ldm.modules.encoders.modules.ClassEmbedder
|
|
||||||
params:
|
|
||||||
embed_dim: 512
|
|
||||||
key: class_label
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 64
|
|
||||||
num_workers: 12
|
|
||||||
wrap: false
|
|
||||||
train:
|
|
||||||
target: ldm.data.imagenet.ImageNetTrain
|
|
||||||
params:
|
|
||||||
config:
|
|
||||||
size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.imagenet.ImageNetValidation
|
|
||||||
params:
|
|
||||||
config:
|
|
||||||
size: 256
|
|
@ -1,70 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 2.0e-06
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0195
|
|
||||||
num_timesteps_cond: 1
|
|
||||||
log_every_t: 200
|
|
||||||
timesteps: 1000
|
|
||||||
first_stage_key: image
|
|
||||||
cond_stage_key: class_label
|
|
||||||
image_size: 64
|
|
||||||
channels: 3
|
|
||||||
cond_stage_trainable: false
|
|
||||||
concat_mode: false
|
|
||||||
monitor: val/loss
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 64
|
|
||||||
in_channels: 3
|
|
||||||
out_channels: 3
|
|
||||||
model_channels: 224
|
|
||||||
attention_resolutions:
|
|
||||||
- 8
|
|
||||||
- 4
|
|
||||||
- 2
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 3
|
|
||||||
- 4
|
|
||||||
num_head_channels: 32
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.VQModelInterface
|
|
||||||
params:
|
|
||||||
embed_dim: 3
|
|
||||||
n_embed: 8192
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
cond_stage_config: __is_unconditional__
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 42
|
|
||||||
num_workers: 5
|
|
||||||
wrap: false
|
|
||||||
train:
|
|
||||||
target: ldm.data.faceshq.FFHQTrain
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.faceshq.FFHQValidation
|
|
||||||
params:
|
|
||||||
size: 256
|
|
@ -1,67 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 1.0e-06
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0205
|
|
||||||
log_every_t: 100
|
|
||||||
timesteps: 1000
|
|
||||||
loss_type: l1
|
|
||||||
first_stage_key: image
|
|
||||||
cond_stage_key: masked_image
|
|
||||||
image_size: 64
|
|
||||||
channels: 3
|
|
||||||
concat_mode: true
|
|
||||||
monitor: val/loss
|
|
||||||
scheduler_config:
|
|
||||||
target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler
|
|
||||||
params:
|
|
||||||
verbosity_interval: 0
|
|
||||||
warm_up_steps: 1000
|
|
||||||
max_decay_steps: 50000
|
|
||||||
lr_start: 0.001
|
|
||||||
lr_max: 0.1
|
|
||||||
lr_min: 0.0001
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 64
|
|
||||||
in_channels: 7
|
|
||||||
out_channels: 3
|
|
||||||
model_channels: 256
|
|
||||||
attention_resolutions:
|
|
||||||
- 8
|
|
||||||
- 4
|
|
||||||
- 2
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 3
|
|
||||||
- 4
|
|
||||||
num_heads: 8
|
|
||||||
resblock_updown: true
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.VQModelInterface
|
|
||||||
params:
|
|
||||||
embed_dim: 3
|
|
||||||
n_embed: 8192
|
|
||||||
monitor: val/rec_loss
|
|
||||||
ddconfig:
|
|
||||||
attn_type: none
|
|
||||||
double_z: false
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: ldm.modules.losses.contperceptual.DummyLoss
|
|
||||||
cond_stage_config: __is_first_stage__
|
|
@ -1,81 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 2.0e-06
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0205
|
|
||||||
log_every_t: 100
|
|
||||||
timesteps: 1000
|
|
||||||
loss_type: l1
|
|
||||||
first_stage_key: image
|
|
||||||
cond_stage_key: coordinates_bbox
|
|
||||||
image_size: 64
|
|
||||||
channels: 3
|
|
||||||
conditioning_key: crossattn
|
|
||||||
cond_stage_trainable: true
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 64
|
|
||||||
in_channels: 3
|
|
||||||
out_channels: 3
|
|
||||||
model_channels: 128
|
|
||||||
attention_resolutions:
|
|
||||||
- 8
|
|
||||||
- 4
|
|
||||||
- 2
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 3
|
|
||||||
- 4
|
|
||||||
num_head_channels: 32
|
|
||||||
use_spatial_transformer: true
|
|
||||||
transformer_depth: 3
|
|
||||||
context_dim: 512
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.VQModelInterface
|
|
||||||
params:
|
|
||||||
embed_dim: 3
|
|
||||||
n_embed: 8192
|
|
||||||
monitor: val/rec_loss
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
cond_stage_config:
|
|
||||||
target: ldm.modules.encoders.modules.BERTEmbedder
|
|
||||||
params:
|
|
||||||
n_embed: 512
|
|
||||||
n_layer: 16
|
|
||||||
vocab_size: 8192
|
|
||||||
max_seq_len: 92
|
|
||||||
use_tokenizer: false
|
|
||||||
monitor: val/loss_simple_ema
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 24
|
|
||||||
wrap: false
|
|
||||||
num_workers: 10
|
|
||||||
train:
|
|
||||||
target: ldm.data.openimages.OpenImagesBBoxTrain
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.openimages.OpenImagesBBoxValidation
|
|
||||||
params:
|
|
||||||
size: 256
|
|
@ -1,70 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 2.0e-06
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0195
|
|
||||||
num_timesteps_cond: 1
|
|
||||||
log_every_t: 200
|
|
||||||
timesteps: 1000
|
|
||||||
first_stage_key: image
|
|
||||||
cond_stage_key: class_label
|
|
||||||
image_size: 64
|
|
||||||
channels: 3
|
|
||||||
cond_stage_trainable: false
|
|
||||||
concat_mode: false
|
|
||||||
monitor: val/loss
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 64
|
|
||||||
in_channels: 3
|
|
||||||
out_channels: 3
|
|
||||||
model_channels: 224
|
|
||||||
attention_resolutions:
|
|
||||||
- 8
|
|
||||||
- 4
|
|
||||||
- 2
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 3
|
|
||||||
- 4
|
|
||||||
num_head_channels: 32
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.VQModelInterface
|
|
||||||
params:
|
|
||||||
embed_dim: 3
|
|
||||||
n_embed: 8192
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
cond_stage_config: __is_unconditional__
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 48
|
|
||||||
num_workers: 5
|
|
||||||
wrap: false
|
|
||||||
train:
|
|
||||||
target: ldm.data.lsun.LSUNBedroomsTrain
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.lsun.LSUNBedroomsValidation
|
|
||||||
params:
|
|
||||||
size: 256
|
|
@ -1,92 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 5.0e-05
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0155
|
|
||||||
num_timesteps_cond: 1
|
|
||||||
log_every_t: 200
|
|
||||||
timesteps: 1000
|
|
||||||
loss_type: l1
|
|
||||||
first_stage_key: image
|
|
||||||
cond_stage_key: image
|
|
||||||
image_size: 32
|
|
||||||
channels: 4
|
|
||||||
cond_stage_trainable: false
|
|
||||||
concat_mode: false
|
|
||||||
scale_by_std: true
|
|
||||||
monitor: val/loss_simple_ema
|
|
||||||
scheduler_config:
|
|
||||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
|
||||||
params:
|
|
||||||
warm_up_steps:
|
|
||||||
- 10000
|
|
||||||
cycle_lengths:
|
|
||||||
- 10000000000000
|
|
||||||
f_start:
|
|
||||||
- 1.0e-06
|
|
||||||
f_max:
|
|
||||||
- 1.0
|
|
||||||
f_min:
|
|
||||||
- 1.0
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 32
|
|
||||||
in_channels: 4
|
|
||||||
out_channels: 4
|
|
||||||
model_channels: 192
|
|
||||||
attention_resolutions:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
- 8
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
- 4
|
|
||||||
num_heads: 8
|
|
||||||
use_scale_shift_norm: true
|
|
||||||
resblock_updown: true
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.AutoencoderKL
|
|
||||||
params:
|
|
||||||
embed_dim: 4
|
|
||||||
monitor: val/rec_loss
|
|
||||||
ddconfig:
|
|
||||||
double_z: true
|
|
||||||
z_channels: 4
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
|
|
||||||
cond_stage_config: '__is_unconditional__'
|
|
||||||
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 96
|
|
||||||
num_workers: 5
|
|
||||||
wrap: false
|
|
||||||
train:
|
|
||||||
target: ldm.data.lsun.LSUNChurchesTrain
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.lsun.LSUNChurchesValidation
|
|
||||||
params:
|
|
||||||
size: 256
|
|
@ -1,59 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 1.0e-06
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0205
|
|
||||||
log_every_t: 100
|
|
||||||
timesteps: 1000
|
|
||||||
loss_type: l1
|
|
||||||
first_stage_key: image
|
|
||||||
cond_stage_key: segmentation
|
|
||||||
image_size: 64
|
|
||||||
channels: 3
|
|
||||||
concat_mode: true
|
|
||||||
cond_stage_trainable: true
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 64
|
|
||||||
in_channels: 6
|
|
||||||
out_channels: 3
|
|
||||||
model_channels: 128
|
|
||||||
attention_resolutions:
|
|
||||||
- 32
|
|
||||||
- 16
|
|
||||||
- 8
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 4
|
|
||||||
- 8
|
|
||||||
num_heads: 8
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.VQModelInterface
|
|
||||||
params:
|
|
||||||
embed_dim: 3
|
|
||||||
n_embed: 8192
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
cond_stage_config:
|
|
||||||
target: ldm.modules.encoders.modules.SpatialRescaler
|
|
||||||
params:
|
|
||||||
n_stages: 2
|
|
||||||
in_channels: 182
|
|
||||||
out_channels: 3
|
|
@ -1,78 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 1.0e-06
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0205
|
|
||||||
log_every_t: 100
|
|
||||||
timesteps: 1000
|
|
||||||
loss_type: l1
|
|
||||||
first_stage_key: image
|
|
||||||
cond_stage_key: segmentation
|
|
||||||
image_size: 128
|
|
||||||
channels: 3
|
|
||||||
concat_mode: true
|
|
||||||
cond_stage_trainable: true
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 128
|
|
||||||
in_channels: 6
|
|
||||||
out_channels: 3
|
|
||||||
model_channels: 128
|
|
||||||
attention_resolutions:
|
|
||||||
- 32
|
|
||||||
- 16
|
|
||||||
- 8
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 4
|
|
||||||
- 8
|
|
||||||
num_heads: 8
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.VQModelInterface
|
|
||||||
params:
|
|
||||||
embed_dim: 3
|
|
||||||
n_embed: 8192
|
|
||||||
monitor: val/rec_loss
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
cond_stage_config:
|
|
||||||
target: ldm.modules.encoders.modules.SpatialRescaler
|
|
||||||
params:
|
|
||||||
n_stages: 2
|
|
||||||
in_channels: 182
|
|
||||||
out_channels: 3
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 8
|
|
||||||
wrap: false
|
|
||||||
num_workers: 10
|
|
||||||
train:
|
|
||||||
target: ldm.data.landscapes.RFWTrain
|
|
||||||
params:
|
|
||||||
size: 768
|
|
||||||
crop_size: 512
|
|
||||||
segmentation_to_float32: true
|
|
||||||
validation:
|
|
||||||
target: ldm.data.landscapes.RFWValidation
|
|
||||||
params:
|
|
||||||
size: 768
|
|
||||||
crop_size: 512
|
|
||||||
segmentation_to_float32: true
|
|
@ -1,77 +0,0 @@
|
|||||||
model:
|
|
||||||
base_learning_rate: 2.0e-06
|
|
||||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
|
||||||
params:
|
|
||||||
linear_start: 0.0015
|
|
||||||
linear_end: 0.0195
|
|
||||||
num_timesteps_cond: 1
|
|
||||||
log_every_t: 200
|
|
||||||
timesteps: 1000
|
|
||||||
first_stage_key: image
|
|
||||||
cond_stage_key: caption
|
|
||||||
image_size: 64
|
|
||||||
channels: 3
|
|
||||||
cond_stage_trainable: true
|
|
||||||
conditioning_key: crossattn
|
|
||||||
monitor: val/loss_simple_ema
|
|
||||||
unet_config:
|
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
|
||||||
params:
|
|
||||||
image_size: 64
|
|
||||||
in_channels: 3
|
|
||||||
out_channels: 3
|
|
||||||
model_channels: 192
|
|
||||||
attention_resolutions:
|
|
||||||
- 8
|
|
||||||
- 4
|
|
||||||
- 2
|
|
||||||
num_res_blocks: 2
|
|
||||||
channel_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 3
|
|
||||||
- 5
|
|
||||||
num_head_channels: 32
|
|
||||||
use_spatial_transformer: true
|
|
||||||
transformer_depth: 1
|
|
||||||
context_dim: 640
|
|
||||||
first_stage_config:
|
|
||||||
target: ldm.models.autoencoder.VQModelInterface
|
|
||||||
params:
|
|
||||||
embed_dim: 3
|
|
||||||
n_embed: 8192
|
|
||||||
ddconfig:
|
|
||||||
double_z: false
|
|
||||||
z_channels: 3
|
|
||||||
resolution: 256
|
|
||||||
in_channels: 3
|
|
||||||
out_ch: 3
|
|
||||||
ch: 128
|
|
||||||
ch_mult:
|
|
||||||
- 1
|
|
||||||
- 2
|
|
||||||
- 4
|
|
||||||
num_res_blocks: 2
|
|
||||||
attn_resolutions: []
|
|
||||||
dropout: 0.0
|
|
||||||
lossconfig:
|
|
||||||
target: torch.nn.Identity
|
|
||||||
cond_stage_config:
|
|
||||||
target: ldm.modules.encoders.modules.BERTEmbedder
|
|
||||||
params:
|
|
||||||
n_embed: 640
|
|
||||||
n_layer: 32
|
|
||||||
data:
|
|
||||||
target: main.DataModuleFromConfig
|
|
||||||
params:
|
|
||||||
batch_size: 28
|
|
||||||
num_workers: 10
|
|
||||||
wrap: false
|
|
||||||
train:
|
|
||||||
target: ldm.data.previews.pytorch_dataset.PreviewsTrain
|
|
||||||
params:
|
|
||||||
size: 256
|
|
||||||
validation:
|
|
||||||
target: ldm.data.previews.pytorch_dataset.PreviewsValidation
|
|
||||||
params:
|
|
||||||
size: 256
|
|
@ -3,9 +3,11 @@
|
|||||||
# Before running stable-diffusion on an internet-isolated machine,
|
# Before running stable-diffusion on an internet-isolated machine,
|
||||||
# run this script from one with internet connectivity. The
|
# run this script from one with internet connectivity. The
|
||||||
# two machines must share a common .cache directory.
|
# two machines must share a common .cache directory.
|
||||||
from transformers import CLIPTokenizer, CLIPTextModel
|
#
|
||||||
|
# Coauthor: Kevin Turner http://github.com/keturn
|
||||||
|
#
|
||||||
|
print('Loading Python libraries...\n')
|
||||||
import clip
|
import clip
|
||||||
from transformers import BertTokenizerFast, AutoFeatureExtractor
|
|
||||||
import sys
|
import sys
|
||||||
import transformers
|
import transformers
|
||||||
import os
|
import os
|
||||||
@ -14,9 +16,247 @@ import torch
|
|||||||
import urllib.request
|
import urllib.request
|
||||||
import zipfile
|
import zipfile
|
||||||
import traceback
|
import traceback
|
||||||
|
import getpass
|
||||||
|
from omegaconf import OmegaConf
|
||||||
|
from pathlib import Path
|
||||||
|
from transformers import CLIPTokenizer, CLIPTextModel
|
||||||
|
from transformers import BertTokenizerFast, AutoFeatureExtractor
|
||||||
|
from huggingface_hub import hf_hub_download, HfFolder, hf_hub_url
|
||||||
|
|
||||||
transformers.logging.set_verbosity_error()
|
transformers.logging.set_verbosity_error()
|
||||||
|
|
||||||
|
#--------------------------globals--
|
||||||
|
Model_dir = './models/ldm/stable-diffusion-v1/'
|
||||||
|
Config_file = './configs/models.yaml'
|
||||||
|
SD_Configs = './configs/stable-diffusion'
|
||||||
|
Datasets = {
|
||||||
|
'stable-diffusion-1.5': {
|
||||||
|
'description': 'The newest Stable Diffusion version 1.5 weight file (4.27 GB)',
|
||||||
|
'repo_id': 'runwayml/stable-diffusion-v1-5',
|
||||||
|
'config': 'v1-inference.yaml',
|
||||||
|
'file': 'v1-5-pruned-emaonly.ckpt',
|
||||||
|
'recommended': True,
|
||||||
|
'width': 512,
|
||||||
|
'height': 512,
|
||||||
|
},
|
||||||
|
'inpainting-1.5': {
|
||||||
|
'description': 'RunwayML SD 1.5 model optimized for inpainting (4.27 GB)',
|
||||||
|
'repo_id': 'runwayml/stable-diffusion-inpainting',
|
||||||
|
'config': 'v1-inpainting-inference.yaml',
|
||||||
|
'file': 'sd-v1-5-inpainting.ckpt',
|
||||||
|
'recommended': True,
|
||||||
|
'width': 512,
|
||||||
|
'height': 512,
|
||||||
|
},
|
||||||
|
'stable-diffusion-1.4': {
|
||||||
|
'description': 'The original Stable Diffusion version 1.4 weight file (4.27 GB)',
|
||||||
|
'repo_id': 'CompVis/stable-diffusion-v-1-4-original',
|
||||||
|
'config': 'v1-inference.yaml',
|
||||||
|
'file': 'sd-v1-4.ckpt',
|
||||||
|
'recommended': False,
|
||||||
|
'width': 512,
|
||||||
|
'height': 512,
|
||||||
|
},
|
||||||
|
'waifu-diffusion-1.3': {
|
||||||
|
'description': 'Stable Diffusion 1.4 fine tuned on anime-styled images (4.27)',
|
||||||
|
'repo_id': 'hakurei/waifu-diffusion-v1-3',
|
||||||
|
'config': 'v1-inference.yaml',
|
||||||
|
'file': 'model-epoch09-float32.ckpt',
|
||||||
|
'recommended': False,
|
||||||
|
'width': 512,
|
||||||
|
'height': 512,
|
||||||
|
},
|
||||||
|
'ft-mse-improved-autoencoder-840000': {
|
||||||
|
'description': 'StabilityAI improved autoencoder fine-tuned for human faces (recommended; 335 MB)',
|
||||||
|
'repo_id': 'stabilityai/sd-vae-ft-mse-original',
|
||||||
|
'config': 'VAE',
|
||||||
|
'file': 'vae-ft-mse-840000-ema-pruned.ckpt',
|
||||||
|
'recommended': True,
|
||||||
|
'width': 512,
|
||||||
|
'height': 512,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
Config_preamble = '''# This file describes the alternative machine learning models
|
||||||
|
# available to InvokeAI script.
|
||||||
|
#
|
||||||
|
# To add a new model, follow the examples below. Each
|
||||||
|
# model requires a model config file, a weights file,
|
||||||
|
# and the width and height of the images it
|
||||||
|
# was trained on.
|
||||||
|
'''
|
||||||
|
|
||||||
|
#---------------------------------------------
|
||||||
|
def introduction():
|
||||||
|
print(
|
||||||
|
'''Welcome to InvokeAI. This script will help download the Stable Diffusion weight files
|
||||||
|
and other large models that are needed for text to image generation. At any point you may interrupt
|
||||||
|
this program and resume later.\n'''
|
||||||
|
)
|
||||||
|
|
||||||
|
#---------------------------------------------
|
||||||
|
def yes_or_no(prompt:str, default_yes=True):
|
||||||
|
default = "y" if default_yes else 'n'
|
||||||
|
response = input(f'{prompt} [{default}] ') or default
|
||||||
|
if default_yes:
|
||||||
|
return response[0] not in ('n','N')
|
||||||
|
else:
|
||||||
|
return response[0] in ('y','Y')
|
||||||
|
|
||||||
|
#---------------------------------------------
|
||||||
|
def user_wants_to_download_weights():
|
||||||
|
return yes_or_no('Would you like to download the Stable Diffusion model weights now?')
|
||||||
|
|
||||||
|
#---------------------------------------------
|
||||||
|
def select_datasets():
|
||||||
|
done = False
|
||||||
|
while not done:
|
||||||
|
print('''
|
||||||
|
Choose the weight file(s) you wish to download. Before downloading you
|
||||||
|
will be given the option to view and change your selections.
|
||||||
|
'''
|
||||||
|
)
|
||||||
|
datasets = dict()
|
||||||
|
|
||||||
|
counter = 1
|
||||||
|
dflt = None # the first model selected will be the default; TODO let user change
|
||||||
|
for ds in Datasets.keys():
|
||||||
|
recommended = '(recommended)' if Datasets[ds]['recommended'] else ''
|
||||||
|
print(f'[{counter}] {ds}:\n {Datasets[ds]["description"]} {recommended}')
|
||||||
|
if yes_or_no(' Download?',default_yes=Datasets[ds]['recommended']):
|
||||||
|
datasets[ds]=counter
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
print('The following weight files will be downloaded:')
|
||||||
|
for ds in datasets:
|
||||||
|
dflt = '*' if dflt is None else ''
|
||||||
|
print(f' [{datasets[ds]}] {ds}{dflt}')
|
||||||
|
print("*default")
|
||||||
|
ok_to_download = yes_or_no('Ok to download?')
|
||||||
|
if not ok_to_download:
|
||||||
|
if yes_or_no('Change your selection?'):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
done = True
|
||||||
|
else:
|
||||||
|
done = True
|
||||||
|
return datasets if ok_to_download else None
|
||||||
|
|
||||||
|
#-------------------------------Authenticate against Hugging Face
|
||||||
|
def authenticate():
|
||||||
|
print('''
|
||||||
|
To download the Stable Diffusion weight files you need to read and accept the
|
||||||
|
CreativeML Responsible AI license. If you have not already done so, please
|
||||||
|
create an account at https://huggingface.co. Then login under your account and
|
||||||
|
read and accept the license available at https://huggingface.co/CompVis/stable-diffusion-v-1-4-original.
|
||||||
|
'''
|
||||||
|
)
|
||||||
|
input('Press <enter> when you are ready to continue:')
|
||||||
|
access_token = HfFolder.get_token()
|
||||||
|
if access_token is None:
|
||||||
|
print('''
|
||||||
|
Thank you! Now you need to authenticate with your HuggingFace access token.
|
||||||
|
Go to https://huggingface.co/settings/tokens and create a token. Copy it to the
|
||||||
|
clipboard and paste it here: '''
|
||||||
|
)
|
||||||
|
access_token = getpass.getpass()
|
||||||
|
HfFolder.save_token(access_token)
|
||||||
|
return access_token
|
||||||
|
|
||||||
|
#---------------------------------------------
|
||||||
|
# look for legacy model.ckpt in models directory and offer to
|
||||||
|
# normalize its name
|
||||||
|
def migrate_models_ckpt():
|
||||||
|
if not os.path.exists(os.path.join(Model_dir,'model.ckpt')):
|
||||||
|
return
|
||||||
|
new_name = Datasets['stable-diffusion-1.4']['file']
|
||||||
|
print('You seem to have the Stable Diffusion v4.1 "model.ckpt" already installed.')
|
||||||
|
rename = yes_or_no(f'Ok to rename it to "{new_name}" for future reference?')
|
||||||
|
if rename:
|
||||||
|
print(f'model.ckpt => {new_name}')
|
||||||
|
os.rename(os.path.join(Model_dir,'model.ckpt'),os.path.join(Model_dir,new_name))
|
||||||
|
|
||||||
|
#---------------------------------------------
|
||||||
|
def download_weight_datasets(models:dict, access_token:str):
|
||||||
|
migrate_models_ckpt()
|
||||||
|
successful = dict()
|
||||||
|
for mod in models.keys():
|
||||||
|
repo_id = Datasets[mod]['repo_id']
|
||||||
|
filename = Datasets[mod]['file']
|
||||||
|
success = conditional_download(
|
||||||
|
repo_id=repo_id,
|
||||||
|
model_name=filename,
|
||||||
|
access_token=access_token
|
||||||
|
)
|
||||||
|
if success:
|
||||||
|
successful[mod] = True
|
||||||
|
keys = ', '.join(successful.keys())
|
||||||
|
print(f'Successfully installed {keys}')
|
||||||
|
return successful
|
||||||
|
|
||||||
|
#---------------------------------------------
|
||||||
|
def conditional_download(repo_id:str, model_name:str, access_token:str):
|
||||||
|
model_dest = os.path.join(Model_dir, model_name)
|
||||||
|
if os.path.exists(model_dest):
|
||||||
|
print(f' * {model_name}: exists')
|
||||||
|
return True
|
||||||
|
os.makedirs(os.path.dirname(model_dest), exist_ok=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
print(f' * {model_name}: downloading or retrieving from cache...')
|
||||||
|
path = Path(hf_hub_download(repo_id, model_name, use_auth_token=access_token))
|
||||||
|
path.resolve(strict=True).link_to(model_dest)
|
||||||
|
except Exception as e:
|
||||||
|
print(f'** Error downloading {model_name}: {str(e)} **')
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
#---------------------------------------------
|
||||||
|
def update_config_file(successfully_downloaded:dict):
|
||||||
|
try:
|
||||||
|
yaml = new_config_file_contents(successfully_downloaded)
|
||||||
|
tmpfile = os.path.join(os.path.dirname(Config_file),'new_config.tmp')
|
||||||
|
with open(tmpfile, 'w') as outfile:
|
||||||
|
outfile.write(Config_preamble)
|
||||||
|
outfile.write(yaml)
|
||||||
|
os.rename(tmpfile,Config_file)
|
||||||
|
except Exception as e:
|
||||||
|
print(f'**Error creating config file {Config_file}: {str(e)} **')
|
||||||
|
return
|
||||||
|
print(f'Successfully created new configuration file {Config_file}')
|
||||||
|
|
||||||
|
|
||||||
|
#---------------------------------------------
|
||||||
|
def new_config_file_contents(successfully_downloaded:dict)->str:
|
||||||
|
conf = OmegaConf.load(Config_file)
|
||||||
|
|
||||||
|
# find the VAE file, if there is one
|
||||||
|
vae = None
|
||||||
|
default_selected = False
|
||||||
|
|
||||||
|
for model in successfully_downloaded:
|
||||||
|
if Datasets[model]['config'] == 'VAE':
|
||||||
|
vae = Datasets[model]['file']
|
||||||
|
|
||||||
|
for model in successfully_downloaded:
|
||||||
|
if Datasets[model]['config'] == 'VAE': # skip VAE entries
|
||||||
|
continue
|
||||||
|
stanza = conf[model] if model in conf else { }
|
||||||
|
|
||||||
|
stanza['description'] = Datasets[model]['description']
|
||||||
|
stanza['weights'] = os.path.join(Model_dir,Datasets[model]['file'])
|
||||||
|
stanza['config'] =os.path.join(SD_Configs, Datasets[model]['config'])
|
||||||
|
stanza['width'] = Datasets[model]['width']
|
||||||
|
stanza['height'] = Datasets[model]['height']
|
||||||
|
stanza.pop('default',None) # this will be set later
|
||||||
|
if vae:
|
||||||
|
stanza['vae'] = os.path.join(Model_dir,vae)
|
||||||
|
# BUG - the first stanza is always the default. User should select.
|
||||||
|
if not default_selected:
|
||||||
|
stanza['default'] = True
|
||||||
|
default_selected = True
|
||||||
|
conf[model] = stanza
|
||||||
|
return OmegaConf.to_yaml(conf)
|
||||||
|
|
||||||
#---------------------------------------------
|
#---------------------------------------------
|
||||||
# this will preload the Bert tokenizer fles
|
# this will preload the Bert tokenizer fles
|
||||||
def download_bert():
|
def download_bert():
|
||||||
@ -66,7 +306,6 @@ def download_gfpgan():
|
|||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
|
|
||||||
print('Loading models from GFPGAN')
|
print('Loading models from GFPGAN')
|
||||||
import urllib.request
|
|
||||||
for model in (
|
for model in (
|
||||||
[
|
[
|
||||||
'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth',
|
'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth',
|
||||||
@ -152,6 +391,15 @@ def download_safety_checker():
|
|||||||
|
|
||||||
#-------------------------------------
|
#-------------------------------------
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
introduction()
|
||||||
|
if user_wants_to_download_weights():
|
||||||
|
models = select_datasets()
|
||||||
|
if models is None:
|
||||||
|
if yes_or_no('Quit?',default_yes=False):
|
||||||
|
sys.exit(0)
|
||||||
|
access_token = authenticate()
|
||||||
|
successfully_downloaded = download_weight_datasets(models, access_token)
|
||||||
|
update_config_file(successfully_downloaded)
|
||||||
download_bert()
|
download_bert()
|
||||||
download_kornia()
|
download_kornia()
|
||||||
download_clip()
|
download_clip()
|
||||||
|
Reference in New Issue
Block a user