diff --git a/README.md b/README.md index 5d733d8af3..4947c0d8f3 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ To train, prepare a folder that contains images sized at 512x512 and execute the -t \ --actual_resume ./models/ldm/stable-diffusion-v1/model.ckpt \ -n my_cat \ - -gpus 0, \ + --gpus 0, \ --data_root D:/textual-inversion/my_cat \ --init_word 'cat' ~~~~ diff --git a/configs/stable-diffusion/v1-finetune_style.yaml b/configs/stable-diffusion/v1-finetune_style.yaml new file mode 100644 index 0000000000..1964d925e1 --- /dev/null +++ b/configs/stable-diffusion/v1-finetune_style.yaml @@ -0,0 +1,103 @@ +model: + base_learning_rate: 5.0e-03 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: image + cond_stage_key: caption + image_size: 64 + channels: 4 + cond_stage_trainable: true # Note: different from the one we trained before + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + embedding_reg_weight: 0.0 + + personalization_config: + target: ldm.modules.embedding_manager.EmbeddingManager + params: + placeholder_strings: ["*"] + initializer_words: ["painting"] + per_image_tokens: false + num_vectors_per_token: 1 + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder + +data: + target: main.DataModuleFromConfig + params: + batch_size: 2 + num_workers: 16 + wrap: false + train: + target: ldm.data.personalized_style.PersonalizedBase + params: + size: 512 + set: train + per_image_tokens: false + repeats: 100 + validation: + target: ldm.data.personalized_style.PersonalizedBase + params: + size: 512 + set: val + per_image_tokens: false + repeats: 10 + +lightning: + callbacks: + image_logger: + target: main.ImageLogger + params: + batch_frequency: 500 + max_images: 8 + increase_log_steps: False + + trainer: + benchmark: True \ No newline at end of file