Merge branch 'main' of https://github.com/BaristaLabs/stable-diffusion-dream into add-simple-variant-mechanism

2024-08-30 20:32:17 +00:00 · 2022-08-24 12:06:29 -04:00 · 2022-08-24 12:06:29 -04:00 · c6b5e930dc
commit c6b5e930dc
parent d33e1bf563 923466387f
29 changed files with 2071 additions and 403 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,175 @@
 # ignore default image save location and model symbolic link
 outputs/
 models/ldm/stable-diffusion-v1/model.ckpt
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # emacs autosave and recovery files
 *~
 .#*
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 src/
 logs/
 **/__pycache__/
 outputs
--- a/28
+++ b/28
@ -1,9 +1,27 @@
-All rights reserved by the authors.
+MIT License
 You must not distribute the weights provided to you directly or indirectly without explicit consent of the authors.
 You must not distribute harmful, offensive, dehumanizing content or otherwise harmful representations of people or their environments, cultures, religions, etc. produced with the model weights
 or other generated content described in the "Misuse and Malicious Use" section in the model card.
 The model weights are provided for research purposes only.
 Copyright (c) 2022 Lincoln D. Stein (https://github.com/lstein)
 This software is derived from a fork of the source code available from
 https://github.com/pesser/stable-diffusion and
 https://github.com/CompViz/stable-diffusion. They carry the following
 copyrights:
 Copyright (c) 2022 Machine Vision and Learning Group, LMU Munich
 Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
 Please see individual source code files for copyright and authorship
 attributions.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
--- a/LICENSE-ModelWeights.txt
+++ b/LICENSE-ModelWeights.txt
@ -0,0 +1,294 @@
 Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
 CreativeML Open RAIL-M
 dated August 22, 2022
 Section I: PREAMBLE
 Multimodal generative models are being widely adopted and used, and
 have the potential to transform the way artists, among other
 individuals, conceive and benefit from AI or ML technologies as a tool
 for content creation.
 Notwithstanding the current and potential benefits that these
 artifacts can bring to society at large, there are also concerns about
 potential misuses of them, either due to their technical limitations
 or ethical considerations.
 In short, this license strives for both the open and responsible
 downstream use of the accompanying model. When it comes to the open
 character, we took inspiration from open source permissive licenses
 regarding the grant of IP rights. Referring to the downstream
 responsible use, we added use-based restrictions not permitting the
 use of the Model in very specific scenarios, in order for the licensor
 to be able to enforce the license in case potential misuses of the
 Model may occur. At the same time, we strive to promote open and
 responsible research on generative models for art and content
 generation.
 Even though downstream derivative versions of the model could be
 released under different licensing terms, the latter will always have
 to include - at minimum - the same use-based restrictions as the ones
 in the original license (this license). We believe in the intersection
 between open and responsible AI development; thus, this License aims
 to strike a balance between both in order to enable responsible
 open-science in the field of AI.
 This License governs the use of the model (and its derivatives) and is
 informed by the model card associated with the model.
 NOW THEREFORE, You and Licensor agree as follows:
 1. Definitions
 - "License" means the terms and conditions for use, reproduction, and
  Distribution as defined in this document.
 - "Data" means a collection of information and/or content extracted
  from the dataset used with the Model, including to train, pretrain,
  or otherwise evaluate the Model. The Data is not licensed under this
  License.
 - "Output" means the results of operating a Model as embodied in
  informational content resulting therefrom.
 - "Model" means any accompanying machine-learning based assemblies
  (including checkpoints), consisting of learnt weights, parameters
  (including optimizer states), corresponding to the model
  architecture as embodied in the Complementary Material, that have
  been trained or tuned, in whole or in part on the Data, using the
  Complementary Material.
 - "Derivatives of the Model" means all modifications to the Model,
  works based on the Model, or any other model which is created or
  initialized by transfer of patterns of the weights, parameters,
  activations or output of the Model, to the other model, in order to
  cause the other model to perform similarly to the Model, including -
  but not limited to - distillation methods entailing the use of
  intermediate data representations or methods based on the generation
  of synthetic data by the Model for training the other model.
 - "Complementary Material" means the accompanying source code and
  scripts used to define, run, load, benchmark or evaluate the Model,
  and used to prepare data for training or evaluation, if any. This
  includes any accompanying documentation, tutorials, examples, etc,
  if any.
 - "Distribution" means any transmission, reproduction, publication or
  other sharing of the Model or Derivatives of the Model to a third
  party, including providing the Model as a hosted service made
  available by electronic or other remote means - e.g. API-based or
  web access.
 - "Licensor" means the copyright owner or entity authorized by the
  copyright owner that is granting the License, including the persons
  or entities that may have rights in the Model and/or distributing
  the Model.
 - "You" (or "Your") means an individual or Legal Entity exercising
  permissions granted by this License and/or making use of the Model
  for whichever purpose and in any field of use, including usage of
  the Model in an end-use application - e.g. chatbot, translator,
  image generator.
 - "Third Parties" means individuals or legal entities that are not
  under common control with Licensor or You.
 - "Contribution" means any work of authorship, including the original
  version of the Model and any modifications or additions to that
  Model or Derivatives of the Model thereof, that is intentionally
  submitted to Licensor for inclusion in the Model by the copyright
  owner or by an individual or Legal Entity authorized to submit on
  behalf of the copyright owner. For the purposes of this definition,
  "submitted" means any form of electronic, verbal, or written
  communication sent to the Licensor or its representatives, including
  but not limited to communication on electronic mailing lists, source
  code control systems, and issue tracking systems that are managed
  by, or on behalf of, the Licensor for the purpose of discussing and
  improving the Model, but excluding communication that is
  conspicuously marked or otherwise designated in writing by the
  copyright owner as "Not a Contribution."
 - "Contributor" means Licensor and any individual or Legal Entity on
  behalf of whom a Contribution has been received by Licensor and
  subsequently incorporated within the Model.
 Section II: INTELLECTUAL PROPERTY RIGHTS
 Both copyright and patent grants apply to the Model, Derivatives of
 the Model and Complementary Material. The Model and Derivatives of the
 Model are subject to additional terms as described in Section III.
 2. Grant of Copyright License. Subject to the terms and conditions of
 this License, each Contributor hereby grants to You a perpetual,
 worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 copyright license to reproduce, prepare, publicly display, publicly
 perform, sublicense, and distribute the Complementary Material, the
 Model, and Derivatives of the Model.
 3. Grant of Patent License. Subject to the terms and conditions of
 this License and where and as applicable, each Contributor hereby
 grants to You a perpetual, worldwide, non-exclusive, no-charge,
 royalty-free, irrevocable (except as stated in this paragraph) patent
 license to make, have made, use, offer to sell, sell, import, and
 otherwise transfer the Model and the Complementary Material, where
 such license applies only to those patent claims licensable by such
 Contributor that are necessarily infringed by their Contribution(s)
 alone or by combination of their Contribution(s) with the Model to
 which such Contribution(s) was submitted. If You institute patent
 litigation against any entity (including a cross-claim or counterclaim
 in a lawsuit) alleging that the Model and/or Complementary Material or
 a Contribution incorporated within the Model and/or Complementary
 Material constitutes direct or contributory patent infringement, then
 any patent licenses granted to You under this License for the Model
 and/or Work shall terminate as of the date such litigation is asserted
 or filed.
 Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
 4. Distribution and Redistribution. You may host for Third Party
 remote access purposes (e.g. software-as-a-service), reproduce and
 distribute copies of the Model or Derivatives of the Model thereof in
 any medium, with or without modifications, provided that You meet the
 following conditions: Use-based restrictions as referenced in
 paragraph 5 MUST be included as an enforceable provision by You in any
 type of legal agreement (e.g. a license) governing the use and/or
 distribution of the Model or Derivatives of the Model, and You shall
 give notice to subsequent users You Distribute to, that the Model or
 Derivatives of the Model are subject to paragraph 5. This provision
 does not apply to the use of Complementary Material.  You must give
 any Third Party recipients of the Model or Derivatives of the Model a
 copy of this License; You must cause any modified files to carry
 prominent notices stating that You changed the files; You must retain
 all copyright, patent, trademark, and attribution notices excluding
 those notices that do not pertain to any part of the Model,
 Derivatives of the Model.  You may add Your own copyright statement to
 Your modifications and may provide additional or different license
 terms and conditions - respecting paragraph 4.a. - for use,
 reproduction, or Distribution of Your modifications, or for any such
 Derivatives of the Model as a whole, provided Your use, reproduction,
 and Distribution of the Model otherwise complies with the conditions
 stated in this License.
 5. Use-based restrictions. The restrictions set forth in Attachment A
 are considered Use-based restrictions. Therefore You cannot use the
 Model and the Derivatives of the Model for the specified restricted
 uses. You may use the Model subject to this License, including only
 for lawful purposes and in accordance with the License. Use may
 include creating any content with, finetuning, updating, running,
 training, evaluating and/or reparametrizing the Model. You shall
 require all of Your users who use the Model or a Derivative of the
 Model to comply with the terms of this paragraph (paragraph 5).
 6. The Output You Generate. Except as set forth herein, Licensor
 claims no rights in the Output You generate using the Model. You are
 accountable for the Output you generate and its subsequent uses. No
 use of the output can contravene any provision as stated in the
 License.
 Section IV: OTHER PROVISIONS
 7. Updates and Runtime Restrictions. To the maximum extent permitted
 by law, Licensor reserves the right to restrict (remotely or
 otherwise) usage of the Model in violation of this License, update the
 Model through electronic means, or modify the Output of the Model
 based on updates. You shall undertake reasonable efforts to use the
 latest version of the Model.
 8. Trademarks and related. Nothing in this License permits You to make
 use of Licensors’ trademarks, trade names, logos or to otherwise
 suggest endorsement or misrepresent the relationship between the
 parties; and any rights not expressly granted herein are reserved by
 the Licensors.
 9. Disclaimer of Warranty. Unless required by applicable law or agreed
 to in writing, Licensor provides the Model and the Complementary
 Material (and each Contributor provides its Contributions) on an "AS
 IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 express or implied, including, without limitation, any warranties or
 conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR
 A PARTICULAR PURPOSE. You are solely responsible for determining the
 appropriateness of using or redistributing the Model, Derivatives of
 the Model, and the Complementary Material and assume any risks
 associated with Your exercise of permissions under this License.
 10. Limitation of Liability. In no event and under no legal theory,
 whether in tort (including negligence), contract, or otherwise, unless
 required by applicable law (such as deliberate and grossly negligent
 acts) or agreed to in writing, shall any Contributor be liable to You
 for damages, including any direct, indirect, special, incidental, or
 consequential damages of any character arising as a result of this
 License or out of the use or inability to use the Model and the
 Complementary Material (including but not limited to damages for loss
 of goodwill, work stoppage, computer failure or malfunction, or any
 and all other commercial damages or losses), even if such Contributor
 has been advised of the possibility of such damages.
 11. Accepting Warranty or Additional Liability. While redistributing
 the Model, Derivatives of the Model and the Complementary Material
 thereof, You may choose to offer, and charge a fee for, acceptance of
 support, warranty, indemnity, or other liability obligations and/or
 rights consistent with this License. However, in accepting such
 obligations, You may act only on Your own behalf and on Your sole
 responsibility, not on behalf of any other Contributor, and only if
 You agree to indemnify, defend, and hold each Contributor harmless for
 any liability incurred by, or claims asserted against, such
 Contributor by reason of your accepting any such warranty or
 additional liability.
 12. If any provision of this License is held to be invalid, illegal or
 unenforceable, the remaining provisions shall be unaffected thereby
 and remain valid as if such provision had not been set forth herein.
 END OF TERMS AND CONDITIONS
 Attachment A
 Use Restrictions
 You agree not to use the Model or Derivatives of the Model:
 - In any way that violates any applicable national, federal, state,
  local or international law or regulation;
 - For the purpose of exploiting, harming or attempting to exploit or
  harm minors in any way;
 - To generate or disseminate verifiably false information and/or
  content with the purpose of harming others;
 - To generate or disseminate personal identifiable information that
  can be used to harm an individual;
 - To defame, disparage or otherwise harass others;
 - For fully automated decision making that adversely impacts an
  individual’s legal rights or otherwise creates or modifies a
  binding, enforceable obligation;
 pp- For any use intended to or which has the effect of discriminating
  against or harming individuals or groups based on online or offline
  social behavior or known or predicted personal or personality
  characteristics;
 - To exploit any of the vulnerabilities of a specific group of persons
  based on their age, social, physical or mental characteristics, in
  order to materially distort the behavior of a person pertaining to
  that group in a manner that causes or is likely to cause that person
  or another person physical or psychological harm;
 - For any use intended to or which has the effect of discriminating
  against individuals or groups based on legally protected
  characteristics or categories;
 - To provide medical advice and medical results interpretation;
 - To generate or disseminate information for the purpose to be used
  for administration of justice, law enforcement, immigration or
  asylum processes, such as predicting an individual will commit
  fraud/crime commitment (e.g. by text profiling, drawing causal
  relationships between assertions made in documents, indiscriminate
  and arbitrarily-targeted use).
--- a/README-CompViz.md
+++ b/README-CompViz.md
@ -0,0 +1,210 @@
 # Original README from CompViz/stable-diffusion
 *Stable Diffusion was made possible thanks to a collaboration with [Stability AI](https://stability.ai/) and [Runway](https://runwayml.com/) and builds upon our previous work:*
 [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://ommer-lab.com/research/latent-diffusion-models/)<br/>
 [Robin Rombach](https://github.com/rromb)\*,
 [Andreas Blattmann](https://github.com/ablattmann)\*,
 [Dominik Lorenz](https://github.com/qp-qp)\,
 [Patrick Esser](https://github.com/pesser),
 [Björn Ommer](https://hci.iwr.uni-heidelberg.de/Staff/bommer)<br/>
 **CVPR '22 Oral**
 which is available on [GitHub](https://github.com/CompVis/latent-diffusion). PDF at [arXiv](https://arxiv.org/abs/2112.10752). Please also visit our [Project page](https://ommer-lab.com/research/latent-diffusion-models/).
 ![txt2img-stable2](assets/stable-samples/txt2img/merged-0006.png)
 [Stable Diffusion](#stable-diffusion-v1) is a latent text-to-image diffusion
 model.
 Thanks to a generous compute donation from [Stability AI](https://stability.ai/) and support from [LAION](https://laion.ai/), we were able to train a Latent Diffusion Model on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database. 
 Similar to Google's [Imagen](https://arxiv.org/abs/2205.11487), 
 this model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts.
 With its 860M UNet and 123M text encoder, the model is relatively lightweight and runs on a GPU with at least 10GB VRAM.
 See [this section](#stable-diffusion-v1) below and the [model card](https://huggingface.co/CompVis/stable-diffusion).
 ## Requirements
 A suitable [conda](https://conda.io/) environment named `ldm` can be created
 and activated with:
 ```
 conda env create -f environment.yaml
 conda activate ldm
 ```
 You can also update an existing [latent diffusion](https://github.com/CompVis/latent-diffusion) environment by running
 ```
 conda install pytorch torchvision -c pytorch
 pip install transformers==4.19.2
 pip install -e .
 ```
 ## Stable Diffusion v1
 Stable Diffusion v1 refers to a specific configuration of the model
 architecture that uses a downsampling-factor 8 autoencoder with an 860M UNet
 and CLIP ViT-L/14 text encoder for the diffusion model. The model was pretrained on 256x256 images and 
 then finetuned on 512x512 images.
 *Note: Stable Diffusion v1 is a general text-to-image diffusion model and therefore mirrors biases and (mis-)conceptions that are present
 in its training data. 
 Details on the training procedure and data, as well as the intended use of the model can be found in the corresponding [model card](https://huggingface.co/CompVis/stable-diffusion).
 Research into the safe deployment of general text-to-image models is an ongoing effort. To prevent misuse and harm, we currently provide access to the checkpoints only for [academic research purposes upon request](https://stability.ai/academia-access-form).
 **This is an experiment in safe and community-driven publication of a capable and general text-to-image model. We are working on a public release with a more permissive license that also incorporates ethical considerations.***
 [Request access to Stable Diffusion v1 checkpoints for academic research](https://stability.ai/academia-access-form) 
 ### Weights
 We currently provide three checkpoints, `sd-v1-1.ckpt`, `sd-v1-2.ckpt` and `sd-v1-3.ckpt`,
 which were trained as follows,
 - `sd-v1-1.ckpt`: 237k steps at resolution `256x256` on [laion2B-en](https://huggingface.co/datasets/laion/laion2B-en).
  194k steps at resolution `512x512` on [laion-high-resolution](https://huggingface.co/datasets/laion/laion-high-resolution) (170M examples from LAION-5B with resolution `>= 1024x1024`).
 - `sd-v1-2.ckpt`: Resumed from `sd-v1-1.ckpt`.
  515k steps at resolution `512x512` on "laion-improved-aesthetics" (a subset of laion2B-en,
 filtered to images with an original size `>= 512x512`, estimated aesthetics score `> 5.0`, and an estimated watermark probability `< 0.5`. The watermark estimate is from the LAION-5B metadata, the aesthetics score is estimated using an [improved aesthetics estimator](https://github.com/christophschuhmann/improved-aesthetic-predictor)).
 - `sd-v1-3.ckpt`: Resumed from `sd-v1-2.ckpt`. 195k steps at resolution `512x512` on "laion-improved-aesthetics" and 10\% dropping of the text-conditioning to improve [classifier-free guidance sampling](https://arxiv.org/abs/2207.12598).
 Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0,
 5.0, 6.0, 7.0, 8.0) and 50 PLMS sampling
 steps show the relative improvements of the checkpoints:
 ![sd evaluation results](assets/v1-variants-scores.jpg)
 ### Text-to-Image with Stable Diffusion
 ![txt2img-stable2](assets/stable-samples/txt2img/merged-0005.png)
 ![txt2img-stable2](assets/stable-samples/txt2img/merged-0007.png)
 Stable Diffusion is a latent diffusion model conditioned on the (non-pooled) text embeddings of a CLIP ViT-L/14 text encoder.
 #### Sampling Script
 After [obtaining the weights](#weights), link them
 ```
 mkdir -p models/ldm/stable-diffusion-v1/
 ln -s <path/to/model.ckpt> models/ldm/stable-diffusion-v1/model.ckpt 
 ```
 and sample with
 ```
 python scripts/txt2img.py --prompt "a photograph of an astronaut riding a horse" --plms 
 ```
 By default, this uses a guidance scale of `--scale 7.5`, [Katherine Crowson's implementation](https://github.com/CompVis/latent-diffusion/pull/51) of the [PLMS](https://arxiv.org/abs/2202.09778) sampler, 
 and renders images of size 512x512 (which it was trained on) in 50 steps. All supported arguments are listed below (type `python scripts/txt2img.py --help`).
 ```commandline
 usage: txt2img.py [-h] [--prompt [PROMPT]] [--outdir [OUTDIR]] [--skip_grid] [--skip_save] [--ddim_steps DDIM_STEPS] [--plms] [--laion400m] [--fixed_code] [--ddim_eta DDIM_ETA] [--n_iter N_ITER] [--H H] [--W W] [--C C] [--f F] [--n_samples N_SAMPLES] [--n_rows N_ROWS]
                  [--scale SCALE] [--from-file FROM_FILE] [--config CONFIG] [--ckpt CKPT] [--seed SEED] [--precision {full,autocast}]
 optional arguments:
  -h, --help            show this help message and exit
  --prompt [PROMPT]     the prompt to render
  --outdir [OUTDIR]     dir to write results to
  --skip_grid           do not save a grid, only individual samples. Helpful when evaluating lots of samples
  --skip_save           do not save individual samples. For speed measurements.
  --ddim_steps DDIM_STEPS
                        number of ddim sampling steps
  --plms                use plms sampling
  --laion400m           uses the LAION400M model
  --fixed_code          if enabled, uses the same starting code across samples
  --ddim_eta DDIM_ETA   ddim eta (eta=0.0 corresponds to deterministic sampling
  --n_iter N_ITER       sample this often
  --H H                 image height, in pixel space
  --W W                 image width, in pixel space
  --C C                 latent channels
  --f F                 downsampling factor
  --n_samples N_SAMPLES
                        how many samples to produce for each given prompt. A.k.a. batch size
                        (note that the seeds for each image in the batch will be unavailable)
  --n_rows N_ROWS       rows in the grid (default: n_samples)
  --scale SCALE         unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))
  --from-file FROM_FILE
                        if specified, load prompts from this file
  --config CONFIG       path to config which constructs model
  --ckpt CKPT           path to checkpoint of model
  --seed SEED           the seed (for reproducible sampling)
  --precision {full,autocast}
                        evaluate at this precision
 ```
 Note: The inference config for all v1 versions is designed to be used with EMA-only checkpoints. 
 For this reason `use_ema=False` is set in the configuration, otherwise the code will try to switch from
 non-EMA to EMA weights. If you want to examine the effect of EMA vs no EMA, we provide "full" checkpoints
 which contain both types of weights. For these, `use_ema=False` will load and use the non-EMA weights.
 #### Diffusers Integration
 Another way to download and sample Stable Diffusion is by using the [diffusers library](https://github.com/huggingface/diffusers/tree/main#new--stable-diffusion-is-now-fully-compatible-with-diffusers)
 ```py
 # make sure you're logged in with `huggingface-cli login`
 from torch import autocast
 from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
 pipe = StableDiffusionPipeline.from_pretrained(
 	"CompVis/stable-diffusion-v1-3-diffusers", 
 	use_auth_token=True
 )
 prompt = "a photo of an astronaut riding a horse on mars"
 with autocast("cuda"):
    image = pipe(prompt)["sample"][0]  
 image.save("astronaut_rides_horse.png")
 ```
 ### Image Modification with Stable Diffusion
 By using a diffusion-denoising mechanism as first proposed by [SDEdit](https://arxiv.org/abs/2108.01073), the model can be used for different 
 tasks such as text-guided image-to-image translation and upscaling. Similar to the txt2img sampling script, 
 we provide a script to perform image modification with Stable Diffusion.  
 The following describes an example where a rough sketch made in [Pinta](https://www.pinta-project.com/) is converted into a detailed artwork.
 ```
 python scripts/img2img.py --prompt "A fantasy landscape, trending on artstation" --init-img <path-to-img.jpg> --strength 0.8
 ```
 Here, strength is a value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. 
 Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input. See the following example.
 **Input**
 ![sketch-in](assets/stable-samples/img2img/sketch-mountains-input.jpg)
 **Outputs**
 ![out3](assets/stable-samples/img2img/mountains-3.png)
 ![out2](assets/stable-samples/img2img/mountains-2.png)
 This procedure can, for example, also be used to upscale samples from the base model.
 ## Comments 
 - Our codebase for the diffusion models builds heavily on [OpenAI's ADM codebase](https://github.com/openai/guided-diffusion)
 and [https://github.com/lucidrains/denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch). 
 Thanks for open-sourcing!
 - The implementation of the transformer encoder is from [x-transformers](https://github.com/lucidrains/x-transformers) by [lucidrains](https://github.com/lucidrains?tab=repositories). 
 ## BibTeX
 ```
@misc{rombach2021highresolution,
      title={High-Resolution Image Synthesis with Latent Diffusion Models}, 
      author={Robin Rombach and Andreas Blattmann and Dominik Lorenz and Patrick Esser and Björn Ommer},
      year={2021},
      eprint={2112.10752},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
 }
 ```
--- a/README.md
+++ b/README.md
@ -100,8 +100,74 @@ cat aspect of the image and 75% on the white duck aspect
 use any combination of integers and floating point numbers, and they
 do not need to add up to 1.
 ## Personalizing Text-to-Image Generation
 You may personalize the generated images to provide your own styles or objects by training a new LDM checkpoint
 and introducing a new vocabulary to the fixed model.
 To train, prepare a folder that contains images sized at 512x512 and execute the following:
 ~~~~
 # As the default backend is not available on Windows, if you're using that platform, execute SET PL_TORCH_DISTRIBUTED_BACKEND=gloo
 (ldm) ~/stable-diffusion$ python3 ./main.py --base ./configs/stable-diffusion/v1-finetune.yaml \
                                            -t \
                                            --actual_resume ./models/ldm/stable-diffusion-v1/model.ckpt \
                                            -n my_cat \
                                            --gpus 0, \
                                            --data_root D:/textual-inversion/my_cat \
                                            --init_word 'cat'
 ~~~~
 During the training process, files will be created in /logs/[project][time][project]/
 where you can see the process.
 conditioning* contains the training prompts
 inputs, reconstruction the input images for the training epoch
 samples, samples scaled for a sample of the prompt and one with the init word provided 
 On a RTX3090, the process for SD will take ~1h @1.6 iterations/sec.
 Note: According to the associated paper, the optimal number of images is 3-5 any more images than that and your model might not converge.
 Training will run indefinately, but you may wish to stop it before the heat death of the universe, when you fine a low loss epoch or around ~5000 iterations.
 Once the model is trained, specify the trained .pt file when starting dream using
 ~~~~
 (ldm) ~/stable-diffusion$ python3 ./scripts/dream.py --embedding_path /path/to/embedding.pt --full_precision
 ~~~~
 Then, to utilize your subject at the dream prompt
 ~~~
 dream> "a photo of *"
 ~~~
 this also works with image2image
 ~~~~
 dream> "waterfall and rainbow in the style of *" --init_img=./init-images/crude_drawing.png --strength=0.5 -s100 -n4
 ~~~~
 It's also possible to train multiple tokens (modify the placeholder string in configs/stable-diffusion/v1-finetune.yaml) and combine LDM checkpoints using:
 ~~~~
 (ldm) ~/stable-diffusion$ python3 ./scripts/merge_embeddings.py \
                                            --manager_ckpts /path/to/first/embedding.pt /path/to/second/embedding.pt [...] \
                                            --output_path /path/to/output/embedding.pt
 ~~~~
 Credit goes to @rinongal and the repository located at https://github.com/rinongal/textual_inversion Please see the repository and associated paper for details and limitations.
 ## Changes
 * v1.08 (24 August 2022)
   * Escape single quotes on the dream> command before trying to parse. This avoids
     parse errors.
   * Removed instruction to get Python3.8 as first step in Windows install.
     Anaconda3 does it for you.
   * Added bounds checks for numeric arguments that could cause crashes.
   * Cleaned up the copyright and license agreement files.
 * v1.07 (23 August 2022)
   * Image filenames will now never fill gaps in the sequence, but will be assigned the
     next higher name in the chosen directory. This ensures that the alphabetic and chronological
@ -236,34 +302,31 @@ This will bring your local copy into sync with the remote one.
 ### Windows
-1. Install Python version 3.8.5 from here: https://www.python.org/downloads/windows/
+1. Install Anaconda3 (miniconda3 version) from here: https://docs.anaconda.com/anaconda/install/windows/
   (note that several users have reported that later versions do not work properly)
-2. Install Anaconda3 (miniconda3 version) from here: https://docs.anaconda.com/anaconda/install/windows/
+2. Install Git from here: https://git-scm.com/download/win
-3. Install Git from here: https://git-scm.com/download/win
+3. Launch Anaconda from the Windows Start menu. This will bring up a command window. Type all the remaining commands in this window.
-4. Launch Anaconda from the Windows Start menu. This will bring up a command window. Type all the remaining commands in this window.
+4. Run the command:
 5. Run the command:
 ```
 git clone https://github.com/lstein/stable-diffusion.git
 ```
 This will create stable-diffusion folder where you will follow the rest of the steps.
-6. Enter the newly-created stable-diffusion folder. From this step forward make sure that you are working in the stable-diffusion directory!
+5. Enter the newly-created stable-diffusion folder. From this step forward make sure that you are working in the stable-diffusion directory!
 ```
 cd stable-diffusion
 ```
-7. Run the following two commands:
+6. Run the following two commands:
 ```
-conda env create -f environment.yaml    (step 7a)
+conda env create -f environment.yaml    (step 6a)
-conda activate ldm                      (step 7b)
+conda activate ldm                      (step 6b)
 ```
 This will install all python requirements and activate the "ldm" environment which sets PATH and other environment variables properly.
-8. Run the command:
+7. Run the command:
 ```
 python scripts\preload_models.py
 ```
@ -273,7 +336,7 @@ requires. (Note that this step is required. I created it because some people
 are using GPU systems that are behind a firewall and the models can't be
 downloaded just-in-time)
-9. Now you need to install the weights for the big stable diffusion model.
+8. Now you need to install the weights for the big stable diffusion model.
 For running with the released weights, you will first need to set up
 an acount with Hugging Face (https://huggingface.co).  Use your
@ -299,7 +362,7 @@ you stashed this file. If you prefer not to copy or move the .ckpt file,
 you may instead create a shortcut to it from within
 "models\ldm\stable-diffusion-v1\".
-10. Start generating images!
+9. Start generating images!
 ```
 # for the pre-release weights
 python scripts\dream.py -l
@ -307,7 +370,7 @@ python scripts\dream.py -l
 # for the post-release weights
 python scripts\dream.py
 ```
-11. Subsequently, to relaunch the script, first activate the Anaconda command window (step 4), enter the stable-diffusion directory (step 6, "cd \path\to\stable-diffusion"), run "conda activate ldm" (step 7b), and then launch the dream script (step 10).
+10. Subsequently, to relaunch the script, first activate the Anaconda command window (step 3), enter the stable-diffusion directory (step 5, "cd \path\to\stable-diffusion"), run "conda activate ldm" (step 6b), and then launch the dream script (step 9).
 #### Updating to newer versions of the script
@ -378,213 +441,9 @@ to send me an email if you use and like the script.
 *Contributions by:* [Peter Kowalczyk](https://github.com/slix), [Henry Harrison](https://github.com/hwharrison), [xraxra](https://github.com/xraxra), and [bmaltais](https://github.com/bmaltais)
-# Original README from CompViz/stable-diffusion
+Original portions of the software are Copyright (c) 2020 Lincoln D. Stein (https://github.com/lstein)
 *Stable Diffusion was made possible thanks to a collaboration with [Stability AI](https://stability.ai/) and [Runway](https://runwayml.com/) and builds upon our previous work:*
 [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://ommer-lab.com/research/latent-diffusion-models/)<br/>
 [Robin Rombach](https://github.com/rromb)\*,
 [Andreas Blattmann](https://github.com/ablattmann)\*,
 [Dominik Lorenz](https://github.com/qp-qp)\,
 [Patrick Esser](https://github.com/pesser),
 [Björn Ommer](https://hci.iwr.uni-heidelberg.de/Staff/bommer)<br/>
 **CVPR '22 Oral**
 which is available on [GitHub](https://github.com/CompVis/latent-diffusion). PDF at [arXiv](https://arxiv.org/abs/2112.10752). Please also visit our [Project page](https://ommer-lab.com/research/latent-diffusion-models/).
 ![txt2img-stable2](assets/stable-samples/txt2img/merged-0006.png)
 [Stable Diffusion](#stable-diffusion-v1) is a latent text-to-image diffusion
 model.
 Thanks to a generous compute donation from [Stability AI](https://stability.ai/) and support from [LAION](https://laion.ai/), we were able to train a Latent Diffusion Model on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database. 
 Similar to Google's [Imagen](https://arxiv.org/abs/2205.11487), 
 this model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts.
 With its 860M UNet and 123M text encoder, the model is relatively lightweight and runs on a GPU with at least 10GB VRAM.
 See [this section](#stable-diffusion-v1) below and the [model card](https://huggingface.co/CompVis/stable-diffusion).
 ## Requirements
 A suitable [conda](https://conda.io/) environment named `ldm` can be created
 and activated with:
 ```
 conda env create -f environment.yaml
 conda activate ldm
 ```
 You can also update an existing [latent diffusion](https://github.com/CompVis/latent-diffusion) environment by running
 ```
 conda install pytorch torchvision -c pytorch
 pip install transformers==4.19.2
 pip install -e .
 ```
 ## Stable Diffusion v1
 Stable Diffusion v1 refers to a specific configuration of the model
 architecture that uses a downsampling-factor 8 autoencoder with an 860M UNet
 and CLIP ViT-L/14 text encoder for the diffusion model. The model was pretrained on 256x256 images and 
 then finetuned on 512x512 images.
 *Note: Stable Diffusion v1 is a general text-to-image diffusion model and therefore mirrors biases and (mis-)conceptions that are present
 in its training data. 
 Details on the training procedure and data, as well as the intended use of the model can be found in the corresponding [model card](https://huggingface.co/CompVis/stable-diffusion).
 Research into the safe deployment of general text-to-image models is an ongoing effort. To prevent misuse and harm, we currently provide access to the checkpoints only for [academic research purposes upon request](https://stability.ai/academia-access-form).
 **This is an experiment in safe and community-driven publication of a capable and general text-to-image model. We are working on a public release with a more permissive license that also incorporates ethical considerations.***
 [Request access to Stable Diffusion v1 checkpoints for academic research](https://stability.ai/academia-access-form) 
 ### Weights
 We currently provide three checkpoints, `sd-v1-1.ckpt`, `sd-v1-2.ckpt` and `sd-v1-3.ckpt`,
 which were trained as follows,
 - `sd-v1-1.ckpt`: 237k steps at resolution `256x256` on [laion2B-en](https://huggingface.co/datasets/laion/laion2B-en).
  194k steps at resolution `512x512` on [laion-high-resolution](https://huggingface.co/datasets/laion/laion-high-resolution) (170M examples from LAION-5B with resolution `>= 1024x1024`).
 - `sd-v1-2.ckpt`: Resumed from `sd-v1-1.ckpt`.
  515k steps at resolution `512x512` on "laion-improved-aesthetics" (a subset of laion2B-en,
 filtered to images with an original size `>= 512x512`, estimated aesthetics score `> 5.0`, and an estimated watermark probability `< 0.5`. The watermark estimate is from the LAION-5B metadata, the aesthetics score is estimated using an [improved aesthetics estimator](https://github.com/christophschuhmann/improved-aesthetic-predictor)).
 - `sd-v1-3.ckpt`: Resumed from `sd-v1-2.ckpt`. 195k steps at resolution `512x512` on "laion-improved-aesthetics" and 10\% dropping of the text-conditioning to improve [classifier-free guidance sampling](https://arxiv.org/abs/2207.12598).
 Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0,
 5.0, 6.0, 7.0, 8.0) and 50 PLMS sampling
 steps show the relative improvements of the checkpoints:
 ![sd evaluation results](assets/v1-variants-scores.jpg)
 ### Text-to-Image with Stable Diffusion
 ![txt2img-stable2](assets/stable-samples/txt2img/merged-0005.png)
 ![txt2img-stable2](assets/stable-samples/txt2img/merged-0007.png)
 Stable Diffusion is a latent diffusion model conditioned on the (non-pooled) text embeddings of a CLIP ViT-L/14 text encoder.
 #### Sampling Script
 After [obtaining the weights](#weights), link them
 ```
 mkdir -p models/ldm/stable-diffusion-v1/
 ln -s <path/to/model.ckpt> models/ldm/stable-diffusion-v1/model.ckpt 
 ```
 and sample with
 ```
 python scripts/txt2img.py --prompt "a photograph of an astronaut riding a horse" --plms 
 ```
 By default, this uses a guidance scale of `--scale 7.5`, [Katherine Crowson's implementation](https://github.com/CompVis/latent-diffusion/pull/51) of the [PLMS](https://arxiv.org/abs/2202.09778) sampler, 
 and renders images of size 512x512 (which it was trained on) in 50 steps. All supported arguments are listed below (type `python scripts/txt2img.py --help`).
 ```commandline
 usage: txt2img.py [-h] [--prompt [PROMPT]] [--outdir [OUTDIR]] [--skip_grid] [--skip_save] [--ddim_steps DDIM_STEPS] [--plms] [--laion400m] [--fixed_code] [--ddim_eta DDIM_ETA] [--n_iter N_ITER] [--H H] [--W W] [--C C] [--f F] [--n_samples N_SAMPLES] [--n_rows N_ROWS]
                  [--scale SCALE] [--from-file FROM_FILE] [--config CONFIG] [--ckpt CKPT] [--seed SEED] [--precision {full,autocast}]
 optional arguments:
  -h, --help            show this help message and exit
  --prompt [PROMPT]     the prompt to render
  --outdir [OUTDIR]     dir to write results to
  --skip_grid           do not save a grid, only individual samples. Helpful when evaluating lots of samples
  --skip_save           do not save individual samples. For speed measurements.
  --ddim_steps DDIM_STEPS
                        number of ddim sampling steps
  --plms                use plms sampling
  --laion400m           uses the LAION400M model
  --fixed_code          if enabled, uses the same starting code across samples
  --ddim_eta DDIM_ETA   ddim eta (eta=0.0 corresponds to deterministic sampling
  --n_iter N_ITER       sample this often
  --H H                 image height, in pixel space
  --W W                 image width, in pixel space
  --C C                 latent channels
  --f F                 downsampling factor
  --n_samples N_SAMPLES
                        how many samples to produce for each given prompt. A.k.a. batch size
                        (note that the seeds for each image in the batch will be unavailable)
  --n_rows N_ROWS       rows in the grid (default: n_samples)
  --scale SCALE         unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))
  --from-file FROM_FILE
                        if specified, load prompts from this file
  --config CONFIG       path to config which constructs model
  --ckpt CKPT           path to checkpoint of model
  --seed SEED           the seed (for reproducible sampling)
  --precision {full,autocast}
                        evaluate at this precision
 ```
 Note: The inference config for all v1 versions is designed to be used with EMA-only checkpoints. 
 For this reason `use_ema=False` is set in the configuration, otherwise the code will try to switch from
 non-EMA to EMA weights. If you want to examine the effect of EMA vs no EMA, we provide "full" checkpoints
 which contain both types of weights. For these, `use_ema=False` will load and use the non-EMA weights.
 #### Diffusers Integration
 Another way to download and sample Stable Diffusion is by using the [diffusers library](https://github.com/huggingface/diffusers/tree/main#new--stable-diffusion-is-now-fully-compatible-with-diffusers)
 ```py
 # make sure you're logged in with `huggingface-cli login`
 from torch import autocast
 from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
 pipe = StableDiffusionPipeline.from_pretrained(
 	"CompVis/stable-diffusion-v1-3-diffusers", 
 	use_auth_token=True
 )
 prompt = "a photo of an astronaut riding a horse on mars"
 with autocast("cuda"):
    image = pipe(prompt)["sample"][0]  
 image.save("astronaut_rides_horse.png")
 ```
 ### Image Modification with Stable Diffusion
 By using a diffusion-denoising mechanism as first proposed by [SDEdit](https://arxiv.org/abs/2108.01073), the model can be used for different 
 tasks such as text-guided image-to-image translation and upscaling. Similar to the txt2img sampling script, 
 we provide a script to perform image modification with Stable Diffusion.  
 The following describes an example where a rough sketch made in [Pinta](https://www.pinta-project.com/) is converted into a detailed artwork.
 ```
 python scripts/img2img.py --prompt "A fantasy landscape, trending on artstation" --init-img <path-to-img.jpg> --strength 0.8
 ```
 Here, strength is a value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. 
 Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input. See the following example.
 **Input**
 ![sketch-in](assets/stable-samples/img2img/sketch-mountains-input.jpg)
 **Outputs**
 ![out3](assets/stable-samples/img2img/mountains-3.png)
 ![out2](assets/stable-samples/img2img/mountains-2.png)
 This procedure can, for example, also be used to upscale samples from the base model.
 ## Comments 
 - Our codebase for the diffusion models builds heavily on [OpenAI's ADM codebase](https://github.com/openai/guided-diffusion)
 and [https://github.com/lucidrains/denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch). 
 Thanks for open-sourcing!
 - The implementation of the transformer encoder is from [x-transformers](https://github.com/lucidrains/x-transformers) by [lucidrains](https://github.com/lucidrains?tab=repositories). 
 ## BibTeX
 ```
@misc{rombach2021highresolution,
      title={High-Resolution Image Synthesis with Latent Diffusion Models}, 
      author={Robin Rombach and Andreas Blattmann and Dominik Lorenz and Patrick Esser and Björn Ommer},
      year={2021},
      eprint={2112.10752},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
 }
 ```
 #Further Reading
 Please see the original README for more information on this software
 and underlying algorithm, located in the file README-CompViz.md.
--- a/configs/stable-diffusion/v1-finetune.yaml
+++ b/configs/stable-diffusion/v1-finetune.yaml
@ -0,0 +1,105 @@
 model:
  base_learning_rate: 5.0e-03
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.00085
    linear_end: 0.0120
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: caption
    image_size: 64
    channels: 4
    cond_stage_trainable: true   # Note: different from the one we trained before
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_factor: 0.18215
    use_ema: False
    embedding_reg_weight: 0.0
    personalization_config:
      target: ldm.modules.embedding_manager.EmbeddingManager
      params:
        placeholder_strings: ["*"]
        initializer_words: ["sculpture"]
        per_image_tokens: false
        num_vectors_per_token: 1
        progressive_words: False
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32 # unused
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 2
        channel_mult: [ 1, 2, 4, 4 ]
        num_heads: 8
        use_spatial_transformer: True
        transformer_depth: 1
        context_dim: 768
        use_checkpoint: True
        legacy: False
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 2
    num_workers: 16
    wrap: false
    train:
      target: ldm.data.personalized.PersonalizedBase
      params:
        size: 512
        set: train
        per_image_tokens: false
        repeats: 100
    validation:
      target: ldm.data.personalized.PersonalizedBase
      params:
        size: 512
        set: val
        per_image_tokens: false
        repeats: 10
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 500
        max_images: 8
        increase_log_steps: False
  trainer:
    benchmark: True
    max_steps: 6100
--- a/configs/stable-diffusion/v1-finetune_style.yaml
+++ b/configs/stable-diffusion/v1-finetune_style.yaml
@ -0,0 +1,103 @@
 model:
  base_learning_rate: 5.0e-03
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.00085
    linear_end: 0.0120
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: caption
    image_size: 64
    channels: 4
    cond_stage_trainable: true   # Note: different from the one we trained before
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_factor: 0.18215
    use_ema: False
    embedding_reg_weight: 0.0
    personalization_config:
      target: ldm.modules.embedding_manager.EmbeddingManager
      params:
        placeholder_strings: ["*"]
        initializer_words: ["painting"]
        per_image_tokens: false
        num_vectors_per_token: 1
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32 # unused
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 2
        channel_mult: [ 1, 2, 4, 4 ]
        num_heads: 8
        use_spatial_transformer: True
        transformer_depth: 1
        context_dim: 768
        use_checkpoint: True
        legacy: False
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 2
    num_workers: 16
    wrap: false
    train:
      target: ldm.data.personalized_style.PersonalizedBase
      params:
        size: 512
        set: train
        per_image_tokens: false
        repeats: 100
    validation:
      target: ldm.data.personalized_style.PersonalizedBase
      params:
        size: 512
        set: val
        per_image_tokens: false
        repeats: 10
 lightning:
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 500
        max_images: 8
        increase_log_steps: False
  trainer:
    benchmark: True
--- a/configs/stable-diffusion/v1-inference.yaml
+++ b/configs/stable-diffusion/v1-inference.yaml
@ -26,6 +26,15 @@ model:
        f_max: [ 1. ]
        f_min: [ 1. ]
    personalization_config:
      target: ldm.modules.embedding_manager.EmbeddingManager
      params:
        placeholder_strings: ["*"]
        initializer_words: ["sculpture"]
        per_image_tokens: false
        num_vectors_per_token: 1
        progressive_words: False
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
--- a/environment.yaml
+++ b/environment.yaml
@ -19,6 +19,7 @@ dependencies:
    - omegaconf==2.1.1
    - test-tube>=0.7.5
    - streamlit>=0.73.1
    - pillow==9.0.1
    - einops==0.3.0
    - torch-fidelity==0.3.0
    - transformers==4.19.2
--- a/ldm/data/personalized.py
+++ b/ldm/data/personalized.py
@ -0,0 +1,160 @@
 import os
 import numpy as np
 import PIL
 from PIL import Image
 from torch.utils.data import Dataset
 from torchvision import transforms
 import random
 imagenet_templates_smallest = [
    'a photo of a {}',
 ]
 imagenet_templates_small = [
    'a photo of a {}',
    'a rendering of a {}',
    'a cropped photo of the {}',
    'the photo of a {}',
    'a photo of a clean {}',
    'a photo of a dirty {}',
    'a dark photo of the {}',
    'a photo of my {}',
    'a photo of the cool {}',
    'a close-up photo of a {}',
    'a bright photo of the {}',
    'a cropped photo of a {}',
    'a photo of the {}',
    'a good photo of the {}',
    'a photo of one {}',
    'a close-up photo of the {}',
    'a rendition of the {}',
    'a photo of the clean {}',
    'a rendition of a {}',
    'a photo of a nice {}',
    'a good photo of a {}',
    'a photo of the nice {}',
    'a photo of the small {}',
    'a photo of the weird {}',
    'a photo of the large {}',
    'a photo of a cool {}',
    'a photo of a small {}',
 ]
 imagenet_dual_templates_small = [
    'a photo of a {} with {}',
    'a rendering of a {} with {}',
    'a cropped photo of the {} with {}',
    'the photo of a {} with {}',
    'a photo of a clean {} with {}',
    'a photo of a dirty {} with {}',
    'a dark photo of the {} with {}',
    'a photo of my {} with {}',
    'a photo of the cool {} with {}',
    'a close-up photo of a {} with {}',
    'a bright photo of the {} with {}',
    'a cropped photo of a {} with {}',
    'a photo of the {} with {}',
    'a good photo of the {} with {}',
    'a photo of one {} with {}',
    'a close-up photo of the {} with {}',
    'a rendition of the {} with {}',
    'a photo of the clean {} with {}',
    'a rendition of a {} with {}',
    'a photo of a nice {} with {}',
    'a good photo of a {} with {}',
    'a photo of the nice {} with {}',
    'a photo of the small {} with {}',
    'a photo of the weird {} with {}',
    'a photo of the large {} with {}',
    'a photo of a cool {} with {}',
    'a photo of a small {} with {}',
 ]
 per_img_token_list = [
    'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 'י', 'כ', 'ל', 'מ', 'נ', 'ס', 'ע', 'פ', 'צ', 'ק', 'ר', 'ש', 'ת',
 ]
 class PersonalizedBase(Dataset):
    def __init__(self,
                 data_root,
                 size=None,
                 repeats=100,
                 interpolation="bicubic",
                 flip_p=0.5,
                 set="train",
                 placeholder_token="*",
                 per_image_tokens=False,
                 center_crop=False,
                 mixing_prob=0.25,
                 coarse_class_text=None,
                 ):
        self.data_root = data_root
        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
        # self._length = len(self.image_paths)
        self.num_images = len(self.image_paths)
        self._length = self.num_images 
        self.placeholder_token = placeholder_token
        self.per_image_tokens = per_image_tokens
        self.center_crop = center_crop
        self.mixing_prob = mixing_prob
        self.coarse_class_text = coarse_class_text
        if per_image_tokens:
            assert self.num_images < len(per_img_token_list), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'."
        if set == "train":
            self._length = self.num_images * repeats
        self.size = size
        self.interpolation = {"linear": PIL.Image.LINEAR,
                              "bilinear": PIL.Image.BILINEAR,
                              "bicubic": PIL.Image.BICUBIC,
                              "lanczos": PIL.Image.LANCZOS,
                              }[interpolation]
        self.flip = transforms.RandomHorizontalFlip(p=flip_p)
    def __len__(self):
        return self._length
    def __getitem__(self, i):
        example = {}
        image = Image.open(self.image_paths[i % self.num_images])
        if not image.mode == "RGB":
            image = image.convert("RGB")
        placeholder_string = self.placeholder_token
        if self.coarse_class_text:
            placeholder_string = f"{self.coarse_class_text} {placeholder_string}"
        if self.per_image_tokens and np.random.uniform() < self.mixing_prob:
            text = random.choice(imagenet_dual_templates_small).format(placeholder_string, per_img_token_list[i % self.num_images])
        else:
            text = random.choice(imagenet_templates_small).format(placeholder_string)
        example["caption"] = text
        # default to score-sde preprocessing
        img = np.array(image).astype(np.uint8)
        if self.center_crop:
            crop = min(img.shape[0], img.shape[1])
            h, w, = img.shape[0], img.shape[1]
            img = img[(h - crop) // 2:(h + crop) // 2,
                (w - crop) // 2:(w + crop) // 2]
        image = Image.fromarray(img)
        if self.size is not None:
            image = image.resize((self.size, self.size), resample=self.interpolation)
        image = self.flip(image)
        image = np.array(image).astype(np.uint8)
        example["image"] = (image / 127.5 - 1.0).astype(np.float32)
        return example
--- a/ldm/data/personalized_style.py
+++ b/ldm/data/personalized_style.py
@ -0,0 +1,129 @@
 import os
 import numpy as np
 import PIL
 from PIL import Image
 from torch.utils.data import Dataset
 from torchvision import transforms
 import random
 imagenet_templates_small = [
    'a painting in the style of {}',
    'a rendering in the style of {}',
    'a cropped painting in the style of {}',
    'the painting in the style of {}',
    'a clean painting in the style of {}',
    'a dirty painting in the style of {}',
    'a dark painting in the style of {}',
    'a picture in the style of {}',
    'a cool painting in the style of {}',
    'a close-up painting in the style of {}',
    'a bright painting in the style of {}',
    'a cropped painting in the style of {}',
    'a good painting in the style of {}',
    'a close-up painting in the style of {}',
    'a rendition in the style of {}',
    'a nice painting in the style of {}',
    'a small painting in the style of {}',
    'a weird painting in the style of {}',
    'a large painting in the style of {}',
 ]
 imagenet_dual_templates_small = [
    'a painting in the style of {} with {}',
    'a rendering in the style of {} with {}',
    'a cropped painting in the style of {} with {}',
    'the painting in the style of {} with {}',
    'a clean painting in the style of {} with {}',
    'a dirty painting in the style of {} with {}',
    'a dark painting in the style of {} with {}',
    'a cool painting in the style of {} with {}',
    'a close-up painting in the style of {} with {}',
    'a bright painting in the style of {} with {}',
    'a cropped painting in the style of {} with {}',
    'a good painting in the style of {} with {}',
    'a painting of one {} in the style of {}',
    'a nice painting in the style of {} with {}',
    'a small painting in the style of {} with {}',
    'a weird painting in the style of {} with {}',
    'a large painting in the style of {} with {}',
 ]
 per_img_token_list = [
    'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 'י', 'כ', 'ל', 'מ', 'נ', 'ס', 'ע', 'פ', 'צ', 'ק', 'ר', 'ש', 'ת',
 ]
 class PersonalizedBase(Dataset):
    def __init__(self,
                 data_root,
                 size=None,
                 repeats=100,
                 interpolation="bicubic",
                 flip_p=0.5,
                 set="train",
                 placeholder_token="*",
                 per_image_tokens=False,
                 center_crop=False,
                 ):
        self.data_root = data_root
        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
        # self._length = len(self.image_paths)
        self.num_images = len(self.image_paths)
        self._length = self.num_images 
        self.placeholder_token = placeholder_token
        self.per_image_tokens = per_image_tokens
        self.center_crop = center_crop
        if per_image_tokens:
            assert self.num_images < len(per_img_token_list), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'."
        if set == "train":
            self._length = self.num_images * repeats
        self.size = size
        self.interpolation = {"linear": PIL.Image.LINEAR,
                              "bilinear": PIL.Image.BILINEAR,
                              "bicubic": PIL.Image.BICUBIC,
                              "lanczos": PIL.Image.LANCZOS,
                              }[interpolation]
        self.flip = transforms.RandomHorizontalFlip(p=flip_p)
    def __len__(self):
        return self._length
    def __getitem__(self, i):
        example = {}
        image = Image.open(self.image_paths[i % self.num_images])
        if not image.mode == "RGB":
            image = image.convert("RGB")
        if self.per_image_tokens and np.random.uniform() < 0.25:
            text = random.choice(imagenet_dual_templates_small).format(self.placeholder_token, per_img_token_list[i % self.num_images])
        else:
            text = random.choice(imagenet_templates_small).format(self.placeholder_token)
        example["caption"] = text
        # default to score-sde preprocessing
        img = np.array(image).astype(np.uint8)
        if self.center_crop:
            crop = min(img.shape[0], img.shape[1])
            h, w, = img.shape[0], img.shape[1]
            img = img[(h - crop) // 2:(h + crop) // 2,
                (w - crop) // 2:(w + crop) // 2]
        image = Image.fromarray(img)
        if self.size is not None:
            image = image.resize((self.size, self.size), resample=self.interpolation)
        image = self.flip(image)
        image = np.array(image).astype(np.uint8)
        example["image"] = (image / 127.5 - 1.0).astype(np.float32)
        return example
--- a/ldm/models/diffusion/ddim.py
+++ b/ldm/models/diffusion/ddim.py
@ -17,9 +17,6 @@ class DDIMSampler(object):
        self.schedule = schedule
    def register_buffer(self, name, attr):
        if type(attr) == torch.Tensor:
            if attr.device != torch.device("cuda"):
                attr = attr.to(torch.device("cuda"))
        setattr(self, name, attr)
    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
--- a/ldm/models/diffusion/ddpm.py
+++ b/ldm/models/diffusion/ddpm.py
@ -7,7 +7,9 @@ https://github.com/CompVis/taming-transformers
 """
 import torch
 import torch.nn as nn
 import os
 import numpy as np
 import pytorch_lightning as pl
 from torch.optim.lr_scheduler import LambdaLR
@ -64,6 +66,7 @@ class DDPM(pl.LightningModule):
                 cosine_s=8e-3,
                 given_betas=None,
                 original_elbo_weight=0.,
                 embedding_reg_weight=0.,
                 v_posterior=0.,  # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
                 l_simple_weight=1.,
                 conditioning_key=None,
@ -98,6 +101,7 @@ class DDPM(pl.LightningModule):
        self.v_posterior = v_posterior
        self.original_elbo_weight = original_elbo_weight
        self.l_simple_weight = l_simple_weight
        self.embedding_reg_weight = embedding_reg_weight
        if monitor is not None:
            self.monitor = monitor
@ -427,6 +431,7 @@ class LatentDiffusion(DDPM):
    def __init__(self,
                 first_stage_config,
                 cond_stage_config,
                 personalization_config,
                 num_timesteps_cond=None,
                 cond_stage_key="image",
                 cond_stage_trainable=False,
@ -436,6 +441,7 @@ class LatentDiffusion(DDPM):
                 scale_factor=1.0,
                 scale_by_std=False,
                 *args, **kwargs):
        self.num_timesteps_cond = default(num_timesteps_cond, 1)
        self.scale_by_std = scale_by_std
        assert self.num_timesteps_cond <= kwargs['timesteps']
@ -450,6 +456,7 @@ class LatentDiffusion(DDPM):
        self.concat_mode = concat_mode
        self.cond_stage_trainable = cond_stage_trainable
        self.cond_stage_key = cond_stage_key
        try:
            self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
        except:
@ -460,6 +467,7 @@ class LatentDiffusion(DDPM):
            self.register_buffer('scale_factor', torch.tensor(scale_factor))
        self.instantiate_first_stage(first_stage_config)
        self.instantiate_cond_stage(cond_stage_config)
        self.cond_stage_forward = cond_stage_forward
        self.clip_denoised = False
        self.bbox_tokenizer = None  
@ -469,6 +477,25 @@ class LatentDiffusion(DDPM):
            self.init_from_ckpt(ckpt_path, ignore_keys)
            self.restarted_from_ckpt = True
        self.cond_stage_model.train = disabled_train
        for param in self.cond_stage_model.parameters():
            param.requires_grad = False
        self.model.eval()
        self.model.train = disabled_train
        for param in self.model.parameters():
            param.requires_grad = False
        self.embedding_manager = self.instantiate_embedding_manager(personalization_config, self.cond_stage_model)
        self.emb_ckpt_counter = 0
        # if self.embedding_manager.is_clip:
        #     self.cond_stage_model.update_embedding_func(self.embedding_manager)
        for param in self.embedding_manager.embedding_parameters():
            param.requires_grad = True
    def make_cond_schedule(self, ):
        self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
        ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
@ -531,6 +558,15 @@ class LatentDiffusion(DDPM):
                raise SystemExit("* Couldn't load a dependency. Try running scripts/preload_models.py from an internet-conected machine.")
            self.cond_stage_model = model
    def instantiate_embedding_manager(self, config, embedder):
        model = instantiate_from_config(config, embedder=embedder)
        if config.params.get("embedding_manager_ckpt", None): # do not load if missing OR empty string
            model.load(config.params.embedding_manager_ckpt)
        return model
    def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False):
        denoise_row = []
        for zd in tqdm(samples, desc=desc):
@ -555,7 +591,7 @@ class LatentDiffusion(DDPM):
    def get_learned_conditioning(self, c):
        if self.cond_stage_forward is None:
            if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
-                c = self.cond_stage_model.encode(c)
+                c = self.cond_stage_model.encode(c, embedding_manager=self.embedding_manager)
                if isinstance(c, DiagonalGaussianDistribution):
                    c = c.mode()
            else:
@ -880,6 +916,7 @@ class LatentDiffusion(DDPM):
            if self.shorten_cond_schedule:  # TODO: drop this option
                tc = self.cond_ids[t].to(self.device)
                c = self.q_sample(x_start=c, t=tc, noise=torch.randn_like(c.float()))
        return self.p_losses(x, c, t, *args, **kwargs)
    def _rescale_annotations(self, bboxes, crop_coordinates):  # TODO: move to dataset
@ -1046,6 +1083,14 @@ class LatentDiffusion(DDPM):
        loss += (self.original_elbo_weight * loss_vlb)
        loss_dict.update({f'{prefix}/loss': loss})
        if self.embedding_reg_weight > 0:
            loss_embedding_reg = self.embedding_manager.embedding_to_coarse_loss().mean()
            loss_dict.update({f'{prefix}/loss_emb_reg': loss_embedding_reg})
            loss += (self.embedding_reg_weight * loss_embedding_reg)
            loss_dict.update({f'{prefix}/loss': loss})
        return loss, loss_dict
    def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codebook_ids=False, quantize_denoised=False,
@ -1250,11 +1295,10 @@ class LatentDiffusion(DDPM):
        return samples, intermediates
    @torch.no_grad()
    def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
-                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
+                   quantize_denoised=True, inpaint=False, plot_denoise_rows=False, plot_progressive_rows=False,
-                   plot_diffusion_rows=True, **kwargs):
+                   plot_diffusion_rows=False, **kwargs):
        use_ddim = ddim_steps is not None
@ -1313,6 +1357,16 @@ class LatentDiffusion(DDPM):
                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
                log["denoise_row"] = denoise_grid
            uc = self.get_learned_conditioning(len(c) * [""])
            sample_scaled, _ = self.sample_log(cond=c, 
                                               batch_size=N, 
                                               ddim=use_ddim, 
                                               ddim_steps=ddim_steps,
                                               eta=ddim_eta,                                                 
                                               unconditional_guidance_scale=5.0,
                                               unconditional_conditioning=uc)
            log["samples_scaled"] = self.decode_first_stage(sample_scaled)
            if quantize_denoised and not isinstance(self.first_stage_model, AutoencoderKL) and not isinstance(
                    self.first_stage_model, IdentityFirstStage):
                # also display when quantizing x0 while sampling
@ -1364,6 +1418,11 @@ class LatentDiffusion(DDPM):
    def configure_optimizers(self):
        lr = self.learning_rate
        if self.embedding_manager is not None:
            params = list(self.embedding_manager.embedding_parameters())
            # params = list(self.cond_stage_model.transformer.text_model.embeddings.embedding_manager.embedding_parameters())
        else:
            params = list(self.model.parameters())
            if self.cond_stage_trainable:
                print(f"{self.__class__.__name__}: Also optimizing conditioner params!")
@ -1395,6 +1454,18 @@ class LatentDiffusion(DDPM):
        x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
        return x
    @rank_zero_only
    def on_save_checkpoint(self, checkpoint):
        checkpoint.clear()
        if os.path.isdir(self.trainer.checkpoint_callback.dirpath):
            self.embedding_manager.save(os.path.join(self.trainer.checkpoint_callback.dirpath, "embeddings.pt"))
            if (self.global_step - self.emb_ckpt_counter) > 500:
                self.embedding_manager.save(os.path.join(self.trainer.checkpoint_callback.dirpath, f"embeddings_gs-{self.global_step}.pt"))
                self.emb_ckpt_counter += 500
 class DiffusionWrapper(pl.LightningModule):
    def __init__(self, diff_model_config, conditioning_key):
--- a/ldm/models/diffusion/ksampler.py
+++ b/ldm/models/diffusion/ksampler.py
@ -67,7 +67,7 @@ class KSampler(object):
            x = torch.randn([batch_size, *shape], device=self.device) * sigmas[0] # for GPU draw
        model_wrap_cfg = CFGDenoiser(self.model)
        extra_args = {'cond': conditioning, 'uncond': unconditional_conditioning, 'cond_scale': unconditional_guidance_scale}
-        return (K.sampling.sample_lms(model_wrap_cfg, x, sigmas, extra_args=extra_args, disable=not self.accelerator.is_main_process),
+        return (K.sampling.__dict__[f'sample_{self.schedule}'](model_wrap_cfg, x, sigmas, extra_args=extra_args, disable=not self.accelerator.is_main_process),
                None)
    def gather(samples_ddim):
--- a/ldm/models/diffusion/plms.py
+++ b/ldm/models/diffusion/plms.py
@ -16,9 +16,6 @@ class PLMSSampler(object):
        self.schedule = schedule
    def register_buffer(self, name, attr):
        if type(attr) == torch.Tensor:
            if attr.device != torch.device("cuda"):
                attr = attr.to(torch.device("cuda"))
        setattr(self, name, attr)
    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
--- a/ldm/modules/diffusionmodules/util.py
+++ b/ldm/modules/diffusionmodules/util.py
@ -109,7 +109,7 @@ def checkpoint(func, inputs, params, flag):
                   explicitly take as arguments.
    :param flag: if False, disable gradient checkpointing.
    """
-    if flag:
+    if False: # disabled checkpointing to allow requires_grad = False for main model
        args = tuple(inputs) + tuple(params)
        return CheckpointFunction.apply(func, len(inputs), *args)
    else:
--- a/ldm/modules/embedding_manager.py
+++ b/ldm/modules/embedding_manager.py
@ -0,0 +1,164 @@
 from cmath import log
 import torch
 from torch import nn
 import sys
 from ldm.data.personalized import per_img_token_list
 from transformers import CLIPTokenizer
 from functools import partial
 DEFAULT_PLACEHOLDER_TOKEN = ["*"]
 PROGRESSIVE_SCALE = 2000
 def get_clip_token_for_string(tokenizer, string):
    batch_encoding = tokenizer(string, truncation=True, max_length=77, return_length=True,
                               return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
    tokens = batch_encoding["input_ids"]
    assert torch.count_nonzero(tokens - 49407) == 2, f"String '{string}' maps to more than a single token. Please use another string"
    return tokens[0, 1]
 def get_bert_token_for_string(tokenizer, string):
    token = tokenizer(string)
    # assert torch.count_nonzero(token) == 3, f"String '{string}' maps to more than a single token. Please use another string"
    token = token[0, 1]
    return token
 def get_embedding_for_clip_token(embedder, token):
    return embedder(token.unsqueeze(0))[0, 0]
 class EmbeddingManager(nn.Module):
    def __init__(
            self,
            embedder,
            placeholder_strings=None,
            initializer_words=None,
            per_image_tokens=False,
            num_vectors_per_token=1,
            progressive_words=False,
            **kwargs
    ):
        super().__init__()
        self.string_to_token_dict = {}
        self.string_to_param_dict = nn.ParameterDict()
        self.initial_embeddings = nn.ParameterDict() # These should not be optimized
        self.progressive_words = progressive_words
        self.progressive_counter = 0
        self.max_vectors_per_token = num_vectors_per_token
        if hasattr(embedder, 'tokenizer'): # using Stable Diffusion's CLIP encoder
            self.is_clip = True
            get_token_for_string = partial(get_clip_token_for_string, embedder.tokenizer)
            get_embedding_for_tkn = partial(get_embedding_for_clip_token, embedder.transformer.text_model.embeddings)
            token_dim = 1280
        else: # using LDM's BERT encoder
            self.is_clip = False
            get_token_for_string = partial(get_bert_token_for_string, embedder.tknz_fn)
            get_embedding_for_tkn = embedder.transformer.token_emb
            token_dim = 1280
        if per_image_tokens:
            placeholder_strings.extend(per_img_token_list)
        for idx, placeholder_string in enumerate(placeholder_strings):
            token = get_token_for_string(placeholder_string)
            if initializer_words and idx < len(initializer_words):
                init_word_token = get_token_for_string(initializer_words[idx])
                with torch.no_grad():
                    init_word_embedding = get_embedding_for_tkn(init_word_token.cpu())
                token_params = torch.nn.Parameter(init_word_embedding.unsqueeze(0).repeat(num_vectors_per_token, 1), requires_grad=True)
                self.initial_embeddings[placeholder_string] = torch.nn.Parameter(init_word_embedding.unsqueeze(0).repeat(num_vectors_per_token, 1), requires_grad=False)
            else:
                token_params = torch.nn.Parameter(torch.rand(size=(num_vectors_per_token, token_dim), requires_grad=True))
            self.string_to_token_dict[placeholder_string] = token
            self.string_to_param_dict[placeholder_string] = token_params
    def forward(
            self,
            tokenized_text,
            embedded_text,
    ):
        b, n, device = *tokenized_text.shape, tokenized_text.device
        for placeholder_string, placeholder_token in self.string_to_token_dict.items():
            placeholder_embedding = self.string_to_param_dict[placeholder_string].to(device)
            if self.max_vectors_per_token == 1: # If there's only one vector per token, we can do a simple replacement
                placeholder_idx = torch.where(tokenized_text == placeholder_token.to(device))
                embedded_text[placeholder_idx] = placeholder_embedding
            else: # otherwise, need to insert and keep track of changing indices
                if self.progressive_words:
                    self.progressive_counter += 1
                    max_step_tokens = 1 + self.progressive_counter // PROGRESSIVE_SCALE
                else:
                    max_step_tokens = self.max_vectors_per_token
                num_vectors_for_token = min(placeholder_embedding.shape[0], max_step_tokens)
                placeholder_rows, placeholder_cols = torch.where(tokenized_text == placeholder_token.to(device))
                if placeholder_rows.nelement() == 0:
                    continue
                sorted_cols, sort_idx = torch.sort(placeholder_cols, descending=True)
                sorted_rows = placeholder_rows[sort_idx]
                for idx in range(len(sorted_rows)):
                    row = sorted_rows[idx]
                    col = sorted_cols[idx]
                    new_token_row = torch.cat([tokenized_text[row][:col], placeholder_token.repeat(num_vectors_for_token).to(device), tokenized_text[row][col + 1:]], axis=0)[:n]
                    new_embed_row = torch.cat([embedded_text[row][:col], placeholder_embedding[:num_vectors_for_token], embedded_text[row][col + 1:]], axis=0)[:n]
                    embedded_text[row]  = new_embed_row
                    tokenized_text[row] = new_token_row
        return embedded_text
    def save(self, ckpt_path):
        torch.save({"string_to_token": self.string_to_token_dict,
                    "string_to_param": self.string_to_param_dict}, ckpt_path)
    def load(self, ckpt_path):
        ckpt = torch.load(ckpt_path, map_location='cpu')
        self.string_to_token_dict = ckpt["string_to_token"]
        self.string_to_param_dict = ckpt["string_to_param"]
    def get_embedding_norms_squared(self):
        all_params = torch.cat(list(self.string_to_param_dict.values()), axis=0) # num_placeholders x embedding_dim
        param_norm_squared = (all_params * all_params).sum(axis=-1)              # num_placeholders
        return param_norm_squared
    def embedding_parameters(self):
        return self.string_to_param_dict.parameters()
    def embedding_to_coarse_loss(self):
        loss = 0.
        num_embeddings = len(self.initial_embeddings)
        for key in self.initial_embeddings:
            optimized = self.string_to_param_dict[key]
            coarse = self.initial_embeddings[key].clone().to(optimized.device)
            loss = loss + (optimized - coarse) @ (optimized - coarse).T / num_embeddings
        return loss
--- a/ldm/modules/encoders/modules.py
+++ b/ldm/modules/encoders/modules.py
@ -8,6 +8,27 @@ import kornia
 from ldm.modules.x_transformer import Encoder, TransformerWrapper  # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
 def _expand_mask(mask, dtype, tgt_len = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    """
    bsz, src_len = mask.size()
    tgt_len = tgt_len if tgt_len is not None else src_len
    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
    inverted_mask = 1.0 - expanded_mask
    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 def _build_causal_attention_mask(bsz, seq_len, dtype):
        # lazily create causal attention mask, with full attention between the vision tokens
        # pytorch uses additive attention mask; fill with -inf
        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
        mask.fill_(torch.tensor(torch.finfo(dtype).min))
        mask.triu_(1)  # zero out the lower diagonal
        mask = mask.unsqueeze(1)  # expand mask
        return mask
 class AbstractEncoder(nn.Module):
    def __init__(self):
@ -98,18 +119,17 @@ class BERTEmbedder(AbstractEncoder):
                                              attn_layers=Encoder(dim=n_embed, depth=n_layer),
                                              emb_dropout=embedding_dropout)
-    def forward(self, text):
+    def forward(self, text, embedding_manager=None):
        if self.use_tknz_fn:
            tokens = self.tknz_fn(text)#.to(self.device)
        else:
            tokens = text
-        z = self.transformer(tokens, return_embeddings=True)
+        z = self.transformer(tokens, return_embeddings=True, embedding_manager=embedding_manager)
        return z
-    def encode(self, text):
+    def encode(self, text, **kwargs):
        # output of length 77
-        return self(text)
+        return self(text, **kwargs)
 class SpatialRescaler(nn.Module):
    def __init__(self,
@ -152,22 +172,165 @@ class FrozenCLIPEmbedder(AbstractEncoder):
        self.max_length = max_length
        self.freeze()
        def embedding_forward(
                self,
                input_ids = None,
                position_ids = None,
                inputs_embeds = None,
                embedding_manager = None,
            ) -> torch.Tensor:
                seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
                if position_ids is None:
                    position_ids = self.position_ids[:, :seq_length]
                if inputs_embeds is None:
                    inputs_embeds = self.token_embedding(input_ids)
                if embedding_manager is not None:
                    inputs_embeds = embedding_manager(input_ids, inputs_embeds)
                position_embeddings = self.position_embedding(position_ids)
                embeddings = inputs_embeds + position_embeddings
                return embeddings      
        self.transformer.text_model.embeddings.forward = embedding_forward.__get__(self.transformer.text_model.embeddings)
        def encoder_forward(
            self,
            inputs_embeds,
            attention_mask = None,
            causal_attention_mask = None,
            output_attentions = None,
            output_hidden_states = None,
            return_dict = None,
        ):
            output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
            output_hidden_states = (
                output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
            )
            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
            encoder_states = () if output_hidden_states else None
            all_attentions = () if output_attentions else None
            hidden_states = inputs_embeds
            for idx, encoder_layer in enumerate(self.layers):
                if output_hidden_states:
                    encoder_states = encoder_states + (hidden_states,)
                layer_outputs = encoder_layer(
                    hidden_states,
                    attention_mask,
                    causal_attention_mask,
                    output_attentions=output_attentions,
                )
                hidden_states = layer_outputs[0]
                if output_attentions:
                    all_attentions = all_attentions + (layer_outputs[1],)
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            return hidden_states
        self.transformer.text_model.encoder.forward = encoder_forward.__get__(self.transformer.text_model.encoder)
        def text_encoder_forward(
            self,
            input_ids = None,
            attention_mask = None,
            position_ids = None,
            output_attentions = None,
            output_hidden_states = None,
            return_dict = None,
            embedding_manager = None,
        ):
            output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
            output_hidden_states = (
                output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
            )
            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
            if input_ids is None:
                raise ValueError("You have to specify either input_ids")
            input_shape = input_ids.size()
            input_ids = input_ids.view(-1, input_shape[-1])
            hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids, embedding_manager=embedding_manager)
            bsz, seq_len = input_shape
            # CLIP's text model uses causal mask, prepare it here.
            # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
            causal_attention_mask = _build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
                hidden_states.device
            )
            # expand attention_mask
            if attention_mask is not None:
                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
            last_hidden_state = self.encoder(
                inputs_embeds=hidden_states,
                attention_mask=attention_mask,
                causal_attention_mask=causal_attention_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
            last_hidden_state = self.final_layer_norm(last_hidden_state)
            return last_hidden_state
        self.transformer.text_model.forward = text_encoder_forward.__get__(self.transformer.text_model)
        def transformer_forward(
            self,
            input_ids = None,
            attention_mask = None,
            position_ids = None,
            output_attentions = None,
            output_hidden_states = None,
            return_dict = None,
            embedding_manager = None,
        ):
            return self.text_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                position_ids=position_ids,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                embedding_manager = embedding_manager
            )
        self.transformer.forward = transformer_forward.__get__(self.transformer)
    def freeze(self):
        self.transformer = self.transformer.eval()
        for param in self.parameters():
            param.requires_grad = False
-    def forward(self, text):
+    def forward(self, text, **kwargs):
        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
        tokens = batch_encoding["input_ids"].to(self.device)
-        outputs = self.transformer(input_ids=tokens)
+        z = self.transformer(input_ids=tokens, **kwargs)
        z = outputs.last_hidden_state
        return z
-    def encode(self, text):
+    def encode(self, text, **kwargs):
-        return self(text)
+        return self(text, **kwargs)
 class FrozenCLIPTextEmbedder(nn.Module):
--- a/ldm/modules/x_transformer.py
+++ b/ldm/modules/x_transformer.py
@ -485,7 +485,8 @@ class AttentionLayers(nn.Module):
            mask=None,
            context_mask=None,
            mems=None,
-            return_hiddens=False
+            return_hiddens=False,
            **kwargs
    ):
        hiddens = []
        intermediates = []
@ -603,11 +604,19 @@ class TransformerWrapper(nn.Module):
            return_mems=False,
            return_attn=False,
            mems=None,
            embedding_manager=None,
            **kwargs
    ):
        b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens
-        x = self.token_emb(x)
+
-        x += self.pos_emb(x)
+        embedded_x = self.token_emb(x)
        if embedding_manager:
            x = embedding_manager(x, embedded_x)
        else:
            x = embedded_x
        x = x + self.pos_emb(x)
        x = self.emb_dropout(x)
        x = self.project_emb(x)
--- a/ldm/simplet2i.py
+++ b/ldm/simplet2i.py
@ -1,3 +1,10 @@
 # Copyright (c) 2022 Lincoln D. Stein (https://github.com/lstein)
 # Derived from source code carrying the following copyrights
 # Copyright (c) 2022 Machine Vision and Learning Group, LMU Munich
 # Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
 """Simplified text to image API for stable diffusion/latent diffusion
 Example Usage:
@ -11,7 +18,7 @@ t2i = T2I(outdir      = <path>        // outputs/txt2img-samples
          batch_size       = <integer>     // how many images to generate per sampling (1)
          steps       = <integer>     // 50
          seed        = <integer>     // current system time
-          sampler_name= ['ddim','plms','klms']  // klms
+          sampler_name= ['ddim', 'k_dpm_2_a', 'k_dpm_2', 'k_euler_a', 'k_euler', 'k_heun', 'k_lms', 'plms']  // k_lms
          grid        = <boolean>     // false
          width       = <integer>     // image width, multiple of 64 (512)
          height      = <integer>     // image height, multiple of 64 (512)
@ -51,6 +58,7 @@ import sys
 import os
 from omegaconf import OmegaConf
 from PIL import Image
 import PIL
 from tqdm import tqdm, trange
 from itertools import islice
 from einops import rearrange, repeat
@ -89,6 +97,7 @@ class T2I:
    downsampling_factor
    precision
    strength
    embedding_path
 The vast majority of these arguments default to reasonable values.
 """
@ -113,7 +122,9 @@ The vast majority of these arguments default to reasonable values.
                 precision='autocast',
                 full_precision=False,
                 strength=0.75, # default in scripts/img2img.py
-                 latent_diffusion_weights=False  # just to keep track of this parameter when regenerating prompt
+                 embedding_path=None,
                 latent_diffusion_weights=False,  # just to keep track of this parameter when regenerating prompt
                 device='cuda'
    ):
        self.outdir     = outdir
        self.batch_size      = batch_size
@ -133,17 +144,20 @@ The vast majority of these arguments default to reasonable values.
        self.precision           = precision
        self.full_precision      = full_precision
        self.strength            = strength
        self.embedding_path      = embedding_path
        self.model      = None     # empty for now
        self.sampler    = None
        self.latent_diffusion_weights=latent_diffusion_weights
        self.device = device
        if seed is None:
            self.seed = self._new_seed()
        else:
            self.seed = seed
    @torch.no_grad()
    def txt2img(self,prompt,outdir=None,batch_size=None,iterations=None,
                steps=None,seed=None,grid=None,individual=None,width=None,height=None,
-                cfg_scale=None,ddim_eta=None,strength=None,init_img=None,
+                cfg_scale=None,ddim_eta=None,strength=None,embedding_path=None,init_img=None,
                skip_normalize=False,variants=None):
        """
        Generate an image from the prompt, writing iteration images into the outdir
@ -159,9 +173,13 @@ The vast majority of these arguments default to reasonable values.
        batch_size = batch_size or self.batch_size
        iterations = iterations or self.iterations
        strength   = strength   or self.strength     # not actually used here, but preserved for code refactoring
        embedding_path = embedding_path or self.embedding_path
        model = self.load_model()  # will instantiate the model or return it from cache
        assert strength<1.0 and strength>=0.0, "strength (-f) must be >=0.0 and <1.0"
        assert cfg_scale>1.0, "CFG_Scale (-C) must be >1.0"
        # grid and individual are mutually exclusive, with individual taking priority.
        # not necessary, but needed for compatability with dream bot
        if (grid is None):
@ -192,9 +210,7 @@ The vast majority of these arguments default to reasonable values.
        # Gawd. Too many levels of indent here. Need to refactor into smaller routines!
        try:
-            with torch.no_grad():
+            with precision_scope(self.device.type), model.ema_scope():
                with precision_scope("cuda"):
                    with model.ema_scope():
                all_samples = list()
                for n in trange(iterations, desc="Sampling"):
                    seed_everything(seed)
@ -267,9 +283,10 @@ The vast majority of these arguments default to reasonable values.
        return images
    # There is lots of shared code between this and txt2img and should be refactored.
    @torch.no_grad()
    def img2img(self,prompt,outdir=None,init_img=None,batch_size=None,iterations=None,
                steps=None,seed=None,grid=None,individual=None,width=None,height=None,
-                cfg_scale=None,ddim_eta=None,strength=None,skip_normalize=False,variants=None):
+                cfg_scale=None,ddim_eta=None,strength=None,skip_normalize=False):
        """
        Generate an image from the prompt and the initial image, writing iteration images into the outdir
        The output is a list of lists in the format: [[filename1,seed1], [filename2,seed2],...]
@ -282,6 +299,10 @@ The vast majority of these arguments default to reasonable values.
        batch_size = batch_size or self.batch_size
        iterations = iterations or self.iterations
        strength   = strength   or self.strength
        embedding_path = embedding_path or self.embedding_path
        assert strength<1.0 and strength>=0.0, "strength (-f) must be >=0.0 and <1.0"
        assert cfg_scale>1.0, "CFG_Scale (-C) must be >1.0"
        if init_img is None:
            print("no init_img provided!")
@ -313,7 +334,7 @@ The vast majority of these arguments default to reasonable values.
        assert os.path.isfile(init_img)
        init_image = self._load_img(init_img).to(self.device)
        init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
-        with precision_scope("cuda"):
+        with precision_scope(self.device.type):
            init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image))  # move to latent space
        sampler.make_schedule(ddim_num_steps=steps, ddim_eta=ddim_eta, verbose=False)
@ -335,9 +356,7 @@ The vast majority of these arguments default to reasonable values.
        # Gawd. Too many levels of indent here. Need to refactor into smaller routines!
        try:
-            with torch.no_grad():
+            with precision_scope(self.device.type), model.ema_scope():
                with precision_scope("cuda"):
                    with model.ema_scope():
                all_samples = list()
                for n in trange(iterations, desc="Sampling"):
                    seed_everything(seed)
@ -430,25 +449,39 @@ The vast majority of these arguments default to reasonable values.
            seed_everything(self.seed)
            try:
                config = OmegaConf.load(self.config)
-                self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+                self.device = torch.device(self.device) if torch.cuda.is_available() else torch.device("cpu")
                model = self._load_model_from_config(config,self.weights)
                if self.embedding_path is not None:
                    model.embedding_manager.load(self.embedding_path)
                self.model = model.to(self.device)
                # model.to doesn't change the cond_stage_model.device used to move the tokenizer output, so set it here
                self.model.cond_stage_model.device = self.device
            except AttributeError:
                raise SystemExit
            msg = f'setting sampler to {self.sampler_name}'
            if self.sampler_name=='plms':
                print("setting sampler to plms")
                self.sampler = PLMSSampler(self.model)
            elif self.sampler_name == 'ddim':
                print("setting sampler to ddim")
                self.sampler = DDIMSampler(self.model)
-            elif self.sampler_name == 'klms':
+            elif self.sampler_name == 'k_dpm_2_a':
-                print("setting sampler to klms")
+                self.sampler = KSampler(self.model,'dpm_2_ancestral')
            elif self.sampler_name == 'k_dpm_2':
                self.sampler = KSampler(self.model,'dpm_2')
            elif self.sampler_name == 'k_euler_a':
                self.sampler = KSampler(self.model,'euler_ancestral')
            elif self.sampler_name == 'k_euler':
                self.sampler = KSampler(self.model,'euler')
            elif self.sampler_name == 'k_heun':
                self.sampler = KSampler(self.model,'heun')
            elif self.sampler_name == 'k_lms':
                self.sampler = KSampler(self.model,'lms')
            else:
-                print(f"unsupported sampler {self.sampler_name}, defaulting to plms")
+                msg = f'unsupported sampler {self.sampler_name}, defaulting to plms'
                self.sampler = PLMSSampler(self.model)
            print(msg)
        return self.model
    def _load_model_from_config(self, config, ckpt):
@ -459,7 +492,6 @@ The vast majority of these arguments default to reasonable values.
        sd = pl_sd["state_dict"]
        model = instantiate_from_config(config.model)
        m, u = model.load_state_dict(sd, strict=False)
        model.cuda()
        model.eval()
        if self.full_precision:
            print('Using slower but more accurate full-precision math (--full_precision)')
@ -473,7 +505,7 @@ The vast majority of these arguments default to reasonable values.
        w, h = image.size
        print(f"loaded input image of size ({w}, {h}) from {path}")
        w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-        image = image.resize((w, h), resample=Image.Resampling.LANCZOS)
+        image = image.resize((w, h), resample=PIL.Image.LANCZOS)
        image = np.array(image).astype(np.float32) / 255.0
        image = image[None].transpose(0, 3, 1, 2)
        image = torch.from_numpy(image)
--- a/ldm/util.py
+++ b/ldm/util.py
@ -12,6 +12,7 @@ from queue import Queue
 from inspect import isfunction
 from PIL import Image, ImageDraw, ImageFont
 def log_txt_as_img(wh, xc, size=10):
    # wh a tuple of (width, height)
    # xc a list of captions to plot
@ -20,7 +21,7 @@ def log_txt_as_img(wh, xc, size=10):
    for bi in range(b):
        txt = Image.new("RGB", wh, color="white")
        draw = ImageDraw.Draw(txt)
-        font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
+        font = ImageFont.load_default()
        nc = int(40 * (wh[0] / 256))
        lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
@ -73,14 +74,14 @@ def count_params(model, verbose=False):
    return total_params
-def instantiate_from_config(config):
+def instantiate_from_config(config, **kwargs):
    if not "target" in config:
        if config == '__is_first_stage__':
            return None
        elif config == "__is_unconditional__":
            return None
        raise KeyError("Expected key `target` to instantiate.")
-    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+    return get_obj_from_str(config["target"])(**config.get("params", dict()), **kwargs)
 def get_obj_from_str(string, reload=False):
--- a/main.py
+++ b/main.py
@ -2,6 +2,7 @@ import argparse, os, sys, datetime, glob, importlib, csv
 import numpy as np
 import time
 import torch
 import torchvision
 import pytorch_lightning as pl
@ -20,6 +21,22 @@ from pytorch_lightning.utilities import rank_zero_info
 from ldm.data.base import Txt2ImgIterableBaseDataset
 from ldm.util import instantiate_from_config
 def load_model_from_config(config, ckpt, verbose=False):
    print(f"Loading model from {ckpt}")
    pl_sd = torch.load(ckpt, map_location="cpu")
    sd = pl_sd["state_dict"]
    config.model.params.ckpt_path = ckpt
    model = instantiate_from_config(config.model)
    m, u = model.load_state_dict(sd, strict=False)
    if len(m) > 0 and verbose:
        print("missing keys:")
        print(m)
    if len(u) > 0 and verbose:
        print("unexpected keys:")
        print(u)
    model.cuda()
    return model
 def get_parser(**parser_kwargs):
    def str2bool(v):
@ -120,6 +137,23 @@ def get_parser(**parser_kwargs):
        default=True,
        help="scale base-lr by ngpu * batch_size * n_accumulate",
    )
    parser.add_argument(
        "--datadir_in_name", 
        type=str2bool, 
        nargs="?", 
        const=True, 
        default=True, 
        help="Prepend the final directory in the data_root to the output directory name")
    parser.add_argument("--actual_resume", type=str, default="", help="Path to model to actually resume from")
    parser.add_argument("--data_root", type=str, required=True, help="Path to directory with training images")
    parser.add_argument("--embedding_manager_ckpt", type=str, default="", help="Initialize embedding manager from a checkpoint")
    parser.add_argument("--placeholder_tokens", type=str, nargs="+", default=["*"])
    parser.add_argument("--init_word", type=str, help="Word to use as source for initial token embedding.")
    return parser
@ -502,6 +536,10 @@ if __name__ == "__main__":
            name = "_" + cfg_name
        else:
            name = ""
        if opt.datadir_in_name:
            now = os.path.basename(os.path.normpath(opt.data_root)) + now
        nowname = now + name + opt.postfix
        logdir = os.path.join(opt.logdir, nowname)
@ -532,6 +570,17 @@ if __name__ == "__main__":
        lightning_config.trainer = trainer_config
        # model
        # config.model.params.personalization_config.params.init_word = opt.init_word
        config.model.params.personalization_config.params.embedding_manager_ckpt = opt.embedding_manager_ckpt
        config.model.params.personalization_config.params.placeholder_tokens = opt.placeholder_tokens
        if opt.init_word:
            config.model.params.personalization_config.params.initializer_words[0] = opt.init_word
        if opt.actual_resume:
            model = load_model_from_config(config, opt.actual_resume)
        else:
            model = instantiate_from_config(config.model)
        # trainer and callbacks
@ -655,11 +704,16 @@ if __name__ == "__main__":
            del callbacks_cfg['ignore_keys_callback']
        trainer_kwargs["callbacks"] = [instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg]
        trainer_kwargs["max_steps"] = opt.max_steps
        trainer = Trainer.from_argparse_args(trainer_opt, **trainer_kwargs)
        trainer.logdir = logdir  ###
        # data
        config.data.params.train.params.data_root = opt.data_root
        config.data.params.validation.params.data_root = opt.data_root
        data = instantiate_from_config(config.data)
        data = instantiate_from_config(config.data)
        # NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
        # calling these ourselves should not be necessary but it is.
@ -710,8 +764,8 @@ if __name__ == "__main__":
        import signal
-        signal.signal(signal.SIGUSR1, melk)
+        signal.signal(signal.SIGTERM, melk)
-        signal.signal(signal.SIGUSR2, divein)
+        signal.signal(signal.SIGTERM, divein)
        # run
        if opt.train:
@ -737,5 +791,5 @@ if __name__ == "__main__":
            dst = os.path.join(dst, "debug_runs", name)
            os.makedirs(os.path.split(dst)[0], exist_ok=True)
            os.rename(logdir, dst)
-        if trainer.global_rank == 0:
+        # if trainer.global_rank == 0:
-            print(trainer.profiler.summary())
+        #     print(trainer.profiler.summary())
--- a/requirements.txt
+++ b/requirements.txt
@ -9,6 +9,7 @@ kornia==0.6.0
 numpy==1.19.2
 omegaconf==2.1.1
 opencv-python==4.1.2.30
 pillow==9.0.1
 pudb==2019.2
 pytorch
 pytorch-lightning==1.4.2
--- a/scripts/dream.py
+++ b/scripts/dream.py
@ -1,4 +1,6 @@
 #!/usr/bin/env python3
 # Copyright (c) 2022 Lincoln D. Stein (https://github.com/lstein)
 import argparse
 import shlex
 import atexit
@ -58,7 +60,9 @@ def main():
              weights=weights,
              full_precision=opt.full_precision,
              config=config,
-              latent_diffusion_weights=opt.laion400m # this is solely for recreating the prompt
+              latent_diffusion_weights=opt.laion400m, # this is solely for recreating the prompt
              embedding_path=opt.embedding_path,
              device=opt.device
    )
    # make sure the output directory exists
@ -108,6 +112,10 @@ def main_loop(t2i,parser,log,infile):
        if command.startswith(('#','//')):
            continue
        # before splitting, escape single quotes so as not to mess
        # up the parser
        command = command.replace("'","\\'")
        try:
            elements = shlex.split(command)
        except ValueError as e:
@ -159,10 +167,16 @@ def main_loop(t2i,parser,log,infile):
            print("Try again with a prompt!")
            continue
        try:
            if opt.init_img is None:
                results = t2i.txt2img(**vars(opt))
            else:
                assert os.path.exists(opt.init_img),f"No file found at {opt.init_img}. On Linux systems, pressing <tab> after -I will autocomplete a list of possible image files."
                results = t2i.img2img(**vars(opt))
        except AssertionError as e:
            print(e)
            continue
        allVariantResults = []
        if opt.variants is not None:
@ -175,7 +189,11 @@ def main_loop(t2i,parser,log,infile):
                for j in range(0, opt.variants):
                    newopt.init_img = resultPath
                    print(f"{newopt.init_img}")
                    try:
                        variantResults = t2i.img2img(**vars(newopt))
                    except AssertionError as e:
                        print(e)
                        continue
                    allVariantResults.append([newopt,variantResults])
            print(f"{opt.variants} Variants generated!")
@ -242,6 +260,7 @@ def _reconstruct_switches(t2i,opt):
    switches.append(f'-W{opt.width        or t2i.width}')
    switches.append(f'-H{opt.height       or t2i.height}')
    switches.append(f'-C{opt.cfg_scale    or t2i.cfg_scale}')
    switches.append(f'-m{t2i.sampler_name}')
    if opt.init_img:
        switches.append(f'-I{opt.init_img}')
    if opt.strength and opt.init_img is not None:
@ -282,14 +301,22 @@ def create_argv_parser():
                        help="number of images to produce per iteration (faster, but doesn't generate individual seeds")
    parser.add_argument('--sampler','-m',
                        dest="sampler_name",
-                        choices=['plms','ddim', 'klms'],
+                        choices=['ddim', 'k_dpm_2_a', 'k_dpm_2', 'k_euler_a', 'k_euler', 'k_heun', 'k_lms', 'plms'],
-                        default='klms',
+                        default='k_lms',
-                        help="which sampler to use (klms) - can only be set on command line")
+                        help="which sampler to use (k_lms) - can only be set on command line")
    parser.add_argument('--outdir',
                        '-o',
                        type=str,
                        default="outputs/img-samples",
                        help="directory in which to place generated images and a log of prompts and seeds")
    parser.add_argument('--embedding_path',
                        type=str,
                        help="Path to a pre-trained embedding manager checkpoint - can only be set on command line")
    parser.add_argument('--device',
                        '-d',
                        type=str,
                        default="cuda",
                        help="device to run stable diffusion on. defaults to cuda `torch.cuda.current_device()` if avalible")
    return parser
@ -397,3 +424,4 @@ if readline_available:
 if __name__ == "__main__":
    main()
--- a/scripts/merge_embeddings.py
+++ b/scripts/merge_embeddings.py
@ -0,0 +1,83 @@
 from ldm.modules.encoders.modules import BERTTokenizer
 from ldm.modules.embedding_manager import EmbeddingManager
 import argparse, os
 from functools import partial
 import torch
 def get_placeholder_loop(placeholder_string, tokenizer):
    new_placeholder   = None
    while True:
        if new_placeholder is None:
            new_placeholder = input(f"Placeholder string {placeholder_string} was already used. Please enter a replacement string: ")
        else:
            new_placeholder = input(f"Placeholder string '{new_placeholder}' maps to more than a single token. Please enter another string: ")
        token = tokenizer(new_placeholder)
        if torch.count_nonzero(token) == 3:
            return new_placeholder, token[0, 1]
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--manager_ckpts", 
        type=str, 
        nargs="+", 
        required=True,
        help="Paths to a set of embedding managers to be merged."
    )
    parser.add_argument(
        "--output_path",
        type=str,
        required=True,
        help="Output path for the merged manager",
    )
    args = parser.parse_args()
    tokenizer = BERTTokenizer(vq_interface=False, max_length=77)
    EmbeddingManager = partial(EmbeddingManager, tokenizer, ["*"])
    string_to_token_dict = {}    
    string_to_param_dict = torch.nn.ParameterDict()
    placeholder_to_src = {}
    for manager_ckpt in args.manager_ckpts:
        print(f"Parsing {manager_ckpt}...")
        manager = EmbeddingManager()
        manager.load(manager_ckpt)
        for placeholder_string in manager.string_to_token_dict:
            if not placeholder_string in string_to_token_dict:
                string_to_token_dict[placeholder_string] = manager.string_to_token_dict[placeholder_string]
                string_to_param_dict[placeholder_string] = manager.string_to_param_dict[placeholder_string]
                placeholder_to_src[placeholder_string] = manager_ckpt
            else:
                new_placeholder, new_token = get_placeholder_loop(placeholder_string, tokenizer)
                string_to_token_dict[new_placeholder] = new_token
                string_to_param_dict[new_placeholder] = manager.string_to_param_dict[placeholder_string]
                placeholder_to_src[new_placeholder] = manager_ckpt
    print("Saving combined manager...")
    merged_manager = EmbeddingManager()
    merged_manager.string_to_param_dict = string_to_param_dict
    merged_manager.string_to_token_dict = string_to_token_dict
    merged_manager.save(args.output_path)
    print("Managers merged. Final list of placeholders: ")
    print(placeholder_to_src)
--- a/scripts/preload_models.py
+++ b/scripts/preload_models.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python3
 # Copyright (c) 2022 Lincoln D. Stein (https://github.com/lstein)
 # Before running stable-diffusion on an internet-isolated machine,
 # run this script from one with internet connectivity. The
 # two machines must share a common .cache directory.
@ -30,3 +31,4 @@ tokenizer  =CLIPTokenizer.from_pretrained(version)
 transformer=CLIPTextModel.from_pretrained(version)
 print('\n\n...success')
--- a/src/clip
+++ b/src/clip
@ -0,0 +1 @@
 Subproject commit d50d76daa670286dd6cacf3bcd80b5e4823fc8e1
--- a/src/k-diffusion
+++ b/src/k-diffusion
@ -0,0 +1 @@
 Subproject commit db5799068749bf3a6d5845120ed32df16b7d883b
--- a/src/taming-transformers
+++ b/src/taming-transformers
@ -0,0 +1 @@
 Subproject commit 24268930bf1dce879235a7fddd0b2355b84d7ea6
		`@ -0,0 +1 @@`
							`Subproject commit d50d76daa670286dd6cacf3bcd80b5e4823fc8e1`
		`@ -0,0 +1 @@`
							`Subproject commit db5799068749bf3a6d5845120ed32df16b7d883b`
		`@ -0,0 +1 @@`
							`Subproject commit 24268930bf1dce879235a7fddd0b2355b84d7ea6`