mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
Merge branch 'main' of https://github.com/BaristaLabs/stable-diffusion-dream into add-simple-variant-mechanism
This commit is contained in:
commit
c6b5e930dc
175
.gitignore
vendored
Normal file
175
.gitignore
vendored
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
# ignore default image save location and model symbolic link
|
||||||
|
outputs/
|
||||||
|
models/ldm/stable-diffusion-v1/model.ckpt
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# emacs autosave and recovery files
|
||||||
|
*~
|
||||||
|
.#*
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
pip-wheel-metadata/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
.python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/#use-with-ide
|
||||||
|
.pdm.toml
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
|
|
||||||
|
src/
|
||||||
|
logs/
|
||||||
|
**/__pycache__/
|
||||||
|
outputs
|
28
LICENSE
28
LICENSE
@ -1,9 +1,27 @@
|
|||||||
All rights reserved by the authors.
|
MIT License
|
||||||
You must not distribute the weights provided to you directly or indirectly without explicit consent of the authors.
|
|
||||||
You must not distribute harmful, offensive, dehumanizing content or otherwise harmful representations of people or their environments, cultures, religions, etc. produced with the model weights
|
|
||||||
or other generated content described in the "Misuse and Malicious Use" section in the model card.
|
|
||||||
The model weights are provided for research purposes only.
|
|
||||||
|
|
||||||
|
Copyright (c) 2022 Lincoln D. Stein (https://github.com/lstein)
|
||||||
|
|
||||||
|
This software is derived from a fork of the source code available from
|
||||||
|
https://github.com/pesser/stable-diffusion and
|
||||||
|
https://github.com/CompViz/stable-diffusion. They carry the following
|
||||||
|
copyrights:
|
||||||
|
|
||||||
|
Copyright (c) 2022 Machine Vision and Learning Group, LMU Munich
|
||||||
|
Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
|
||||||
|
|
||||||
|
Please see individual source code files for copyright and authorship
|
||||||
|
attributions.
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
294
LICENSE-ModelWeights.txt
Normal file
294
LICENSE-ModelWeights.txt
Normal file
@ -0,0 +1,294 @@
|
|||||||
|
Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
|
||||||
|
|
||||||
|
CreativeML Open RAIL-M
|
||||||
|
dated August 22, 2022
|
||||||
|
|
||||||
|
Section I: PREAMBLE
|
||||||
|
|
||||||
|
Multimodal generative models are being widely adopted and used, and
|
||||||
|
have the potential to transform the way artists, among other
|
||||||
|
individuals, conceive and benefit from AI or ML technologies as a tool
|
||||||
|
for content creation.
|
||||||
|
|
||||||
|
Notwithstanding the current and potential benefits that these
|
||||||
|
artifacts can bring to society at large, there are also concerns about
|
||||||
|
potential misuses of them, either due to their technical limitations
|
||||||
|
or ethical considerations.
|
||||||
|
|
||||||
|
In short, this license strives for both the open and responsible
|
||||||
|
downstream use of the accompanying model. When it comes to the open
|
||||||
|
character, we took inspiration from open source permissive licenses
|
||||||
|
regarding the grant of IP rights. Referring to the downstream
|
||||||
|
responsible use, we added use-based restrictions not permitting the
|
||||||
|
use of the Model in very specific scenarios, in order for the licensor
|
||||||
|
to be able to enforce the license in case potential misuses of the
|
||||||
|
Model may occur. At the same time, we strive to promote open and
|
||||||
|
responsible research on generative models for art and content
|
||||||
|
generation.
|
||||||
|
|
||||||
|
Even though downstream derivative versions of the model could be
|
||||||
|
released under different licensing terms, the latter will always have
|
||||||
|
to include - at minimum - the same use-based restrictions as the ones
|
||||||
|
in the original license (this license). We believe in the intersection
|
||||||
|
between open and responsible AI development; thus, this License aims
|
||||||
|
to strike a balance between both in order to enable responsible
|
||||||
|
open-science in the field of AI.
|
||||||
|
|
||||||
|
This License governs the use of the model (and its derivatives) and is
|
||||||
|
informed by the model card associated with the model.
|
||||||
|
|
||||||
|
NOW THEREFORE, You and Licensor agree as follows:
|
||||||
|
|
||||||
|
1. Definitions
|
||||||
|
|
||||||
|
- "License" means the terms and conditions for use, reproduction, and
|
||||||
|
Distribution as defined in this document.
|
||||||
|
|
||||||
|
- "Data" means a collection of information and/or content extracted
|
||||||
|
from the dataset used with the Model, including to train, pretrain,
|
||||||
|
or otherwise evaluate the Model. The Data is not licensed under this
|
||||||
|
License.
|
||||||
|
|
||||||
|
- "Output" means the results of operating a Model as embodied in
|
||||||
|
informational content resulting therefrom.
|
||||||
|
|
||||||
|
- "Model" means any accompanying machine-learning based assemblies
|
||||||
|
(including checkpoints), consisting of learnt weights, parameters
|
||||||
|
(including optimizer states), corresponding to the model
|
||||||
|
architecture as embodied in the Complementary Material, that have
|
||||||
|
been trained or tuned, in whole or in part on the Data, using the
|
||||||
|
Complementary Material.
|
||||||
|
|
||||||
|
- "Derivatives of the Model" means all modifications to the Model,
|
||||||
|
works based on the Model, or any other model which is created or
|
||||||
|
initialized by transfer of patterns of the weights, parameters,
|
||||||
|
activations or output of the Model, to the other model, in order to
|
||||||
|
cause the other model to perform similarly to the Model, including -
|
||||||
|
but not limited to - distillation methods entailing the use of
|
||||||
|
intermediate data representations or methods based on the generation
|
||||||
|
of synthetic data by the Model for training the other model.
|
||||||
|
|
||||||
|
- "Complementary Material" means the accompanying source code and
|
||||||
|
scripts used to define, run, load, benchmark or evaluate the Model,
|
||||||
|
and used to prepare data for training or evaluation, if any. This
|
||||||
|
includes any accompanying documentation, tutorials, examples, etc,
|
||||||
|
if any.
|
||||||
|
|
||||||
|
- "Distribution" means any transmission, reproduction, publication or
|
||||||
|
other sharing of the Model or Derivatives of the Model to a third
|
||||||
|
party, including providing the Model as a hosted service made
|
||||||
|
available by electronic or other remote means - e.g. API-based or
|
||||||
|
web access.
|
||||||
|
|
||||||
|
- "Licensor" means the copyright owner or entity authorized by the
|
||||||
|
copyright owner that is granting the License, including the persons
|
||||||
|
or entities that may have rights in the Model and/or distributing
|
||||||
|
the Model.
|
||||||
|
|
||||||
|
- "You" (or "Your") means an individual or Legal Entity exercising
|
||||||
|
permissions granted by this License and/or making use of the Model
|
||||||
|
for whichever purpose and in any field of use, including usage of
|
||||||
|
the Model in an end-use application - e.g. chatbot, translator,
|
||||||
|
image generator.
|
||||||
|
|
||||||
|
- "Third Parties" means individuals or legal entities that are not
|
||||||
|
under common control with Licensor or You.
|
||||||
|
|
||||||
|
- "Contribution" means any work of authorship, including the original
|
||||||
|
version of the Model and any modifications or additions to that
|
||||||
|
Model or Derivatives of the Model thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Model by the copyright
|
||||||
|
owner or by an individual or Legal Entity authorized to submit on
|
||||||
|
behalf of the copyright owner. For the purposes of this definition,
|
||||||
|
"submitted" means any form of electronic, verbal, or written
|
||||||
|
communication sent to the Licensor or its representatives, including
|
||||||
|
but not limited to communication on electronic mailing lists, source
|
||||||
|
code control systems, and issue tracking systems that are managed
|
||||||
|
by, or on behalf of, the Licensor for the purpose of discussing and
|
||||||
|
improving the Model, but excluding communication that is
|
||||||
|
conspicuously marked or otherwise designated in writing by the
|
||||||
|
copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
- "Contributor" means Licensor and any individual or Legal Entity on
|
||||||
|
behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Model.
|
||||||
|
|
||||||
|
Section II: INTELLECTUAL PROPERTY RIGHTS
|
||||||
|
|
||||||
|
Both copyright and patent grants apply to the Model, Derivatives of
|
||||||
|
the Model and Complementary Material. The Model and Derivatives of the
|
||||||
|
Model are subject to additional terms as described in Section III.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare, publicly display, publicly
|
||||||
|
perform, sublicense, and distribute the Complementary Material, the
|
||||||
|
Model, and Derivatives of the Model.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License and where and as applicable, each Contributor hereby
|
||||||
|
grants to You a perpetual, worldwide, non-exclusive, no-charge,
|
||||||
|
royalty-free, irrevocable (except as stated in this paragraph) patent
|
||||||
|
license to make, have made, use, offer to sell, sell, import, and
|
||||||
|
otherwise transfer the Model and the Complementary Material, where
|
||||||
|
such license applies only to those patent claims licensable by such
|
||||||
|
Contributor that are necessarily infringed by their Contribution(s)
|
||||||
|
alone or by combination of their Contribution(s) with the Model to
|
||||||
|
which such Contribution(s) was submitted. If You institute patent
|
||||||
|
litigation against any entity (including a cross-claim or counterclaim
|
||||||
|
in a lawsuit) alleging that the Model and/or Complementary Material or
|
||||||
|
a Contribution incorporated within the Model and/or Complementary
|
||||||
|
Material constitutes direct or contributory patent infringement, then
|
||||||
|
any patent licenses granted to You under this License for the Model
|
||||||
|
and/or Work shall terminate as of the date such litigation is asserted
|
||||||
|
or filed.
|
||||||
|
|
||||||
|
Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
|
||||||
|
|
||||||
|
4. Distribution and Redistribution. You may host for Third Party
|
||||||
|
remote access purposes (e.g. software-as-a-service), reproduce and
|
||||||
|
distribute copies of the Model or Derivatives of the Model thereof in
|
||||||
|
any medium, with or without modifications, provided that You meet the
|
||||||
|
following conditions: Use-based restrictions as referenced in
|
||||||
|
paragraph 5 MUST be included as an enforceable provision by You in any
|
||||||
|
type of legal agreement (e.g. a license) governing the use and/or
|
||||||
|
distribution of the Model or Derivatives of the Model, and You shall
|
||||||
|
give notice to subsequent users You Distribute to, that the Model or
|
||||||
|
Derivatives of the Model are subject to paragraph 5. This provision
|
||||||
|
does not apply to the use of Complementary Material. You must give
|
||||||
|
any Third Party recipients of the Model or Derivatives of the Model a
|
||||||
|
copy of this License; You must cause any modified files to carry
|
||||||
|
prominent notices stating that You changed the files; You must retain
|
||||||
|
all copyright, patent, trademark, and attribution notices excluding
|
||||||
|
those notices that do not pertain to any part of the Model,
|
||||||
|
Derivatives of the Model. You may add Your own copyright statement to
|
||||||
|
Your modifications and may provide additional or different license
|
||||||
|
terms and conditions - respecting paragraph 4.a. - for use,
|
||||||
|
reproduction, or Distribution of Your modifications, or for any such
|
||||||
|
Derivatives of the Model as a whole, provided Your use, reproduction,
|
||||||
|
and Distribution of the Model otherwise complies with the conditions
|
||||||
|
stated in this License.
|
||||||
|
|
||||||
|
5. Use-based restrictions. The restrictions set forth in Attachment A
|
||||||
|
are considered Use-based restrictions. Therefore You cannot use the
|
||||||
|
Model and the Derivatives of the Model for the specified restricted
|
||||||
|
uses. You may use the Model subject to this License, including only
|
||||||
|
for lawful purposes and in accordance with the License. Use may
|
||||||
|
include creating any content with, finetuning, updating, running,
|
||||||
|
training, evaluating and/or reparametrizing the Model. You shall
|
||||||
|
require all of Your users who use the Model or a Derivative of the
|
||||||
|
Model to comply with the terms of this paragraph (paragraph 5).
|
||||||
|
|
||||||
|
6. The Output You Generate. Except as set forth herein, Licensor
|
||||||
|
claims no rights in the Output You generate using the Model. You are
|
||||||
|
accountable for the Output you generate and its subsequent uses. No
|
||||||
|
use of the output can contravene any provision as stated in the
|
||||||
|
License.
|
||||||
|
|
||||||
|
Section IV: OTHER PROVISIONS
|
||||||
|
|
||||||
|
7. Updates and Runtime Restrictions. To the maximum extent permitted
|
||||||
|
by law, Licensor reserves the right to restrict (remotely or
|
||||||
|
otherwise) usage of the Model in violation of this License, update the
|
||||||
|
Model through electronic means, or modify the Output of the Model
|
||||||
|
based on updates. You shall undertake reasonable efforts to use the
|
||||||
|
latest version of the Model.
|
||||||
|
|
||||||
|
8. Trademarks and related. Nothing in this License permits You to make
|
||||||
|
use of Licensors’ trademarks, trade names, logos or to otherwise
|
||||||
|
suggest endorsement or misrepresent the relationship between the
|
||||||
|
parties; and any rights not expressly granted herein are reserved by
|
||||||
|
the Licensors.
|
||||||
|
|
||||||
|
9. Disclaimer of Warranty. Unless required by applicable law or agreed
|
||||||
|
to in writing, Licensor provides the Model and the Complementary
|
||||||
|
Material (and each Contributor provides its Contributions) on an "AS
|
||||||
|
IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||||
|
express or implied, including, without limitation, any warranties or
|
||||||
|
conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Model, Derivatives of
|
||||||
|
the Model, and the Complementary Material and assume any risks
|
||||||
|
associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
10. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise, unless
|
||||||
|
required by applicable law (such as deliberate and grossly negligent
|
||||||
|
acts) or agreed to in writing, shall any Contributor be liable to You
|
||||||
|
for damages, including any direct, indirect, special, incidental, or
|
||||||
|
consequential damages of any character arising as a result of this
|
||||||
|
License or out of the use or inability to use the Model and the
|
||||||
|
Complementary Material (including but not limited to damages for loss
|
||||||
|
of goodwill, work stoppage, computer failure or malfunction, or any
|
||||||
|
and all other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
11. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Model, Derivatives of the Model and the Complementary Material
|
||||||
|
thereof, You may choose to offer, and charge a fee for, acceptance of
|
||||||
|
support, warranty, indemnity, or other liability obligations and/or
|
||||||
|
rights consistent with this License. However, in accepting such
|
||||||
|
obligations, You may act only on Your own behalf and on Your sole
|
||||||
|
responsibility, not on behalf of any other Contributor, and only if
|
||||||
|
You agree to indemnify, defend, and hold each Contributor harmless for
|
||||||
|
any liability incurred by, or claims asserted against, such
|
||||||
|
Contributor by reason of your accepting any such warranty or
|
||||||
|
additional liability.
|
||||||
|
|
||||||
|
12. If any provision of this License is held to be invalid, illegal or
|
||||||
|
unenforceable, the remaining provisions shall be unaffected thereby
|
||||||
|
and remain valid as if such provision had not been set forth herein.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Attachment A
|
||||||
|
|
||||||
|
Use Restrictions
|
||||||
|
|
||||||
|
You agree not to use the Model or Derivatives of the Model:
|
||||||
|
|
||||||
|
- In any way that violates any applicable national, federal, state,
|
||||||
|
local or international law or regulation;
|
||||||
|
|
||||||
|
- For the purpose of exploiting, harming or attempting to exploit or
|
||||||
|
harm minors in any way;
|
||||||
|
|
||||||
|
- To generate or disseminate verifiably false information and/or
|
||||||
|
content with the purpose of harming others;
|
||||||
|
|
||||||
|
- To generate or disseminate personal identifiable information that
|
||||||
|
can be used to harm an individual;
|
||||||
|
|
||||||
|
- To defame, disparage or otherwise harass others;
|
||||||
|
|
||||||
|
- For fully automated decision making that adversely impacts an
|
||||||
|
individual’s legal rights or otherwise creates or modifies a
|
||||||
|
binding, enforceable obligation;
|
||||||
|
|
||||||
|
pp- For any use intended to or which has the effect of discriminating
|
||||||
|
against or harming individuals or groups based on online or offline
|
||||||
|
social behavior or known or predicted personal or personality
|
||||||
|
characteristics;
|
||||||
|
|
||||||
|
- To exploit any of the vulnerabilities of a specific group of persons
|
||||||
|
based on their age, social, physical or mental characteristics, in
|
||||||
|
order to materially distort the behavior of a person pertaining to
|
||||||
|
that group in a manner that causes or is likely to cause that person
|
||||||
|
or another person physical or psychological harm;
|
||||||
|
|
||||||
|
- For any use intended to or which has the effect of discriminating
|
||||||
|
against individuals or groups based on legally protected
|
||||||
|
characteristics or categories;
|
||||||
|
|
||||||
|
- To provide medical advice and medical results interpretation;
|
||||||
|
|
||||||
|
- To generate or disseminate information for the purpose to be used
|
||||||
|
for administration of justice, law enforcement, immigration or
|
||||||
|
asylum processes, such as predicting an individual will commit
|
||||||
|
fraud/crime commitment (e.g. by text profiling, drawing causal
|
||||||
|
relationships between assertions made in documents, indiscriminate
|
||||||
|
and arbitrarily-targeted use).
|
210
README-CompViz.md
Normal file
210
README-CompViz.md
Normal file
@ -0,0 +1,210 @@
|
|||||||
|
# Original README from CompViz/stable-diffusion
|
||||||
|
*Stable Diffusion was made possible thanks to a collaboration with [Stability AI](https://stability.ai/) and [Runway](https://runwayml.com/) and builds upon our previous work:*
|
||||||
|
|
||||||
|
[**High-Resolution Image Synthesis with Latent Diffusion Models**](https://ommer-lab.com/research/latent-diffusion-models/)<br/>
|
||||||
|
[Robin Rombach](https://github.com/rromb)\*,
|
||||||
|
[Andreas Blattmann](https://github.com/ablattmann)\*,
|
||||||
|
[Dominik Lorenz](https://github.com/qp-qp)\,
|
||||||
|
[Patrick Esser](https://github.com/pesser),
|
||||||
|
[Björn Ommer](https://hci.iwr.uni-heidelberg.de/Staff/bommer)<br/>
|
||||||
|
|
||||||
|
**CVPR '22 Oral**
|
||||||
|
|
||||||
|
which is available on [GitHub](https://github.com/CompVis/latent-diffusion). PDF at [arXiv](https://arxiv.org/abs/2112.10752). Please also visit our [Project page](https://ommer-lab.com/research/latent-diffusion-models/).
|
||||||
|
|
||||||
|
![txt2img-stable2](assets/stable-samples/txt2img/merged-0006.png)
|
||||||
|
[Stable Diffusion](#stable-diffusion-v1) is a latent text-to-image diffusion
|
||||||
|
model.
|
||||||
|
Thanks to a generous compute donation from [Stability AI](https://stability.ai/) and support from [LAION](https://laion.ai/), we were able to train a Latent Diffusion Model on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database.
|
||||||
|
Similar to Google's [Imagen](https://arxiv.org/abs/2205.11487),
|
||||||
|
this model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts.
|
||||||
|
With its 860M UNet and 123M text encoder, the model is relatively lightweight and runs on a GPU with at least 10GB VRAM.
|
||||||
|
See [this section](#stable-diffusion-v1) below and the [model card](https://huggingface.co/CompVis/stable-diffusion).
|
||||||
|
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
A suitable [conda](https://conda.io/) environment named `ldm` can be created
|
||||||
|
and activated with:
|
||||||
|
|
||||||
|
```
|
||||||
|
conda env create -f environment.yaml
|
||||||
|
conda activate ldm
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also update an existing [latent diffusion](https://github.com/CompVis/latent-diffusion) environment by running
|
||||||
|
|
||||||
|
```
|
||||||
|
conda install pytorch torchvision -c pytorch
|
||||||
|
pip install transformers==4.19.2
|
||||||
|
pip install -e .
|
||||||
|
```
|
||||||
|
|
||||||
|
## Stable Diffusion v1
|
||||||
|
|
||||||
|
Stable Diffusion v1 refers to a specific configuration of the model
|
||||||
|
architecture that uses a downsampling-factor 8 autoencoder with an 860M UNet
|
||||||
|
and CLIP ViT-L/14 text encoder for the diffusion model. The model was pretrained on 256x256 images and
|
||||||
|
then finetuned on 512x512 images.
|
||||||
|
|
||||||
|
*Note: Stable Diffusion v1 is a general text-to-image diffusion model and therefore mirrors biases and (mis-)conceptions that are present
|
||||||
|
in its training data.
|
||||||
|
Details on the training procedure and data, as well as the intended use of the model can be found in the corresponding [model card](https://huggingface.co/CompVis/stable-diffusion).
|
||||||
|
Research into the safe deployment of general text-to-image models is an ongoing effort. To prevent misuse and harm, we currently provide access to the checkpoints only for [academic research purposes upon request](https://stability.ai/academia-access-form).
|
||||||
|
**This is an experiment in safe and community-driven publication of a capable and general text-to-image model. We are working on a public release with a more permissive license that also incorporates ethical considerations.***
|
||||||
|
|
||||||
|
[Request access to Stable Diffusion v1 checkpoints for academic research](https://stability.ai/academia-access-form)
|
||||||
|
|
||||||
|
### Weights
|
||||||
|
|
||||||
|
We currently provide three checkpoints, `sd-v1-1.ckpt`, `sd-v1-2.ckpt` and `sd-v1-3.ckpt`,
|
||||||
|
which were trained as follows,
|
||||||
|
|
||||||
|
- `sd-v1-1.ckpt`: 237k steps at resolution `256x256` on [laion2B-en](https://huggingface.co/datasets/laion/laion2B-en).
|
||||||
|
194k steps at resolution `512x512` on [laion-high-resolution](https://huggingface.co/datasets/laion/laion-high-resolution) (170M examples from LAION-5B with resolution `>= 1024x1024`).
|
||||||
|
- `sd-v1-2.ckpt`: Resumed from `sd-v1-1.ckpt`.
|
||||||
|
515k steps at resolution `512x512` on "laion-improved-aesthetics" (a subset of laion2B-en,
|
||||||
|
filtered to images with an original size `>= 512x512`, estimated aesthetics score `> 5.0`, and an estimated watermark probability `< 0.5`. The watermark estimate is from the LAION-5B metadata, the aesthetics score is estimated using an [improved aesthetics estimator](https://github.com/christophschuhmann/improved-aesthetic-predictor)).
|
||||||
|
- `sd-v1-3.ckpt`: Resumed from `sd-v1-2.ckpt`. 195k steps at resolution `512x512` on "laion-improved-aesthetics" and 10\% dropping of the text-conditioning to improve [classifier-free guidance sampling](https://arxiv.org/abs/2207.12598).
|
||||||
|
|
||||||
|
Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0,
|
||||||
|
5.0, 6.0, 7.0, 8.0) and 50 PLMS sampling
|
||||||
|
steps show the relative improvements of the checkpoints:
|
||||||
|
![sd evaluation results](assets/v1-variants-scores.jpg)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Text-to-Image with Stable Diffusion
|
||||||
|
![txt2img-stable2](assets/stable-samples/txt2img/merged-0005.png)
|
||||||
|
![txt2img-stable2](assets/stable-samples/txt2img/merged-0007.png)
|
||||||
|
|
||||||
|
Stable Diffusion is a latent diffusion model conditioned on the (non-pooled) text embeddings of a CLIP ViT-L/14 text encoder.
|
||||||
|
|
||||||
|
|
||||||
|
#### Sampling Script
|
||||||
|
|
||||||
|
After [obtaining the weights](#weights), link them
|
||||||
|
```
|
||||||
|
mkdir -p models/ldm/stable-diffusion-v1/
|
||||||
|
ln -s <path/to/model.ckpt> models/ldm/stable-diffusion-v1/model.ckpt
|
||||||
|
```
|
||||||
|
and sample with
|
||||||
|
```
|
||||||
|
python scripts/txt2img.py --prompt "a photograph of an astronaut riding a horse" --plms
|
||||||
|
```
|
||||||
|
|
||||||
|
By default, this uses a guidance scale of `--scale 7.5`, [Katherine Crowson's implementation](https://github.com/CompVis/latent-diffusion/pull/51) of the [PLMS](https://arxiv.org/abs/2202.09778) sampler,
|
||||||
|
and renders images of size 512x512 (which it was trained on) in 50 steps. All supported arguments are listed below (type `python scripts/txt2img.py --help`).
|
||||||
|
|
||||||
|
```commandline
|
||||||
|
usage: txt2img.py [-h] [--prompt [PROMPT]] [--outdir [OUTDIR]] [--skip_grid] [--skip_save] [--ddim_steps DDIM_STEPS] [--plms] [--laion400m] [--fixed_code] [--ddim_eta DDIM_ETA] [--n_iter N_ITER] [--H H] [--W W] [--C C] [--f F] [--n_samples N_SAMPLES] [--n_rows N_ROWS]
|
||||||
|
[--scale SCALE] [--from-file FROM_FILE] [--config CONFIG] [--ckpt CKPT] [--seed SEED] [--precision {full,autocast}]
|
||||||
|
|
||||||
|
optional arguments:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
--prompt [PROMPT] the prompt to render
|
||||||
|
--outdir [OUTDIR] dir to write results to
|
||||||
|
--skip_grid do not save a grid, only individual samples. Helpful when evaluating lots of samples
|
||||||
|
--skip_save do not save individual samples. For speed measurements.
|
||||||
|
--ddim_steps DDIM_STEPS
|
||||||
|
number of ddim sampling steps
|
||||||
|
--plms use plms sampling
|
||||||
|
--laion400m uses the LAION400M model
|
||||||
|
--fixed_code if enabled, uses the same starting code across samples
|
||||||
|
--ddim_eta DDIM_ETA ddim eta (eta=0.0 corresponds to deterministic sampling
|
||||||
|
--n_iter N_ITER sample this often
|
||||||
|
--H H image height, in pixel space
|
||||||
|
--W W image width, in pixel space
|
||||||
|
--C C latent channels
|
||||||
|
--f F downsampling factor
|
||||||
|
--n_samples N_SAMPLES
|
||||||
|
how many samples to produce for each given prompt. A.k.a. batch size
|
||||||
|
(note that the seeds for each image in the batch will be unavailable)
|
||||||
|
--n_rows N_ROWS rows in the grid (default: n_samples)
|
||||||
|
--scale SCALE unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))
|
||||||
|
--from-file FROM_FILE
|
||||||
|
if specified, load prompts from this file
|
||||||
|
--config CONFIG path to config which constructs model
|
||||||
|
--ckpt CKPT path to checkpoint of model
|
||||||
|
--seed SEED the seed (for reproducible sampling)
|
||||||
|
--precision {full,autocast}
|
||||||
|
evaluate at this precision
|
||||||
|
|
||||||
|
```
|
||||||
|
Note: The inference config for all v1 versions is designed to be used with EMA-only checkpoints.
|
||||||
|
For this reason `use_ema=False` is set in the configuration, otherwise the code will try to switch from
|
||||||
|
non-EMA to EMA weights. If you want to examine the effect of EMA vs no EMA, we provide "full" checkpoints
|
||||||
|
which contain both types of weights. For these, `use_ema=False` will load and use the non-EMA weights.
|
||||||
|
|
||||||
|
|
||||||
|
#### Diffusers Integration
|
||||||
|
|
||||||
|
Another way to download and sample Stable Diffusion is by using the [diffusers library](https://github.com/huggingface/diffusers/tree/main#new--stable-diffusion-is-now-fully-compatible-with-diffusers)
|
||||||
|
```py
|
||||||
|
# make sure you're logged in with `huggingface-cli login`
|
||||||
|
from torch import autocast
|
||||||
|
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
|
||||||
|
|
||||||
|
pipe = StableDiffusionPipeline.from_pretrained(
|
||||||
|
"CompVis/stable-diffusion-v1-3-diffusers",
|
||||||
|
use_auth_token=True
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = "a photo of an astronaut riding a horse on mars"
|
||||||
|
with autocast("cuda"):
|
||||||
|
image = pipe(prompt)["sample"][0]
|
||||||
|
|
||||||
|
image.save("astronaut_rides_horse.png")
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Image Modification with Stable Diffusion
|
||||||
|
|
||||||
|
By using a diffusion-denoising mechanism as first proposed by [SDEdit](https://arxiv.org/abs/2108.01073), the model can be used for different
|
||||||
|
tasks such as text-guided image-to-image translation and upscaling. Similar to the txt2img sampling script,
|
||||||
|
we provide a script to perform image modification with Stable Diffusion.
|
||||||
|
|
||||||
|
The following describes an example where a rough sketch made in [Pinta](https://www.pinta-project.com/) is converted into a detailed artwork.
|
||||||
|
```
|
||||||
|
python scripts/img2img.py --prompt "A fantasy landscape, trending on artstation" --init-img <path-to-img.jpg> --strength 0.8
|
||||||
|
```
|
||||||
|
Here, strength is a value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
|
||||||
|
Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input. See the following example.
|
||||||
|
|
||||||
|
**Input**
|
||||||
|
|
||||||
|
![sketch-in](assets/stable-samples/img2img/sketch-mountains-input.jpg)
|
||||||
|
|
||||||
|
**Outputs**
|
||||||
|
|
||||||
|
![out3](assets/stable-samples/img2img/mountains-3.png)
|
||||||
|
![out2](assets/stable-samples/img2img/mountains-2.png)
|
||||||
|
|
||||||
|
This procedure can, for example, also be used to upscale samples from the base model.
|
||||||
|
|
||||||
|
|
||||||
|
## Comments
|
||||||
|
|
||||||
|
- Our codebase for the diffusion models builds heavily on [OpenAI's ADM codebase](https://github.com/openai/guided-diffusion)
|
||||||
|
and [https://github.com/lucidrains/denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch).
|
||||||
|
Thanks for open-sourcing!
|
||||||
|
|
||||||
|
- The implementation of the transformer encoder is from [x-transformers](https://github.com/lucidrains/x-transformers) by [lucidrains](https://github.com/lucidrains?tab=repositories).
|
||||||
|
|
||||||
|
|
||||||
|
## BibTeX
|
||||||
|
|
||||||
|
```
|
||||||
|
@misc{rombach2021highresolution,
|
||||||
|
title={High-Resolution Image Synthesis with Latent Diffusion Models},
|
||||||
|
author={Robin Rombach and Andreas Blattmann and Dominik Lorenz and Patrick Esser and Björn Ommer},
|
||||||
|
year={2021},
|
||||||
|
eprint={2112.10752},
|
||||||
|
archivePrefix={arXiv},
|
||||||
|
primaryClass={cs.CV}
|
||||||
|
}
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
305
README.md
305
README.md
@ -100,8 +100,74 @@ cat aspect of the image and 75% on the white duck aspect
|
|||||||
use any combination of integers and floating point numbers, and they
|
use any combination of integers and floating point numbers, and they
|
||||||
do not need to add up to 1.
|
do not need to add up to 1.
|
||||||
|
|
||||||
|
## Personalizing Text-to-Image Generation
|
||||||
|
|
||||||
|
You may personalize the generated images to provide your own styles or objects by training a new LDM checkpoint
|
||||||
|
and introducing a new vocabulary to the fixed model.
|
||||||
|
|
||||||
|
To train, prepare a folder that contains images sized at 512x512 and execute the following:
|
||||||
|
|
||||||
|
~~~~
|
||||||
|
# As the default backend is not available on Windows, if you're using that platform, execute SET PL_TORCH_DISTRIBUTED_BACKEND=gloo
|
||||||
|
(ldm) ~/stable-diffusion$ python3 ./main.py --base ./configs/stable-diffusion/v1-finetune.yaml \
|
||||||
|
-t \
|
||||||
|
--actual_resume ./models/ldm/stable-diffusion-v1/model.ckpt \
|
||||||
|
-n my_cat \
|
||||||
|
--gpus 0, \
|
||||||
|
--data_root D:/textual-inversion/my_cat \
|
||||||
|
--init_word 'cat'
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
During the training process, files will be created in /logs/[project][time][project]/
|
||||||
|
where you can see the process.
|
||||||
|
|
||||||
|
conditioning* contains the training prompts
|
||||||
|
inputs, reconstruction the input images for the training epoch
|
||||||
|
samples, samples scaled for a sample of the prompt and one with the init word provided
|
||||||
|
|
||||||
|
On a RTX3090, the process for SD will take ~1h @1.6 iterations/sec.
|
||||||
|
|
||||||
|
Note: According to the associated paper, the optimal number of images is 3-5 any more images than that and your model might not converge.
|
||||||
|
|
||||||
|
Training will run indefinately, but you may wish to stop it before the heat death of the universe, when you fine a low loss epoch or around ~5000 iterations.
|
||||||
|
|
||||||
|
Once the model is trained, specify the trained .pt file when starting dream using
|
||||||
|
|
||||||
|
~~~~
|
||||||
|
(ldm) ~/stable-diffusion$ python3 ./scripts/dream.py --embedding_path /path/to/embedding.pt --full_precision
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
Then, to utilize your subject at the dream prompt
|
||||||
|
|
||||||
|
~~~
|
||||||
|
dream> "a photo of *"
|
||||||
|
~~~
|
||||||
|
|
||||||
|
this also works with image2image
|
||||||
|
~~~~
|
||||||
|
dream> "waterfall and rainbow in the style of *" --init_img=./init-images/crude_drawing.png --strength=0.5 -s100 -n4
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
It's also possible to train multiple tokens (modify the placeholder string in configs/stable-diffusion/v1-finetune.yaml) and combine LDM checkpoints using:
|
||||||
|
|
||||||
|
~~~~
|
||||||
|
(ldm) ~/stable-diffusion$ python3 ./scripts/merge_embeddings.py \
|
||||||
|
--manager_ckpts /path/to/first/embedding.pt /path/to/second/embedding.pt [...] \
|
||||||
|
--output_path /path/to/output/embedding.pt
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
Credit goes to @rinongal and the repository located at https://github.com/rinongal/textual_inversion Please see the repository and associated paper for details and limitations.
|
||||||
|
|
||||||
## Changes
|
## Changes
|
||||||
|
|
||||||
|
* v1.08 (24 August 2022)
|
||||||
|
* Escape single quotes on the dream> command before trying to parse. This avoids
|
||||||
|
parse errors.
|
||||||
|
* Removed instruction to get Python3.8 as first step in Windows install.
|
||||||
|
Anaconda3 does it for you.
|
||||||
|
* Added bounds checks for numeric arguments that could cause crashes.
|
||||||
|
* Cleaned up the copyright and license agreement files.
|
||||||
|
|
||||||
* v1.07 (23 August 2022)
|
* v1.07 (23 August 2022)
|
||||||
* Image filenames will now never fill gaps in the sequence, but will be assigned the
|
* Image filenames will now never fill gaps in the sequence, but will be assigned the
|
||||||
next higher name in the chosen directory. This ensures that the alphabetic and chronological
|
next higher name in the chosen directory. This ensures that the alphabetic and chronological
|
||||||
@ -236,34 +302,31 @@ This will bring your local copy into sync with the remote one.
|
|||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
|
||||||
1. Install Python version 3.8.5 from here: https://www.python.org/downloads/windows/
|
1. Install Anaconda3 (miniconda3 version) from here: https://docs.anaconda.com/anaconda/install/windows/
|
||||||
(note that several users have reported that later versions do not work properly)
|
|
||||||
|
|
||||||
2. Install Anaconda3 (miniconda3 version) from here: https://docs.anaconda.com/anaconda/install/windows/
|
2. Install Git from here: https://git-scm.com/download/win
|
||||||
|
|
||||||
3. Install Git from here: https://git-scm.com/download/win
|
3. Launch Anaconda from the Windows Start menu. This will bring up a command window. Type all the remaining commands in this window.
|
||||||
|
|
||||||
4. Launch Anaconda from the Windows Start menu. This will bring up a command window. Type all the remaining commands in this window.
|
4. Run the command:
|
||||||
|
|
||||||
5. Run the command:
|
|
||||||
```
|
```
|
||||||
git clone https://github.com/lstein/stable-diffusion.git
|
git clone https://github.com/lstein/stable-diffusion.git
|
||||||
```
|
```
|
||||||
This will create stable-diffusion folder where you will follow the rest of the steps.
|
This will create stable-diffusion folder where you will follow the rest of the steps.
|
||||||
|
|
||||||
6. Enter the newly-created stable-diffusion folder. From this step forward make sure that you are working in the stable-diffusion directory!
|
5. Enter the newly-created stable-diffusion folder. From this step forward make sure that you are working in the stable-diffusion directory!
|
||||||
```
|
```
|
||||||
cd stable-diffusion
|
cd stable-diffusion
|
||||||
```
|
```
|
||||||
|
|
||||||
7. Run the following two commands:
|
6. Run the following two commands:
|
||||||
```
|
```
|
||||||
conda env create -f environment.yaml (step 7a)
|
conda env create -f environment.yaml (step 6a)
|
||||||
conda activate ldm (step 7b)
|
conda activate ldm (step 6b)
|
||||||
```
|
```
|
||||||
This will install all python requirements and activate the "ldm" environment which sets PATH and other environment variables properly.
|
This will install all python requirements and activate the "ldm" environment which sets PATH and other environment variables properly.
|
||||||
|
|
||||||
8. Run the command:
|
7. Run the command:
|
||||||
```
|
```
|
||||||
python scripts\preload_models.py
|
python scripts\preload_models.py
|
||||||
```
|
```
|
||||||
@ -273,7 +336,7 @@ requires. (Note that this step is required. I created it because some people
|
|||||||
are using GPU systems that are behind a firewall and the models can't be
|
are using GPU systems that are behind a firewall and the models can't be
|
||||||
downloaded just-in-time)
|
downloaded just-in-time)
|
||||||
|
|
||||||
9. Now you need to install the weights for the big stable diffusion model.
|
8. Now you need to install the weights for the big stable diffusion model.
|
||||||
|
|
||||||
For running with the released weights, you will first need to set up
|
For running with the released weights, you will first need to set up
|
||||||
an acount with Hugging Face (https://huggingface.co). Use your
|
an acount with Hugging Face (https://huggingface.co). Use your
|
||||||
@ -299,7 +362,7 @@ you stashed this file. If you prefer not to copy or move the .ckpt file,
|
|||||||
you may instead create a shortcut to it from within
|
you may instead create a shortcut to it from within
|
||||||
"models\ldm\stable-diffusion-v1\".
|
"models\ldm\stable-diffusion-v1\".
|
||||||
|
|
||||||
10. Start generating images!
|
9. Start generating images!
|
||||||
```
|
```
|
||||||
# for the pre-release weights
|
# for the pre-release weights
|
||||||
python scripts\dream.py -l
|
python scripts\dream.py -l
|
||||||
@ -307,7 +370,7 @@ python scripts\dream.py -l
|
|||||||
# for the post-release weights
|
# for the post-release weights
|
||||||
python scripts\dream.py
|
python scripts\dream.py
|
||||||
```
|
```
|
||||||
11. Subsequently, to relaunch the script, first activate the Anaconda command window (step 4), enter the stable-diffusion directory (step 6, "cd \path\to\stable-diffusion"), run "conda activate ldm" (step 7b), and then launch the dream script (step 10).
|
10. Subsequently, to relaunch the script, first activate the Anaconda command window (step 3), enter the stable-diffusion directory (step 5, "cd \path\to\stable-diffusion"), run "conda activate ldm" (step 6b), and then launch the dream script (step 9).
|
||||||
|
|
||||||
#### Updating to newer versions of the script
|
#### Updating to newer versions of the script
|
||||||
|
|
||||||
@ -378,213 +441,9 @@ to send me an email if you use and like the script.
|
|||||||
|
|
||||||
*Contributions by:* [Peter Kowalczyk](https://github.com/slix), [Henry Harrison](https://github.com/hwharrison), [xraxra](https://github.com/xraxra), and [bmaltais](https://github.com/bmaltais)
|
*Contributions by:* [Peter Kowalczyk](https://github.com/slix), [Henry Harrison](https://github.com/hwharrison), [xraxra](https://github.com/xraxra), and [bmaltais](https://github.com/bmaltais)
|
||||||
|
|
||||||
# Original README from CompViz/stable-diffusion
|
Original portions of the software are Copyright (c) 2020 Lincoln D. Stein (https://github.com/lstein)
|
||||||
*Stable Diffusion was made possible thanks to a collaboration with [Stability AI](https://stability.ai/) and [Runway](https://runwayml.com/) and builds upon our previous work:*
|
|
||||||
|
|
||||||
[**High-Resolution Image Synthesis with Latent Diffusion Models**](https://ommer-lab.com/research/latent-diffusion-models/)<br/>
|
|
||||||
[Robin Rombach](https://github.com/rromb)\*,
|
|
||||||
[Andreas Blattmann](https://github.com/ablattmann)\*,
|
|
||||||
[Dominik Lorenz](https://github.com/qp-qp)\,
|
|
||||||
[Patrick Esser](https://github.com/pesser),
|
|
||||||
[Björn Ommer](https://hci.iwr.uni-heidelberg.de/Staff/bommer)<br/>
|
|
||||||
|
|
||||||
**CVPR '22 Oral**
|
|
||||||
|
|
||||||
which is available on [GitHub](https://github.com/CompVis/latent-diffusion). PDF at [arXiv](https://arxiv.org/abs/2112.10752). Please also visit our [Project page](https://ommer-lab.com/research/latent-diffusion-models/).
|
|
||||||
|
|
||||||
![txt2img-stable2](assets/stable-samples/txt2img/merged-0006.png)
|
|
||||||
[Stable Diffusion](#stable-diffusion-v1) is a latent text-to-image diffusion
|
|
||||||
model.
|
|
||||||
Thanks to a generous compute donation from [Stability AI](https://stability.ai/) and support from [LAION](https://laion.ai/), we were able to train a Latent Diffusion Model on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database.
|
|
||||||
Similar to Google's [Imagen](https://arxiv.org/abs/2205.11487),
|
|
||||||
this model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts.
|
|
||||||
With its 860M UNet and 123M text encoder, the model is relatively lightweight and runs on a GPU with at least 10GB VRAM.
|
|
||||||
See [this section](#stable-diffusion-v1) below and the [model card](https://huggingface.co/CompVis/stable-diffusion).
|
|
||||||
|
|
||||||
|
|
||||||
## Requirements
|
|
||||||
|
|
||||||
A suitable [conda](https://conda.io/) environment named `ldm` can be created
|
|
||||||
and activated with:
|
|
||||||
|
|
||||||
```
|
|
||||||
conda env create -f environment.yaml
|
|
||||||
conda activate ldm
|
|
||||||
```
|
|
||||||
|
|
||||||
You can also update an existing [latent diffusion](https://github.com/CompVis/latent-diffusion) environment by running
|
|
||||||
|
|
||||||
```
|
|
||||||
conda install pytorch torchvision -c pytorch
|
|
||||||
pip install transformers==4.19.2
|
|
||||||
pip install -e .
|
|
||||||
```
|
|
||||||
|
|
||||||
## Stable Diffusion v1
|
|
||||||
|
|
||||||
Stable Diffusion v1 refers to a specific configuration of the model
|
|
||||||
architecture that uses a downsampling-factor 8 autoencoder with an 860M UNet
|
|
||||||
and CLIP ViT-L/14 text encoder for the diffusion model. The model was pretrained on 256x256 images and
|
|
||||||
then finetuned on 512x512 images.
|
|
||||||
|
|
||||||
*Note: Stable Diffusion v1 is a general text-to-image diffusion model and therefore mirrors biases and (mis-)conceptions that are present
|
|
||||||
in its training data.
|
|
||||||
Details on the training procedure and data, as well as the intended use of the model can be found in the corresponding [model card](https://huggingface.co/CompVis/stable-diffusion).
|
|
||||||
Research into the safe deployment of general text-to-image models is an ongoing effort. To prevent misuse and harm, we currently provide access to the checkpoints only for [academic research purposes upon request](https://stability.ai/academia-access-form).
|
|
||||||
**This is an experiment in safe and community-driven publication of a capable and general text-to-image model. We are working on a public release with a more permissive license that also incorporates ethical considerations.***
|
|
||||||
|
|
||||||
[Request access to Stable Diffusion v1 checkpoints for academic research](https://stability.ai/academia-access-form)
|
|
||||||
|
|
||||||
### Weights
|
|
||||||
|
|
||||||
We currently provide three checkpoints, `sd-v1-1.ckpt`, `sd-v1-2.ckpt` and `sd-v1-3.ckpt`,
|
|
||||||
which were trained as follows,
|
|
||||||
|
|
||||||
- `sd-v1-1.ckpt`: 237k steps at resolution `256x256` on [laion2B-en](https://huggingface.co/datasets/laion/laion2B-en).
|
|
||||||
194k steps at resolution `512x512` on [laion-high-resolution](https://huggingface.co/datasets/laion/laion-high-resolution) (170M examples from LAION-5B with resolution `>= 1024x1024`).
|
|
||||||
- `sd-v1-2.ckpt`: Resumed from `sd-v1-1.ckpt`.
|
|
||||||
515k steps at resolution `512x512` on "laion-improved-aesthetics" (a subset of laion2B-en,
|
|
||||||
filtered to images with an original size `>= 512x512`, estimated aesthetics score `> 5.0`, and an estimated watermark probability `< 0.5`. The watermark estimate is from the LAION-5B metadata, the aesthetics score is estimated using an [improved aesthetics estimator](https://github.com/christophschuhmann/improved-aesthetic-predictor)).
|
|
||||||
- `sd-v1-3.ckpt`: Resumed from `sd-v1-2.ckpt`. 195k steps at resolution `512x512` on "laion-improved-aesthetics" and 10\% dropping of the text-conditioning to improve [classifier-free guidance sampling](https://arxiv.org/abs/2207.12598).
|
|
||||||
|
|
||||||
Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0,
|
|
||||||
5.0, 6.0, 7.0, 8.0) and 50 PLMS sampling
|
|
||||||
steps show the relative improvements of the checkpoints:
|
|
||||||
![sd evaluation results](assets/v1-variants-scores.jpg)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### Text-to-Image with Stable Diffusion
|
|
||||||
![txt2img-stable2](assets/stable-samples/txt2img/merged-0005.png)
|
|
||||||
![txt2img-stable2](assets/stable-samples/txt2img/merged-0007.png)
|
|
||||||
|
|
||||||
Stable Diffusion is a latent diffusion model conditioned on the (non-pooled) text embeddings of a CLIP ViT-L/14 text encoder.
|
|
||||||
|
|
||||||
|
|
||||||
#### Sampling Script
|
|
||||||
|
|
||||||
After [obtaining the weights](#weights), link them
|
|
||||||
```
|
|
||||||
mkdir -p models/ldm/stable-diffusion-v1/
|
|
||||||
ln -s <path/to/model.ckpt> models/ldm/stable-diffusion-v1/model.ckpt
|
|
||||||
```
|
|
||||||
and sample with
|
|
||||||
```
|
|
||||||
python scripts/txt2img.py --prompt "a photograph of an astronaut riding a horse" --plms
|
|
||||||
```
|
|
||||||
|
|
||||||
By default, this uses a guidance scale of `--scale 7.5`, [Katherine Crowson's implementation](https://github.com/CompVis/latent-diffusion/pull/51) of the [PLMS](https://arxiv.org/abs/2202.09778) sampler,
|
|
||||||
and renders images of size 512x512 (which it was trained on) in 50 steps. All supported arguments are listed below (type `python scripts/txt2img.py --help`).
|
|
||||||
|
|
||||||
```commandline
|
|
||||||
usage: txt2img.py [-h] [--prompt [PROMPT]] [--outdir [OUTDIR]] [--skip_grid] [--skip_save] [--ddim_steps DDIM_STEPS] [--plms] [--laion400m] [--fixed_code] [--ddim_eta DDIM_ETA] [--n_iter N_ITER] [--H H] [--W W] [--C C] [--f F] [--n_samples N_SAMPLES] [--n_rows N_ROWS]
|
|
||||||
[--scale SCALE] [--from-file FROM_FILE] [--config CONFIG] [--ckpt CKPT] [--seed SEED] [--precision {full,autocast}]
|
|
||||||
|
|
||||||
optional arguments:
|
|
||||||
-h, --help show this help message and exit
|
|
||||||
--prompt [PROMPT] the prompt to render
|
|
||||||
--outdir [OUTDIR] dir to write results to
|
|
||||||
--skip_grid do not save a grid, only individual samples. Helpful when evaluating lots of samples
|
|
||||||
--skip_save do not save individual samples. For speed measurements.
|
|
||||||
--ddim_steps DDIM_STEPS
|
|
||||||
number of ddim sampling steps
|
|
||||||
--plms use plms sampling
|
|
||||||
--laion400m uses the LAION400M model
|
|
||||||
--fixed_code if enabled, uses the same starting code across samples
|
|
||||||
--ddim_eta DDIM_ETA ddim eta (eta=0.0 corresponds to deterministic sampling
|
|
||||||
--n_iter N_ITER sample this often
|
|
||||||
--H H image height, in pixel space
|
|
||||||
--W W image width, in pixel space
|
|
||||||
--C C latent channels
|
|
||||||
--f F downsampling factor
|
|
||||||
--n_samples N_SAMPLES
|
|
||||||
how many samples to produce for each given prompt. A.k.a. batch size
|
|
||||||
(note that the seeds for each image in the batch will be unavailable)
|
|
||||||
--n_rows N_ROWS rows in the grid (default: n_samples)
|
|
||||||
--scale SCALE unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))
|
|
||||||
--from-file FROM_FILE
|
|
||||||
if specified, load prompts from this file
|
|
||||||
--config CONFIG path to config which constructs model
|
|
||||||
--ckpt CKPT path to checkpoint of model
|
|
||||||
--seed SEED the seed (for reproducible sampling)
|
|
||||||
--precision {full,autocast}
|
|
||||||
evaluate at this precision
|
|
||||||
|
|
||||||
```
|
|
||||||
Note: The inference config for all v1 versions is designed to be used with EMA-only checkpoints.
|
|
||||||
For this reason `use_ema=False` is set in the configuration, otherwise the code will try to switch from
|
|
||||||
non-EMA to EMA weights. If you want to examine the effect of EMA vs no EMA, we provide "full" checkpoints
|
|
||||||
which contain both types of weights. For these, `use_ema=False` will load and use the non-EMA weights.
|
|
||||||
|
|
||||||
|
|
||||||
#### Diffusers Integration
|
|
||||||
|
|
||||||
Another way to download and sample Stable Diffusion is by using the [diffusers library](https://github.com/huggingface/diffusers/tree/main#new--stable-diffusion-is-now-fully-compatible-with-diffusers)
|
|
||||||
```py
|
|
||||||
# make sure you're logged in with `huggingface-cli login`
|
|
||||||
from torch import autocast
|
|
||||||
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
|
|
||||||
|
|
||||||
pipe = StableDiffusionPipeline.from_pretrained(
|
|
||||||
"CompVis/stable-diffusion-v1-3-diffusers",
|
|
||||||
use_auth_token=True
|
|
||||||
)
|
|
||||||
|
|
||||||
prompt = "a photo of an astronaut riding a horse on mars"
|
|
||||||
with autocast("cuda"):
|
|
||||||
image = pipe(prompt)["sample"][0]
|
|
||||||
|
|
||||||
image.save("astronaut_rides_horse.png")
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### Image Modification with Stable Diffusion
|
|
||||||
|
|
||||||
By using a diffusion-denoising mechanism as first proposed by [SDEdit](https://arxiv.org/abs/2108.01073), the model can be used for different
|
|
||||||
tasks such as text-guided image-to-image translation and upscaling. Similar to the txt2img sampling script,
|
|
||||||
we provide a script to perform image modification with Stable Diffusion.
|
|
||||||
|
|
||||||
The following describes an example where a rough sketch made in [Pinta](https://www.pinta-project.com/) is converted into a detailed artwork.
|
|
||||||
```
|
|
||||||
python scripts/img2img.py --prompt "A fantasy landscape, trending on artstation" --init-img <path-to-img.jpg> --strength 0.8
|
|
||||||
```
|
|
||||||
Here, strength is a value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
|
|
||||||
Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input. See the following example.
|
|
||||||
|
|
||||||
**Input**
|
|
||||||
|
|
||||||
![sketch-in](assets/stable-samples/img2img/sketch-mountains-input.jpg)
|
|
||||||
|
|
||||||
**Outputs**
|
|
||||||
|
|
||||||
![out3](assets/stable-samples/img2img/mountains-3.png)
|
|
||||||
![out2](assets/stable-samples/img2img/mountains-2.png)
|
|
||||||
|
|
||||||
This procedure can, for example, also be used to upscale samples from the base model.
|
|
||||||
|
|
||||||
|
|
||||||
## Comments
|
|
||||||
|
|
||||||
- Our codebase for the diffusion models builds heavily on [OpenAI's ADM codebase](https://github.com/openai/guided-diffusion)
|
|
||||||
and [https://github.com/lucidrains/denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch).
|
|
||||||
Thanks for open-sourcing!
|
|
||||||
|
|
||||||
- The implementation of the transformer encoder is from [x-transformers](https://github.com/lucidrains/x-transformers) by [lucidrains](https://github.com/lucidrains?tab=repositories).
|
|
||||||
|
|
||||||
|
|
||||||
## BibTeX
|
|
||||||
|
|
||||||
```
|
|
||||||
@misc{rombach2021highresolution,
|
|
||||||
title={High-Resolution Image Synthesis with Latent Diffusion Models},
|
|
||||||
author={Robin Rombach and Andreas Blattmann and Dominik Lorenz and Patrick Esser and Björn Ommer},
|
|
||||||
year={2021},
|
|
||||||
eprint={2112.10752},
|
|
||||||
archivePrefix={arXiv},
|
|
||||||
primaryClass={cs.CV}
|
|
||||||
}
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
|
#Further Reading
|
||||||
|
|
||||||
|
Please see the original README for more information on this software
|
||||||
|
and underlying algorithm, located in the file README-CompViz.md.
|
105
configs/stable-diffusion/v1-finetune.yaml
Normal file
105
configs/stable-diffusion/v1-finetune.yaml
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
model:
|
||||||
|
base_learning_rate: 5.0e-03
|
||||||
|
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||||
|
params:
|
||||||
|
linear_start: 0.00085
|
||||||
|
linear_end: 0.0120
|
||||||
|
num_timesteps_cond: 1
|
||||||
|
log_every_t: 200
|
||||||
|
timesteps: 1000
|
||||||
|
first_stage_key: image
|
||||||
|
cond_stage_key: caption
|
||||||
|
image_size: 64
|
||||||
|
channels: 4
|
||||||
|
cond_stage_trainable: true # Note: different from the one we trained before
|
||||||
|
conditioning_key: crossattn
|
||||||
|
monitor: val/loss_simple_ema
|
||||||
|
scale_factor: 0.18215
|
||||||
|
use_ema: False
|
||||||
|
embedding_reg_weight: 0.0
|
||||||
|
|
||||||
|
personalization_config:
|
||||||
|
target: ldm.modules.embedding_manager.EmbeddingManager
|
||||||
|
params:
|
||||||
|
placeholder_strings: ["*"]
|
||||||
|
initializer_words: ["sculpture"]
|
||||||
|
per_image_tokens: false
|
||||||
|
num_vectors_per_token: 1
|
||||||
|
progressive_words: False
|
||||||
|
|
||||||
|
unet_config:
|
||||||
|
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||||
|
params:
|
||||||
|
image_size: 32 # unused
|
||||||
|
in_channels: 4
|
||||||
|
out_channels: 4
|
||||||
|
model_channels: 320
|
||||||
|
attention_resolutions: [ 4, 2, 1 ]
|
||||||
|
num_res_blocks: 2
|
||||||
|
channel_mult: [ 1, 2, 4, 4 ]
|
||||||
|
num_heads: 8
|
||||||
|
use_spatial_transformer: True
|
||||||
|
transformer_depth: 1
|
||||||
|
context_dim: 768
|
||||||
|
use_checkpoint: True
|
||||||
|
legacy: False
|
||||||
|
|
||||||
|
first_stage_config:
|
||||||
|
target: ldm.models.autoencoder.AutoencoderKL
|
||||||
|
params:
|
||||||
|
embed_dim: 4
|
||||||
|
monitor: val/rec_loss
|
||||||
|
ddconfig:
|
||||||
|
double_z: true
|
||||||
|
z_channels: 4
|
||||||
|
resolution: 256
|
||||||
|
in_channels: 3
|
||||||
|
out_ch: 3
|
||||||
|
ch: 128
|
||||||
|
ch_mult:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 4
|
||||||
|
num_res_blocks: 2
|
||||||
|
attn_resolutions: []
|
||||||
|
dropout: 0.0
|
||||||
|
lossconfig:
|
||||||
|
target: torch.nn.Identity
|
||||||
|
|
||||||
|
cond_stage_config:
|
||||||
|
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
||||||
|
|
||||||
|
data:
|
||||||
|
target: main.DataModuleFromConfig
|
||||||
|
params:
|
||||||
|
batch_size: 2
|
||||||
|
num_workers: 16
|
||||||
|
wrap: false
|
||||||
|
train:
|
||||||
|
target: ldm.data.personalized.PersonalizedBase
|
||||||
|
params:
|
||||||
|
size: 512
|
||||||
|
set: train
|
||||||
|
per_image_tokens: false
|
||||||
|
repeats: 100
|
||||||
|
validation:
|
||||||
|
target: ldm.data.personalized.PersonalizedBase
|
||||||
|
params:
|
||||||
|
size: 512
|
||||||
|
set: val
|
||||||
|
per_image_tokens: false
|
||||||
|
repeats: 10
|
||||||
|
|
||||||
|
lightning:
|
||||||
|
callbacks:
|
||||||
|
image_logger:
|
||||||
|
target: main.ImageLogger
|
||||||
|
params:
|
||||||
|
batch_frequency: 500
|
||||||
|
max_images: 8
|
||||||
|
increase_log_steps: False
|
||||||
|
|
||||||
|
trainer:
|
||||||
|
benchmark: True
|
||||||
|
max_steps: 6100
|
103
configs/stable-diffusion/v1-finetune_style.yaml
Normal file
103
configs/stable-diffusion/v1-finetune_style.yaml
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
model:
|
||||||
|
base_learning_rate: 5.0e-03
|
||||||
|
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||||
|
params:
|
||||||
|
linear_start: 0.00085
|
||||||
|
linear_end: 0.0120
|
||||||
|
num_timesteps_cond: 1
|
||||||
|
log_every_t: 200
|
||||||
|
timesteps: 1000
|
||||||
|
first_stage_key: image
|
||||||
|
cond_stage_key: caption
|
||||||
|
image_size: 64
|
||||||
|
channels: 4
|
||||||
|
cond_stage_trainable: true # Note: different from the one we trained before
|
||||||
|
conditioning_key: crossattn
|
||||||
|
monitor: val/loss_simple_ema
|
||||||
|
scale_factor: 0.18215
|
||||||
|
use_ema: False
|
||||||
|
embedding_reg_weight: 0.0
|
||||||
|
|
||||||
|
personalization_config:
|
||||||
|
target: ldm.modules.embedding_manager.EmbeddingManager
|
||||||
|
params:
|
||||||
|
placeholder_strings: ["*"]
|
||||||
|
initializer_words: ["painting"]
|
||||||
|
per_image_tokens: false
|
||||||
|
num_vectors_per_token: 1
|
||||||
|
|
||||||
|
unet_config:
|
||||||
|
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||||
|
params:
|
||||||
|
image_size: 32 # unused
|
||||||
|
in_channels: 4
|
||||||
|
out_channels: 4
|
||||||
|
model_channels: 320
|
||||||
|
attention_resolutions: [ 4, 2, 1 ]
|
||||||
|
num_res_blocks: 2
|
||||||
|
channel_mult: [ 1, 2, 4, 4 ]
|
||||||
|
num_heads: 8
|
||||||
|
use_spatial_transformer: True
|
||||||
|
transformer_depth: 1
|
||||||
|
context_dim: 768
|
||||||
|
use_checkpoint: True
|
||||||
|
legacy: False
|
||||||
|
|
||||||
|
first_stage_config:
|
||||||
|
target: ldm.models.autoencoder.AutoencoderKL
|
||||||
|
params:
|
||||||
|
embed_dim: 4
|
||||||
|
monitor: val/rec_loss
|
||||||
|
ddconfig:
|
||||||
|
double_z: true
|
||||||
|
z_channels: 4
|
||||||
|
resolution: 256
|
||||||
|
in_channels: 3
|
||||||
|
out_ch: 3
|
||||||
|
ch: 128
|
||||||
|
ch_mult:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 4
|
||||||
|
num_res_blocks: 2
|
||||||
|
attn_resolutions: []
|
||||||
|
dropout: 0.0
|
||||||
|
lossconfig:
|
||||||
|
target: torch.nn.Identity
|
||||||
|
|
||||||
|
cond_stage_config:
|
||||||
|
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
||||||
|
|
||||||
|
data:
|
||||||
|
target: main.DataModuleFromConfig
|
||||||
|
params:
|
||||||
|
batch_size: 2
|
||||||
|
num_workers: 16
|
||||||
|
wrap: false
|
||||||
|
train:
|
||||||
|
target: ldm.data.personalized_style.PersonalizedBase
|
||||||
|
params:
|
||||||
|
size: 512
|
||||||
|
set: train
|
||||||
|
per_image_tokens: false
|
||||||
|
repeats: 100
|
||||||
|
validation:
|
||||||
|
target: ldm.data.personalized_style.PersonalizedBase
|
||||||
|
params:
|
||||||
|
size: 512
|
||||||
|
set: val
|
||||||
|
per_image_tokens: false
|
||||||
|
repeats: 10
|
||||||
|
|
||||||
|
lightning:
|
||||||
|
callbacks:
|
||||||
|
image_logger:
|
||||||
|
target: main.ImageLogger
|
||||||
|
params:
|
||||||
|
batch_frequency: 500
|
||||||
|
max_images: 8
|
||||||
|
increase_log_steps: False
|
||||||
|
|
||||||
|
trainer:
|
||||||
|
benchmark: True
|
@ -26,6 +26,15 @@ model:
|
|||||||
f_max: [ 1. ]
|
f_max: [ 1. ]
|
||||||
f_min: [ 1. ]
|
f_min: [ 1. ]
|
||||||
|
|
||||||
|
personalization_config:
|
||||||
|
target: ldm.modules.embedding_manager.EmbeddingManager
|
||||||
|
params:
|
||||||
|
placeholder_strings: ["*"]
|
||||||
|
initializer_words: ["sculpture"]
|
||||||
|
per_image_tokens: false
|
||||||
|
num_vectors_per_token: 1
|
||||||
|
progressive_words: False
|
||||||
|
|
||||||
unet_config:
|
unet_config:
|
||||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||||
params:
|
params:
|
||||||
|
@ -19,6 +19,7 @@ dependencies:
|
|||||||
- omegaconf==2.1.1
|
- omegaconf==2.1.1
|
||||||
- test-tube>=0.7.5
|
- test-tube>=0.7.5
|
||||||
- streamlit>=0.73.1
|
- streamlit>=0.73.1
|
||||||
|
- pillow==9.0.1
|
||||||
- einops==0.3.0
|
- einops==0.3.0
|
||||||
- torch-fidelity==0.3.0
|
- torch-fidelity==0.3.0
|
||||||
- transformers==4.19.2
|
- transformers==4.19.2
|
||||||
|
160
ldm/data/personalized.py
Normal file
160
ldm/data/personalized.py
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import PIL
|
||||||
|
from PIL import Image
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
from torchvision import transforms
|
||||||
|
|
||||||
|
import random
|
||||||
|
|
||||||
|
imagenet_templates_smallest = [
|
||||||
|
'a photo of a {}',
|
||||||
|
]
|
||||||
|
|
||||||
|
imagenet_templates_small = [
|
||||||
|
'a photo of a {}',
|
||||||
|
'a rendering of a {}',
|
||||||
|
'a cropped photo of the {}',
|
||||||
|
'the photo of a {}',
|
||||||
|
'a photo of a clean {}',
|
||||||
|
'a photo of a dirty {}',
|
||||||
|
'a dark photo of the {}',
|
||||||
|
'a photo of my {}',
|
||||||
|
'a photo of the cool {}',
|
||||||
|
'a close-up photo of a {}',
|
||||||
|
'a bright photo of the {}',
|
||||||
|
'a cropped photo of a {}',
|
||||||
|
'a photo of the {}',
|
||||||
|
'a good photo of the {}',
|
||||||
|
'a photo of one {}',
|
||||||
|
'a close-up photo of the {}',
|
||||||
|
'a rendition of the {}',
|
||||||
|
'a photo of the clean {}',
|
||||||
|
'a rendition of a {}',
|
||||||
|
'a photo of a nice {}',
|
||||||
|
'a good photo of a {}',
|
||||||
|
'a photo of the nice {}',
|
||||||
|
'a photo of the small {}',
|
||||||
|
'a photo of the weird {}',
|
||||||
|
'a photo of the large {}',
|
||||||
|
'a photo of a cool {}',
|
||||||
|
'a photo of a small {}',
|
||||||
|
]
|
||||||
|
|
||||||
|
imagenet_dual_templates_small = [
|
||||||
|
'a photo of a {} with {}',
|
||||||
|
'a rendering of a {} with {}',
|
||||||
|
'a cropped photo of the {} with {}',
|
||||||
|
'the photo of a {} with {}',
|
||||||
|
'a photo of a clean {} with {}',
|
||||||
|
'a photo of a dirty {} with {}',
|
||||||
|
'a dark photo of the {} with {}',
|
||||||
|
'a photo of my {} with {}',
|
||||||
|
'a photo of the cool {} with {}',
|
||||||
|
'a close-up photo of a {} with {}',
|
||||||
|
'a bright photo of the {} with {}',
|
||||||
|
'a cropped photo of a {} with {}',
|
||||||
|
'a photo of the {} with {}',
|
||||||
|
'a good photo of the {} with {}',
|
||||||
|
'a photo of one {} with {}',
|
||||||
|
'a close-up photo of the {} with {}',
|
||||||
|
'a rendition of the {} with {}',
|
||||||
|
'a photo of the clean {} with {}',
|
||||||
|
'a rendition of a {} with {}',
|
||||||
|
'a photo of a nice {} with {}',
|
||||||
|
'a good photo of a {} with {}',
|
||||||
|
'a photo of the nice {} with {}',
|
||||||
|
'a photo of the small {} with {}',
|
||||||
|
'a photo of the weird {} with {}',
|
||||||
|
'a photo of the large {} with {}',
|
||||||
|
'a photo of a cool {} with {}',
|
||||||
|
'a photo of a small {} with {}',
|
||||||
|
]
|
||||||
|
|
||||||
|
per_img_token_list = [
|
||||||
|
'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 'י', 'כ', 'ל', 'מ', 'נ', 'ס', 'ע', 'פ', 'צ', 'ק', 'ר', 'ש', 'ת',
|
||||||
|
]
|
||||||
|
|
||||||
|
class PersonalizedBase(Dataset):
|
||||||
|
def __init__(self,
|
||||||
|
data_root,
|
||||||
|
size=None,
|
||||||
|
repeats=100,
|
||||||
|
interpolation="bicubic",
|
||||||
|
flip_p=0.5,
|
||||||
|
set="train",
|
||||||
|
placeholder_token="*",
|
||||||
|
per_image_tokens=False,
|
||||||
|
center_crop=False,
|
||||||
|
mixing_prob=0.25,
|
||||||
|
coarse_class_text=None,
|
||||||
|
):
|
||||||
|
|
||||||
|
self.data_root = data_root
|
||||||
|
|
||||||
|
self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
|
||||||
|
|
||||||
|
# self._length = len(self.image_paths)
|
||||||
|
self.num_images = len(self.image_paths)
|
||||||
|
self._length = self.num_images
|
||||||
|
|
||||||
|
self.placeholder_token = placeholder_token
|
||||||
|
|
||||||
|
self.per_image_tokens = per_image_tokens
|
||||||
|
self.center_crop = center_crop
|
||||||
|
self.mixing_prob = mixing_prob
|
||||||
|
|
||||||
|
self.coarse_class_text = coarse_class_text
|
||||||
|
|
||||||
|
if per_image_tokens:
|
||||||
|
assert self.num_images < len(per_img_token_list), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'."
|
||||||
|
|
||||||
|
if set == "train":
|
||||||
|
self._length = self.num_images * repeats
|
||||||
|
|
||||||
|
self.size = size
|
||||||
|
self.interpolation = {"linear": PIL.Image.LINEAR,
|
||||||
|
"bilinear": PIL.Image.BILINEAR,
|
||||||
|
"bicubic": PIL.Image.BICUBIC,
|
||||||
|
"lanczos": PIL.Image.LANCZOS,
|
||||||
|
}[interpolation]
|
||||||
|
self.flip = transforms.RandomHorizontalFlip(p=flip_p)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return self._length
|
||||||
|
|
||||||
|
def __getitem__(self, i):
|
||||||
|
example = {}
|
||||||
|
image = Image.open(self.image_paths[i % self.num_images])
|
||||||
|
|
||||||
|
if not image.mode == "RGB":
|
||||||
|
image = image.convert("RGB")
|
||||||
|
|
||||||
|
placeholder_string = self.placeholder_token
|
||||||
|
if self.coarse_class_text:
|
||||||
|
placeholder_string = f"{self.coarse_class_text} {placeholder_string}"
|
||||||
|
|
||||||
|
if self.per_image_tokens and np.random.uniform() < self.mixing_prob:
|
||||||
|
text = random.choice(imagenet_dual_templates_small).format(placeholder_string, per_img_token_list[i % self.num_images])
|
||||||
|
else:
|
||||||
|
text = random.choice(imagenet_templates_small).format(placeholder_string)
|
||||||
|
|
||||||
|
example["caption"] = text
|
||||||
|
|
||||||
|
# default to score-sde preprocessing
|
||||||
|
img = np.array(image).astype(np.uint8)
|
||||||
|
|
||||||
|
if self.center_crop:
|
||||||
|
crop = min(img.shape[0], img.shape[1])
|
||||||
|
h, w, = img.shape[0], img.shape[1]
|
||||||
|
img = img[(h - crop) // 2:(h + crop) // 2,
|
||||||
|
(w - crop) // 2:(w + crop) // 2]
|
||||||
|
|
||||||
|
image = Image.fromarray(img)
|
||||||
|
if self.size is not None:
|
||||||
|
image = image.resize((self.size, self.size), resample=self.interpolation)
|
||||||
|
|
||||||
|
image = self.flip(image)
|
||||||
|
image = np.array(image).astype(np.uint8)
|
||||||
|
example["image"] = (image / 127.5 - 1.0).astype(np.float32)
|
||||||
|
return example
|
129
ldm/data/personalized_style.py
Normal file
129
ldm/data/personalized_style.py
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import PIL
|
||||||
|
from PIL import Image
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
from torchvision import transforms
|
||||||
|
|
||||||
|
import random
|
||||||
|
|
||||||
|
imagenet_templates_small = [
|
||||||
|
'a painting in the style of {}',
|
||||||
|
'a rendering in the style of {}',
|
||||||
|
'a cropped painting in the style of {}',
|
||||||
|
'the painting in the style of {}',
|
||||||
|
'a clean painting in the style of {}',
|
||||||
|
'a dirty painting in the style of {}',
|
||||||
|
'a dark painting in the style of {}',
|
||||||
|
'a picture in the style of {}',
|
||||||
|
'a cool painting in the style of {}',
|
||||||
|
'a close-up painting in the style of {}',
|
||||||
|
'a bright painting in the style of {}',
|
||||||
|
'a cropped painting in the style of {}',
|
||||||
|
'a good painting in the style of {}',
|
||||||
|
'a close-up painting in the style of {}',
|
||||||
|
'a rendition in the style of {}',
|
||||||
|
'a nice painting in the style of {}',
|
||||||
|
'a small painting in the style of {}',
|
||||||
|
'a weird painting in the style of {}',
|
||||||
|
'a large painting in the style of {}',
|
||||||
|
]
|
||||||
|
|
||||||
|
imagenet_dual_templates_small = [
|
||||||
|
'a painting in the style of {} with {}',
|
||||||
|
'a rendering in the style of {} with {}',
|
||||||
|
'a cropped painting in the style of {} with {}',
|
||||||
|
'the painting in the style of {} with {}',
|
||||||
|
'a clean painting in the style of {} with {}',
|
||||||
|
'a dirty painting in the style of {} with {}',
|
||||||
|
'a dark painting in the style of {} with {}',
|
||||||
|
'a cool painting in the style of {} with {}',
|
||||||
|
'a close-up painting in the style of {} with {}',
|
||||||
|
'a bright painting in the style of {} with {}',
|
||||||
|
'a cropped painting in the style of {} with {}',
|
||||||
|
'a good painting in the style of {} with {}',
|
||||||
|
'a painting of one {} in the style of {}',
|
||||||
|
'a nice painting in the style of {} with {}',
|
||||||
|
'a small painting in the style of {} with {}',
|
||||||
|
'a weird painting in the style of {} with {}',
|
||||||
|
'a large painting in the style of {} with {}',
|
||||||
|
]
|
||||||
|
|
||||||
|
per_img_token_list = [
|
||||||
|
'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 'י', 'כ', 'ל', 'מ', 'נ', 'ס', 'ע', 'פ', 'צ', 'ק', 'ר', 'ש', 'ת',
|
||||||
|
]
|
||||||
|
|
||||||
|
class PersonalizedBase(Dataset):
|
||||||
|
def __init__(self,
|
||||||
|
data_root,
|
||||||
|
size=None,
|
||||||
|
repeats=100,
|
||||||
|
interpolation="bicubic",
|
||||||
|
flip_p=0.5,
|
||||||
|
set="train",
|
||||||
|
placeholder_token="*",
|
||||||
|
per_image_tokens=False,
|
||||||
|
center_crop=False,
|
||||||
|
):
|
||||||
|
|
||||||
|
self.data_root = data_root
|
||||||
|
|
||||||
|
self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
|
||||||
|
|
||||||
|
# self._length = len(self.image_paths)
|
||||||
|
self.num_images = len(self.image_paths)
|
||||||
|
self._length = self.num_images
|
||||||
|
|
||||||
|
self.placeholder_token = placeholder_token
|
||||||
|
|
||||||
|
self.per_image_tokens = per_image_tokens
|
||||||
|
self.center_crop = center_crop
|
||||||
|
|
||||||
|
if per_image_tokens:
|
||||||
|
assert self.num_images < len(per_img_token_list), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'."
|
||||||
|
|
||||||
|
if set == "train":
|
||||||
|
self._length = self.num_images * repeats
|
||||||
|
|
||||||
|
self.size = size
|
||||||
|
self.interpolation = {"linear": PIL.Image.LINEAR,
|
||||||
|
"bilinear": PIL.Image.BILINEAR,
|
||||||
|
"bicubic": PIL.Image.BICUBIC,
|
||||||
|
"lanczos": PIL.Image.LANCZOS,
|
||||||
|
}[interpolation]
|
||||||
|
self.flip = transforms.RandomHorizontalFlip(p=flip_p)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return self._length
|
||||||
|
|
||||||
|
def __getitem__(self, i):
|
||||||
|
example = {}
|
||||||
|
image = Image.open(self.image_paths[i % self.num_images])
|
||||||
|
|
||||||
|
if not image.mode == "RGB":
|
||||||
|
image = image.convert("RGB")
|
||||||
|
|
||||||
|
if self.per_image_tokens and np.random.uniform() < 0.25:
|
||||||
|
text = random.choice(imagenet_dual_templates_small).format(self.placeholder_token, per_img_token_list[i % self.num_images])
|
||||||
|
else:
|
||||||
|
text = random.choice(imagenet_templates_small).format(self.placeholder_token)
|
||||||
|
|
||||||
|
example["caption"] = text
|
||||||
|
|
||||||
|
# default to score-sde preprocessing
|
||||||
|
img = np.array(image).astype(np.uint8)
|
||||||
|
|
||||||
|
if self.center_crop:
|
||||||
|
crop = min(img.shape[0], img.shape[1])
|
||||||
|
h, w, = img.shape[0], img.shape[1]
|
||||||
|
img = img[(h - crop) // 2:(h + crop) // 2,
|
||||||
|
(w - crop) // 2:(w + crop) // 2]
|
||||||
|
|
||||||
|
image = Image.fromarray(img)
|
||||||
|
if self.size is not None:
|
||||||
|
image = image.resize((self.size, self.size), resample=self.interpolation)
|
||||||
|
|
||||||
|
image = self.flip(image)
|
||||||
|
image = np.array(image).astype(np.uint8)
|
||||||
|
example["image"] = (image / 127.5 - 1.0).astype(np.float32)
|
||||||
|
return example
|
@ -17,9 +17,6 @@ class DDIMSampler(object):
|
|||||||
self.schedule = schedule
|
self.schedule = schedule
|
||||||
|
|
||||||
def register_buffer(self, name, attr):
|
def register_buffer(self, name, attr):
|
||||||
if type(attr) == torch.Tensor:
|
|
||||||
if attr.device != torch.device("cuda"):
|
|
||||||
attr = attr.to(torch.device("cuda"))
|
|
||||||
setattr(self, name, attr)
|
setattr(self, name, attr)
|
||||||
|
|
||||||
def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
|
def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
|
||||||
|
@ -7,7 +7,9 @@ https://github.com/CompVis/taming-transformers
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
import os
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytorch_lightning as pl
|
import pytorch_lightning as pl
|
||||||
from torch.optim.lr_scheduler import LambdaLR
|
from torch.optim.lr_scheduler import LambdaLR
|
||||||
@ -64,6 +66,7 @@ class DDPM(pl.LightningModule):
|
|||||||
cosine_s=8e-3,
|
cosine_s=8e-3,
|
||||||
given_betas=None,
|
given_betas=None,
|
||||||
original_elbo_weight=0.,
|
original_elbo_weight=0.,
|
||||||
|
embedding_reg_weight=0.,
|
||||||
v_posterior=0., # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
|
v_posterior=0., # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
|
||||||
l_simple_weight=1.,
|
l_simple_weight=1.,
|
||||||
conditioning_key=None,
|
conditioning_key=None,
|
||||||
@ -98,6 +101,7 @@ class DDPM(pl.LightningModule):
|
|||||||
self.v_posterior = v_posterior
|
self.v_posterior = v_posterior
|
||||||
self.original_elbo_weight = original_elbo_weight
|
self.original_elbo_weight = original_elbo_weight
|
||||||
self.l_simple_weight = l_simple_weight
|
self.l_simple_weight = l_simple_weight
|
||||||
|
self.embedding_reg_weight = embedding_reg_weight
|
||||||
|
|
||||||
if monitor is not None:
|
if monitor is not None:
|
||||||
self.monitor = monitor
|
self.monitor = monitor
|
||||||
@ -427,6 +431,7 @@ class LatentDiffusion(DDPM):
|
|||||||
def __init__(self,
|
def __init__(self,
|
||||||
first_stage_config,
|
first_stage_config,
|
||||||
cond_stage_config,
|
cond_stage_config,
|
||||||
|
personalization_config,
|
||||||
num_timesteps_cond=None,
|
num_timesteps_cond=None,
|
||||||
cond_stage_key="image",
|
cond_stage_key="image",
|
||||||
cond_stage_trainable=False,
|
cond_stage_trainable=False,
|
||||||
@ -436,6 +441,7 @@ class LatentDiffusion(DDPM):
|
|||||||
scale_factor=1.0,
|
scale_factor=1.0,
|
||||||
scale_by_std=False,
|
scale_by_std=False,
|
||||||
*args, **kwargs):
|
*args, **kwargs):
|
||||||
|
|
||||||
self.num_timesteps_cond = default(num_timesteps_cond, 1)
|
self.num_timesteps_cond = default(num_timesteps_cond, 1)
|
||||||
self.scale_by_std = scale_by_std
|
self.scale_by_std = scale_by_std
|
||||||
assert self.num_timesteps_cond <= kwargs['timesteps']
|
assert self.num_timesteps_cond <= kwargs['timesteps']
|
||||||
@ -450,6 +456,7 @@ class LatentDiffusion(DDPM):
|
|||||||
self.concat_mode = concat_mode
|
self.concat_mode = concat_mode
|
||||||
self.cond_stage_trainable = cond_stage_trainable
|
self.cond_stage_trainable = cond_stage_trainable
|
||||||
self.cond_stage_key = cond_stage_key
|
self.cond_stage_key = cond_stage_key
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
|
self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
|
||||||
except:
|
except:
|
||||||
@ -460,6 +467,7 @@ class LatentDiffusion(DDPM):
|
|||||||
self.register_buffer('scale_factor', torch.tensor(scale_factor))
|
self.register_buffer('scale_factor', torch.tensor(scale_factor))
|
||||||
self.instantiate_first_stage(first_stage_config)
|
self.instantiate_first_stage(first_stage_config)
|
||||||
self.instantiate_cond_stage(cond_stage_config)
|
self.instantiate_cond_stage(cond_stage_config)
|
||||||
|
|
||||||
self.cond_stage_forward = cond_stage_forward
|
self.cond_stage_forward = cond_stage_forward
|
||||||
self.clip_denoised = False
|
self.clip_denoised = False
|
||||||
self.bbox_tokenizer = None
|
self.bbox_tokenizer = None
|
||||||
@ -469,6 +477,25 @@ class LatentDiffusion(DDPM):
|
|||||||
self.init_from_ckpt(ckpt_path, ignore_keys)
|
self.init_from_ckpt(ckpt_path, ignore_keys)
|
||||||
self.restarted_from_ckpt = True
|
self.restarted_from_ckpt = True
|
||||||
|
|
||||||
|
self.cond_stage_model.train = disabled_train
|
||||||
|
for param in self.cond_stage_model.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
self.model.eval()
|
||||||
|
self.model.train = disabled_train
|
||||||
|
for param in self.model.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
self.embedding_manager = self.instantiate_embedding_manager(personalization_config, self.cond_stage_model)
|
||||||
|
|
||||||
|
self.emb_ckpt_counter = 0
|
||||||
|
|
||||||
|
# if self.embedding_manager.is_clip:
|
||||||
|
# self.cond_stage_model.update_embedding_func(self.embedding_manager)
|
||||||
|
|
||||||
|
for param in self.embedding_manager.embedding_parameters():
|
||||||
|
param.requires_grad = True
|
||||||
|
|
||||||
def make_cond_schedule(self, ):
|
def make_cond_schedule(self, ):
|
||||||
self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
|
self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
|
||||||
ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
|
ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
|
||||||
@ -531,6 +558,15 @@ class LatentDiffusion(DDPM):
|
|||||||
raise SystemExit("* Couldn't load a dependency. Try running scripts/preload_models.py from an internet-conected machine.")
|
raise SystemExit("* Couldn't load a dependency. Try running scripts/preload_models.py from an internet-conected machine.")
|
||||||
self.cond_stage_model = model
|
self.cond_stage_model = model
|
||||||
|
|
||||||
|
|
||||||
|
def instantiate_embedding_manager(self, config, embedder):
|
||||||
|
model = instantiate_from_config(config, embedder=embedder)
|
||||||
|
|
||||||
|
if config.params.get("embedding_manager_ckpt", None): # do not load if missing OR empty string
|
||||||
|
model.load(config.params.embedding_manager_ckpt)
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False):
|
def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False):
|
||||||
denoise_row = []
|
denoise_row = []
|
||||||
for zd in tqdm(samples, desc=desc):
|
for zd in tqdm(samples, desc=desc):
|
||||||
@ -555,7 +591,7 @@ class LatentDiffusion(DDPM):
|
|||||||
def get_learned_conditioning(self, c):
|
def get_learned_conditioning(self, c):
|
||||||
if self.cond_stage_forward is None:
|
if self.cond_stage_forward is None:
|
||||||
if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
|
if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
|
||||||
c = self.cond_stage_model.encode(c)
|
c = self.cond_stage_model.encode(c, embedding_manager=self.embedding_manager)
|
||||||
if isinstance(c, DiagonalGaussianDistribution):
|
if isinstance(c, DiagonalGaussianDistribution):
|
||||||
c = c.mode()
|
c = c.mode()
|
||||||
else:
|
else:
|
||||||
@ -880,6 +916,7 @@ class LatentDiffusion(DDPM):
|
|||||||
if self.shorten_cond_schedule: # TODO: drop this option
|
if self.shorten_cond_schedule: # TODO: drop this option
|
||||||
tc = self.cond_ids[t].to(self.device)
|
tc = self.cond_ids[t].to(self.device)
|
||||||
c = self.q_sample(x_start=c, t=tc, noise=torch.randn_like(c.float()))
|
c = self.q_sample(x_start=c, t=tc, noise=torch.randn_like(c.float()))
|
||||||
|
|
||||||
return self.p_losses(x, c, t, *args, **kwargs)
|
return self.p_losses(x, c, t, *args, **kwargs)
|
||||||
|
|
||||||
def _rescale_annotations(self, bboxes, crop_coordinates): # TODO: move to dataset
|
def _rescale_annotations(self, bboxes, crop_coordinates): # TODO: move to dataset
|
||||||
@ -1046,6 +1083,14 @@ class LatentDiffusion(DDPM):
|
|||||||
loss += (self.original_elbo_weight * loss_vlb)
|
loss += (self.original_elbo_weight * loss_vlb)
|
||||||
loss_dict.update({f'{prefix}/loss': loss})
|
loss_dict.update({f'{prefix}/loss': loss})
|
||||||
|
|
||||||
|
if self.embedding_reg_weight > 0:
|
||||||
|
loss_embedding_reg = self.embedding_manager.embedding_to_coarse_loss().mean()
|
||||||
|
|
||||||
|
loss_dict.update({f'{prefix}/loss_emb_reg': loss_embedding_reg})
|
||||||
|
|
||||||
|
loss += (self.embedding_reg_weight * loss_embedding_reg)
|
||||||
|
loss_dict.update({f'{prefix}/loss': loss})
|
||||||
|
|
||||||
return loss, loss_dict
|
return loss, loss_dict
|
||||||
|
|
||||||
def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codebook_ids=False, quantize_denoised=False,
|
def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codebook_ids=False, quantize_denoised=False,
|
||||||
@ -1250,11 +1295,10 @@ class LatentDiffusion(DDPM):
|
|||||||
|
|
||||||
return samples, intermediates
|
return samples, intermediates
|
||||||
|
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
|
def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
|
||||||
quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
|
quantize_denoised=True, inpaint=False, plot_denoise_rows=False, plot_progressive_rows=False,
|
||||||
plot_diffusion_rows=True, **kwargs):
|
plot_diffusion_rows=False, **kwargs):
|
||||||
|
|
||||||
use_ddim = ddim_steps is not None
|
use_ddim = ddim_steps is not None
|
||||||
|
|
||||||
@ -1313,6 +1357,16 @@ class LatentDiffusion(DDPM):
|
|||||||
denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
|
denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
|
||||||
log["denoise_row"] = denoise_grid
|
log["denoise_row"] = denoise_grid
|
||||||
|
|
||||||
|
uc = self.get_learned_conditioning(len(c) * [""])
|
||||||
|
sample_scaled, _ = self.sample_log(cond=c,
|
||||||
|
batch_size=N,
|
||||||
|
ddim=use_ddim,
|
||||||
|
ddim_steps=ddim_steps,
|
||||||
|
eta=ddim_eta,
|
||||||
|
unconditional_guidance_scale=5.0,
|
||||||
|
unconditional_conditioning=uc)
|
||||||
|
log["samples_scaled"] = self.decode_first_stage(sample_scaled)
|
||||||
|
|
||||||
if quantize_denoised and not isinstance(self.first_stage_model, AutoencoderKL) and not isinstance(
|
if quantize_denoised and not isinstance(self.first_stage_model, AutoencoderKL) and not isinstance(
|
||||||
self.first_stage_model, IdentityFirstStage):
|
self.first_stage_model, IdentityFirstStage):
|
||||||
# also display when quantizing x0 while sampling
|
# also display when quantizing x0 while sampling
|
||||||
@ -1364,6 +1418,11 @@ class LatentDiffusion(DDPM):
|
|||||||
|
|
||||||
def configure_optimizers(self):
|
def configure_optimizers(self):
|
||||||
lr = self.learning_rate
|
lr = self.learning_rate
|
||||||
|
|
||||||
|
if self.embedding_manager is not None:
|
||||||
|
params = list(self.embedding_manager.embedding_parameters())
|
||||||
|
# params = list(self.cond_stage_model.transformer.text_model.embeddings.embedding_manager.embedding_parameters())
|
||||||
|
else:
|
||||||
params = list(self.model.parameters())
|
params = list(self.model.parameters())
|
||||||
if self.cond_stage_trainable:
|
if self.cond_stage_trainable:
|
||||||
print(f"{self.__class__.__name__}: Also optimizing conditioner params!")
|
print(f"{self.__class__.__name__}: Also optimizing conditioner params!")
|
||||||
@ -1395,6 +1454,18 @@ class LatentDiffusion(DDPM):
|
|||||||
x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
|
x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
@rank_zero_only
|
||||||
|
def on_save_checkpoint(self, checkpoint):
|
||||||
|
checkpoint.clear()
|
||||||
|
|
||||||
|
if os.path.isdir(self.trainer.checkpoint_callback.dirpath):
|
||||||
|
self.embedding_manager.save(os.path.join(self.trainer.checkpoint_callback.dirpath, "embeddings.pt"))
|
||||||
|
|
||||||
|
if (self.global_step - self.emb_ckpt_counter) > 500:
|
||||||
|
self.embedding_manager.save(os.path.join(self.trainer.checkpoint_callback.dirpath, f"embeddings_gs-{self.global_step}.pt"))
|
||||||
|
|
||||||
|
self.emb_ckpt_counter += 500
|
||||||
|
|
||||||
|
|
||||||
class DiffusionWrapper(pl.LightningModule):
|
class DiffusionWrapper(pl.LightningModule):
|
||||||
def __init__(self, diff_model_config, conditioning_key):
|
def __init__(self, diff_model_config, conditioning_key):
|
||||||
|
@ -67,7 +67,7 @@ class KSampler(object):
|
|||||||
x = torch.randn([batch_size, *shape], device=self.device) * sigmas[0] # for GPU draw
|
x = torch.randn([batch_size, *shape], device=self.device) * sigmas[0] # for GPU draw
|
||||||
model_wrap_cfg = CFGDenoiser(self.model)
|
model_wrap_cfg = CFGDenoiser(self.model)
|
||||||
extra_args = {'cond': conditioning, 'uncond': unconditional_conditioning, 'cond_scale': unconditional_guidance_scale}
|
extra_args = {'cond': conditioning, 'uncond': unconditional_conditioning, 'cond_scale': unconditional_guidance_scale}
|
||||||
return (K.sampling.sample_lms(model_wrap_cfg, x, sigmas, extra_args=extra_args, disable=not self.accelerator.is_main_process),
|
return (K.sampling.__dict__[f'sample_{self.schedule}'](model_wrap_cfg, x, sigmas, extra_args=extra_args, disable=not self.accelerator.is_main_process),
|
||||||
None)
|
None)
|
||||||
|
|
||||||
def gather(samples_ddim):
|
def gather(samples_ddim):
|
||||||
|
@ -16,9 +16,6 @@ class PLMSSampler(object):
|
|||||||
self.schedule = schedule
|
self.schedule = schedule
|
||||||
|
|
||||||
def register_buffer(self, name, attr):
|
def register_buffer(self, name, attr):
|
||||||
if type(attr) == torch.Tensor:
|
|
||||||
if attr.device != torch.device("cuda"):
|
|
||||||
attr = attr.to(torch.device("cuda"))
|
|
||||||
setattr(self, name, attr)
|
setattr(self, name, attr)
|
||||||
|
|
||||||
def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
|
def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
|
||||||
|
@ -109,7 +109,7 @@ def checkpoint(func, inputs, params, flag):
|
|||||||
explicitly take as arguments.
|
explicitly take as arguments.
|
||||||
:param flag: if False, disable gradient checkpointing.
|
:param flag: if False, disable gradient checkpointing.
|
||||||
"""
|
"""
|
||||||
if flag:
|
if False: # disabled checkpointing to allow requires_grad = False for main model
|
||||||
args = tuple(inputs) + tuple(params)
|
args = tuple(inputs) + tuple(params)
|
||||||
return CheckpointFunction.apply(func, len(inputs), *args)
|
return CheckpointFunction.apply(func, len(inputs), *args)
|
||||||
else:
|
else:
|
||||||
|
164
ldm/modules/embedding_manager.py
Normal file
164
ldm/modules/embedding_manager.py
Normal file
@ -0,0 +1,164 @@
|
|||||||
|
from cmath import log
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from ldm.data.personalized import per_img_token_list
|
||||||
|
from transformers import CLIPTokenizer
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
DEFAULT_PLACEHOLDER_TOKEN = ["*"]
|
||||||
|
|
||||||
|
PROGRESSIVE_SCALE = 2000
|
||||||
|
|
||||||
|
def get_clip_token_for_string(tokenizer, string):
|
||||||
|
batch_encoding = tokenizer(string, truncation=True, max_length=77, return_length=True,
|
||||||
|
return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
|
||||||
|
tokens = batch_encoding["input_ids"]
|
||||||
|
assert torch.count_nonzero(tokens - 49407) == 2, f"String '{string}' maps to more than a single token. Please use another string"
|
||||||
|
|
||||||
|
return tokens[0, 1]
|
||||||
|
|
||||||
|
def get_bert_token_for_string(tokenizer, string):
|
||||||
|
token = tokenizer(string)
|
||||||
|
# assert torch.count_nonzero(token) == 3, f"String '{string}' maps to more than a single token. Please use another string"
|
||||||
|
|
||||||
|
token = token[0, 1]
|
||||||
|
|
||||||
|
return token
|
||||||
|
|
||||||
|
def get_embedding_for_clip_token(embedder, token):
|
||||||
|
return embedder(token.unsqueeze(0))[0, 0]
|
||||||
|
|
||||||
|
|
||||||
|
class EmbeddingManager(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embedder,
|
||||||
|
placeholder_strings=None,
|
||||||
|
initializer_words=None,
|
||||||
|
per_image_tokens=False,
|
||||||
|
num_vectors_per_token=1,
|
||||||
|
progressive_words=False,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.string_to_token_dict = {}
|
||||||
|
|
||||||
|
self.string_to_param_dict = nn.ParameterDict()
|
||||||
|
|
||||||
|
self.initial_embeddings = nn.ParameterDict() # These should not be optimized
|
||||||
|
|
||||||
|
self.progressive_words = progressive_words
|
||||||
|
self.progressive_counter = 0
|
||||||
|
|
||||||
|
self.max_vectors_per_token = num_vectors_per_token
|
||||||
|
|
||||||
|
if hasattr(embedder, 'tokenizer'): # using Stable Diffusion's CLIP encoder
|
||||||
|
self.is_clip = True
|
||||||
|
get_token_for_string = partial(get_clip_token_for_string, embedder.tokenizer)
|
||||||
|
get_embedding_for_tkn = partial(get_embedding_for_clip_token, embedder.transformer.text_model.embeddings)
|
||||||
|
token_dim = 1280
|
||||||
|
else: # using LDM's BERT encoder
|
||||||
|
self.is_clip = False
|
||||||
|
get_token_for_string = partial(get_bert_token_for_string, embedder.tknz_fn)
|
||||||
|
get_embedding_for_tkn = embedder.transformer.token_emb
|
||||||
|
token_dim = 1280
|
||||||
|
|
||||||
|
if per_image_tokens:
|
||||||
|
placeholder_strings.extend(per_img_token_list)
|
||||||
|
|
||||||
|
for idx, placeholder_string in enumerate(placeholder_strings):
|
||||||
|
|
||||||
|
token = get_token_for_string(placeholder_string)
|
||||||
|
|
||||||
|
if initializer_words and idx < len(initializer_words):
|
||||||
|
init_word_token = get_token_for_string(initializer_words[idx])
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
init_word_embedding = get_embedding_for_tkn(init_word_token.cpu())
|
||||||
|
|
||||||
|
token_params = torch.nn.Parameter(init_word_embedding.unsqueeze(0).repeat(num_vectors_per_token, 1), requires_grad=True)
|
||||||
|
self.initial_embeddings[placeholder_string] = torch.nn.Parameter(init_word_embedding.unsqueeze(0).repeat(num_vectors_per_token, 1), requires_grad=False)
|
||||||
|
else:
|
||||||
|
token_params = torch.nn.Parameter(torch.rand(size=(num_vectors_per_token, token_dim), requires_grad=True))
|
||||||
|
|
||||||
|
self.string_to_token_dict[placeholder_string] = token
|
||||||
|
self.string_to_param_dict[placeholder_string] = token_params
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
tokenized_text,
|
||||||
|
embedded_text,
|
||||||
|
):
|
||||||
|
b, n, device = *tokenized_text.shape, tokenized_text.device
|
||||||
|
|
||||||
|
for placeholder_string, placeholder_token in self.string_to_token_dict.items():
|
||||||
|
|
||||||
|
placeholder_embedding = self.string_to_param_dict[placeholder_string].to(device)
|
||||||
|
|
||||||
|
if self.max_vectors_per_token == 1: # If there's only one vector per token, we can do a simple replacement
|
||||||
|
placeholder_idx = torch.where(tokenized_text == placeholder_token.to(device))
|
||||||
|
embedded_text[placeholder_idx] = placeholder_embedding
|
||||||
|
else: # otherwise, need to insert and keep track of changing indices
|
||||||
|
if self.progressive_words:
|
||||||
|
self.progressive_counter += 1
|
||||||
|
max_step_tokens = 1 + self.progressive_counter // PROGRESSIVE_SCALE
|
||||||
|
else:
|
||||||
|
max_step_tokens = self.max_vectors_per_token
|
||||||
|
|
||||||
|
num_vectors_for_token = min(placeholder_embedding.shape[0], max_step_tokens)
|
||||||
|
|
||||||
|
placeholder_rows, placeholder_cols = torch.where(tokenized_text == placeholder_token.to(device))
|
||||||
|
|
||||||
|
if placeholder_rows.nelement() == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
sorted_cols, sort_idx = torch.sort(placeholder_cols, descending=True)
|
||||||
|
sorted_rows = placeholder_rows[sort_idx]
|
||||||
|
|
||||||
|
for idx in range(len(sorted_rows)):
|
||||||
|
row = sorted_rows[idx]
|
||||||
|
col = sorted_cols[idx]
|
||||||
|
|
||||||
|
new_token_row = torch.cat([tokenized_text[row][:col], placeholder_token.repeat(num_vectors_for_token).to(device), tokenized_text[row][col + 1:]], axis=0)[:n]
|
||||||
|
new_embed_row = torch.cat([embedded_text[row][:col], placeholder_embedding[:num_vectors_for_token], embedded_text[row][col + 1:]], axis=0)[:n]
|
||||||
|
|
||||||
|
embedded_text[row] = new_embed_row
|
||||||
|
tokenized_text[row] = new_token_row
|
||||||
|
|
||||||
|
return embedded_text
|
||||||
|
|
||||||
|
def save(self, ckpt_path):
|
||||||
|
torch.save({"string_to_token": self.string_to_token_dict,
|
||||||
|
"string_to_param": self.string_to_param_dict}, ckpt_path)
|
||||||
|
|
||||||
|
def load(self, ckpt_path):
|
||||||
|
ckpt = torch.load(ckpt_path, map_location='cpu')
|
||||||
|
|
||||||
|
self.string_to_token_dict = ckpt["string_to_token"]
|
||||||
|
self.string_to_param_dict = ckpt["string_to_param"]
|
||||||
|
|
||||||
|
def get_embedding_norms_squared(self):
|
||||||
|
all_params = torch.cat(list(self.string_to_param_dict.values()), axis=0) # num_placeholders x embedding_dim
|
||||||
|
param_norm_squared = (all_params * all_params).sum(axis=-1) # num_placeholders
|
||||||
|
|
||||||
|
return param_norm_squared
|
||||||
|
|
||||||
|
def embedding_parameters(self):
|
||||||
|
return self.string_to_param_dict.parameters()
|
||||||
|
|
||||||
|
def embedding_to_coarse_loss(self):
|
||||||
|
|
||||||
|
loss = 0.
|
||||||
|
num_embeddings = len(self.initial_embeddings)
|
||||||
|
|
||||||
|
for key in self.initial_embeddings:
|
||||||
|
optimized = self.string_to_param_dict[key]
|
||||||
|
coarse = self.initial_embeddings[key].clone().to(optimized.device)
|
||||||
|
|
||||||
|
loss = loss + (optimized - coarse) @ (optimized - coarse).T / num_embeddings
|
||||||
|
|
||||||
|
return loss
|
@ -8,6 +8,27 @@ import kornia
|
|||||||
|
|
||||||
from ldm.modules.x_transformer import Encoder, TransformerWrapper # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
|
from ldm.modules.x_transformer import Encoder, TransformerWrapper # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
|
||||||
|
|
||||||
|
def _expand_mask(mask, dtype, tgt_len = None):
|
||||||
|
"""
|
||||||
|
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
|
||||||
|
"""
|
||||||
|
bsz, src_len = mask.size()
|
||||||
|
tgt_len = tgt_len if tgt_len is not None else src_len
|
||||||
|
|
||||||
|
expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
|
||||||
|
|
||||||
|
inverted_mask = 1.0 - expanded_mask
|
||||||
|
|
||||||
|
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
|
||||||
|
|
||||||
|
def _build_causal_attention_mask(bsz, seq_len, dtype):
|
||||||
|
# lazily create causal attention mask, with full attention between the vision tokens
|
||||||
|
# pytorch uses additive attention mask; fill with -inf
|
||||||
|
mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
|
||||||
|
mask.fill_(torch.tensor(torch.finfo(dtype).min))
|
||||||
|
mask.triu_(1) # zero out the lower diagonal
|
||||||
|
mask = mask.unsqueeze(1) # expand mask
|
||||||
|
return mask
|
||||||
|
|
||||||
class AbstractEncoder(nn.Module):
|
class AbstractEncoder(nn.Module):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -98,18 +119,17 @@ class BERTEmbedder(AbstractEncoder):
|
|||||||
attn_layers=Encoder(dim=n_embed, depth=n_layer),
|
attn_layers=Encoder(dim=n_embed, depth=n_layer),
|
||||||
emb_dropout=embedding_dropout)
|
emb_dropout=embedding_dropout)
|
||||||
|
|
||||||
def forward(self, text):
|
def forward(self, text, embedding_manager=None):
|
||||||
if self.use_tknz_fn:
|
if self.use_tknz_fn:
|
||||||
tokens = self.tknz_fn(text)#.to(self.device)
|
tokens = self.tknz_fn(text)#.to(self.device)
|
||||||
else:
|
else:
|
||||||
tokens = text
|
tokens = text
|
||||||
z = self.transformer(tokens, return_embeddings=True)
|
z = self.transformer(tokens, return_embeddings=True, embedding_manager=embedding_manager)
|
||||||
return z
|
return z
|
||||||
|
|
||||||
def encode(self, text):
|
def encode(self, text, **kwargs):
|
||||||
# output of length 77
|
# output of length 77
|
||||||
return self(text)
|
return self(text, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class SpatialRescaler(nn.Module):
|
class SpatialRescaler(nn.Module):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
@ -152,22 +172,165 @@ class FrozenCLIPEmbedder(AbstractEncoder):
|
|||||||
self.max_length = max_length
|
self.max_length = max_length
|
||||||
self.freeze()
|
self.freeze()
|
||||||
|
|
||||||
|
def embedding_forward(
|
||||||
|
self,
|
||||||
|
input_ids = None,
|
||||||
|
position_ids = None,
|
||||||
|
inputs_embeds = None,
|
||||||
|
embedding_manager = None,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
|
||||||
|
seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
|
||||||
|
|
||||||
|
if position_ids is None:
|
||||||
|
position_ids = self.position_ids[:, :seq_length]
|
||||||
|
|
||||||
|
if inputs_embeds is None:
|
||||||
|
inputs_embeds = self.token_embedding(input_ids)
|
||||||
|
|
||||||
|
if embedding_manager is not None:
|
||||||
|
inputs_embeds = embedding_manager(input_ids, inputs_embeds)
|
||||||
|
|
||||||
|
|
||||||
|
position_embeddings = self.position_embedding(position_ids)
|
||||||
|
embeddings = inputs_embeds + position_embeddings
|
||||||
|
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
self.transformer.text_model.embeddings.forward = embedding_forward.__get__(self.transformer.text_model.embeddings)
|
||||||
|
|
||||||
|
def encoder_forward(
|
||||||
|
self,
|
||||||
|
inputs_embeds,
|
||||||
|
attention_mask = None,
|
||||||
|
causal_attention_mask = None,
|
||||||
|
output_attentions = None,
|
||||||
|
output_hidden_states = None,
|
||||||
|
return_dict = None,
|
||||||
|
):
|
||||||
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
|
output_hidden_states = (
|
||||||
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||||
|
)
|
||||||
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
|
||||||
|
encoder_states = () if output_hidden_states else None
|
||||||
|
all_attentions = () if output_attentions else None
|
||||||
|
|
||||||
|
hidden_states = inputs_embeds
|
||||||
|
for idx, encoder_layer in enumerate(self.layers):
|
||||||
|
if output_hidden_states:
|
||||||
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
|
|
||||||
|
layer_outputs = encoder_layer(
|
||||||
|
hidden_states,
|
||||||
|
attention_mask,
|
||||||
|
causal_attention_mask,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
)
|
||||||
|
|
||||||
|
hidden_states = layer_outputs[0]
|
||||||
|
|
||||||
|
if output_attentions:
|
||||||
|
all_attentions = all_attentions + (layer_outputs[1],)
|
||||||
|
|
||||||
|
if output_hidden_states:
|
||||||
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
|
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
self.transformer.text_model.encoder.forward = encoder_forward.__get__(self.transformer.text_model.encoder)
|
||||||
|
|
||||||
|
|
||||||
|
def text_encoder_forward(
|
||||||
|
self,
|
||||||
|
input_ids = None,
|
||||||
|
attention_mask = None,
|
||||||
|
position_ids = None,
|
||||||
|
output_attentions = None,
|
||||||
|
output_hidden_states = None,
|
||||||
|
return_dict = None,
|
||||||
|
embedding_manager = None,
|
||||||
|
):
|
||||||
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
|
output_hidden_states = (
|
||||||
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||||
|
)
|
||||||
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
|
||||||
|
if input_ids is None:
|
||||||
|
raise ValueError("You have to specify either input_ids")
|
||||||
|
|
||||||
|
input_shape = input_ids.size()
|
||||||
|
input_ids = input_ids.view(-1, input_shape[-1])
|
||||||
|
|
||||||
|
hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids, embedding_manager=embedding_manager)
|
||||||
|
|
||||||
|
bsz, seq_len = input_shape
|
||||||
|
# CLIP's text model uses causal mask, prepare it here.
|
||||||
|
# https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
|
||||||
|
causal_attention_mask = _build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
|
||||||
|
hidden_states.device
|
||||||
|
)
|
||||||
|
|
||||||
|
# expand attention_mask
|
||||||
|
if attention_mask is not None:
|
||||||
|
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
|
||||||
|
attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
|
||||||
|
|
||||||
|
last_hidden_state = self.encoder(
|
||||||
|
inputs_embeds=hidden_states,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
causal_attention_mask=causal_attention_mask,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
output_hidden_states=output_hidden_states,
|
||||||
|
return_dict=return_dict,
|
||||||
|
)
|
||||||
|
|
||||||
|
last_hidden_state = self.final_layer_norm(last_hidden_state)
|
||||||
|
|
||||||
|
return last_hidden_state
|
||||||
|
|
||||||
|
self.transformer.text_model.forward = text_encoder_forward.__get__(self.transformer.text_model)
|
||||||
|
|
||||||
|
def transformer_forward(
|
||||||
|
self,
|
||||||
|
input_ids = None,
|
||||||
|
attention_mask = None,
|
||||||
|
position_ids = None,
|
||||||
|
output_attentions = None,
|
||||||
|
output_hidden_states = None,
|
||||||
|
return_dict = None,
|
||||||
|
embedding_manager = None,
|
||||||
|
):
|
||||||
|
return self.text_model(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_ids=position_ids,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
output_hidden_states=output_hidden_states,
|
||||||
|
return_dict=return_dict,
|
||||||
|
embedding_manager = embedding_manager
|
||||||
|
)
|
||||||
|
|
||||||
|
self.transformer.forward = transformer_forward.__get__(self.transformer)
|
||||||
|
|
||||||
|
|
||||||
def freeze(self):
|
def freeze(self):
|
||||||
self.transformer = self.transformer.eval()
|
self.transformer = self.transformer.eval()
|
||||||
for param in self.parameters():
|
for param in self.parameters():
|
||||||
param.requires_grad = False
|
param.requires_grad = False
|
||||||
|
|
||||||
def forward(self, text):
|
def forward(self, text, **kwargs):
|
||||||
batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
|
batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
|
||||||
return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
|
return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
|
||||||
tokens = batch_encoding["input_ids"].to(self.device)
|
tokens = batch_encoding["input_ids"].to(self.device)
|
||||||
outputs = self.transformer(input_ids=tokens)
|
z = self.transformer(input_ids=tokens, **kwargs)
|
||||||
|
|
||||||
z = outputs.last_hidden_state
|
|
||||||
return z
|
return z
|
||||||
|
|
||||||
def encode(self, text):
|
def encode(self, text, **kwargs):
|
||||||
return self(text)
|
return self(text, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class FrozenCLIPTextEmbedder(nn.Module):
|
class FrozenCLIPTextEmbedder(nn.Module):
|
||||||
|
@ -485,7 +485,8 @@ class AttentionLayers(nn.Module):
|
|||||||
mask=None,
|
mask=None,
|
||||||
context_mask=None,
|
context_mask=None,
|
||||||
mems=None,
|
mems=None,
|
||||||
return_hiddens=False
|
return_hiddens=False,
|
||||||
|
**kwargs
|
||||||
):
|
):
|
||||||
hiddens = []
|
hiddens = []
|
||||||
intermediates = []
|
intermediates = []
|
||||||
@ -603,11 +604,19 @@ class TransformerWrapper(nn.Module):
|
|||||||
return_mems=False,
|
return_mems=False,
|
||||||
return_attn=False,
|
return_attn=False,
|
||||||
mems=None,
|
mems=None,
|
||||||
|
embedding_manager=None,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens
|
b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens
|
||||||
x = self.token_emb(x)
|
|
||||||
x += self.pos_emb(x)
|
embedded_x = self.token_emb(x)
|
||||||
|
|
||||||
|
if embedding_manager:
|
||||||
|
x = embedding_manager(x, embedded_x)
|
||||||
|
else:
|
||||||
|
x = embedded_x
|
||||||
|
|
||||||
|
x = x + self.pos_emb(x)
|
||||||
x = self.emb_dropout(x)
|
x = self.emb_dropout(x)
|
||||||
|
|
||||||
x = self.project_emb(x)
|
x = self.project_emb(x)
|
||||||
|
@ -1,3 +1,10 @@
|
|||||||
|
# Copyright (c) 2022 Lincoln D. Stein (https://github.com/lstein)
|
||||||
|
|
||||||
|
# Derived from source code carrying the following copyrights
|
||||||
|
# Copyright (c) 2022 Machine Vision and Learning Group, LMU Munich
|
||||||
|
# Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
|
||||||
|
|
||||||
|
|
||||||
"""Simplified text to image API for stable diffusion/latent diffusion
|
"""Simplified text to image API for stable diffusion/latent diffusion
|
||||||
|
|
||||||
Example Usage:
|
Example Usage:
|
||||||
@ -11,7 +18,7 @@ t2i = T2I(outdir = <path> // outputs/txt2img-samples
|
|||||||
batch_size = <integer> // how many images to generate per sampling (1)
|
batch_size = <integer> // how many images to generate per sampling (1)
|
||||||
steps = <integer> // 50
|
steps = <integer> // 50
|
||||||
seed = <integer> // current system time
|
seed = <integer> // current system time
|
||||||
sampler_name= ['ddim','plms','klms'] // klms
|
sampler_name= ['ddim', 'k_dpm_2_a', 'k_dpm_2', 'k_euler_a', 'k_euler', 'k_heun', 'k_lms', 'plms'] // k_lms
|
||||||
grid = <boolean> // false
|
grid = <boolean> // false
|
||||||
width = <integer> // image width, multiple of 64 (512)
|
width = <integer> // image width, multiple of 64 (512)
|
||||||
height = <integer> // image height, multiple of 64 (512)
|
height = <integer> // image height, multiple of 64 (512)
|
||||||
@ -51,6 +58,7 @@ import sys
|
|||||||
import os
|
import os
|
||||||
from omegaconf import OmegaConf
|
from omegaconf import OmegaConf
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
import PIL
|
||||||
from tqdm import tqdm, trange
|
from tqdm import tqdm, trange
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from einops import rearrange, repeat
|
from einops import rearrange, repeat
|
||||||
@ -89,6 +97,7 @@ class T2I:
|
|||||||
downsampling_factor
|
downsampling_factor
|
||||||
precision
|
precision
|
||||||
strength
|
strength
|
||||||
|
embedding_path
|
||||||
|
|
||||||
The vast majority of these arguments default to reasonable values.
|
The vast majority of these arguments default to reasonable values.
|
||||||
"""
|
"""
|
||||||
@ -113,7 +122,9 @@ The vast majority of these arguments default to reasonable values.
|
|||||||
precision='autocast',
|
precision='autocast',
|
||||||
full_precision=False,
|
full_precision=False,
|
||||||
strength=0.75, # default in scripts/img2img.py
|
strength=0.75, # default in scripts/img2img.py
|
||||||
latent_diffusion_weights=False # just to keep track of this parameter when regenerating prompt
|
embedding_path=None,
|
||||||
|
latent_diffusion_weights=False, # just to keep track of this parameter when regenerating prompt
|
||||||
|
device='cuda'
|
||||||
):
|
):
|
||||||
self.outdir = outdir
|
self.outdir = outdir
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
@ -133,17 +144,20 @@ The vast majority of these arguments default to reasonable values.
|
|||||||
self.precision = precision
|
self.precision = precision
|
||||||
self.full_precision = full_precision
|
self.full_precision = full_precision
|
||||||
self.strength = strength
|
self.strength = strength
|
||||||
|
self.embedding_path = embedding_path
|
||||||
self.model = None # empty for now
|
self.model = None # empty for now
|
||||||
self.sampler = None
|
self.sampler = None
|
||||||
self.latent_diffusion_weights=latent_diffusion_weights
|
self.latent_diffusion_weights=latent_diffusion_weights
|
||||||
|
self.device = device
|
||||||
if seed is None:
|
if seed is None:
|
||||||
self.seed = self._new_seed()
|
self.seed = self._new_seed()
|
||||||
else:
|
else:
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
def txt2img(self,prompt,outdir=None,batch_size=None,iterations=None,
|
def txt2img(self,prompt,outdir=None,batch_size=None,iterations=None,
|
||||||
steps=None,seed=None,grid=None,individual=None,width=None,height=None,
|
steps=None,seed=None,grid=None,individual=None,width=None,height=None,
|
||||||
cfg_scale=None,ddim_eta=None,strength=None,init_img=None,
|
cfg_scale=None,ddim_eta=None,strength=None,embedding_path=None,init_img=None,
|
||||||
skip_normalize=False,variants=None):
|
skip_normalize=False,variants=None):
|
||||||
"""
|
"""
|
||||||
Generate an image from the prompt, writing iteration images into the outdir
|
Generate an image from the prompt, writing iteration images into the outdir
|
||||||
@ -159,9 +173,13 @@ The vast majority of these arguments default to reasonable values.
|
|||||||
batch_size = batch_size or self.batch_size
|
batch_size = batch_size or self.batch_size
|
||||||
iterations = iterations or self.iterations
|
iterations = iterations or self.iterations
|
||||||
strength = strength or self.strength # not actually used here, but preserved for code refactoring
|
strength = strength or self.strength # not actually used here, but preserved for code refactoring
|
||||||
|
embedding_path = embedding_path or self.embedding_path
|
||||||
|
|
||||||
model = self.load_model() # will instantiate the model or return it from cache
|
model = self.load_model() # will instantiate the model or return it from cache
|
||||||
|
|
||||||
|
assert strength<1.0 and strength>=0.0, "strength (-f) must be >=0.0 and <1.0"
|
||||||
|
assert cfg_scale>1.0, "CFG_Scale (-C) must be >1.0"
|
||||||
|
|
||||||
# grid and individual are mutually exclusive, with individual taking priority.
|
# grid and individual are mutually exclusive, with individual taking priority.
|
||||||
# not necessary, but needed for compatability with dream bot
|
# not necessary, but needed for compatability with dream bot
|
||||||
if (grid is None):
|
if (grid is None):
|
||||||
@ -192,9 +210,7 @@ The vast majority of these arguments default to reasonable values.
|
|||||||
|
|
||||||
# Gawd. Too many levels of indent here. Need to refactor into smaller routines!
|
# Gawd. Too many levels of indent here. Need to refactor into smaller routines!
|
||||||
try:
|
try:
|
||||||
with torch.no_grad():
|
with precision_scope(self.device.type), model.ema_scope():
|
||||||
with precision_scope("cuda"):
|
|
||||||
with model.ema_scope():
|
|
||||||
all_samples = list()
|
all_samples = list()
|
||||||
for n in trange(iterations, desc="Sampling"):
|
for n in trange(iterations, desc="Sampling"):
|
||||||
seed_everything(seed)
|
seed_everything(seed)
|
||||||
@ -267,9 +283,10 @@ The vast majority of these arguments default to reasonable values.
|
|||||||
return images
|
return images
|
||||||
|
|
||||||
# There is lots of shared code between this and txt2img and should be refactored.
|
# There is lots of shared code between this and txt2img and should be refactored.
|
||||||
|
@torch.no_grad()
|
||||||
def img2img(self,prompt,outdir=None,init_img=None,batch_size=None,iterations=None,
|
def img2img(self,prompt,outdir=None,init_img=None,batch_size=None,iterations=None,
|
||||||
steps=None,seed=None,grid=None,individual=None,width=None,height=None,
|
steps=None,seed=None,grid=None,individual=None,width=None,height=None,
|
||||||
cfg_scale=None,ddim_eta=None,strength=None,skip_normalize=False,variants=None):
|
cfg_scale=None,ddim_eta=None,strength=None,skip_normalize=False):
|
||||||
"""
|
"""
|
||||||
Generate an image from the prompt and the initial image, writing iteration images into the outdir
|
Generate an image from the prompt and the initial image, writing iteration images into the outdir
|
||||||
The output is a list of lists in the format: [[filename1,seed1], [filename2,seed2],...]
|
The output is a list of lists in the format: [[filename1,seed1], [filename2,seed2],...]
|
||||||
@ -282,6 +299,10 @@ The vast majority of these arguments default to reasonable values.
|
|||||||
batch_size = batch_size or self.batch_size
|
batch_size = batch_size or self.batch_size
|
||||||
iterations = iterations or self.iterations
|
iterations = iterations or self.iterations
|
||||||
strength = strength or self.strength
|
strength = strength or self.strength
|
||||||
|
embedding_path = embedding_path or self.embedding_path
|
||||||
|
|
||||||
|
assert strength<1.0 and strength>=0.0, "strength (-f) must be >=0.0 and <1.0"
|
||||||
|
assert cfg_scale>1.0, "CFG_Scale (-C) must be >1.0"
|
||||||
|
|
||||||
if init_img is None:
|
if init_img is None:
|
||||||
print("no init_img provided!")
|
print("no init_img provided!")
|
||||||
@ -313,7 +334,7 @@ The vast majority of these arguments default to reasonable values.
|
|||||||
assert os.path.isfile(init_img)
|
assert os.path.isfile(init_img)
|
||||||
init_image = self._load_img(init_img).to(self.device)
|
init_image = self._load_img(init_img).to(self.device)
|
||||||
init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
|
init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
|
||||||
with precision_scope("cuda"):
|
with precision_scope(self.device.type):
|
||||||
init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image)) # move to latent space
|
init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image)) # move to latent space
|
||||||
|
|
||||||
sampler.make_schedule(ddim_num_steps=steps, ddim_eta=ddim_eta, verbose=False)
|
sampler.make_schedule(ddim_num_steps=steps, ddim_eta=ddim_eta, verbose=False)
|
||||||
@ -335,9 +356,7 @@ The vast majority of these arguments default to reasonable values.
|
|||||||
|
|
||||||
# Gawd. Too many levels of indent here. Need to refactor into smaller routines!
|
# Gawd. Too many levels of indent here. Need to refactor into smaller routines!
|
||||||
try:
|
try:
|
||||||
with torch.no_grad():
|
with precision_scope(self.device.type), model.ema_scope():
|
||||||
with precision_scope("cuda"):
|
|
||||||
with model.ema_scope():
|
|
||||||
all_samples = list()
|
all_samples = list()
|
||||||
for n in trange(iterations, desc="Sampling"):
|
for n in trange(iterations, desc="Sampling"):
|
||||||
seed_everything(seed)
|
seed_everything(seed)
|
||||||
@ -430,25 +449,39 @@ The vast majority of these arguments default to reasonable values.
|
|||||||
seed_everything(self.seed)
|
seed_everything(self.seed)
|
||||||
try:
|
try:
|
||||||
config = OmegaConf.load(self.config)
|
config = OmegaConf.load(self.config)
|
||||||
self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
self.device = torch.device(self.device) if torch.cuda.is_available() else torch.device("cpu")
|
||||||
model = self._load_model_from_config(config,self.weights)
|
model = self._load_model_from_config(config,self.weights)
|
||||||
|
if self.embedding_path is not None:
|
||||||
|
model.embedding_manager.load(self.embedding_path)
|
||||||
self.model = model.to(self.device)
|
self.model = model.to(self.device)
|
||||||
|
# model.to doesn't change the cond_stage_model.device used to move the tokenizer output, so set it here
|
||||||
|
self.model.cond_stage_model.device = self.device
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise SystemExit
|
raise SystemExit
|
||||||
|
|
||||||
|
msg = f'setting sampler to {self.sampler_name}'
|
||||||
if self.sampler_name=='plms':
|
if self.sampler_name=='plms':
|
||||||
print("setting sampler to plms")
|
|
||||||
self.sampler = PLMSSampler(self.model)
|
self.sampler = PLMSSampler(self.model)
|
||||||
elif self.sampler_name == 'ddim':
|
elif self.sampler_name == 'ddim':
|
||||||
print("setting sampler to ddim")
|
|
||||||
self.sampler = DDIMSampler(self.model)
|
self.sampler = DDIMSampler(self.model)
|
||||||
elif self.sampler_name == 'klms':
|
elif self.sampler_name == 'k_dpm_2_a':
|
||||||
print("setting sampler to klms")
|
self.sampler = KSampler(self.model,'dpm_2_ancestral')
|
||||||
|
elif self.sampler_name == 'k_dpm_2':
|
||||||
|
self.sampler = KSampler(self.model,'dpm_2')
|
||||||
|
elif self.sampler_name == 'k_euler_a':
|
||||||
|
self.sampler = KSampler(self.model,'euler_ancestral')
|
||||||
|
elif self.sampler_name == 'k_euler':
|
||||||
|
self.sampler = KSampler(self.model,'euler')
|
||||||
|
elif self.sampler_name == 'k_heun':
|
||||||
|
self.sampler = KSampler(self.model,'heun')
|
||||||
|
elif self.sampler_name == 'k_lms':
|
||||||
self.sampler = KSampler(self.model,'lms')
|
self.sampler = KSampler(self.model,'lms')
|
||||||
else:
|
else:
|
||||||
print(f"unsupported sampler {self.sampler_name}, defaulting to plms")
|
msg = f'unsupported sampler {self.sampler_name}, defaulting to plms'
|
||||||
self.sampler = PLMSSampler(self.model)
|
self.sampler = PLMSSampler(self.model)
|
||||||
|
|
||||||
|
print(msg)
|
||||||
|
|
||||||
return self.model
|
return self.model
|
||||||
|
|
||||||
def _load_model_from_config(self, config, ckpt):
|
def _load_model_from_config(self, config, ckpt):
|
||||||
@ -459,7 +492,6 @@ The vast majority of these arguments default to reasonable values.
|
|||||||
sd = pl_sd["state_dict"]
|
sd = pl_sd["state_dict"]
|
||||||
model = instantiate_from_config(config.model)
|
model = instantiate_from_config(config.model)
|
||||||
m, u = model.load_state_dict(sd, strict=False)
|
m, u = model.load_state_dict(sd, strict=False)
|
||||||
model.cuda()
|
|
||||||
model.eval()
|
model.eval()
|
||||||
if self.full_precision:
|
if self.full_precision:
|
||||||
print('Using slower but more accurate full-precision math (--full_precision)')
|
print('Using slower but more accurate full-precision math (--full_precision)')
|
||||||
@ -473,7 +505,7 @@ The vast majority of these arguments default to reasonable values.
|
|||||||
w, h = image.size
|
w, h = image.size
|
||||||
print(f"loaded input image of size ({w}, {h}) from {path}")
|
print(f"loaded input image of size ({w}, {h}) from {path}")
|
||||||
w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
|
w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
|
||||||
image = image.resize((w, h), resample=Image.Resampling.LANCZOS)
|
image = image.resize((w, h), resample=PIL.Image.LANCZOS)
|
||||||
image = np.array(image).astype(np.float32) / 255.0
|
image = np.array(image).astype(np.float32) / 255.0
|
||||||
image = image[None].transpose(0, 3, 1, 2)
|
image = image[None].transpose(0, 3, 1, 2)
|
||||||
image = torch.from_numpy(image)
|
image = torch.from_numpy(image)
|
||||||
|
@ -12,6 +12,7 @@ from queue import Queue
|
|||||||
|
|
||||||
from inspect import isfunction
|
from inspect import isfunction
|
||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
def log_txt_as_img(wh, xc, size=10):
|
def log_txt_as_img(wh, xc, size=10):
|
||||||
# wh a tuple of (width, height)
|
# wh a tuple of (width, height)
|
||||||
# xc a list of captions to plot
|
# xc a list of captions to plot
|
||||||
@ -20,7 +21,7 @@ def log_txt_as_img(wh, xc, size=10):
|
|||||||
for bi in range(b):
|
for bi in range(b):
|
||||||
txt = Image.new("RGB", wh, color="white")
|
txt = Image.new("RGB", wh, color="white")
|
||||||
draw = ImageDraw.Draw(txt)
|
draw = ImageDraw.Draw(txt)
|
||||||
font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
|
font = ImageFont.load_default()
|
||||||
nc = int(40 * (wh[0] / 256))
|
nc = int(40 * (wh[0] / 256))
|
||||||
lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
|
lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
|
||||||
|
|
||||||
@ -73,14 +74,14 @@ def count_params(model, verbose=False):
|
|||||||
return total_params
|
return total_params
|
||||||
|
|
||||||
|
|
||||||
def instantiate_from_config(config):
|
def instantiate_from_config(config, **kwargs):
|
||||||
if not "target" in config:
|
if not "target" in config:
|
||||||
if config == '__is_first_stage__':
|
if config == '__is_first_stage__':
|
||||||
return None
|
return None
|
||||||
elif config == "__is_unconditional__":
|
elif config == "__is_unconditional__":
|
||||||
return None
|
return None
|
||||||
raise KeyError("Expected key `target` to instantiate.")
|
raise KeyError("Expected key `target` to instantiate.")
|
||||||
return get_obj_from_str(config["target"])(**config.get("params", dict()))
|
return get_obj_from_str(config["target"])(**config.get("params", dict()), **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def get_obj_from_str(string, reload=False):
|
def get_obj_from_str(string, reload=False):
|
||||||
|
62
main.py
62
main.py
@ -2,6 +2,7 @@ import argparse, os, sys, datetime, glob, importlib, csv
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import time
|
import time
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
import torchvision
|
import torchvision
|
||||||
import pytorch_lightning as pl
|
import pytorch_lightning as pl
|
||||||
|
|
||||||
@ -20,6 +21,22 @@ from pytorch_lightning.utilities import rank_zero_info
|
|||||||
from ldm.data.base import Txt2ImgIterableBaseDataset
|
from ldm.data.base import Txt2ImgIterableBaseDataset
|
||||||
from ldm.util import instantiate_from_config
|
from ldm.util import instantiate_from_config
|
||||||
|
|
||||||
|
def load_model_from_config(config, ckpt, verbose=False):
|
||||||
|
print(f"Loading model from {ckpt}")
|
||||||
|
pl_sd = torch.load(ckpt, map_location="cpu")
|
||||||
|
sd = pl_sd["state_dict"]
|
||||||
|
config.model.params.ckpt_path = ckpt
|
||||||
|
model = instantiate_from_config(config.model)
|
||||||
|
m, u = model.load_state_dict(sd, strict=False)
|
||||||
|
if len(m) > 0 and verbose:
|
||||||
|
print("missing keys:")
|
||||||
|
print(m)
|
||||||
|
if len(u) > 0 and verbose:
|
||||||
|
print("unexpected keys:")
|
||||||
|
print(u)
|
||||||
|
|
||||||
|
model.cuda()
|
||||||
|
return model
|
||||||
|
|
||||||
def get_parser(**parser_kwargs):
|
def get_parser(**parser_kwargs):
|
||||||
def str2bool(v):
|
def str2bool(v):
|
||||||
@ -120,6 +137,23 @@ def get_parser(**parser_kwargs):
|
|||||||
default=True,
|
default=True,
|
||||||
help="scale base-lr by ngpu * batch_size * n_accumulate",
|
help="scale base-lr by ngpu * batch_size * n_accumulate",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--datadir_in_name",
|
||||||
|
type=str2bool,
|
||||||
|
nargs="?",
|
||||||
|
const=True,
|
||||||
|
default=True,
|
||||||
|
help="Prepend the final directory in the data_root to the output directory name")
|
||||||
|
|
||||||
|
parser.add_argument("--actual_resume", type=str, default="", help="Path to model to actually resume from")
|
||||||
|
parser.add_argument("--data_root", type=str, required=True, help="Path to directory with training images")
|
||||||
|
|
||||||
|
parser.add_argument("--embedding_manager_ckpt", type=str, default="", help="Initialize embedding manager from a checkpoint")
|
||||||
|
parser.add_argument("--placeholder_tokens", type=str, nargs="+", default=["*"])
|
||||||
|
|
||||||
|
parser.add_argument("--init_word", type=str, help="Word to use as source for initial token embedding.")
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
@ -502,6 +536,10 @@ if __name__ == "__main__":
|
|||||||
name = "_" + cfg_name
|
name = "_" + cfg_name
|
||||||
else:
|
else:
|
||||||
name = ""
|
name = ""
|
||||||
|
|
||||||
|
if opt.datadir_in_name:
|
||||||
|
now = os.path.basename(os.path.normpath(opt.data_root)) + now
|
||||||
|
|
||||||
nowname = now + name + opt.postfix
|
nowname = now + name + opt.postfix
|
||||||
logdir = os.path.join(opt.logdir, nowname)
|
logdir = os.path.join(opt.logdir, nowname)
|
||||||
|
|
||||||
@ -532,6 +570,17 @@ if __name__ == "__main__":
|
|||||||
lightning_config.trainer = trainer_config
|
lightning_config.trainer = trainer_config
|
||||||
|
|
||||||
# model
|
# model
|
||||||
|
|
||||||
|
# config.model.params.personalization_config.params.init_word = opt.init_word
|
||||||
|
config.model.params.personalization_config.params.embedding_manager_ckpt = opt.embedding_manager_ckpt
|
||||||
|
config.model.params.personalization_config.params.placeholder_tokens = opt.placeholder_tokens
|
||||||
|
|
||||||
|
if opt.init_word:
|
||||||
|
config.model.params.personalization_config.params.initializer_words[0] = opt.init_word
|
||||||
|
|
||||||
|
if opt.actual_resume:
|
||||||
|
model = load_model_from_config(config, opt.actual_resume)
|
||||||
|
else:
|
||||||
model = instantiate_from_config(config.model)
|
model = instantiate_from_config(config.model)
|
||||||
|
|
||||||
# trainer and callbacks
|
# trainer and callbacks
|
||||||
@ -655,11 +704,16 @@ if __name__ == "__main__":
|
|||||||
del callbacks_cfg['ignore_keys_callback']
|
del callbacks_cfg['ignore_keys_callback']
|
||||||
|
|
||||||
trainer_kwargs["callbacks"] = [instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg]
|
trainer_kwargs["callbacks"] = [instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg]
|
||||||
|
trainer_kwargs["max_steps"] = opt.max_steps
|
||||||
|
|
||||||
trainer = Trainer.from_argparse_args(trainer_opt, **trainer_kwargs)
|
trainer = Trainer.from_argparse_args(trainer_opt, **trainer_kwargs)
|
||||||
trainer.logdir = logdir ###
|
trainer.logdir = logdir ###
|
||||||
|
|
||||||
# data
|
# data
|
||||||
|
config.data.params.train.params.data_root = opt.data_root
|
||||||
|
config.data.params.validation.params.data_root = opt.data_root
|
||||||
|
data = instantiate_from_config(config.data)
|
||||||
|
|
||||||
data = instantiate_from_config(config.data)
|
data = instantiate_from_config(config.data)
|
||||||
# NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
|
# NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
|
||||||
# calling these ourselves should not be necessary but it is.
|
# calling these ourselves should not be necessary but it is.
|
||||||
@ -710,8 +764,8 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
import signal
|
import signal
|
||||||
|
|
||||||
signal.signal(signal.SIGUSR1, melk)
|
signal.signal(signal.SIGTERM, melk)
|
||||||
signal.signal(signal.SIGUSR2, divein)
|
signal.signal(signal.SIGTERM, divein)
|
||||||
|
|
||||||
# run
|
# run
|
||||||
if opt.train:
|
if opt.train:
|
||||||
@ -737,5 +791,5 @@ if __name__ == "__main__":
|
|||||||
dst = os.path.join(dst, "debug_runs", name)
|
dst = os.path.join(dst, "debug_runs", name)
|
||||||
os.makedirs(os.path.split(dst)[0], exist_ok=True)
|
os.makedirs(os.path.split(dst)[0], exist_ok=True)
|
||||||
os.rename(logdir, dst)
|
os.rename(logdir, dst)
|
||||||
if trainer.global_rank == 0:
|
# if trainer.global_rank == 0:
|
||||||
print(trainer.profiler.summary())
|
# print(trainer.profiler.summary())
|
||||||
|
@ -9,6 +9,7 @@ kornia==0.6.0
|
|||||||
numpy==1.19.2
|
numpy==1.19.2
|
||||||
omegaconf==2.1.1
|
omegaconf==2.1.1
|
||||||
opencv-python==4.1.2.30
|
opencv-python==4.1.2.30
|
||||||
|
pillow==9.0.1
|
||||||
pudb==2019.2
|
pudb==2019.2
|
||||||
pytorch
|
pytorch
|
||||||
pytorch-lightning==1.4.2
|
pytorch-lightning==1.4.2
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
# Copyright (c) 2022 Lincoln D. Stein (https://github.com/lstein)
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import shlex
|
import shlex
|
||||||
import atexit
|
import atexit
|
||||||
@ -58,7 +60,9 @@ def main():
|
|||||||
weights=weights,
|
weights=weights,
|
||||||
full_precision=opt.full_precision,
|
full_precision=opt.full_precision,
|
||||||
config=config,
|
config=config,
|
||||||
latent_diffusion_weights=opt.laion400m # this is solely for recreating the prompt
|
latent_diffusion_weights=opt.laion400m, # this is solely for recreating the prompt
|
||||||
|
embedding_path=opt.embedding_path,
|
||||||
|
device=opt.device
|
||||||
)
|
)
|
||||||
|
|
||||||
# make sure the output directory exists
|
# make sure the output directory exists
|
||||||
@ -108,6 +112,10 @@ def main_loop(t2i,parser,log,infile):
|
|||||||
if command.startswith(('#','//')):
|
if command.startswith(('#','//')):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# before splitting, escape single quotes so as not to mess
|
||||||
|
# up the parser
|
||||||
|
command = command.replace("'","\\'")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
elements = shlex.split(command)
|
elements = shlex.split(command)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
@ -159,10 +167,16 @@ def main_loop(t2i,parser,log,infile):
|
|||||||
print("Try again with a prompt!")
|
print("Try again with a prompt!")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
if opt.init_img is None:
|
if opt.init_img is None:
|
||||||
results = t2i.txt2img(**vars(opt))
|
results = t2i.txt2img(**vars(opt))
|
||||||
else:
|
else:
|
||||||
|
assert os.path.exists(opt.init_img),f"No file found at {opt.init_img}. On Linux systems, pressing <tab> after -I will autocomplete a list of possible image files."
|
||||||
results = t2i.img2img(**vars(opt))
|
results = t2i.img2img(**vars(opt))
|
||||||
|
except AssertionError as e:
|
||||||
|
print(e)
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
allVariantResults = []
|
allVariantResults = []
|
||||||
if opt.variants is not None:
|
if opt.variants is not None:
|
||||||
@ -175,7 +189,11 @@ def main_loop(t2i,parser,log,infile):
|
|||||||
for j in range(0, opt.variants):
|
for j in range(0, opt.variants):
|
||||||
newopt.init_img = resultPath
|
newopt.init_img = resultPath
|
||||||
print(f"{newopt.init_img}")
|
print(f"{newopt.init_img}")
|
||||||
|
try:
|
||||||
variantResults = t2i.img2img(**vars(newopt))
|
variantResults = t2i.img2img(**vars(newopt))
|
||||||
|
except AssertionError as e:
|
||||||
|
print(e)
|
||||||
|
continue
|
||||||
allVariantResults.append([newopt,variantResults])
|
allVariantResults.append([newopt,variantResults])
|
||||||
print(f"{opt.variants} Variants generated!")
|
print(f"{opt.variants} Variants generated!")
|
||||||
|
|
||||||
@ -242,6 +260,7 @@ def _reconstruct_switches(t2i,opt):
|
|||||||
switches.append(f'-W{opt.width or t2i.width}')
|
switches.append(f'-W{opt.width or t2i.width}')
|
||||||
switches.append(f'-H{opt.height or t2i.height}')
|
switches.append(f'-H{opt.height or t2i.height}')
|
||||||
switches.append(f'-C{opt.cfg_scale or t2i.cfg_scale}')
|
switches.append(f'-C{opt.cfg_scale or t2i.cfg_scale}')
|
||||||
|
switches.append(f'-m{t2i.sampler_name}')
|
||||||
if opt.init_img:
|
if opt.init_img:
|
||||||
switches.append(f'-I{opt.init_img}')
|
switches.append(f'-I{opt.init_img}')
|
||||||
if opt.strength and opt.init_img is not None:
|
if opt.strength and opt.init_img is not None:
|
||||||
@ -282,14 +301,22 @@ def create_argv_parser():
|
|||||||
help="number of images to produce per iteration (faster, but doesn't generate individual seeds")
|
help="number of images to produce per iteration (faster, but doesn't generate individual seeds")
|
||||||
parser.add_argument('--sampler','-m',
|
parser.add_argument('--sampler','-m',
|
||||||
dest="sampler_name",
|
dest="sampler_name",
|
||||||
choices=['plms','ddim', 'klms'],
|
choices=['ddim', 'k_dpm_2_a', 'k_dpm_2', 'k_euler_a', 'k_euler', 'k_heun', 'k_lms', 'plms'],
|
||||||
default='klms',
|
default='k_lms',
|
||||||
help="which sampler to use (klms) - can only be set on command line")
|
help="which sampler to use (k_lms) - can only be set on command line")
|
||||||
parser.add_argument('--outdir',
|
parser.add_argument('--outdir',
|
||||||
'-o',
|
'-o',
|
||||||
type=str,
|
type=str,
|
||||||
default="outputs/img-samples",
|
default="outputs/img-samples",
|
||||||
help="directory in which to place generated images and a log of prompts and seeds")
|
help="directory in which to place generated images and a log of prompts and seeds")
|
||||||
|
parser.add_argument('--embedding_path',
|
||||||
|
type=str,
|
||||||
|
help="Path to a pre-trained embedding manager checkpoint - can only be set on command line")
|
||||||
|
parser.add_argument('--device',
|
||||||
|
'-d',
|
||||||
|
type=str,
|
||||||
|
default="cuda",
|
||||||
|
help="device to run stable diffusion on. defaults to cuda `torch.cuda.current_device()` if avalible")
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
@ -397,3 +424,4 @@ if readline_available:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
83
scripts/merge_embeddings.py
Normal file
83
scripts/merge_embeddings.py
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
from ldm.modules.encoders.modules import BERTTokenizer
|
||||||
|
from ldm.modules.embedding_manager import EmbeddingManager
|
||||||
|
|
||||||
|
import argparse, os
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
def get_placeholder_loop(placeholder_string, tokenizer):
|
||||||
|
|
||||||
|
new_placeholder = None
|
||||||
|
|
||||||
|
while True:
|
||||||
|
if new_placeholder is None:
|
||||||
|
new_placeholder = input(f"Placeholder string {placeholder_string} was already used. Please enter a replacement string: ")
|
||||||
|
else:
|
||||||
|
new_placeholder = input(f"Placeholder string '{new_placeholder}' maps to more than a single token. Please enter another string: ")
|
||||||
|
|
||||||
|
token = tokenizer(new_placeholder)
|
||||||
|
|
||||||
|
if torch.count_nonzero(token) == 3:
|
||||||
|
return new_placeholder, token[0, 1]
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--manager_ckpts",
|
||||||
|
type=str,
|
||||||
|
nargs="+",
|
||||||
|
required=True,
|
||||||
|
help="Paths to a set of embedding managers to be merged."
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--output_path",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Output path for the merged manager",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
tokenizer = BERTTokenizer(vq_interface=False, max_length=77)
|
||||||
|
EmbeddingManager = partial(EmbeddingManager, tokenizer, ["*"])
|
||||||
|
|
||||||
|
string_to_token_dict = {}
|
||||||
|
string_to_param_dict = torch.nn.ParameterDict()
|
||||||
|
|
||||||
|
placeholder_to_src = {}
|
||||||
|
|
||||||
|
for manager_ckpt in args.manager_ckpts:
|
||||||
|
print(f"Parsing {manager_ckpt}...")
|
||||||
|
|
||||||
|
manager = EmbeddingManager()
|
||||||
|
manager.load(manager_ckpt)
|
||||||
|
|
||||||
|
for placeholder_string in manager.string_to_token_dict:
|
||||||
|
if not placeholder_string in string_to_token_dict:
|
||||||
|
string_to_token_dict[placeholder_string] = manager.string_to_token_dict[placeholder_string]
|
||||||
|
string_to_param_dict[placeholder_string] = manager.string_to_param_dict[placeholder_string]
|
||||||
|
|
||||||
|
placeholder_to_src[placeholder_string] = manager_ckpt
|
||||||
|
else:
|
||||||
|
new_placeholder, new_token = get_placeholder_loop(placeholder_string, tokenizer)
|
||||||
|
string_to_token_dict[new_placeholder] = new_token
|
||||||
|
string_to_param_dict[new_placeholder] = manager.string_to_param_dict[placeholder_string]
|
||||||
|
|
||||||
|
placeholder_to_src[new_placeholder] = manager_ckpt
|
||||||
|
|
||||||
|
print("Saving combined manager...")
|
||||||
|
merged_manager = EmbeddingManager()
|
||||||
|
merged_manager.string_to_param_dict = string_to_param_dict
|
||||||
|
merged_manager.string_to_token_dict = string_to_token_dict
|
||||||
|
merged_manager.save(args.output_path)
|
||||||
|
|
||||||
|
print("Managers merged. Final list of placeholders: ")
|
||||||
|
print(placeholder_to_src)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
# Copyright (c) 2022 Lincoln D. Stein (https://github.com/lstein)
|
||||||
# Before running stable-diffusion on an internet-isolated machine,
|
# Before running stable-diffusion on an internet-isolated machine,
|
||||||
# run this script from one with internet connectivity. The
|
# run this script from one with internet connectivity. The
|
||||||
# two machines must share a common .cache directory.
|
# two machines must share a common .cache directory.
|
||||||
@ -30,3 +31,4 @@ tokenizer =CLIPTokenizer.from_pretrained(version)
|
|||||||
transformer=CLIPTextModel.from_pretrained(version)
|
transformer=CLIPTextModel.from_pretrained(version)
|
||||||
print('\n\n...success')
|
print('\n\n...success')
|
||||||
|
|
||||||
|
|
||||||
|
1
src/clip
Submodule
1
src/clip
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit d50d76daa670286dd6cacf3bcd80b5e4823fc8e1
|
1
src/k-diffusion
Submodule
1
src/k-diffusion
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit db5799068749bf3a6d5845120ed32df16b7d883b
|
1
src/taming-transformers
Submodule
1
src/taming-transformers
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 24268930bf1dce879235a7fddd0b2355b84d7ea6
|
Loading…
Reference in New Issue
Block a user