mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
Merge branch 'main' into dev/installer
This commit is contained in:
commit
8ce9f07223
@ -1,19 +1,18 @@
|
||||
*
|
||||
!assets/caution.png
|
||||
!backend
|
||||
!environments-and-requirements
|
||||
!frontend
|
||||
!frontend/dist
|
||||
!ldm
|
||||
!main.py
|
||||
!pyproject.toml
|
||||
!README.md
|
||||
!scripts
|
||||
!server
|
||||
!static
|
||||
!setup.py
|
||||
|
||||
# Guard against pulling in any models that might exist in the directory tree
|
||||
**/*.pt*
|
||||
**.pt*
|
||||
|
||||
# unignore configs, but only ignore the custom models.yaml, in case it exists
|
||||
!configs
|
||||
configs/models.yaml
|
||||
configs/models.yaml.orig
|
||||
|
||||
**/__pycache__
|
||||
|
1
.github/workflows/build-cloud-img.yml
vendored
1
.github/workflows/build-cloud-img.yml
vendored
@ -21,6 +21,7 @@ env:
|
||||
|
||||
jobs:
|
||||
docker:
|
||||
if: github.event.pull_request.draft == false
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
|
35
.github/workflows/build-container.yml
vendored
35
.github/workflows/build-container.yml
vendored
@ -3,63 +3,60 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- 'main'
|
||||
tags:
|
||||
- 'v*.*.*'
|
||||
|
||||
jobs:
|
||||
docker:
|
||||
if: github.event.pull_request.draft == false
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
registry:
|
||||
- ghcr.io
|
||||
flavor:
|
||||
- amd
|
||||
- cuda
|
||||
# - cloud
|
||||
include:
|
||||
- flavor: amd
|
||||
pip-requirements: requirements-lin-amd.txt
|
||||
pip-extra-index-url: 'https://download.pytorch.org/whl/rocm5.2'
|
||||
dockerfile: docker-build/Dockerfile
|
||||
platforms: linux/amd64,linux/arm64
|
||||
- flavor: cuda
|
||||
pip-requirements: requirements-lin-cuda.txt
|
||||
pip-extra-index-url: ''
|
||||
dockerfile: docker-build/Dockerfile
|
||||
platforms: linux/amd64,linux/arm64
|
||||
# - flavor: cloud
|
||||
# pip-requirements: requirements-lin-cuda.txt
|
||||
# dockerfile: docker-build/Dockerfile.cloud
|
||||
# platforms: linux/amd64
|
||||
runs-on: ubuntu-latest
|
||||
name: ${{ matrix.flavor }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v4
|
||||
with:
|
||||
images: ${{ matrix.registry }}/${{ github.repository }}-${{ matrix.flavor }}
|
||||
images: ghcr.io/${{ github.repository }}-${{ matrix.flavor }}
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
type=ref,event=tag
|
||||
type=semver,pattern={{version}}
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
type=semver,pattern={{major}}
|
||||
type=sha
|
||||
flavor: |
|
||||
latest=true
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
|
||||
- if: github.event_name != 'pull_request'
|
||||
name: Docker login
|
||||
- name: Login to GitHub Container Registry
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
registry: ${{ matrix.registry }}
|
||||
username: ${{ github.actor }}
|
||||
registry: ghcr.io
|
||||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Build container
|
||||
@ -71,4 +68,6 @@ jobs:
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
build-args: pip_requirements=${{ matrix.pip-requirements }}
|
||||
build-args: PIP_EXTRA_INDEX_URL=${{ matrix.pip-extra-index-url }}
|
||||
# cache-from: type=gha
|
||||
# cache-to: type=gha,mode=max
|
||||
|
34
.github/workflows/clean-caches.yml
vendored
Normal file
34
.github/workflows/clean-caches.yml
vendored
Normal file
@ -0,0 +1,34 @@
|
||||
name: cleanup caches by a branch
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
- closed
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
cleanup:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Cleanup
|
||||
run: |
|
||||
gh extension install actions/gh-actions-cache
|
||||
|
||||
REPO=${{ github.repository }}
|
||||
BRANCH=${{ github.ref }}
|
||||
|
||||
echo "Fetching list of cache key"
|
||||
cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH | cut -f 1 )
|
||||
|
||||
## Setting this to not fail the workflow while deleting cache keys.
|
||||
set +e
|
||||
echo "Deleting caches..."
|
||||
for cacheKey in $cacheKeysForPR
|
||||
do
|
||||
gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm
|
||||
done
|
||||
echo "Done"
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
1
.github/workflows/lint-frontend.yml
vendored
1
.github/workflows/lint-frontend.yml
vendored
@ -14,6 +14,7 @@ defaults:
|
||||
|
||||
jobs:
|
||||
lint-frontend:
|
||||
if: github.event.pull_request.draft == false
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Setup Node 18
|
||||
|
1
.github/workflows/mkdocs-material.yml
vendored
1
.github/workflows/mkdocs-material.yml
vendored
@ -7,6 +7,7 @@ on:
|
||||
|
||||
jobs:
|
||||
mkdocs-material:
|
||||
if: github.event.pull_request.draft == false
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: checkout sources
|
||||
|
1
.github/workflows/pyflakes.yml
vendored
1
.github/workflows/pyflakes.yml
vendored
@ -9,6 +9,7 @@ on:
|
||||
jobs:
|
||||
pyflakes:
|
||||
name: runner / pyflakes
|
||||
if: github.event.pull_request.draft == false
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
@ -1,59 +1,71 @@
|
||||
FROM python:3.10-slim AS builder
|
||||
# syntax=docker/dockerfile:1
|
||||
FROM python:3.9-slim AS python-base
|
||||
|
||||
# use bash
|
||||
SHELL [ "/bin/bash", "-c" ]
|
||||
|
||||
# Install necesarry packages
|
||||
RUN apt-get update \
|
||||
RUN \
|
||||
--mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||
apt-get update \
|
||||
&& apt-get install -y \
|
||||
--no-install-recommends \
|
||||
libgl1-mesa-glx=20.3.* \
|
||||
libglib2.0-0=2.66.* \
|
||||
libopencv-dev=4.5.* \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ARG APPDIR=/usr/src/app
|
||||
ENV APPDIR ${APPDIR}
|
||||
WORKDIR ${APPDIR}
|
||||
|
||||
FROM python-base AS builder
|
||||
|
||||
RUN \
|
||||
--mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||
apt-get update \
|
||||
&& apt-get install -y \
|
||||
--no-install-recommends \
|
||||
gcc=4:10.2.* \
|
||||
libgl1-mesa-glx=20.3.* \
|
||||
libglib2.0-0=2.66.* \
|
||||
python3-dev=3.9.* \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# set WORKDIR, PATH and copy sources
|
||||
ARG APPDIR=/usr/src/app
|
||||
WORKDIR ${APPDIR}
|
||||
ENV PATH ${APPDIR}/.venv/bin:$PATH
|
||||
ARG PIP_REQUIREMENTS=requirements-lin-cuda.txt
|
||||
COPY . ./environments-and-requirements/${PIP_REQUIREMENTS} ./
|
||||
# copy sources
|
||||
COPY --link . .
|
||||
ARG PIP_EXTRA_INDEX_URL
|
||||
ENV PIP_EXTRA_INDEX_URL ${PIP_EXTRA_INDEX_URL}
|
||||
|
||||
# install requirements
|
||||
RUN python3 -m venv .venv \
|
||||
&& pip install \
|
||||
--upgrade \
|
||||
RUN python3 -m venv invokeai \
|
||||
&& ${APPDIR}/invokeai/bin/pip \
|
||||
install \
|
||||
--no-cache-dir \
|
||||
'wheel>=0.38.4' \
|
||||
&& pip install \
|
||||
--no-cache-dir \
|
||||
-r ${PIP_REQUIREMENTS}
|
||||
--use-pep517 \
|
||||
.
|
||||
|
||||
FROM python:3.10-slim AS runtime
|
||||
FROM python-base AS runtime
|
||||
|
||||
# setup environment
|
||||
ARG APPDIR=/usr/src/app
|
||||
WORKDIR ${APPDIR}
|
||||
COPY --from=builder ${APPDIR} .
|
||||
ENV \
|
||||
PATH=${APPDIR}/.venv/bin:$PATH \
|
||||
INVOKEAI_ROOT=/data \
|
||||
INVOKE_MODEL_RECONFIGURE=--yes
|
||||
COPY --link . .
|
||||
COPY --from=builder ${APPDIR}/invokeai ${APPDIR}/invokeai
|
||||
ENV PATH=${APPDIR}/invokeai/bin:$PATH
|
||||
ENV INVOKEAI_ROOT=/data
|
||||
ENV INVOKE_MODEL_RECONFIGURE="--yes --default_only"
|
||||
|
||||
# Install necesarry packages
|
||||
RUN apt-get update \
|
||||
# build patchmatch
|
||||
RUN \
|
||||
--mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||
apt-get update \
|
||||
&& apt-get install -y \
|
||||
--no-install-recommends \
|
||||
build-essential=12.9 \
|
||||
libgl1-mesa-glx=20.3.* \
|
||||
libglib2.0-0=2.66.* \
|
||||
libopencv-dev=4.5.* \
|
||||
&& ln -sf \
|
||||
/usr/lib/"$(arch)"-linux-gnu/pkgconfig/opencv4.pc \
|
||||
/usr/lib/"$(arch)"-linux-gnu/pkgconfig/opencv.pc \
|
||||
&& python3 -c "from patchmatch import patch_match" \
|
||||
&& PYTHONDONTWRITEBYTECODE=1 \
|
||||
python3 -c "from patchmatch import patch_match" \
|
||||
&& apt-get remove -y \
|
||||
--autoremove \
|
||||
build-essential \
|
||||
@ -61,5 +73,6 @@ RUN apt-get update \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# set Entrypoint and default CMD
|
||||
ENTRYPOINT [ "python3", "scripts/invoke.py" ]
|
||||
ENTRYPOINT [ "invoke" ]
|
||||
CMD [ "--web", "--host=0.0.0.0" ]
|
||||
VOLUME [ "/data" ]
|
||||
|
@ -2,34 +2,41 @@
|
||||
set -e
|
||||
|
||||
# How to use: https://invoke-ai.github.io/InvokeAI/installation/INSTALL_DOCKER/#setup
|
||||
#
|
||||
# Some possible pip extra-index urls (cuda 11.7 is available without extra url):
|
||||
#
|
||||
# CUDA 11.6: https://download.pytorch.org/whl/cu116
|
||||
# ROCm 5.2: https://download.pytorch.org/whl/rocm5.2
|
||||
# CPU: https://download.pytorch.org/whl/cpu
|
||||
#
|
||||
# as found on https://pytorch.org/get-started/locally/
|
||||
|
||||
source ./docker-build/env.sh \
|
||||
|| echo "please execute docker-build/build.sh from repository root" \
|
||||
|| exit 1
|
||||
cd "$(dirname "$0")" || exit 1
|
||||
|
||||
PIP_REQUIREMENTS=${PIP_REQUIREMENTS:-requirements-lin-cuda.txt}
|
||||
DOCKERFILE=${INVOKE_DOCKERFILE:-docker-build/Dockerfile}
|
||||
source ./env.sh
|
||||
|
||||
DOCKERFILE=${INVOKE_DOCKERFILE:-"./Dockerfile"}
|
||||
|
||||
# print the settings
|
||||
echo -e "You are using these values:\n"
|
||||
echo -e "Dockerfile:\t ${DOCKERFILE}"
|
||||
echo -e "Requirements:\t ${PIP_REQUIREMENTS}"
|
||||
echo -e "extra-index-url: ${PIP_EXTRA_INDEX_URL:-none}"
|
||||
echo -e "Volumename:\t ${VOLUMENAME}"
|
||||
echo -e "arch:\t\t ${ARCH}"
|
||||
echo -e "Platform:\t ${PLATFORM}"
|
||||
echo -e "Invokeai_tag:\t ${INVOKEAI_TAG}\n"
|
||||
|
||||
if [[ -n "$(docker volume ls -f name="${VOLUMENAME}" -q)" ]]; then
|
||||
echo -e "Volume already exists\n"
|
||||
echo -e "Volume already exists\n"
|
||||
else
|
||||
echo -n "createing docker volume "
|
||||
docker volume create "${VOLUMENAME}"
|
||||
echo -n "createing docker volume "
|
||||
docker volume create "${VOLUMENAME}"
|
||||
fi
|
||||
|
||||
# Build Container
|
||||
docker build \
|
||||
--platform="${PLATFORM}" \
|
||||
--tag="${INVOKEAI_TAG}" \
|
||||
--build-arg="PIP_REQUIREMENTS=${PIP_REQUIREMENTS}" \
|
||||
--file="${DOCKERFILE}" \
|
||||
.
|
||||
--platform="${PLATFORM}" \
|
||||
--tag="${INVOKEAI_TAG}" \
|
||||
${PIP_EXTRA_INDEX_URL:+--build-arg=PIP_EXTRA_INDEX_URL="${PIP_EXTRA_INDEX_URL}"} \
|
||||
--file="${DOCKERFILE}" \
|
||||
..
|
||||
|
@ -7,4 +7,4 @@ ARCH=${ARCH:-$(uname -m)}
|
||||
PLATFORM=${PLATFORM:-Linux/${ARCH}}
|
||||
CONTAINER_FLAVOR=${CONTAINER_FLAVOR:-cuda}
|
||||
INVOKEAI_BRANCH=$(git branch --show)
|
||||
INVOKEAI_TAG=${REPOSITORY_NAME,,}-${CONTAINER_FLAVOR}:${INVOKEAI_TAG:-${INVOKEAI_BRANCH/\//-}}
|
||||
INVOKEAI_TAG=${REPOSITORY_NAME,,}-${CONTAINER_FLAVOR}:${INVOKEAI_TAG:-${INVOKEAI_BRANCH##*/}}
|
||||
|
@ -4,17 +4,14 @@ set -e
|
||||
# How to use: https://invoke-ai.github.io/InvokeAI/installation/INSTALL_DOCKER/#run-the-container
|
||||
# IMPORTANT: You need to have a token on huggingface.co to be able to download the checkpoints!!!
|
||||
|
||||
source ./docker-build/env.sh \
|
||||
|| echo "please run from repository root" \
|
||||
|| exit 1
|
||||
cd "$(dirname "$0")" || exit 1
|
||||
|
||||
# check if HUGGINGFACE_TOKEN is available
|
||||
# You must have accepted the terms of use for required models
|
||||
HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:?Please set your token for Huggingface as HUGGINGFACE_TOKEN}
|
||||
source ./env.sh
|
||||
|
||||
echo -e "You are using these values:\n"
|
||||
echo -e "Volumename:\t ${VOLUMENAME}"
|
||||
echo -e "Invokeai_tag:\t ${INVOKEAI_TAG}\n"
|
||||
echo -e "Volumename:\t${VOLUMENAME}"
|
||||
echo -e "Invokeai_tag:\t${INVOKEAI_TAG}"
|
||||
echo -e "local Models:\t${MODELSPATH:-unset}\n"
|
||||
|
||||
docker run \
|
||||
--interactive \
|
||||
@ -23,8 +20,10 @@ docker run \
|
||||
--platform="$PLATFORM" \
|
||||
--name="${REPOSITORY_NAME,,}" \
|
||||
--hostname="${REPOSITORY_NAME,,}" \
|
||||
--mount="source=$VOLUMENAME,target=/data" \
|
||||
--env="HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN}" \
|
||||
--mount=source="$VOLUMENAME",target=/data \
|
||||
${MODELSPATH:+-u "$(id -u):$(id -g)"} \
|
||||
${MODELSPATH:+--mount=type=bind,source=${MODELSPATH},target=/data/models} \
|
||||
${HUGGING_FACE_HUB_TOKEN:+--env=HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}} \
|
||||
--publish=9090:9090 \
|
||||
--cap-add=sys_nice \
|
||||
${GPU_FLAGS:+--gpus=${GPU_FLAGS}} \
|
||||
|
@ -52,12 +52,17 @@ introduces several changes you should know about.
|
||||
path: models/diffusers/hakurei-haifu-diffusion-1.4
|
||||
```
|
||||
|
||||
2. The format of the models directory has changed to mimic the
|
||||
HuggingFace cache directory. By default, diffusers models are
|
||||
now automatically downloaded and retrieved from the directory
|
||||
`ROOTDIR/models/diffusers`, while other models are stored in
|
||||
the directory `ROOTDIR/models/hub`. This organization is the
|
||||
same as that used by HuggingFace for its cache management.
|
||||
2. In order of precedence, InvokeAI will now use HF_HOME, then
|
||||
XDG_CACHE_HOME, then finally default to `ROOTDIR/models` to
|
||||
store HuggingFace diffusers models.
|
||||
|
||||
Consequently, the format of the models directory has changed to
|
||||
mimic the HuggingFace cache directory. When HF_HOME and XDG_HOME
|
||||
are not set, diffusers models are now automatically downloaded
|
||||
and retrieved from the directory `ROOTDIR/models/diffusers`,
|
||||
while other models are stored in the directory
|
||||
`ROOTDIR/models/hub`. This organization is the same as that used
|
||||
by HuggingFace for its cache management.
|
||||
|
||||
This allows you to share diffusers and ckpt model files easily with
|
||||
other machine learning applications that use the HuggingFace
|
||||
@ -66,7 +71,13 @@ introduces several changes you should know about.
|
||||
cache models in. To tell InvokeAI to use the standard HuggingFace
|
||||
cache directory, you would set HF_HOME like this (Linux/Mac):
|
||||
|
||||
`export HF_HOME=~/.cache/hugging_face`
|
||||
`export HF_HOME=~/.cache/huggingface`
|
||||
|
||||
Both HuggingFace and InvokeAI will fall back to the XDG_CACHE_HOME
|
||||
environment variable if HF_HOME is not set; this path
|
||||
takes precedence over `ROOTDIR/models` to allow for the same sharing
|
||||
with other machine learning applications that use HuggingFace
|
||||
libraries.
|
||||
|
||||
3. If you upgrade to InvokeAI 2.3.* from an earlier version, there
|
||||
will be a one-time migration from the old models directory format
|
||||
|
@ -239,28 +239,24 @@ Generate an image with a given prompt, record the seed of the image, and then
|
||||
use the `prompt2prompt` syntax to substitute words in the original prompt for
|
||||
words in a new prompt. This works for `img2img` as well.
|
||||
|
||||
- `a ("fluffy cat").swap("smiling dog") eating a hotdog`.
|
||||
- quotes optional: `a (fluffy cat).swap(smiling dog) eating a hotdog`.
|
||||
- for single word substitutions parentheses are also optional:
|
||||
`a cat.swap(dog) eating a hotdog`.
|
||||
- Supports options `s_start`, `s_end`, `t_start`, `t_end` (each 0-1) loosely
|
||||
corresponding to bloc97's `prompt_edit_spatial_start/_end` and
|
||||
`prompt_edit_tokens_start/_end` but with the math swapped to make it easier to
|
||||
intuitively understand.
|
||||
- Example usage:`a (cat).swap(dog, s_end=0.3) eating a hotdog` - the `s_end`
|
||||
argument means that the "spatial" (self-attention) edit will stop having any
|
||||
effect after 30% (=0.3) of the steps have been done, leaving Stable
|
||||
Diffusion with 70% of the steps where it is free to decide for itself how to
|
||||
reshape the cat-form into a dog form.
|
||||
- The numbers represent a percentage through the step sequence where the edits
|
||||
should happen. 0 means the start (noisy starting image), 1 is the end (final
|
||||
image).
|
||||
- For img2img, the step sequence does not start at 0 but instead at
|
||||
(1-strength) - so if strength is 0.7, s_start and s_end must both be
|
||||
greater than 0.3 (1-0.7) to have any effect.
|
||||
- Convenience option `shape_freedom` (0-1) to specify how much "freedom" Stable
|
||||
Diffusion should have to change the shape of the subject being swapped.
|
||||
- `a (cat).swap(dog, shape_freedom=0.5) eating a hotdog`.
|
||||
For example, consider the prompt `a cat.swap(dog) playing with a ball in the forest`. Normally, because of the word words interact with each other when doing a stable diffusion image generation, these two prompts would generate different compositions:
|
||||
- `a cat playing with a ball in the forest`
|
||||
- `a dog playing with a ball in the forest`
|
||||
|
||||
| `a cat playing with a ball in the forest` | `a dog playing with a ball in the forest` |
|
||||
| --- | --- |
|
||||
| img | img |
|
||||
|
||||
|
||||
- For multiple word swaps, use parentheses: `a (fluffy cat).swap(barking dog) playing with a ball in the forest`.
|
||||
- To swap a comma, use quotes: `a ("fluffy, grey cat").swap("big, barking dog") playing with a ball in the forest`.
|
||||
- Supports options `t_start` and `t_end` (each 0-1) loosely corresponding to bloc97's `prompt_edit_tokens_start/_end` but with the math swapped to make it easier to
|
||||
intuitively understand. `t_start` and `t_end` are used to control on which steps cross-attention control should run. With the default values `t_start=0` and `t_end=1`, cross-attention control is active on every step of image generation. Other values can be used to turn cross-attention control off for part of the image generation process.
|
||||
- For example, if doing a diffusion with 10 steps for the prompt is `a cat.swap(dog, t_start=0.3, t_end=1.0) playing with a ball in the forest`, the first 3 steps will be run as `a cat playing with a ball in the forest`, while the last 7 steps will run as `a dog playing with a ball in the forest`, but the pixels that represent `dog` will be locked to the pixels that would have represented `cat` if the `cat` prompt had been used instead.
|
||||
- Conversely, for `a cat.swap(dog, t_start=0, t_end=0.7) playing with a ball in the forest`, the first 7 steps will run as `a dog playing with a ball in the forest` with the pixels that represent `dog` locked to the same pixels that would have represented `cat` if the `cat` prompt was being used instead. The final 3 steps will just run `a cat playing with a ball in the forest`.
|
||||
> For img2img, the step sequence does not start at 0 but instead at `(1.0-strength)` - so if the img2img `strength` is `0.7`, `t_start` and `t_end` must both be greater than `0.3` (`1.0-0.7`) to have any effect.
|
||||
|
||||
Prompt2prompt `.swap()` is not compatible with xformers, which will be temporarily disabled when doing a `.swap()` - so you should expect to use more VRAM and run slower that with xformers enabled.
|
||||
|
||||
The `prompt2prompt` code is based off
|
||||
[bloc97's colab](https://github.com/bloc97/CrossAttentionControl).
|
||||
|
@ -2,4 +2,5 @@
|
||||
-r environments-and-requirements/requirements-base.txt
|
||||
torch>=1.13.1
|
||||
torchvision>=0.14.1
|
||||
xformers~=0.16
|
||||
-e .
|
||||
|
@ -3,4 +3,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu117 --trusted-host https://download.pytorch.org
|
||||
torch==1.13.1
|
||||
torchvision==0.14.1
|
||||
xformers~=0.16
|
||||
-e .
|
||||
|
@ -20,6 +20,7 @@ import torch
|
||||
import transformers
|
||||
from PIL import Image, ImageOps
|
||||
from diffusers.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.utils.import_utils import is_xformers_available
|
||||
from omegaconf import OmegaConf
|
||||
from pytorch_lightning import seed_everything, logging
|
||||
|
||||
@ -203,6 +204,14 @@ class Generate:
|
||||
self.precision = choose_precision(self.device)
|
||||
Globals.full_precision = self.precision=='float32'
|
||||
|
||||
if is_xformers_available():
|
||||
if not Globals.disable_xformers:
|
||||
print('>> xformers memory-efficient attention is available and enabled')
|
||||
else:
|
||||
print('>> xformers memory-efficient attention is available but disabled')
|
||||
else:
|
||||
print('>> xformers not installed')
|
||||
|
||||
# model caching system for fast switching
|
||||
self.model_manager = ModelManager(mconfig,self.device,self.precision,max_loaded_models=max_loaded_models)
|
||||
# don't accept invalid models
|
||||
|
@ -53,10 +53,11 @@ def main():
|
||||
|
||||
if not args.conf:
|
||||
if not os.path.exists(os.path.join(Globals.root,'configs','models.yaml')):
|
||||
print(f"\n** Error. The file {os.path.join(Globals.root,'configs','models.yaml')} could not be found.")
|
||||
print('** Please check the location of your invokeai directory and use the --root_dir option to point to the correct path.')
|
||||
print('** This script will now exit.')
|
||||
sys.exit(-1)
|
||||
report_model_error(opt, e)
|
||||
# print(f"\n** Error. The file {os.path.join(Globals.root,'configs','models.yaml')} could not be found.")
|
||||
# print('** Please check the location of your invokeai directory and use the --root_dir option to point to the correct path.')
|
||||
# print('** This script will now exit.')
|
||||
# sys.exit(-1)
|
||||
|
||||
print(f'>> {ldm.invoke.__app_name__}, version {ldm.invoke.__version__}')
|
||||
print(f'>> InvokeAI runtime directory is "{Globals.root}"')
|
||||
@ -789,8 +790,8 @@ def _get_model_name(existing_names,completer,default_name:str='')->str:
|
||||
model_name = input(f'Short name for this model [{default_name}]: ').strip()
|
||||
if len(model_name)==0:
|
||||
model_name = default_name
|
||||
if not re.match('^[\w._+-]+$',model_name):
|
||||
print('** model name must contain only words, digits and the characters "._+-" **')
|
||||
if not re.match('^[\w._+:/-]+$',model_name):
|
||||
print('** model name must contain only words, digits and the characters "._+:/-" **')
|
||||
elif model_name != default_name and model_name in existing_names:
|
||||
print(f'** the name {model_name} is already in use. Pick another.')
|
||||
else:
|
||||
|
@ -24,9 +24,6 @@ from ...models.diffusion import cross_attention_control
|
||||
from ...models.diffusion.cross_attention_map_saving import AttentionMapSaver
|
||||
from ...modules.prompt_to_embeddings_converter import WeightedPromptFragmentsToEmbeddingsConverter
|
||||
|
||||
# monkeypatch diffusers CrossAttention 🙈
|
||||
# this is to make prompt2prompt and (future) attention maps work
|
||||
attention.CrossAttention = cross_attention_control.InvokeAIDiffusersCrossAttention
|
||||
|
||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||
@ -295,7 +292,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
||||
safety_checker=safety_checker,
|
||||
feature_extractor=feature_extractor,
|
||||
)
|
||||
self.invokeai_diffuser = InvokeAIDiffuserComponent(self.unet, self._unet_forward)
|
||||
self.invokeai_diffuser = InvokeAIDiffuserComponent(self.unet, self._unet_forward, is_running_diffusers=True)
|
||||
use_full_precision = (precision == 'float32' or precision == 'autocast')
|
||||
self.textual_inversion_manager = TextualInversionManager(tokenizer=self.tokenizer,
|
||||
text_encoder=self.text_encoder,
|
||||
@ -307,8 +304,23 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
||||
textual_inversion_manager=self.textual_inversion_manager
|
||||
)
|
||||
|
||||
self._enable_memory_efficient_attention()
|
||||
|
||||
|
||||
def _enable_memory_efficient_attention(self):
|
||||
"""
|
||||
if xformers is available, use it, otherwise use sliced attention.
|
||||
"""
|
||||
if is_xformers_available() and not Globals.disable_xformers:
|
||||
self.enable_xformers_memory_efficient_attention()
|
||||
else:
|
||||
if torch.backends.mps.is_available():
|
||||
# until pytorch #91617 is fixed, slicing is borked on MPS
|
||||
# https://github.com/pytorch/pytorch/issues/91617
|
||||
# fix is in https://github.com/kulinseth/pytorch/pull/222 but no idea when it will get merged to pytorch mainline.
|
||||
pass
|
||||
else:
|
||||
self.enable_attention_slicing(slice_size='auto')
|
||||
|
||||
def image_from_embeddings(self, latents: torch.Tensor, num_inference_steps: int,
|
||||
conditioning_data: ConditioningData,
|
||||
@ -373,42 +385,40 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
||||
if additional_guidance is None:
|
||||
additional_guidance = []
|
||||
extra_conditioning_info = conditioning_data.extra
|
||||
if extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control:
|
||||
self.invokeai_diffuser.setup_cross_attention_control(extra_conditioning_info,
|
||||
step_count=len(self.scheduler.timesteps))
|
||||
else:
|
||||
self.invokeai_diffuser.remove_cross_attention_control()
|
||||
with self.invokeai_diffuser.custom_attention_context(extra_conditioning_info=extra_conditioning_info,
|
||||
step_count=len(self.scheduler.timesteps)
|
||||
):
|
||||
|
||||
yield PipelineIntermediateState(run_id=run_id, step=-1, timestep=self.scheduler.num_train_timesteps,
|
||||
latents=latents)
|
||||
yield PipelineIntermediateState(run_id=run_id, step=-1, timestep=self.scheduler.num_train_timesteps,
|
||||
latents=latents)
|
||||
|
||||
batch_size = latents.shape[0]
|
||||
batched_t = torch.full((batch_size,), timesteps[0],
|
||||
dtype=timesteps.dtype, device=self.unet.device)
|
||||
latents = self.scheduler.add_noise(latents, noise, batched_t)
|
||||
batch_size = latents.shape[0]
|
||||
batched_t = torch.full((batch_size,), timesteps[0],
|
||||
dtype=timesteps.dtype, device=self.unet.device)
|
||||
latents = self.scheduler.add_noise(latents, noise, batched_t)
|
||||
|
||||
attention_map_saver: Optional[AttentionMapSaver] = None
|
||||
self.invokeai_diffuser.remove_attention_map_saving()
|
||||
for i, t in enumerate(self.progress_bar(timesteps)):
|
||||
batched_t.fill_(t)
|
||||
step_output = self.step(batched_t, latents, conditioning_data,
|
||||
step_index=i,
|
||||
total_step_count=len(timesteps),
|
||||
additional_guidance=additional_guidance)
|
||||
latents = step_output.prev_sample
|
||||
predicted_original = getattr(step_output, 'pred_original_sample', None)
|
||||
attention_map_saver: Optional[AttentionMapSaver] = None
|
||||
|
||||
if i == len(timesteps)-1 and extra_conditioning_info is not None:
|
||||
eos_token_index = extra_conditioning_info.tokens_count_including_eos_bos - 1
|
||||
attention_map_token_ids = range(1, eos_token_index)
|
||||
attention_map_saver = AttentionMapSaver(token_ids=attention_map_token_ids, latents_shape=latents.shape[-2:])
|
||||
self.invokeai_diffuser.setup_attention_map_saving(attention_map_saver)
|
||||
for i, t in enumerate(self.progress_bar(timesteps)):
|
||||
batched_t.fill_(t)
|
||||
step_output = self.step(batched_t, latents, conditioning_data,
|
||||
step_index=i,
|
||||
total_step_count=len(timesteps),
|
||||
additional_guidance=additional_guidance)
|
||||
latents = step_output.prev_sample
|
||||
predicted_original = getattr(step_output, 'pred_original_sample', None)
|
||||
|
||||
yield PipelineIntermediateState(run_id=run_id, step=i, timestep=int(t), latents=latents,
|
||||
predicted_original=predicted_original, attention_map_saver=attention_map_saver)
|
||||
# TODO resuscitate attention map saving
|
||||
#if i == len(timesteps)-1 and extra_conditioning_info is not None:
|
||||
# eos_token_index = extra_conditioning_info.tokens_count_including_eos_bos - 1
|
||||
# attention_map_token_ids = range(1, eos_token_index)
|
||||
# attention_map_saver = AttentionMapSaver(token_ids=attention_map_token_ids, latents_shape=latents.shape[-2:])
|
||||
# self.invokeai_diffuser.setup_attention_map_saving(attention_map_saver)
|
||||
|
||||
self.invokeai_diffuser.remove_attention_map_saving()
|
||||
return latents, attention_map_saver
|
||||
yield PipelineIntermediateState(run_id=run_id, step=i, timestep=int(t), latents=latents,
|
||||
predicted_original=predicted_original, attention_map_saver=attention_map_saver)
|
||||
|
||||
return latents, attention_map_saver
|
||||
|
||||
@torch.inference_mode()
|
||||
def step(self, t: torch.Tensor, latents: torch.Tensor,
|
||||
@ -447,7 +457,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
||||
|
||||
return step_output
|
||||
|
||||
def _unet_forward(self, latents, t, text_embeddings):
|
||||
def _unet_forward(self, latents, t, text_embeddings, cross_attention_kwargs: Optional[dict[str,Any]] = None):
|
||||
"""predict the noise residual"""
|
||||
if is_inpainting_model(self.unet) and latents.size(1) == 4:
|
||||
# Pad out normal non-inpainting inputs for an inpainting model.
|
||||
@ -460,7 +470,10 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
||||
initial_image_latents=torch.zeros_like(latents[:1], device=latents.device, dtype=latents.dtype)
|
||||
).add_mask_channels(latents)
|
||||
|
||||
return self.unet(latents, t, encoder_hidden_states=text_embeddings).sample
|
||||
return self.unet(sample=latents,
|
||||
timestep=t,
|
||||
encoder_hidden_states=text_embeddings,
|
||||
cross_attention_kwargs=cross_attention_kwargs).sample
|
||||
|
||||
def img2img_from_embeddings(self,
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
@ -531,6 +544,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
||||
init_image = image_resized_to_grid_as_tensor(init_image.convert('RGB'))
|
||||
|
||||
init_image = init_image.to(device=device, dtype=latents_dtype)
|
||||
mask = mask.to(device=device, dtype=latents_dtype)
|
||||
|
||||
if init_image.dim() == 3:
|
||||
init_image = init_image.unsqueeze(0)
|
||||
@ -549,17 +563,22 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
||||
|
||||
if mask.dim() == 3:
|
||||
mask = mask.unsqueeze(0)
|
||||
mask = tv_resize(mask, init_image_latents.shape[-2:], T.InterpolationMode.BILINEAR) \
|
||||
latent_mask = tv_resize(mask, init_image_latents.shape[-2:], T.InterpolationMode.BILINEAR) \
|
||||
.to(device=device, dtype=latents_dtype)
|
||||
|
||||
guidance: List[Callable] = []
|
||||
|
||||
if is_inpainting_model(self.unet):
|
||||
# You'd think the inpainting model wouldn't be paying attention to the area it is going to repaint
|
||||
# (that's why there's a mask!) but it seems to really want that blanked out.
|
||||
masked_init_image = init_image * torch.where(mask < 0.5, 1, 0)
|
||||
masked_latents = self.non_noised_latents_from_image(masked_init_image, device=device, dtype=latents_dtype)
|
||||
|
||||
# TODO: we should probably pass this in so we don't have to try/finally around setting it.
|
||||
self.invokeai_diffuser.model_forward_callback = \
|
||||
AddsMaskLatents(self._unet_forward, mask, init_image_latents)
|
||||
AddsMaskLatents(self._unet_forward, latent_mask, masked_latents)
|
||||
else:
|
||||
guidance.append(AddsMaskGuidance(mask, init_image_latents, self.scheduler, noise))
|
||||
guidance.append(AddsMaskGuidance(latent_mask, init_image_latents, self.scheduler, noise))
|
||||
|
||||
try:
|
||||
result_latents, result_attention_maps = self.latents_from_embeddings(
|
||||
@ -578,11 +597,20 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
||||
output = InvokeAIStableDiffusionPipelineOutput(images=image, nsfw_content_detected=[], attention_map_saver=result_attention_maps)
|
||||
return self.check_for_safety(output, dtype=conditioning_data.dtype)
|
||||
|
||||
def non_noised_latents_from_image(self, init_image, *, device, dtype):
|
||||
def non_noised_latents_from_image(self, init_image, *, device: torch.device, dtype):
|
||||
init_image = init_image.to(device=device, dtype=dtype)
|
||||
with torch.inference_mode():
|
||||
if device.type == 'mps':
|
||||
# workaround for torch MPS bug that has been fixed in https://github.com/kulinseth/pytorch/pull/222
|
||||
# TODO remove this workaround once kulinseth#222 is merged to pytorch mainline
|
||||
self.vae.to('cpu')
|
||||
init_image = init_image.to('cpu')
|
||||
init_latent_dist = self.vae.encode(init_image).latent_dist
|
||||
init_latents = init_latent_dist.sample().to(dtype=dtype) # FIXME: uses torch.randn. make reproducible!
|
||||
if device.type == 'mps':
|
||||
self.vae.to(device)
|
||||
init_latents = init_latents.to(device)
|
||||
|
||||
init_latents = 0.18215 * init_latents
|
||||
return init_latents
|
||||
|
||||
|
@ -19,10 +19,12 @@ from ldm.util import debug_image
|
||||
|
||||
|
||||
def infill_methods()->list[str]:
|
||||
methods = list()
|
||||
methods = [
|
||||
"tile",
|
||||
"solid",
|
||||
]
|
||||
if PatchMatch.patchmatch_available():
|
||||
methods.append('patchmatch')
|
||||
methods.append('tile')
|
||||
methods.insert(0, 'patchmatch')
|
||||
return methods
|
||||
|
||||
class Inpaint(Img2Img):
|
||||
@ -182,6 +184,7 @@ class Inpaint(Img2Img):
|
||||
infill_method = None,
|
||||
inpaint_width=None,
|
||||
inpaint_height=None,
|
||||
inpaint_fill:tuple(int)=(0x7F, 0x7F, 0x7F, 0xFF),
|
||||
attention_maps_callback=None,
|
||||
**kwargs):
|
||||
"""
|
||||
@ -202,12 +205,17 @@ class Inpaint(Img2Img):
|
||||
# Do infill
|
||||
if infill_method == 'patchmatch' and PatchMatch.patchmatch_available():
|
||||
init_filled = self.infill_patchmatch(self.pil_image.copy())
|
||||
else: # if infill_method == 'tile': # Only two methods right now, so always use 'tile' if not patchmatch
|
||||
elif infill_method == 'tile':
|
||||
init_filled = self.tile_fill_missing(
|
||||
self.pil_image.copy(),
|
||||
seed = self.seed,
|
||||
tile_size = tile_size
|
||||
)
|
||||
elif infill_method == 'solid':
|
||||
solid_bg = PIL.Image.new("RGBA", init_image.size, inpaint_fill)
|
||||
init_filled = PIL.Image.alpha_composite(solid_bg, init_image)
|
||||
else:
|
||||
raise ValueError(f"Non-supported infill type {infill_method}", infill_method)
|
||||
init_filled.paste(init_image, (0,0), init_image.split()[-1])
|
||||
|
||||
# Resize if requested for inpainting
|
||||
|
@ -3,10 +3,10 @@ ldm.invoke.generator.txt2img inherits from ldm.invoke.generator
|
||||
'''
|
||||
|
||||
import math
|
||||
from diffusers.utils.logging import get_verbosity, set_verbosity, set_verbosity_error
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
from diffusers.utils.logging import get_verbosity, set_verbosity, set_verbosity_error
|
||||
|
||||
from ldm.invoke.generator.base import Generator
|
||||
from ldm.invoke.generator.diffusers_pipeline import trim_to_multiple_of, StableDiffusionGeneratorPipeline, \
|
||||
@ -38,10 +38,6 @@ class Txt2Img2Img(Generator):
|
||||
uc, c, cfg_scale, extra_conditioning_info,
|
||||
threshold = ThresholdSettings(threshold, warmup=0.2) if threshold else None)
|
||||
.add_scheduler_args_if_applicable(pipeline.scheduler, eta=ddim_eta))
|
||||
scale_dim = min(width, height)
|
||||
scale = 512 / scale_dim
|
||||
|
||||
init_width, init_height = trim_to_multiple_of(scale * width, scale * height)
|
||||
|
||||
def make_image(x_T):
|
||||
|
||||
@ -54,6 +50,10 @@ class Txt2Img2Img(Generator):
|
||||
# TODO: threshold = threshold,
|
||||
)
|
||||
|
||||
# Get our initial generation width and height directly from the latent output so
|
||||
# the message below is accurate.
|
||||
init_width = first_pass_latent_output.size()[3] * self.downsampling_factor
|
||||
init_height = first_pass_latent_output.size()[2] * self.downsampling_factor
|
||||
print(
|
||||
f"\n>> Interpolating from {init_width}x{init_height} to {width}x{height} using DDIM sampling"
|
||||
)
|
||||
@ -106,27 +106,35 @@ class Txt2Img2Img(Generator):
|
||||
def get_noise(self,width,height,scale = True):
|
||||
# print(f"Get noise: {width}x{height}")
|
||||
if scale:
|
||||
trained_square = 512 * 512
|
||||
actual_square = width * height
|
||||
scale = math.sqrt(trained_square / actual_square)
|
||||
scaled_width = math.ceil(scale * width / 64) * 64
|
||||
scaled_height = math.ceil(scale * height / 64) * 64
|
||||
# Scale the input width and height for the initial generation
|
||||
# Make their area equivalent to the model's resolution area (e.g. 512*512 = 262144),
|
||||
# while keeping the minimum dimension at least 0.5 * resolution (e.g. 512*0.5 = 256)
|
||||
|
||||
aspect = width / height
|
||||
dimension = self.model.unet.config.sample_size * self.model.vae_scale_factor
|
||||
min_dimension = math.floor(dimension * 0.5)
|
||||
model_area = dimension * dimension # hardcoded for now since all models are trained on square images
|
||||
|
||||
if aspect > 1.0:
|
||||
init_height = max(min_dimension, math.sqrt(model_area / aspect))
|
||||
init_width = init_height * aspect
|
||||
else:
|
||||
init_width = max(min_dimension, math.sqrt(model_area * aspect))
|
||||
init_height = init_width / aspect
|
||||
|
||||
scaled_width, scaled_height = trim_to_multiple_of(math.floor(init_width), math.floor(init_height))
|
||||
|
||||
else:
|
||||
scaled_width = width
|
||||
scaled_height = height
|
||||
|
||||
device = self.model.device
|
||||
device = self.model.device
|
||||
channels = self.latent_channels
|
||||
if channels == 9:
|
||||
channels = 4 # we don't really want noise for all the mask channels
|
||||
shape = (1, channels,
|
||||
scaled_height // self.downsampling_factor, scaled_width // self.downsampling_factor)
|
||||
if self.use_mps_noise or device.type == 'mps':
|
||||
return torch.randn([1,
|
||||
self.latent_channels,
|
||||
scaled_height // self.downsampling_factor,
|
||||
scaled_width // self.downsampling_factor],
|
||||
dtype=self.torch_dtype(),
|
||||
device='cpu').to(device)
|
||||
return torch.randn(shape, dtype=self.torch_dtype(), device='cpu').to(device)
|
||||
else:
|
||||
return torch.randn([1,
|
||||
self.latent_channels,
|
||||
scaled_height // self.downsampling_factor,
|
||||
scaled_width // self.downsampling_factor],
|
||||
dtype=self.torch_dtype(),
|
||||
device=device)
|
||||
return torch.randn(shape, dtype=self.torch_dtype(), device=device)
|
||||
|
@ -125,7 +125,7 @@ class ModelManager(object):
|
||||
Set the default model. The change will not take
|
||||
effect until you call model_manager.commit()
|
||||
'''
|
||||
assert model_name in self.models,f"unknown model '{model_name}'"
|
||||
assert model_name in self.model_names(), f"unknown model '{model_name}'"
|
||||
|
||||
config = self.config
|
||||
for model in config:
|
||||
|
@ -155,7 +155,7 @@ class CrossAttentionControlSubstitute(CrossAttentionControlledFragment):
|
||||
default_options = {
|
||||
's_start': 0.0,
|
||||
's_end': 0.2062994740159002, # ~= shape_freedom=0.5
|
||||
't_start': 0.0,
|
||||
't_start': 0.1,
|
||||
't_end': 1.0
|
||||
}
|
||||
merged_options = default_options
|
||||
|
@ -7,8 +7,10 @@ import torch
|
||||
import diffusers
|
||||
from torch import nn
|
||||
from diffusers.models.unet_2d_condition import UNet2DConditionModel
|
||||
from diffusers.models.cross_attention import AttnProcessor
|
||||
from ldm.invoke.devices import torch_dtype
|
||||
|
||||
|
||||
# adapted from bloc97's CrossAttentionControl colab
|
||||
# https://github.com/bloc97/CrossAttentionControl
|
||||
|
||||
@ -304,11 +306,15 @@ class InvokeAICrossAttentionMixin:
|
||||
|
||||
|
||||
|
||||
def remove_cross_attention_control(model):
|
||||
remove_attention_function(model)
|
||||
def restore_default_cross_attention(model, is_running_diffusers: bool, restore_attention_processor: Optional[AttnProcessor]=None):
|
||||
if is_running_diffusers:
|
||||
unet = model
|
||||
unet.set_attn_processor(restore_attention_processor or CrossAttnProcessor())
|
||||
else:
|
||||
remove_attention_function(model)
|
||||
|
||||
|
||||
def setup_cross_attention_control(model, context: Context):
|
||||
def override_cross_attention(model, context: Context, is_running_diffusers = False):
|
||||
"""
|
||||
Inject attention parameters and functions into the passed in model to enable cross attention editing.
|
||||
|
||||
@ -323,7 +329,7 @@ def setup_cross_attention_control(model, context: Context):
|
||||
# urgh. should this be hardcoded?
|
||||
max_length = 77
|
||||
# mask=1 means use base prompt attention, mask=0 means use edited prompt attention
|
||||
mask = torch.zeros(max_length)
|
||||
mask = torch.zeros(max_length, dtype=torch_dtype(device))
|
||||
indices_target = torch.arange(max_length, dtype=torch.long)
|
||||
indices = torch.arange(max_length, dtype=torch.long)
|
||||
for name, a0, a1, b0, b1 in context.arguments.edit_opcodes:
|
||||
@ -333,10 +339,26 @@ def setup_cross_attention_control(model, context: Context):
|
||||
indices[b0:b1] = indices_target[a0:a1]
|
||||
mask[b0:b1] = 1
|
||||
|
||||
context.register_cross_attention_modules(model)
|
||||
context.cross_attention_mask = mask.to(device)
|
||||
context.cross_attention_index_map = indices.to(device)
|
||||
inject_attention_function(model, context)
|
||||
if is_running_diffusers:
|
||||
unet = model
|
||||
old_attn_processors = unet.attn_processors
|
||||
if torch.backends.mps.is_available():
|
||||
# see note in StableDiffusionGeneratorPipeline.__init__ about borked slicing on MPS
|
||||
unet.set_attn_processor(SwapCrossAttnProcessor())
|
||||
else:
|
||||
# try to re-use an existing slice size
|
||||
default_slice_size = 4
|
||||
slice_size = next((p.slice_size for p in old_attn_processors.values() if type(p) is SlicedAttnProcessor), default_slice_size)
|
||||
unet.set_attn_processor(SlicedSwapCrossAttnProcesser(slice_size=slice_size))
|
||||
return old_attn_processors
|
||||
else:
|
||||
context.register_cross_attention_modules(model)
|
||||
inject_attention_function(model, context)
|
||||
return None
|
||||
|
||||
|
||||
|
||||
|
||||
def get_cross_attention_modules(model, which: CrossAttentionType) -> list[tuple[str, InvokeAICrossAttentionMixin]]:
|
||||
@ -445,6 +467,7 @@ def get_mem_free_total(device):
|
||||
return mem_free_total
|
||||
|
||||
|
||||
|
||||
class InvokeAIDiffusersCrossAttention(diffusers.models.attention.CrossAttention, InvokeAICrossAttentionMixin):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
@ -460,3 +483,176 @@ class InvokeAIDiffusersCrossAttention(diffusers.models.attention.CrossAttention,
|
||||
hidden_states = self.reshape_batch_dim_to_heads(attention_result)
|
||||
return hidden_states
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## 🧨diffusers implementation follows
|
||||
|
||||
|
||||
"""
|
||||
# base implementation
|
||||
|
||||
class CrossAttnProcessor:
|
||||
def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None):
|
||||
batch_size, sequence_length, _ = hidden_states.shape
|
||||
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length)
|
||||
|
||||
query = attn.to_q(hidden_states)
|
||||
query = attn.head_to_batch_dim(query)
|
||||
|
||||
encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
|
||||
key = attn.to_k(encoder_hidden_states)
|
||||
value = attn.to_v(encoder_hidden_states)
|
||||
key = attn.head_to_batch_dim(key)
|
||||
value = attn.head_to_batch_dim(value)
|
||||
|
||||
attention_probs = attn.get_attention_scores(query, key, attention_mask)
|
||||
hidden_states = torch.bmm(attention_probs, value)
|
||||
hidden_states = attn.batch_to_head_dim(hidden_states)
|
||||
|
||||
# linear proj
|
||||
hidden_states = attn.to_out[0](hidden_states)
|
||||
# dropout
|
||||
hidden_states = attn.to_out[1](hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
"""
|
||||
from dataclasses import field, dataclass
|
||||
|
||||
import torch
|
||||
|
||||
from diffusers.models.cross_attention import CrossAttention, CrossAttnProcessor, SlicedAttnProcessor, AttnProcessor
|
||||
|
||||
|
||||
@dataclass
|
||||
class SwapCrossAttnContext:
|
||||
modified_text_embeddings: torch.Tensor
|
||||
index_map: torch.Tensor # maps from original prompt token indices to the equivalent tokens in the modified prompt
|
||||
mask: torch.Tensor # in the target space of the index_map
|
||||
cross_attention_types_to_do: list[CrossAttentionType] = field(default_factory=list)
|
||||
|
||||
def __int__(self,
|
||||
cac_types_to_do: [CrossAttentionType],
|
||||
modified_text_embeddings: torch.Tensor,
|
||||
index_map: torch.Tensor,
|
||||
mask: torch.Tensor):
|
||||
self.cross_attention_types_to_do = cac_types_to_do
|
||||
self.modified_text_embeddings = modified_text_embeddings
|
||||
self.index_map = index_map
|
||||
self.mask = mask
|
||||
|
||||
def wants_cross_attention_control(self, attn_type: CrossAttentionType) -> bool:
|
||||
return attn_type in self.cross_attention_types_to_do
|
||||
|
||||
@classmethod
|
||||
def make_mask_and_index_map(cls, edit_opcodes: list[tuple[str, int, int, int, int]], max_length: int) \
|
||||
-> tuple[torch.Tensor, torch.Tensor]:
|
||||
|
||||
# mask=1 means use original prompt attention, mask=0 means use modified prompt attention
|
||||
mask = torch.zeros(max_length)
|
||||
indices_target = torch.arange(max_length, dtype=torch.long)
|
||||
indices = torch.arange(max_length, dtype=torch.long)
|
||||
for name, a0, a1, b0, b1 in edit_opcodes:
|
||||
if b0 < max_length:
|
||||
if name == "equal":
|
||||
# these tokens remain the same as in the original prompt
|
||||
indices[b0:b1] = indices_target[a0:a1]
|
||||
mask[b0:b1] = 1
|
||||
|
||||
return mask, indices
|
||||
|
||||
|
||||
class SlicedSwapCrossAttnProcesser(SlicedAttnProcessor):
|
||||
|
||||
# TODO: dynamically pick slice size based on memory conditions
|
||||
|
||||
def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None,
|
||||
# kwargs
|
||||
swap_cross_attn_context: SwapCrossAttnContext=None):
|
||||
|
||||
attention_type = CrossAttentionType.SELF if encoder_hidden_states is None else CrossAttentionType.TOKENS
|
||||
|
||||
# if cross-attention control is not in play, just call through to the base implementation.
|
||||
if attention_type is CrossAttentionType.SELF or \
|
||||
swap_cross_attn_context is None or \
|
||||
not swap_cross_attn_context.wants_cross_attention_control(attention_type):
|
||||
#print(f"SwapCrossAttnContext for {attention_type} not active - passing request to superclass")
|
||||
return super().__call__(attn, hidden_states, encoder_hidden_states, attention_mask)
|
||||
#else:
|
||||
# print(f"SwapCrossAttnContext for {attention_type} active")
|
||||
|
||||
batch_size, sequence_length, _ = hidden_states.shape
|
||||
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length)
|
||||
|
||||
query = attn.to_q(hidden_states)
|
||||
dim = query.shape[-1]
|
||||
query = attn.head_to_batch_dim(query)
|
||||
|
||||
original_text_embeddings = encoder_hidden_states
|
||||
modified_text_embeddings = swap_cross_attn_context.modified_text_embeddings
|
||||
original_text_key = attn.to_k(original_text_embeddings)
|
||||
modified_text_key = attn.to_k(modified_text_embeddings)
|
||||
original_value = attn.to_v(original_text_embeddings)
|
||||
modified_value = attn.to_v(modified_text_embeddings)
|
||||
|
||||
original_text_key = attn.head_to_batch_dim(original_text_key)
|
||||
modified_text_key = attn.head_to_batch_dim(modified_text_key)
|
||||
original_value = attn.head_to_batch_dim(original_value)
|
||||
modified_value = attn.head_to_batch_dim(modified_value)
|
||||
|
||||
# compute slices and prepare output tensor
|
||||
batch_size_attention = query.shape[0]
|
||||
hidden_states = torch.zeros(
|
||||
(batch_size_attention, sequence_length, dim // attn.heads), device=query.device, dtype=query.dtype
|
||||
)
|
||||
|
||||
# do slices
|
||||
for i in range(max(1,hidden_states.shape[0] // self.slice_size)):
|
||||
start_idx = i * self.slice_size
|
||||
end_idx = (i + 1) * self.slice_size
|
||||
|
||||
query_slice = query[start_idx:end_idx]
|
||||
original_key_slice = original_text_key[start_idx:end_idx]
|
||||
modified_key_slice = modified_text_key[start_idx:end_idx]
|
||||
attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
|
||||
|
||||
original_attn_slice = attn.get_attention_scores(query_slice, original_key_slice, attn_mask_slice)
|
||||
modified_attn_slice = attn.get_attention_scores(query_slice, modified_key_slice, attn_mask_slice)
|
||||
|
||||
# because the prompt modifications may result in token sequences shifted forwards or backwards,
|
||||
# the original attention probabilities must be remapped to account for token index changes in the
|
||||
# modified prompt
|
||||
remapped_original_attn_slice = torch.index_select(original_attn_slice, -1,
|
||||
swap_cross_attn_context.index_map)
|
||||
|
||||
# only some tokens taken from the original attention probabilities. this is controlled by the mask.
|
||||
mask = swap_cross_attn_context.mask
|
||||
inverse_mask = 1 - mask
|
||||
attn_slice = \
|
||||
remapped_original_attn_slice * mask + \
|
||||
modified_attn_slice * inverse_mask
|
||||
|
||||
del remapped_original_attn_slice, modified_attn_slice
|
||||
|
||||
attn_slice = torch.bmm(attn_slice, modified_value[start_idx:end_idx])
|
||||
hidden_states[start_idx:end_idx] = attn_slice
|
||||
|
||||
|
||||
# done
|
||||
hidden_states = attn.batch_to_head_dim(hidden_states)
|
||||
|
||||
# linear proj
|
||||
hidden_states = attn.to_out[0](hidden_states)
|
||||
# dropout
|
||||
hidden_states = attn.to_out[1](hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class SwapCrossAttnProcessor(SlicedSwapCrossAttnProcesser):
|
||||
|
||||
def __init__(self):
|
||||
super(SwapCrossAttnProcessor, self).__init__(slice_size=int(1e9)) # massive slice size = don't slice
|
||||
|
||||
|
@ -19,9 +19,9 @@ class DDIMSampler(Sampler):
|
||||
all_timesteps_count = kwargs.get('all_timesteps_count', t_enc)
|
||||
|
||||
if extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control:
|
||||
self.invokeai_diffuser.setup_cross_attention_control(extra_conditioning_info, step_count = all_timesteps_count)
|
||||
self.invokeai_diffuser.override_cross_attention(extra_conditioning_info, step_count = all_timesteps_count)
|
||||
else:
|
||||
self.invokeai_diffuser.remove_cross_attention_control()
|
||||
self.invokeai_diffuser.restore_default_cross_attention()
|
||||
|
||||
|
||||
# This is the central routine
|
||||
|
@ -43,9 +43,9 @@ class CFGDenoiser(nn.Module):
|
||||
extra_conditioning_info = kwargs.get('extra_conditioning_info', None)
|
||||
|
||||
if extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control:
|
||||
self.invokeai_diffuser.setup_cross_attention_control(extra_conditioning_info, step_count = t_enc)
|
||||
self.invokeai_diffuser.override_cross_attention(extra_conditioning_info, step_count = t_enc)
|
||||
else:
|
||||
self.invokeai_diffuser.remove_cross_attention_control()
|
||||
self.invokeai_diffuser.restore_default_cross_attention()
|
||||
|
||||
|
||||
def forward(self, x, sigma, uncond, cond, cond_scale):
|
||||
|
@ -21,9 +21,9 @@ class PLMSSampler(Sampler):
|
||||
all_timesteps_count = kwargs.get('all_timesteps_count', t_enc)
|
||||
|
||||
if extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control:
|
||||
self.invokeai_diffuser.setup_cross_attention_control(extra_conditioning_info, step_count = all_timesteps_count)
|
||||
self.invokeai_diffuser.override_cross_attention(extra_conditioning_info, step_count = all_timesteps_count)
|
||||
else:
|
||||
self.invokeai_diffuser.remove_cross_attention_control()
|
||||
self.invokeai_diffuser.restore_default_cross_attention()
|
||||
|
||||
|
||||
# this is the essential routine
|
||||
|
@ -1,14 +1,16 @@
|
||||
import math
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from math import ceil
|
||||
from typing import Callable, Optional, Union
|
||||
from typing import Callable, Optional, Union, Any, Dict
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from diffusers.models.cross_attention import AttnProcessor
|
||||
from ldm.models.diffusion.cross_attention_control import Arguments, \
|
||||
remove_cross_attention_control, setup_cross_attention_control, Context, get_cross_attention_modules, \
|
||||
CrossAttentionType
|
||||
restore_default_cross_attention, override_cross_attention, Context, get_cross_attention_modules, \
|
||||
CrossAttentionType, SwapCrossAttnContext
|
||||
from ldm.models.diffusion.cross_attention_map_saving import AttentionMapSaver
|
||||
|
||||
|
||||
@ -30,39 +32,68 @@ class InvokeAIDiffuserComponent:
|
||||
debug_thresholding = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtraConditioningInfo:
|
||||
def __init__(self, tokens_count_including_eos_bos:int, cross_attention_control_args: Optional[Arguments]):
|
||||
self.tokens_count_including_eos_bos = tokens_count_including_eos_bos
|
||||
self.cross_attention_control_args = cross_attention_control_args
|
||||
|
||||
tokens_count_including_eos_bos: int
|
||||
cross_attention_control_args: Optional[Arguments] = None
|
||||
|
||||
@property
|
||||
def wants_cross_attention_control(self):
|
||||
return self.cross_attention_control_args is not None
|
||||
|
||||
|
||||
def __init__(self, model, model_forward_callback:
|
||||
Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]
|
||||
):
|
||||
Callable[[torch.Tensor, torch.Tensor, torch.Tensor, Optional[dict[str,Any]]], torch.Tensor],
|
||||
is_running_diffusers: bool=False,
|
||||
):
|
||||
"""
|
||||
:param model: the unet model to pass through to cross attention control
|
||||
:param model_forward_callback: a lambda with arguments (x, sigma, conditioning_to_apply). will be called repeatedly. most likely, this should simply call model.forward(x, sigma, conditioning)
|
||||
"""
|
||||
self.conditioning = None
|
||||
self.model = model
|
||||
self.is_running_diffusers = is_running_diffusers
|
||||
self.model_forward_callback = model_forward_callback
|
||||
self.cross_attention_control_context = None
|
||||
|
||||
def setup_cross_attention_control(self, conditioning: ExtraConditioningInfo, step_count: int):
|
||||
@contextmanager
|
||||
def custom_attention_context(self,
|
||||
extra_conditioning_info: Optional[ExtraConditioningInfo],
|
||||
step_count: int):
|
||||
do_swap = extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control
|
||||
old_attn_processor = None
|
||||
if do_swap:
|
||||
old_attn_processor = self.override_cross_attention(extra_conditioning_info,
|
||||
step_count=step_count)
|
||||
try:
|
||||
yield None
|
||||
finally:
|
||||
if old_attn_processor is not None:
|
||||
self.restore_default_cross_attention(old_attn_processor)
|
||||
# TODO resuscitate attention map saving
|
||||
#self.remove_attention_map_saving()
|
||||
|
||||
def override_cross_attention(self, conditioning: ExtraConditioningInfo, step_count: int) -> Dict[str, AttnProcessor]:
|
||||
"""
|
||||
setup cross attention .swap control. for diffusers this replaces the attention processor, so
|
||||
the previous attention processor is returned so that the caller can restore it later.
|
||||
"""
|
||||
self.conditioning = conditioning
|
||||
self.cross_attention_control_context = Context(
|
||||
arguments=self.conditioning.cross_attention_control_args,
|
||||
step_count=step_count
|
||||
)
|
||||
setup_cross_attention_control(self.model, self.cross_attention_control_context)
|
||||
return override_cross_attention(self.model,
|
||||
self.cross_attention_control_context,
|
||||
is_running_diffusers=self.is_running_diffusers)
|
||||
|
||||
def remove_cross_attention_control(self):
|
||||
def restore_default_cross_attention(self, restore_attention_processor: Optional['AttnProcessor']=None):
|
||||
self.conditioning = None
|
||||
self.cross_attention_control_context = None
|
||||
remove_cross_attention_control(self.model)
|
||||
restore_default_cross_attention(self.model,
|
||||
is_running_diffusers=self.is_running_diffusers,
|
||||
restore_attention_processor=restore_attention_processor)
|
||||
|
||||
def setup_attention_map_saving(self, saver: AttentionMapSaver):
|
||||
def callback(slice, dim, offset, slice_size, key):
|
||||
@ -168,7 +199,41 @@ class InvokeAIDiffuserComponent:
|
||||
return unconditioned_next_x, conditioned_next_x
|
||||
|
||||
|
||||
def apply_cross_attention_controlled_conditioning(self, x:torch.Tensor, sigma, unconditioning, conditioning, cross_attention_control_types_to_do):
|
||||
def apply_cross_attention_controlled_conditioning(self,
|
||||
x: torch.Tensor,
|
||||
sigma,
|
||||
unconditioning,
|
||||
conditioning,
|
||||
cross_attention_control_types_to_do):
|
||||
if self.is_running_diffusers:
|
||||
return self.apply_cross_attention_controlled_conditioning__diffusers(x, sigma, unconditioning, conditioning, cross_attention_control_types_to_do)
|
||||
else:
|
||||
return self.apply_cross_attention_controlled_conditioning__compvis(x, sigma, unconditioning, conditioning, cross_attention_control_types_to_do)
|
||||
|
||||
def apply_cross_attention_controlled_conditioning__diffusers(self,
|
||||
x: torch.Tensor,
|
||||
sigma,
|
||||
unconditioning,
|
||||
conditioning,
|
||||
cross_attention_control_types_to_do):
|
||||
context: Context = self.cross_attention_control_context
|
||||
|
||||
cross_attn_processor_context = SwapCrossAttnContext(modified_text_embeddings=context.arguments.edited_conditioning,
|
||||
index_map=context.cross_attention_index_map,
|
||||
mask=context.cross_attention_mask,
|
||||
cross_attention_types_to_do=[])
|
||||
# no cross attention for unconditioning (negative prompt)
|
||||
unconditioned_next_x = self.model_forward_callback(x, sigma, unconditioning,
|
||||
{"swap_cross_attn_context": cross_attn_processor_context})
|
||||
|
||||
# do requested cross attention types for conditioning (positive prompt)
|
||||
cross_attn_processor_context.cross_attention_types_to_do = cross_attention_control_types_to_do
|
||||
conditioned_next_x = self.model_forward_callback(x, sigma, conditioning,
|
||||
{"swap_cross_attn_context": cross_attn_processor_context})
|
||||
return unconditioned_next_x, conditioned_next_x
|
||||
|
||||
|
||||
def apply_cross_attention_controlled_conditioning__compvis(self, x:torch.Tensor, sigma, unconditioning, conditioning, cross_attention_control_types_to_do):
|
||||
# print('pct', percent_through, ': doing cross attention control on', cross_attention_control_types_to_do)
|
||||
# slower non-batched path (20% slower on mac MPS)
|
||||
# We are only interested in using attention maps for conditioned_next_x, but batching them with generation of
|
||||
|
Loading…
Reference in New Issue
Block a user