mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
Merge branch 'main' into dev/installer
This commit is contained in:
commit
8ce9f07223
@ -1,19 +1,18 @@
|
|||||||
*
|
*
|
||||||
|
!assets/caution.png
|
||||||
!backend
|
!backend
|
||||||
!environments-and-requirements
|
!frontend/dist
|
||||||
!frontend
|
|
||||||
!ldm
|
!ldm
|
||||||
!main.py
|
!pyproject.toml
|
||||||
|
!README.md
|
||||||
!scripts
|
!scripts
|
||||||
!server
|
|
||||||
!static
|
|
||||||
!setup.py
|
|
||||||
|
|
||||||
# Guard against pulling in any models that might exist in the directory tree
|
# Guard against pulling in any models that might exist in the directory tree
|
||||||
**/*.pt*
|
**.pt*
|
||||||
|
|
||||||
# unignore configs, but only ignore the custom models.yaml, in case it exists
|
# unignore configs, but only ignore the custom models.yaml, in case it exists
|
||||||
!configs
|
!configs
|
||||||
configs/models.yaml
|
configs/models.yaml
|
||||||
|
configs/models.yaml.orig
|
||||||
|
|
||||||
**/__pycache__
|
**/__pycache__
|
||||||
|
1
.github/workflows/build-cloud-img.yml
vendored
1
.github/workflows/build-cloud-img.yml
vendored
@ -21,6 +21,7 @@ env:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
docker:
|
docker:
|
||||||
|
if: github.event.pull_request.draft == false
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
|
35
.github/workflows/build-container.yml
vendored
35
.github/workflows/build-container.yml
vendored
@ -3,63 +3,60 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- 'main'
|
- 'main'
|
||||||
|
tags:
|
||||||
|
- 'v*.*.*'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
docker:
|
docker:
|
||||||
|
if: github.event.pull_request.draft == false
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
registry:
|
|
||||||
- ghcr.io
|
|
||||||
flavor:
|
flavor:
|
||||||
- amd
|
- amd
|
||||||
- cuda
|
- cuda
|
||||||
# - cloud
|
|
||||||
include:
|
include:
|
||||||
- flavor: amd
|
- flavor: amd
|
||||||
pip-requirements: requirements-lin-amd.txt
|
pip-extra-index-url: 'https://download.pytorch.org/whl/rocm5.2'
|
||||||
dockerfile: docker-build/Dockerfile
|
dockerfile: docker-build/Dockerfile
|
||||||
platforms: linux/amd64,linux/arm64
|
platforms: linux/amd64,linux/arm64
|
||||||
- flavor: cuda
|
- flavor: cuda
|
||||||
pip-requirements: requirements-lin-cuda.txt
|
pip-extra-index-url: ''
|
||||||
dockerfile: docker-build/Dockerfile
|
dockerfile: docker-build/Dockerfile
|
||||||
platforms: linux/amd64,linux/arm64
|
platforms: linux/amd64,linux/arm64
|
||||||
# - flavor: cloud
|
|
||||||
# pip-requirements: requirements-lin-cuda.txt
|
|
||||||
# dockerfile: docker-build/Dockerfile.cloud
|
|
||||||
# platforms: linux/amd64
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
name: ${{ matrix.flavor }}
|
name: ${{ matrix.flavor }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Set up QEMU
|
|
||||||
uses: docker/setup-qemu-action@v2
|
|
||||||
|
|
||||||
- name: Docker meta
|
- name: Docker meta
|
||||||
id: meta
|
id: meta
|
||||||
uses: docker/metadata-action@v4
|
uses: docker/metadata-action@v4
|
||||||
with:
|
with:
|
||||||
images: ${{ matrix.registry }}/${{ github.repository }}-${{ matrix.flavor }}
|
images: ghcr.io/${{ github.repository }}-${{ matrix.flavor }}
|
||||||
tags: |
|
tags: |
|
||||||
type=ref,event=branch
|
type=ref,event=branch
|
||||||
type=ref,event=tag
|
type=ref,event=tag
|
||||||
type=semver,pattern={{version}}
|
type=semver,pattern={{version}}
|
||||||
type=semver,pattern={{major}}.{{minor}}
|
type=semver,pattern={{major}}.{{minor}}
|
||||||
|
type=semver,pattern={{major}}
|
||||||
type=sha
|
type=sha
|
||||||
flavor: |
|
flavor: |
|
||||||
latest=true
|
latest=true
|
||||||
|
|
||||||
|
- name: Set up QEMU
|
||||||
|
uses: docker/setup-qemu-action@v2
|
||||||
|
|
||||||
- name: Set up Docker Buildx
|
- name: Set up Docker Buildx
|
||||||
uses: docker/setup-buildx-action@v2
|
uses: docker/setup-buildx-action@v2
|
||||||
|
|
||||||
- if: github.event_name != 'pull_request'
|
- name: Login to GitHub Container Registry
|
||||||
name: Docker login
|
if: github.event_name != 'pull_request'
|
||||||
uses: docker/login-action@v2
|
uses: docker/login-action@v2
|
||||||
with:
|
with:
|
||||||
registry: ${{ matrix.registry }}
|
registry: ghcr.io
|
||||||
username: ${{ github.actor }}
|
username: ${{ github.repository_owner }}
|
||||||
password: ${{ secrets.GITHUB_TOKEN }}
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
- name: Build container
|
- name: Build container
|
||||||
@ -71,4 +68,6 @@ jobs:
|
|||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: ${{ steps.meta.outputs.tags }}
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
build-args: pip_requirements=${{ matrix.pip-requirements }}
|
build-args: PIP_EXTRA_INDEX_URL=${{ matrix.pip-extra-index-url }}
|
||||||
|
# cache-from: type=gha
|
||||||
|
# cache-to: type=gha,mode=max
|
||||||
|
34
.github/workflows/clean-caches.yml
vendored
Normal file
34
.github/workflows/clean-caches.yml
vendored
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
name: cleanup caches by a branch
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
types:
|
||||||
|
- closed
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
cleanup:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Check out code
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Cleanup
|
||||||
|
run: |
|
||||||
|
gh extension install actions/gh-actions-cache
|
||||||
|
|
||||||
|
REPO=${{ github.repository }}
|
||||||
|
BRANCH=${{ github.ref }}
|
||||||
|
|
||||||
|
echo "Fetching list of cache key"
|
||||||
|
cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH | cut -f 1 )
|
||||||
|
|
||||||
|
## Setting this to not fail the workflow while deleting cache keys.
|
||||||
|
set +e
|
||||||
|
echo "Deleting caches..."
|
||||||
|
for cacheKey in $cacheKeysForPR
|
||||||
|
do
|
||||||
|
gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm
|
||||||
|
done
|
||||||
|
echo "Done"
|
||||||
|
env:
|
||||||
|
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
1
.github/workflows/lint-frontend.yml
vendored
1
.github/workflows/lint-frontend.yml
vendored
@ -14,6 +14,7 @@ defaults:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
lint-frontend:
|
lint-frontend:
|
||||||
|
if: github.event.pull_request.draft == false
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
steps:
|
steps:
|
||||||
- name: Setup Node 18
|
- name: Setup Node 18
|
||||||
|
1
.github/workflows/mkdocs-material.yml
vendored
1
.github/workflows/mkdocs-material.yml
vendored
@ -7,6 +7,7 @@ on:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
mkdocs-material:
|
mkdocs-material:
|
||||||
|
if: github.event.pull_request.draft == false
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: checkout sources
|
- name: checkout sources
|
||||||
|
1
.github/workflows/pyflakes.yml
vendored
1
.github/workflows/pyflakes.yml
vendored
@ -9,6 +9,7 @@ on:
|
|||||||
jobs:
|
jobs:
|
||||||
pyflakes:
|
pyflakes:
|
||||||
name: runner / pyflakes
|
name: runner / pyflakes
|
||||||
|
if: github.event.pull_request.draft == false
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
|
@ -1,59 +1,71 @@
|
|||||||
FROM python:3.10-slim AS builder
|
# syntax=docker/dockerfile:1
|
||||||
|
FROM python:3.9-slim AS python-base
|
||||||
|
|
||||||
# use bash
|
# use bash
|
||||||
SHELL [ "/bin/bash", "-c" ]
|
SHELL [ "/bin/bash", "-c" ]
|
||||||
|
|
||||||
# Install necesarry packages
|
# Install necesarry packages
|
||||||
RUN apt-get update \
|
RUN \
|
||||||
|
--mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||||
|
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||||
|
apt-get update \
|
||||||
|
&& apt-get install -y \
|
||||||
|
--no-install-recommends \
|
||||||
|
libgl1-mesa-glx=20.3.* \
|
||||||
|
libglib2.0-0=2.66.* \
|
||||||
|
libopencv-dev=4.5.* \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
ARG APPDIR=/usr/src/app
|
||||||
|
ENV APPDIR ${APPDIR}
|
||||||
|
WORKDIR ${APPDIR}
|
||||||
|
|
||||||
|
FROM python-base AS builder
|
||||||
|
|
||||||
|
RUN \
|
||||||
|
--mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||||
|
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||||
|
apt-get update \
|
||||||
&& apt-get install -y \
|
&& apt-get install -y \
|
||||||
--no-install-recommends \
|
--no-install-recommends \
|
||||||
gcc=4:10.2.* \
|
gcc=4:10.2.* \
|
||||||
libgl1-mesa-glx=20.3.* \
|
|
||||||
libglib2.0-0=2.66.* \
|
|
||||||
python3-dev=3.9.* \
|
python3-dev=3.9.* \
|
||||||
&& apt-get clean \
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# set WORKDIR, PATH and copy sources
|
# copy sources
|
||||||
ARG APPDIR=/usr/src/app
|
COPY --link . .
|
||||||
WORKDIR ${APPDIR}
|
ARG PIP_EXTRA_INDEX_URL
|
||||||
ENV PATH ${APPDIR}/.venv/bin:$PATH
|
ENV PIP_EXTRA_INDEX_URL ${PIP_EXTRA_INDEX_URL}
|
||||||
ARG PIP_REQUIREMENTS=requirements-lin-cuda.txt
|
|
||||||
COPY . ./environments-and-requirements/${PIP_REQUIREMENTS} ./
|
|
||||||
|
|
||||||
# install requirements
|
# install requirements
|
||||||
RUN python3 -m venv .venv \
|
RUN python3 -m venv invokeai \
|
||||||
&& pip install \
|
&& ${APPDIR}/invokeai/bin/pip \
|
||||||
--upgrade \
|
install \
|
||||||
--no-cache-dir \
|
--no-cache-dir \
|
||||||
'wheel>=0.38.4' \
|
--use-pep517 \
|
||||||
&& pip install \
|
.
|
||||||
--no-cache-dir \
|
|
||||||
-r ${PIP_REQUIREMENTS}
|
|
||||||
|
|
||||||
FROM python:3.10-slim AS runtime
|
FROM python-base AS runtime
|
||||||
|
|
||||||
# setup environment
|
# setup environment
|
||||||
ARG APPDIR=/usr/src/app
|
COPY --link . .
|
||||||
WORKDIR ${APPDIR}
|
COPY --from=builder ${APPDIR}/invokeai ${APPDIR}/invokeai
|
||||||
COPY --from=builder ${APPDIR} .
|
ENV PATH=${APPDIR}/invokeai/bin:$PATH
|
||||||
ENV \
|
ENV INVOKEAI_ROOT=/data
|
||||||
PATH=${APPDIR}/.venv/bin:$PATH \
|
ENV INVOKE_MODEL_RECONFIGURE="--yes --default_only"
|
||||||
INVOKEAI_ROOT=/data \
|
|
||||||
INVOKE_MODEL_RECONFIGURE=--yes
|
|
||||||
|
|
||||||
# Install necesarry packages
|
# build patchmatch
|
||||||
RUN apt-get update \
|
RUN \
|
||||||
|
--mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||||
|
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||||
|
apt-get update \
|
||||||
&& apt-get install -y \
|
&& apt-get install -y \
|
||||||
--no-install-recommends \
|
--no-install-recommends \
|
||||||
build-essential=12.9 \
|
build-essential=12.9 \
|
||||||
libgl1-mesa-glx=20.3.* \
|
&& PYTHONDONTWRITEBYTECODE=1 \
|
||||||
libglib2.0-0=2.66.* \
|
python3 -c "from patchmatch import patch_match" \
|
||||||
libopencv-dev=4.5.* \
|
|
||||||
&& ln -sf \
|
|
||||||
/usr/lib/"$(arch)"-linux-gnu/pkgconfig/opencv4.pc \
|
|
||||||
/usr/lib/"$(arch)"-linux-gnu/pkgconfig/opencv.pc \
|
|
||||||
&& python3 -c "from patchmatch import patch_match" \
|
|
||||||
&& apt-get remove -y \
|
&& apt-get remove -y \
|
||||||
--autoremove \
|
--autoremove \
|
||||||
build-essential \
|
build-essential \
|
||||||
@ -61,5 +73,6 @@ RUN apt-get update \
|
|||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# set Entrypoint and default CMD
|
# set Entrypoint and default CMD
|
||||||
ENTRYPOINT [ "python3", "scripts/invoke.py" ]
|
ENTRYPOINT [ "invoke" ]
|
||||||
CMD [ "--web", "--host=0.0.0.0" ]
|
CMD [ "--web", "--host=0.0.0.0" ]
|
||||||
|
VOLUME [ "/data" ]
|
||||||
|
@ -2,18 +2,25 @@
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
# How to use: https://invoke-ai.github.io/InvokeAI/installation/INSTALL_DOCKER/#setup
|
# How to use: https://invoke-ai.github.io/InvokeAI/installation/INSTALL_DOCKER/#setup
|
||||||
|
#
|
||||||
|
# Some possible pip extra-index urls (cuda 11.7 is available without extra url):
|
||||||
|
#
|
||||||
|
# CUDA 11.6: https://download.pytorch.org/whl/cu116
|
||||||
|
# ROCm 5.2: https://download.pytorch.org/whl/rocm5.2
|
||||||
|
# CPU: https://download.pytorch.org/whl/cpu
|
||||||
|
#
|
||||||
|
# as found on https://pytorch.org/get-started/locally/
|
||||||
|
|
||||||
source ./docker-build/env.sh \
|
cd "$(dirname "$0")" || exit 1
|
||||||
|| echo "please execute docker-build/build.sh from repository root" \
|
|
||||||
|| exit 1
|
|
||||||
|
|
||||||
PIP_REQUIREMENTS=${PIP_REQUIREMENTS:-requirements-lin-cuda.txt}
|
source ./env.sh
|
||||||
DOCKERFILE=${INVOKE_DOCKERFILE:-docker-build/Dockerfile}
|
|
||||||
|
DOCKERFILE=${INVOKE_DOCKERFILE:-"./Dockerfile"}
|
||||||
|
|
||||||
# print the settings
|
# print the settings
|
||||||
echo -e "You are using these values:\n"
|
echo -e "You are using these values:\n"
|
||||||
echo -e "Dockerfile:\t ${DOCKERFILE}"
|
echo -e "Dockerfile:\t ${DOCKERFILE}"
|
||||||
echo -e "Requirements:\t ${PIP_REQUIREMENTS}"
|
echo -e "extra-index-url: ${PIP_EXTRA_INDEX_URL:-none}"
|
||||||
echo -e "Volumename:\t ${VOLUMENAME}"
|
echo -e "Volumename:\t ${VOLUMENAME}"
|
||||||
echo -e "arch:\t\t ${ARCH}"
|
echo -e "arch:\t\t ${ARCH}"
|
||||||
echo -e "Platform:\t ${PLATFORM}"
|
echo -e "Platform:\t ${PLATFORM}"
|
||||||
@ -30,6 +37,6 @@ fi
|
|||||||
docker build \
|
docker build \
|
||||||
--platform="${PLATFORM}" \
|
--platform="${PLATFORM}" \
|
||||||
--tag="${INVOKEAI_TAG}" \
|
--tag="${INVOKEAI_TAG}" \
|
||||||
--build-arg="PIP_REQUIREMENTS=${PIP_REQUIREMENTS}" \
|
${PIP_EXTRA_INDEX_URL:+--build-arg=PIP_EXTRA_INDEX_URL="${PIP_EXTRA_INDEX_URL}"} \
|
||||||
--file="${DOCKERFILE}" \
|
--file="${DOCKERFILE}" \
|
||||||
.
|
..
|
||||||
|
@ -7,4 +7,4 @@ ARCH=${ARCH:-$(uname -m)}
|
|||||||
PLATFORM=${PLATFORM:-Linux/${ARCH}}
|
PLATFORM=${PLATFORM:-Linux/${ARCH}}
|
||||||
CONTAINER_FLAVOR=${CONTAINER_FLAVOR:-cuda}
|
CONTAINER_FLAVOR=${CONTAINER_FLAVOR:-cuda}
|
||||||
INVOKEAI_BRANCH=$(git branch --show)
|
INVOKEAI_BRANCH=$(git branch --show)
|
||||||
INVOKEAI_TAG=${REPOSITORY_NAME,,}-${CONTAINER_FLAVOR}:${INVOKEAI_TAG:-${INVOKEAI_BRANCH/\//-}}
|
INVOKEAI_TAG=${REPOSITORY_NAME,,}-${CONTAINER_FLAVOR}:${INVOKEAI_TAG:-${INVOKEAI_BRANCH##*/}}
|
||||||
|
@ -4,17 +4,14 @@ set -e
|
|||||||
# How to use: https://invoke-ai.github.io/InvokeAI/installation/INSTALL_DOCKER/#run-the-container
|
# How to use: https://invoke-ai.github.io/InvokeAI/installation/INSTALL_DOCKER/#run-the-container
|
||||||
# IMPORTANT: You need to have a token on huggingface.co to be able to download the checkpoints!!!
|
# IMPORTANT: You need to have a token on huggingface.co to be able to download the checkpoints!!!
|
||||||
|
|
||||||
source ./docker-build/env.sh \
|
cd "$(dirname "$0")" || exit 1
|
||||||
|| echo "please run from repository root" \
|
|
||||||
|| exit 1
|
|
||||||
|
|
||||||
# check if HUGGINGFACE_TOKEN is available
|
source ./env.sh
|
||||||
# You must have accepted the terms of use for required models
|
|
||||||
HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:?Please set your token for Huggingface as HUGGINGFACE_TOKEN}
|
|
||||||
|
|
||||||
echo -e "You are using these values:\n"
|
echo -e "You are using these values:\n"
|
||||||
echo -e "Volumename:\t ${VOLUMENAME}"
|
echo -e "Volumename:\t${VOLUMENAME}"
|
||||||
echo -e "Invokeai_tag:\t ${INVOKEAI_TAG}\n"
|
echo -e "Invokeai_tag:\t${INVOKEAI_TAG}"
|
||||||
|
echo -e "local Models:\t${MODELSPATH:-unset}\n"
|
||||||
|
|
||||||
docker run \
|
docker run \
|
||||||
--interactive \
|
--interactive \
|
||||||
@ -23,8 +20,10 @@ docker run \
|
|||||||
--platform="$PLATFORM" \
|
--platform="$PLATFORM" \
|
||||||
--name="${REPOSITORY_NAME,,}" \
|
--name="${REPOSITORY_NAME,,}" \
|
||||||
--hostname="${REPOSITORY_NAME,,}" \
|
--hostname="${REPOSITORY_NAME,,}" \
|
||||||
--mount="source=$VOLUMENAME,target=/data" \
|
--mount=source="$VOLUMENAME",target=/data \
|
||||||
--env="HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN}" \
|
${MODELSPATH:+-u "$(id -u):$(id -g)"} \
|
||||||
|
${MODELSPATH:+--mount=type=bind,source=${MODELSPATH},target=/data/models} \
|
||||||
|
${HUGGING_FACE_HUB_TOKEN:+--env=HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}} \
|
||||||
--publish=9090:9090 \
|
--publish=9090:9090 \
|
||||||
--cap-add=sys_nice \
|
--cap-add=sys_nice \
|
||||||
${GPU_FLAGS:+--gpus=${GPU_FLAGS}} \
|
${GPU_FLAGS:+--gpus=${GPU_FLAGS}} \
|
||||||
|
@ -52,12 +52,17 @@ introduces several changes you should know about.
|
|||||||
path: models/diffusers/hakurei-haifu-diffusion-1.4
|
path: models/diffusers/hakurei-haifu-diffusion-1.4
|
||||||
```
|
```
|
||||||
|
|
||||||
2. The format of the models directory has changed to mimic the
|
2. In order of precedence, InvokeAI will now use HF_HOME, then
|
||||||
HuggingFace cache directory. By default, diffusers models are
|
XDG_CACHE_HOME, then finally default to `ROOTDIR/models` to
|
||||||
now automatically downloaded and retrieved from the directory
|
store HuggingFace diffusers models.
|
||||||
`ROOTDIR/models/diffusers`, while other models are stored in
|
|
||||||
the directory `ROOTDIR/models/hub`. This organization is the
|
Consequently, the format of the models directory has changed to
|
||||||
same as that used by HuggingFace for its cache management.
|
mimic the HuggingFace cache directory. When HF_HOME and XDG_HOME
|
||||||
|
are not set, diffusers models are now automatically downloaded
|
||||||
|
and retrieved from the directory `ROOTDIR/models/diffusers`,
|
||||||
|
while other models are stored in the directory
|
||||||
|
`ROOTDIR/models/hub`. This organization is the same as that used
|
||||||
|
by HuggingFace for its cache management.
|
||||||
|
|
||||||
This allows you to share diffusers and ckpt model files easily with
|
This allows you to share diffusers and ckpt model files easily with
|
||||||
other machine learning applications that use the HuggingFace
|
other machine learning applications that use the HuggingFace
|
||||||
@ -66,7 +71,13 @@ introduces several changes you should know about.
|
|||||||
cache models in. To tell InvokeAI to use the standard HuggingFace
|
cache models in. To tell InvokeAI to use the standard HuggingFace
|
||||||
cache directory, you would set HF_HOME like this (Linux/Mac):
|
cache directory, you would set HF_HOME like this (Linux/Mac):
|
||||||
|
|
||||||
`export HF_HOME=~/.cache/hugging_face`
|
`export HF_HOME=~/.cache/huggingface`
|
||||||
|
|
||||||
|
Both HuggingFace and InvokeAI will fall back to the XDG_CACHE_HOME
|
||||||
|
environment variable if HF_HOME is not set; this path
|
||||||
|
takes precedence over `ROOTDIR/models` to allow for the same sharing
|
||||||
|
with other machine learning applications that use HuggingFace
|
||||||
|
libraries.
|
||||||
|
|
||||||
3. If you upgrade to InvokeAI 2.3.* from an earlier version, there
|
3. If you upgrade to InvokeAI 2.3.* from an earlier version, there
|
||||||
will be a one-time migration from the old models directory format
|
will be a one-time migration from the old models directory format
|
||||||
|
@ -239,28 +239,24 @@ Generate an image with a given prompt, record the seed of the image, and then
|
|||||||
use the `prompt2prompt` syntax to substitute words in the original prompt for
|
use the `prompt2prompt` syntax to substitute words in the original prompt for
|
||||||
words in a new prompt. This works for `img2img` as well.
|
words in a new prompt. This works for `img2img` as well.
|
||||||
|
|
||||||
- `a ("fluffy cat").swap("smiling dog") eating a hotdog`.
|
For example, consider the prompt `a cat.swap(dog) playing with a ball in the forest`. Normally, because of the word words interact with each other when doing a stable diffusion image generation, these two prompts would generate different compositions:
|
||||||
- quotes optional: `a (fluffy cat).swap(smiling dog) eating a hotdog`.
|
- `a cat playing with a ball in the forest`
|
||||||
- for single word substitutions parentheses are also optional:
|
- `a dog playing with a ball in the forest`
|
||||||
`a cat.swap(dog) eating a hotdog`.
|
|
||||||
- Supports options `s_start`, `s_end`, `t_start`, `t_end` (each 0-1) loosely
|
| `a cat playing with a ball in the forest` | `a dog playing with a ball in the forest` |
|
||||||
corresponding to bloc97's `prompt_edit_spatial_start/_end` and
|
| --- | --- |
|
||||||
`prompt_edit_tokens_start/_end` but with the math swapped to make it easier to
|
| img | img |
|
||||||
intuitively understand.
|
|
||||||
- Example usage:`a (cat).swap(dog, s_end=0.3) eating a hotdog` - the `s_end`
|
|
||||||
argument means that the "spatial" (self-attention) edit will stop having any
|
- For multiple word swaps, use parentheses: `a (fluffy cat).swap(barking dog) playing with a ball in the forest`.
|
||||||
effect after 30% (=0.3) of the steps have been done, leaving Stable
|
- To swap a comma, use quotes: `a ("fluffy, grey cat").swap("big, barking dog") playing with a ball in the forest`.
|
||||||
Diffusion with 70% of the steps where it is free to decide for itself how to
|
- Supports options `t_start` and `t_end` (each 0-1) loosely corresponding to bloc97's `prompt_edit_tokens_start/_end` but with the math swapped to make it easier to
|
||||||
reshape the cat-form into a dog form.
|
intuitively understand. `t_start` and `t_end` are used to control on which steps cross-attention control should run. With the default values `t_start=0` and `t_end=1`, cross-attention control is active on every step of image generation. Other values can be used to turn cross-attention control off for part of the image generation process.
|
||||||
- The numbers represent a percentage through the step sequence where the edits
|
- For example, if doing a diffusion with 10 steps for the prompt is `a cat.swap(dog, t_start=0.3, t_end=1.0) playing with a ball in the forest`, the first 3 steps will be run as `a cat playing with a ball in the forest`, while the last 7 steps will run as `a dog playing with a ball in the forest`, but the pixels that represent `dog` will be locked to the pixels that would have represented `cat` if the `cat` prompt had been used instead.
|
||||||
should happen. 0 means the start (noisy starting image), 1 is the end (final
|
- Conversely, for `a cat.swap(dog, t_start=0, t_end=0.7) playing with a ball in the forest`, the first 7 steps will run as `a dog playing with a ball in the forest` with the pixels that represent `dog` locked to the same pixels that would have represented `cat` if the `cat` prompt was being used instead. The final 3 steps will just run `a cat playing with a ball in the forest`.
|
||||||
image).
|
> For img2img, the step sequence does not start at 0 but instead at `(1.0-strength)` - so if the img2img `strength` is `0.7`, `t_start` and `t_end` must both be greater than `0.3` (`1.0-0.7`) to have any effect.
|
||||||
- For img2img, the step sequence does not start at 0 but instead at
|
|
||||||
(1-strength) - so if strength is 0.7, s_start and s_end must both be
|
Prompt2prompt `.swap()` is not compatible with xformers, which will be temporarily disabled when doing a `.swap()` - so you should expect to use more VRAM and run slower that with xformers enabled.
|
||||||
greater than 0.3 (1-0.7) to have any effect.
|
|
||||||
- Convenience option `shape_freedom` (0-1) to specify how much "freedom" Stable
|
|
||||||
Diffusion should have to change the shape of the subject being swapped.
|
|
||||||
- `a (cat).swap(dog, shape_freedom=0.5) eating a hotdog`.
|
|
||||||
|
|
||||||
The `prompt2prompt` code is based off
|
The `prompt2prompt` code is based off
|
||||||
[bloc97's colab](https://github.com/bloc97/CrossAttentionControl).
|
[bloc97's colab](https://github.com/bloc97/CrossAttentionControl).
|
||||||
|
@ -2,4 +2,5 @@
|
|||||||
-r environments-and-requirements/requirements-base.txt
|
-r environments-and-requirements/requirements-base.txt
|
||||||
torch>=1.13.1
|
torch>=1.13.1
|
||||||
torchvision>=0.14.1
|
torchvision>=0.14.1
|
||||||
|
xformers~=0.16
|
||||||
-e .
|
-e .
|
||||||
|
@ -3,4 +3,5 @@
|
|||||||
--extra-index-url https://download.pytorch.org/whl/cu117 --trusted-host https://download.pytorch.org
|
--extra-index-url https://download.pytorch.org/whl/cu117 --trusted-host https://download.pytorch.org
|
||||||
torch==1.13.1
|
torch==1.13.1
|
||||||
torchvision==0.14.1
|
torchvision==0.14.1
|
||||||
|
xformers~=0.16
|
||||||
-e .
|
-e .
|
||||||
|
@ -20,6 +20,7 @@ import torch
|
|||||||
import transformers
|
import transformers
|
||||||
from PIL import Image, ImageOps
|
from PIL import Image, ImageOps
|
||||||
from diffusers.pipeline_utils import DiffusionPipeline
|
from diffusers.pipeline_utils import DiffusionPipeline
|
||||||
|
from diffusers.utils.import_utils import is_xformers_available
|
||||||
from omegaconf import OmegaConf
|
from omegaconf import OmegaConf
|
||||||
from pytorch_lightning import seed_everything, logging
|
from pytorch_lightning import seed_everything, logging
|
||||||
|
|
||||||
@ -203,6 +204,14 @@ class Generate:
|
|||||||
self.precision = choose_precision(self.device)
|
self.precision = choose_precision(self.device)
|
||||||
Globals.full_precision = self.precision=='float32'
|
Globals.full_precision = self.precision=='float32'
|
||||||
|
|
||||||
|
if is_xformers_available():
|
||||||
|
if not Globals.disable_xformers:
|
||||||
|
print('>> xformers memory-efficient attention is available and enabled')
|
||||||
|
else:
|
||||||
|
print('>> xformers memory-efficient attention is available but disabled')
|
||||||
|
else:
|
||||||
|
print('>> xformers not installed')
|
||||||
|
|
||||||
# model caching system for fast switching
|
# model caching system for fast switching
|
||||||
self.model_manager = ModelManager(mconfig,self.device,self.precision,max_loaded_models=max_loaded_models)
|
self.model_manager = ModelManager(mconfig,self.device,self.precision,max_loaded_models=max_loaded_models)
|
||||||
# don't accept invalid models
|
# don't accept invalid models
|
||||||
|
@ -53,10 +53,11 @@ def main():
|
|||||||
|
|
||||||
if not args.conf:
|
if not args.conf:
|
||||||
if not os.path.exists(os.path.join(Globals.root,'configs','models.yaml')):
|
if not os.path.exists(os.path.join(Globals.root,'configs','models.yaml')):
|
||||||
print(f"\n** Error. The file {os.path.join(Globals.root,'configs','models.yaml')} could not be found.")
|
report_model_error(opt, e)
|
||||||
print('** Please check the location of your invokeai directory and use the --root_dir option to point to the correct path.')
|
# print(f"\n** Error. The file {os.path.join(Globals.root,'configs','models.yaml')} could not be found.")
|
||||||
print('** This script will now exit.')
|
# print('** Please check the location of your invokeai directory and use the --root_dir option to point to the correct path.')
|
||||||
sys.exit(-1)
|
# print('** This script will now exit.')
|
||||||
|
# sys.exit(-1)
|
||||||
|
|
||||||
print(f'>> {ldm.invoke.__app_name__}, version {ldm.invoke.__version__}')
|
print(f'>> {ldm.invoke.__app_name__}, version {ldm.invoke.__version__}')
|
||||||
print(f'>> InvokeAI runtime directory is "{Globals.root}"')
|
print(f'>> InvokeAI runtime directory is "{Globals.root}"')
|
||||||
@ -789,8 +790,8 @@ def _get_model_name(existing_names,completer,default_name:str='')->str:
|
|||||||
model_name = input(f'Short name for this model [{default_name}]: ').strip()
|
model_name = input(f'Short name for this model [{default_name}]: ').strip()
|
||||||
if len(model_name)==0:
|
if len(model_name)==0:
|
||||||
model_name = default_name
|
model_name = default_name
|
||||||
if not re.match('^[\w._+-]+$',model_name):
|
if not re.match('^[\w._+:/-]+$',model_name):
|
||||||
print('** model name must contain only words, digits and the characters "._+-" **')
|
print('** model name must contain only words, digits and the characters "._+:/-" **')
|
||||||
elif model_name != default_name and model_name in existing_names:
|
elif model_name != default_name and model_name in existing_names:
|
||||||
print(f'** the name {model_name} is already in use. Pick another.')
|
print(f'** the name {model_name} is already in use. Pick another.')
|
||||||
else:
|
else:
|
||||||
|
@ -24,9 +24,6 @@ from ...models.diffusion import cross_attention_control
|
|||||||
from ...models.diffusion.cross_attention_map_saving import AttentionMapSaver
|
from ...models.diffusion.cross_attention_map_saving import AttentionMapSaver
|
||||||
from ...modules.prompt_to_embeddings_converter import WeightedPromptFragmentsToEmbeddingsConverter
|
from ...modules.prompt_to_embeddings_converter import WeightedPromptFragmentsToEmbeddingsConverter
|
||||||
|
|
||||||
# monkeypatch diffusers CrossAttention 🙈
|
|
||||||
# this is to make prompt2prompt and (future) attention maps work
|
|
||||||
attention.CrossAttention = cross_attention_control.InvokeAIDiffusersCrossAttention
|
|
||||||
|
|
||||||
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
||||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||||
@ -295,7 +292,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
|||||||
safety_checker=safety_checker,
|
safety_checker=safety_checker,
|
||||||
feature_extractor=feature_extractor,
|
feature_extractor=feature_extractor,
|
||||||
)
|
)
|
||||||
self.invokeai_diffuser = InvokeAIDiffuserComponent(self.unet, self._unet_forward)
|
self.invokeai_diffuser = InvokeAIDiffuserComponent(self.unet, self._unet_forward, is_running_diffusers=True)
|
||||||
use_full_precision = (precision == 'float32' or precision == 'autocast')
|
use_full_precision = (precision == 'float32' or precision == 'autocast')
|
||||||
self.textual_inversion_manager = TextualInversionManager(tokenizer=self.tokenizer,
|
self.textual_inversion_manager = TextualInversionManager(tokenizer=self.tokenizer,
|
||||||
text_encoder=self.text_encoder,
|
text_encoder=self.text_encoder,
|
||||||
@ -307,8 +304,23 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
|||||||
textual_inversion_manager=self.textual_inversion_manager
|
textual_inversion_manager=self.textual_inversion_manager
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self._enable_memory_efficient_attention()
|
||||||
|
|
||||||
|
|
||||||
|
def _enable_memory_efficient_attention(self):
|
||||||
|
"""
|
||||||
|
if xformers is available, use it, otherwise use sliced attention.
|
||||||
|
"""
|
||||||
if is_xformers_available() and not Globals.disable_xformers:
|
if is_xformers_available() and not Globals.disable_xformers:
|
||||||
self.enable_xformers_memory_efficient_attention()
|
self.enable_xformers_memory_efficient_attention()
|
||||||
|
else:
|
||||||
|
if torch.backends.mps.is_available():
|
||||||
|
# until pytorch #91617 is fixed, slicing is borked on MPS
|
||||||
|
# https://github.com/pytorch/pytorch/issues/91617
|
||||||
|
# fix is in https://github.com/kulinseth/pytorch/pull/222 but no idea when it will get merged to pytorch mainline.
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
self.enable_attention_slicing(slice_size='auto')
|
||||||
|
|
||||||
def image_from_embeddings(self, latents: torch.Tensor, num_inference_steps: int,
|
def image_from_embeddings(self, latents: torch.Tensor, num_inference_steps: int,
|
||||||
conditioning_data: ConditioningData,
|
conditioning_data: ConditioningData,
|
||||||
@ -373,11 +385,9 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
|||||||
if additional_guidance is None:
|
if additional_guidance is None:
|
||||||
additional_guidance = []
|
additional_guidance = []
|
||||||
extra_conditioning_info = conditioning_data.extra
|
extra_conditioning_info = conditioning_data.extra
|
||||||
if extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control:
|
with self.invokeai_diffuser.custom_attention_context(extra_conditioning_info=extra_conditioning_info,
|
||||||
self.invokeai_diffuser.setup_cross_attention_control(extra_conditioning_info,
|
step_count=len(self.scheduler.timesteps)
|
||||||
step_count=len(self.scheduler.timesteps))
|
):
|
||||||
else:
|
|
||||||
self.invokeai_diffuser.remove_cross_attention_control()
|
|
||||||
|
|
||||||
yield PipelineIntermediateState(run_id=run_id, step=-1, timestep=self.scheduler.num_train_timesteps,
|
yield PipelineIntermediateState(run_id=run_id, step=-1, timestep=self.scheduler.num_train_timesteps,
|
||||||
latents=latents)
|
latents=latents)
|
||||||
@ -388,7 +398,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
|||||||
latents = self.scheduler.add_noise(latents, noise, batched_t)
|
latents = self.scheduler.add_noise(latents, noise, batched_t)
|
||||||
|
|
||||||
attention_map_saver: Optional[AttentionMapSaver] = None
|
attention_map_saver: Optional[AttentionMapSaver] = None
|
||||||
self.invokeai_diffuser.remove_attention_map_saving()
|
|
||||||
for i, t in enumerate(self.progress_bar(timesteps)):
|
for i, t in enumerate(self.progress_bar(timesteps)):
|
||||||
batched_t.fill_(t)
|
batched_t.fill_(t)
|
||||||
step_output = self.step(batched_t, latents, conditioning_data,
|
step_output = self.step(batched_t, latents, conditioning_data,
|
||||||
@ -398,16 +408,16 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
|||||||
latents = step_output.prev_sample
|
latents = step_output.prev_sample
|
||||||
predicted_original = getattr(step_output, 'pred_original_sample', None)
|
predicted_original = getattr(step_output, 'pred_original_sample', None)
|
||||||
|
|
||||||
if i == len(timesteps)-1 and extra_conditioning_info is not None:
|
# TODO resuscitate attention map saving
|
||||||
eos_token_index = extra_conditioning_info.tokens_count_including_eos_bos - 1
|
#if i == len(timesteps)-1 and extra_conditioning_info is not None:
|
||||||
attention_map_token_ids = range(1, eos_token_index)
|
# eos_token_index = extra_conditioning_info.tokens_count_including_eos_bos - 1
|
||||||
attention_map_saver = AttentionMapSaver(token_ids=attention_map_token_ids, latents_shape=latents.shape[-2:])
|
# attention_map_token_ids = range(1, eos_token_index)
|
||||||
self.invokeai_diffuser.setup_attention_map_saving(attention_map_saver)
|
# attention_map_saver = AttentionMapSaver(token_ids=attention_map_token_ids, latents_shape=latents.shape[-2:])
|
||||||
|
# self.invokeai_diffuser.setup_attention_map_saving(attention_map_saver)
|
||||||
|
|
||||||
yield PipelineIntermediateState(run_id=run_id, step=i, timestep=int(t), latents=latents,
|
yield PipelineIntermediateState(run_id=run_id, step=i, timestep=int(t), latents=latents,
|
||||||
predicted_original=predicted_original, attention_map_saver=attention_map_saver)
|
predicted_original=predicted_original, attention_map_saver=attention_map_saver)
|
||||||
|
|
||||||
self.invokeai_diffuser.remove_attention_map_saving()
|
|
||||||
return latents, attention_map_saver
|
return latents, attention_map_saver
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
@ -447,7 +457,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
|||||||
|
|
||||||
return step_output
|
return step_output
|
||||||
|
|
||||||
def _unet_forward(self, latents, t, text_embeddings):
|
def _unet_forward(self, latents, t, text_embeddings, cross_attention_kwargs: Optional[dict[str,Any]] = None):
|
||||||
"""predict the noise residual"""
|
"""predict the noise residual"""
|
||||||
if is_inpainting_model(self.unet) and latents.size(1) == 4:
|
if is_inpainting_model(self.unet) and latents.size(1) == 4:
|
||||||
# Pad out normal non-inpainting inputs for an inpainting model.
|
# Pad out normal non-inpainting inputs for an inpainting model.
|
||||||
@ -460,7 +470,10 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
|||||||
initial_image_latents=torch.zeros_like(latents[:1], device=latents.device, dtype=latents.dtype)
|
initial_image_latents=torch.zeros_like(latents[:1], device=latents.device, dtype=latents.dtype)
|
||||||
).add_mask_channels(latents)
|
).add_mask_channels(latents)
|
||||||
|
|
||||||
return self.unet(latents, t, encoder_hidden_states=text_embeddings).sample
|
return self.unet(sample=latents,
|
||||||
|
timestep=t,
|
||||||
|
encoder_hidden_states=text_embeddings,
|
||||||
|
cross_attention_kwargs=cross_attention_kwargs).sample
|
||||||
|
|
||||||
def img2img_from_embeddings(self,
|
def img2img_from_embeddings(self,
|
||||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||||
@ -531,6 +544,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
|||||||
init_image = image_resized_to_grid_as_tensor(init_image.convert('RGB'))
|
init_image = image_resized_to_grid_as_tensor(init_image.convert('RGB'))
|
||||||
|
|
||||||
init_image = init_image.to(device=device, dtype=latents_dtype)
|
init_image = init_image.to(device=device, dtype=latents_dtype)
|
||||||
|
mask = mask.to(device=device, dtype=latents_dtype)
|
||||||
|
|
||||||
if init_image.dim() == 3:
|
if init_image.dim() == 3:
|
||||||
init_image = init_image.unsqueeze(0)
|
init_image = init_image.unsqueeze(0)
|
||||||
@ -549,17 +563,22 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
|||||||
|
|
||||||
if mask.dim() == 3:
|
if mask.dim() == 3:
|
||||||
mask = mask.unsqueeze(0)
|
mask = mask.unsqueeze(0)
|
||||||
mask = tv_resize(mask, init_image_latents.shape[-2:], T.InterpolationMode.BILINEAR) \
|
latent_mask = tv_resize(mask, init_image_latents.shape[-2:], T.InterpolationMode.BILINEAR) \
|
||||||
.to(device=device, dtype=latents_dtype)
|
.to(device=device, dtype=latents_dtype)
|
||||||
|
|
||||||
guidance: List[Callable] = []
|
guidance: List[Callable] = []
|
||||||
|
|
||||||
if is_inpainting_model(self.unet):
|
if is_inpainting_model(self.unet):
|
||||||
|
# You'd think the inpainting model wouldn't be paying attention to the area it is going to repaint
|
||||||
|
# (that's why there's a mask!) but it seems to really want that blanked out.
|
||||||
|
masked_init_image = init_image * torch.where(mask < 0.5, 1, 0)
|
||||||
|
masked_latents = self.non_noised_latents_from_image(masked_init_image, device=device, dtype=latents_dtype)
|
||||||
|
|
||||||
# TODO: we should probably pass this in so we don't have to try/finally around setting it.
|
# TODO: we should probably pass this in so we don't have to try/finally around setting it.
|
||||||
self.invokeai_diffuser.model_forward_callback = \
|
self.invokeai_diffuser.model_forward_callback = \
|
||||||
AddsMaskLatents(self._unet_forward, mask, init_image_latents)
|
AddsMaskLatents(self._unet_forward, latent_mask, masked_latents)
|
||||||
else:
|
else:
|
||||||
guidance.append(AddsMaskGuidance(mask, init_image_latents, self.scheduler, noise))
|
guidance.append(AddsMaskGuidance(latent_mask, init_image_latents, self.scheduler, noise))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result_latents, result_attention_maps = self.latents_from_embeddings(
|
result_latents, result_attention_maps = self.latents_from_embeddings(
|
||||||
@ -578,11 +597,20 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
|
|||||||
output = InvokeAIStableDiffusionPipelineOutput(images=image, nsfw_content_detected=[], attention_map_saver=result_attention_maps)
|
output = InvokeAIStableDiffusionPipelineOutput(images=image, nsfw_content_detected=[], attention_map_saver=result_attention_maps)
|
||||||
return self.check_for_safety(output, dtype=conditioning_data.dtype)
|
return self.check_for_safety(output, dtype=conditioning_data.dtype)
|
||||||
|
|
||||||
def non_noised_latents_from_image(self, init_image, *, device, dtype):
|
def non_noised_latents_from_image(self, init_image, *, device: torch.device, dtype):
|
||||||
init_image = init_image.to(device=device, dtype=dtype)
|
init_image = init_image.to(device=device, dtype=dtype)
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
|
if device.type == 'mps':
|
||||||
|
# workaround for torch MPS bug that has been fixed in https://github.com/kulinseth/pytorch/pull/222
|
||||||
|
# TODO remove this workaround once kulinseth#222 is merged to pytorch mainline
|
||||||
|
self.vae.to('cpu')
|
||||||
|
init_image = init_image.to('cpu')
|
||||||
init_latent_dist = self.vae.encode(init_image).latent_dist
|
init_latent_dist = self.vae.encode(init_image).latent_dist
|
||||||
init_latents = init_latent_dist.sample().to(dtype=dtype) # FIXME: uses torch.randn. make reproducible!
|
init_latents = init_latent_dist.sample().to(dtype=dtype) # FIXME: uses torch.randn. make reproducible!
|
||||||
|
if device.type == 'mps':
|
||||||
|
self.vae.to(device)
|
||||||
|
init_latents = init_latents.to(device)
|
||||||
|
|
||||||
init_latents = 0.18215 * init_latents
|
init_latents = 0.18215 * init_latents
|
||||||
return init_latents
|
return init_latents
|
||||||
|
|
||||||
|
@ -19,10 +19,12 @@ from ldm.util import debug_image
|
|||||||
|
|
||||||
|
|
||||||
def infill_methods()->list[str]:
|
def infill_methods()->list[str]:
|
||||||
methods = list()
|
methods = [
|
||||||
|
"tile",
|
||||||
|
"solid",
|
||||||
|
]
|
||||||
if PatchMatch.patchmatch_available():
|
if PatchMatch.patchmatch_available():
|
||||||
methods.append('patchmatch')
|
methods.insert(0, 'patchmatch')
|
||||||
methods.append('tile')
|
|
||||||
return methods
|
return methods
|
||||||
|
|
||||||
class Inpaint(Img2Img):
|
class Inpaint(Img2Img):
|
||||||
@ -182,6 +184,7 @@ class Inpaint(Img2Img):
|
|||||||
infill_method = None,
|
infill_method = None,
|
||||||
inpaint_width=None,
|
inpaint_width=None,
|
||||||
inpaint_height=None,
|
inpaint_height=None,
|
||||||
|
inpaint_fill:tuple(int)=(0x7F, 0x7F, 0x7F, 0xFF),
|
||||||
attention_maps_callback=None,
|
attention_maps_callback=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""
|
"""
|
||||||
@ -202,12 +205,17 @@ class Inpaint(Img2Img):
|
|||||||
# Do infill
|
# Do infill
|
||||||
if infill_method == 'patchmatch' and PatchMatch.patchmatch_available():
|
if infill_method == 'patchmatch' and PatchMatch.patchmatch_available():
|
||||||
init_filled = self.infill_patchmatch(self.pil_image.copy())
|
init_filled = self.infill_patchmatch(self.pil_image.copy())
|
||||||
else: # if infill_method == 'tile': # Only two methods right now, so always use 'tile' if not patchmatch
|
elif infill_method == 'tile':
|
||||||
init_filled = self.tile_fill_missing(
|
init_filled = self.tile_fill_missing(
|
||||||
self.pil_image.copy(),
|
self.pil_image.copy(),
|
||||||
seed = self.seed,
|
seed = self.seed,
|
||||||
tile_size = tile_size
|
tile_size = tile_size
|
||||||
)
|
)
|
||||||
|
elif infill_method == 'solid':
|
||||||
|
solid_bg = PIL.Image.new("RGBA", init_image.size, inpaint_fill)
|
||||||
|
init_filled = PIL.Image.alpha_composite(solid_bg, init_image)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Non-supported infill type {infill_method}", infill_method)
|
||||||
init_filled.paste(init_image, (0,0), init_image.split()[-1])
|
init_filled.paste(init_image, (0,0), init_image.split()[-1])
|
||||||
|
|
||||||
# Resize if requested for inpainting
|
# Resize if requested for inpainting
|
||||||
|
@ -3,10 +3,10 @@ ldm.invoke.generator.txt2img inherits from ldm.invoke.generator
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
import math
|
import math
|
||||||
from diffusers.utils.logging import get_verbosity, set_verbosity, set_verbosity_error
|
|
||||||
from typing import Callable, Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
from diffusers.utils.logging import get_verbosity, set_verbosity, set_verbosity_error
|
||||||
|
|
||||||
from ldm.invoke.generator.base import Generator
|
from ldm.invoke.generator.base import Generator
|
||||||
from ldm.invoke.generator.diffusers_pipeline import trim_to_multiple_of, StableDiffusionGeneratorPipeline, \
|
from ldm.invoke.generator.diffusers_pipeline import trim_to_multiple_of, StableDiffusionGeneratorPipeline, \
|
||||||
@ -38,10 +38,6 @@ class Txt2Img2Img(Generator):
|
|||||||
uc, c, cfg_scale, extra_conditioning_info,
|
uc, c, cfg_scale, extra_conditioning_info,
|
||||||
threshold = ThresholdSettings(threshold, warmup=0.2) if threshold else None)
|
threshold = ThresholdSettings(threshold, warmup=0.2) if threshold else None)
|
||||||
.add_scheduler_args_if_applicable(pipeline.scheduler, eta=ddim_eta))
|
.add_scheduler_args_if_applicable(pipeline.scheduler, eta=ddim_eta))
|
||||||
scale_dim = min(width, height)
|
|
||||||
scale = 512 / scale_dim
|
|
||||||
|
|
||||||
init_width, init_height = trim_to_multiple_of(scale * width, scale * height)
|
|
||||||
|
|
||||||
def make_image(x_T):
|
def make_image(x_T):
|
||||||
|
|
||||||
@ -54,6 +50,10 @@ class Txt2Img2Img(Generator):
|
|||||||
# TODO: threshold = threshold,
|
# TODO: threshold = threshold,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Get our initial generation width and height directly from the latent output so
|
||||||
|
# the message below is accurate.
|
||||||
|
init_width = first_pass_latent_output.size()[3] * self.downsampling_factor
|
||||||
|
init_height = first_pass_latent_output.size()[2] * self.downsampling_factor
|
||||||
print(
|
print(
|
||||||
f"\n>> Interpolating from {init_width}x{init_height} to {width}x{height} using DDIM sampling"
|
f"\n>> Interpolating from {init_width}x{init_height} to {width}x{height} using DDIM sampling"
|
||||||
)
|
)
|
||||||
@ -106,27 +106,35 @@ class Txt2Img2Img(Generator):
|
|||||||
def get_noise(self,width,height,scale = True):
|
def get_noise(self,width,height,scale = True):
|
||||||
# print(f"Get noise: {width}x{height}")
|
# print(f"Get noise: {width}x{height}")
|
||||||
if scale:
|
if scale:
|
||||||
trained_square = 512 * 512
|
# Scale the input width and height for the initial generation
|
||||||
actual_square = width * height
|
# Make their area equivalent to the model's resolution area (e.g. 512*512 = 262144),
|
||||||
scale = math.sqrt(trained_square / actual_square)
|
# while keeping the minimum dimension at least 0.5 * resolution (e.g. 512*0.5 = 256)
|
||||||
scaled_width = math.ceil(scale * width / 64) * 64
|
|
||||||
scaled_height = math.ceil(scale * height / 64) * 64
|
aspect = width / height
|
||||||
|
dimension = self.model.unet.config.sample_size * self.model.vae_scale_factor
|
||||||
|
min_dimension = math.floor(dimension * 0.5)
|
||||||
|
model_area = dimension * dimension # hardcoded for now since all models are trained on square images
|
||||||
|
|
||||||
|
if aspect > 1.0:
|
||||||
|
init_height = max(min_dimension, math.sqrt(model_area / aspect))
|
||||||
|
init_width = init_height * aspect
|
||||||
|
else:
|
||||||
|
init_width = max(min_dimension, math.sqrt(model_area * aspect))
|
||||||
|
init_height = init_width / aspect
|
||||||
|
|
||||||
|
scaled_width, scaled_height = trim_to_multiple_of(math.floor(init_width), math.floor(init_height))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
scaled_width = width
|
scaled_width = width
|
||||||
scaled_height = height
|
scaled_height = height
|
||||||
|
|
||||||
device = self.model.device
|
device = self.model.device
|
||||||
|
channels = self.latent_channels
|
||||||
|
if channels == 9:
|
||||||
|
channels = 4 # we don't really want noise for all the mask channels
|
||||||
|
shape = (1, channels,
|
||||||
|
scaled_height // self.downsampling_factor, scaled_width // self.downsampling_factor)
|
||||||
if self.use_mps_noise or device.type == 'mps':
|
if self.use_mps_noise or device.type == 'mps':
|
||||||
return torch.randn([1,
|
return torch.randn(shape, dtype=self.torch_dtype(), device='cpu').to(device)
|
||||||
self.latent_channels,
|
|
||||||
scaled_height // self.downsampling_factor,
|
|
||||||
scaled_width // self.downsampling_factor],
|
|
||||||
dtype=self.torch_dtype(),
|
|
||||||
device='cpu').to(device)
|
|
||||||
else:
|
else:
|
||||||
return torch.randn([1,
|
return torch.randn(shape, dtype=self.torch_dtype(), device=device)
|
||||||
self.latent_channels,
|
|
||||||
scaled_height // self.downsampling_factor,
|
|
||||||
scaled_width // self.downsampling_factor],
|
|
||||||
dtype=self.torch_dtype(),
|
|
||||||
device=device)
|
|
||||||
|
@ -125,7 +125,7 @@ class ModelManager(object):
|
|||||||
Set the default model. The change will not take
|
Set the default model. The change will not take
|
||||||
effect until you call model_manager.commit()
|
effect until you call model_manager.commit()
|
||||||
'''
|
'''
|
||||||
assert model_name in self.models,f"unknown model '{model_name}'"
|
assert model_name in self.model_names(), f"unknown model '{model_name}'"
|
||||||
|
|
||||||
config = self.config
|
config = self.config
|
||||||
for model in config:
|
for model in config:
|
||||||
|
@ -155,7 +155,7 @@ class CrossAttentionControlSubstitute(CrossAttentionControlledFragment):
|
|||||||
default_options = {
|
default_options = {
|
||||||
's_start': 0.0,
|
's_start': 0.0,
|
||||||
's_end': 0.2062994740159002, # ~= shape_freedom=0.5
|
's_end': 0.2062994740159002, # ~= shape_freedom=0.5
|
||||||
't_start': 0.0,
|
't_start': 0.1,
|
||||||
't_end': 1.0
|
't_end': 1.0
|
||||||
}
|
}
|
||||||
merged_options = default_options
|
merged_options = default_options
|
||||||
|
@ -7,8 +7,10 @@ import torch
|
|||||||
import diffusers
|
import diffusers
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from diffusers.models.unet_2d_condition import UNet2DConditionModel
|
from diffusers.models.unet_2d_condition import UNet2DConditionModel
|
||||||
|
from diffusers.models.cross_attention import AttnProcessor
|
||||||
from ldm.invoke.devices import torch_dtype
|
from ldm.invoke.devices import torch_dtype
|
||||||
|
|
||||||
|
|
||||||
# adapted from bloc97's CrossAttentionControl colab
|
# adapted from bloc97's CrossAttentionControl colab
|
||||||
# https://github.com/bloc97/CrossAttentionControl
|
# https://github.com/bloc97/CrossAttentionControl
|
||||||
|
|
||||||
@ -304,11 +306,15 @@ class InvokeAICrossAttentionMixin:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def remove_cross_attention_control(model):
|
def restore_default_cross_attention(model, is_running_diffusers: bool, restore_attention_processor: Optional[AttnProcessor]=None):
|
||||||
|
if is_running_diffusers:
|
||||||
|
unet = model
|
||||||
|
unet.set_attn_processor(restore_attention_processor or CrossAttnProcessor())
|
||||||
|
else:
|
||||||
remove_attention_function(model)
|
remove_attention_function(model)
|
||||||
|
|
||||||
|
|
||||||
def setup_cross_attention_control(model, context: Context):
|
def override_cross_attention(model, context: Context, is_running_diffusers = False):
|
||||||
"""
|
"""
|
||||||
Inject attention parameters and functions into the passed in model to enable cross attention editing.
|
Inject attention parameters and functions into the passed in model to enable cross attention editing.
|
||||||
|
|
||||||
@ -323,7 +329,7 @@ def setup_cross_attention_control(model, context: Context):
|
|||||||
# urgh. should this be hardcoded?
|
# urgh. should this be hardcoded?
|
||||||
max_length = 77
|
max_length = 77
|
||||||
# mask=1 means use base prompt attention, mask=0 means use edited prompt attention
|
# mask=1 means use base prompt attention, mask=0 means use edited prompt attention
|
||||||
mask = torch.zeros(max_length)
|
mask = torch.zeros(max_length, dtype=torch_dtype(device))
|
||||||
indices_target = torch.arange(max_length, dtype=torch.long)
|
indices_target = torch.arange(max_length, dtype=torch.long)
|
||||||
indices = torch.arange(max_length, dtype=torch.long)
|
indices = torch.arange(max_length, dtype=torch.long)
|
||||||
for name, a0, a1, b0, b1 in context.arguments.edit_opcodes:
|
for name, a0, a1, b0, b1 in context.arguments.edit_opcodes:
|
||||||
@ -333,10 +339,26 @@ def setup_cross_attention_control(model, context: Context):
|
|||||||
indices[b0:b1] = indices_target[a0:a1]
|
indices[b0:b1] = indices_target[a0:a1]
|
||||||
mask[b0:b1] = 1
|
mask[b0:b1] = 1
|
||||||
|
|
||||||
context.register_cross_attention_modules(model)
|
|
||||||
context.cross_attention_mask = mask.to(device)
|
context.cross_attention_mask = mask.to(device)
|
||||||
context.cross_attention_index_map = indices.to(device)
|
context.cross_attention_index_map = indices.to(device)
|
||||||
|
if is_running_diffusers:
|
||||||
|
unet = model
|
||||||
|
old_attn_processors = unet.attn_processors
|
||||||
|
if torch.backends.mps.is_available():
|
||||||
|
# see note in StableDiffusionGeneratorPipeline.__init__ about borked slicing on MPS
|
||||||
|
unet.set_attn_processor(SwapCrossAttnProcessor())
|
||||||
|
else:
|
||||||
|
# try to re-use an existing slice size
|
||||||
|
default_slice_size = 4
|
||||||
|
slice_size = next((p.slice_size for p in old_attn_processors.values() if type(p) is SlicedAttnProcessor), default_slice_size)
|
||||||
|
unet.set_attn_processor(SlicedSwapCrossAttnProcesser(slice_size=slice_size))
|
||||||
|
return old_attn_processors
|
||||||
|
else:
|
||||||
|
context.register_cross_attention_modules(model)
|
||||||
inject_attention_function(model, context)
|
inject_attention_function(model, context)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_cross_attention_modules(model, which: CrossAttentionType) -> list[tuple[str, InvokeAICrossAttentionMixin]]:
|
def get_cross_attention_modules(model, which: CrossAttentionType) -> list[tuple[str, InvokeAICrossAttentionMixin]]:
|
||||||
@ -445,6 +467,7 @@ def get_mem_free_total(device):
|
|||||||
return mem_free_total
|
return mem_free_total
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class InvokeAIDiffusersCrossAttention(diffusers.models.attention.CrossAttention, InvokeAICrossAttentionMixin):
|
class InvokeAIDiffusersCrossAttention(diffusers.models.attention.CrossAttention, InvokeAICrossAttentionMixin):
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
@ -460,3 +483,176 @@ class InvokeAIDiffusersCrossAttention(diffusers.models.attention.CrossAttention,
|
|||||||
hidden_states = self.reshape_batch_dim_to_heads(attention_result)
|
hidden_states = self.reshape_batch_dim_to_heads(attention_result)
|
||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## 🧨diffusers implementation follows
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
# base implementation
|
||||||
|
|
||||||
|
class CrossAttnProcessor:
|
||||||
|
def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None):
|
||||||
|
batch_size, sequence_length, _ = hidden_states.shape
|
||||||
|
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length)
|
||||||
|
|
||||||
|
query = attn.to_q(hidden_states)
|
||||||
|
query = attn.head_to_batch_dim(query)
|
||||||
|
|
||||||
|
encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
|
||||||
|
key = attn.to_k(encoder_hidden_states)
|
||||||
|
value = attn.to_v(encoder_hidden_states)
|
||||||
|
key = attn.head_to_batch_dim(key)
|
||||||
|
value = attn.head_to_batch_dim(value)
|
||||||
|
|
||||||
|
attention_probs = attn.get_attention_scores(query, key, attention_mask)
|
||||||
|
hidden_states = torch.bmm(attention_probs, value)
|
||||||
|
hidden_states = attn.batch_to_head_dim(hidden_states)
|
||||||
|
|
||||||
|
# linear proj
|
||||||
|
hidden_states = attn.to_out[0](hidden_states)
|
||||||
|
# dropout
|
||||||
|
hidden_states = attn.to_out[1](hidden_states)
|
||||||
|
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
"""
|
||||||
|
from dataclasses import field, dataclass
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from diffusers.models.cross_attention import CrossAttention, CrossAttnProcessor, SlicedAttnProcessor, AttnProcessor
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SwapCrossAttnContext:
|
||||||
|
modified_text_embeddings: torch.Tensor
|
||||||
|
index_map: torch.Tensor # maps from original prompt token indices to the equivalent tokens in the modified prompt
|
||||||
|
mask: torch.Tensor # in the target space of the index_map
|
||||||
|
cross_attention_types_to_do: list[CrossAttentionType] = field(default_factory=list)
|
||||||
|
|
||||||
|
def __int__(self,
|
||||||
|
cac_types_to_do: [CrossAttentionType],
|
||||||
|
modified_text_embeddings: torch.Tensor,
|
||||||
|
index_map: torch.Tensor,
|
||||||
|
mask: torch.Tensor):
|
||||||
|
self.cross_attention_types_to_do = cac_types_to_do
|
||||||
|
self.modified_text_embeddings = modified_text_embeddings
|
||||||
|
self.index_map = index_map
|
||||||
|
self.mask = mask
|
||||||
|
|
||||||
|
def wants_cross_attention_control(self, attn_type: CrossAttentionType) -> bool:
|
||||||
|
return attn_type in self.cross_attention_types_to_do
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def make_mask_and_index_map(cls, edit_opcodes: list[tuple[str, int, int, int, int]], max_length: int) \
|
||||||
|
-> tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
|
||||||
|
# mask=1 means use original prompt attention, mask=0 means use modified prompt attention
|
||||||
|
mask = torch.zeros(max_length)
|
||||||
|
indices_target = torch.arange(max_length, dtype=torch.long)
|
||||||
|
indices = torch.arange(max_length, dtype=torch.long)
|
||||||
|
for name, a0, a1, b0, b1 in edit_opcodes:
|
||||||
|
if b0 < max_length:
|
||||||
|
if name == "equal":
|
||||||
|
# these tokens remain the same as in the original prompt
|
||||||
|
indices[b0:b1] = indices_target[a0:a1]
|
||||||
|
mask[b0:b1] = 1
|
||||||
|
|
||||||
|
return mask, indices
|
||||||
|
|
||||||
|
|
||||||
|
class SlicedSwapCrossAttnProcesser(SlicedAttnProcessor):
|
||||||
|
|
||||||
|
# TODO: dynamically pick slice size based on memory conditions
|
||||||
|
|
||||||
|
def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None,
|
||||||
|
# kwargs
|
||||||
|
swap_cross_attn_context: SwapCrossAttnContext=None):
|
||||||
|
|
||||||
|
attention_type = CrossAttentionType.SELF if encoder_hidden_states is None else CrossAttentionType.TOKENS
|
||||||
|
|
||||||
|
# if cross-attention control is not in play, just call through to the base implementation.
|
||||||
|
if attention_type is CrossAttentionType.SELF or \
|
||||||
|
swap_cross_attn_context is None or \
|
||||||
|
not swap_cross_attn_context.wants_cross_attention_control(attention_type):
|
||||||
|
#print(f"SwapCrossAttnContext for {attention_type} not active - passing request to superclass")
|
||||||
|
return super().__call__(attn, hidden_states, encoder_hidden_states, attention_mask)
|
||||||
|
#else:
|
||||||
|
# print(f"SwapCrossAttnContext for {attention_type} active")
|
||||||
|
|
||||||
|
batch_size, sequence_length, _ = hidden_states.shape
|
||||||
|
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length)
|
||||||
|
|
||||||
|
query = attn.to_q(hidden_states)
|
||||||
|
dim = query.shape[-1]
|
||||||
|
query = attn.head_to_batch_dim(query)
|
||||||
|
|
||||||
|
original_text_embeddings = encoder_hidden_states
|
||||||
|
modified_text_embeddings = swap_cross_attn_context.modified_text_embeddings
|
||||||
|
original_text_key = attn.to_k(original_text_embeddings)
|
||||||
|
modified_text_key = attn.to_k(modified_text_embeddings)
|
||||||
|
original_value = attn.to_v(original_text_embeddings)
|
||||||
|
modified_value = attn.to_v(modified_text_embeddings)
|
||||||
|
|
||||||
|
original_text_key = attn.head_to_batch_dim(original_text_key)
|
||||||
|
modified_text_key = attn.head_to_batch_dim(modified_text_key)
|
||||||
|
original_value = attn.head_to_batch_dim(original_value)
|
||||||
|
modified_value = attn.head_to_batch_dim(modified_value)
|
||||||
|
|
||||||
|
# compute slices and prepare output tensor
|
||||||
|
batch_size_attention = query.shape[0]
|
||||||
|
hidden_states = torch.zeros(
|
||||||
|
(batch_size_attention, sequence_length, dim // attn.heads), device=query.device, dtype=query.dtype
|
||||||
|
)
|
||||||
|
|
||||||
|
# do slices
|
||||||
|
for i in range(max(1,hidden_states.shape[0] // self.slice_size)):
|
||||||
|
start_idx = i * self.slice_size
|
||||||
|
end_idx = (i + 1) * self.slice_size
|
||||||
|
|
||||||
|
query_slice = query[start_idx:end_idx]
|
||||||
|
original_key_slice = original_text_key[start_idx:end_idx]
|
||||||
|
modified_key_slice = modified_text_key[start_idx:end_idx]
|
||||||
|
attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
|
||||||
|
|
||||||
|
original_attn_slice = attn.get_attention_scores(query_slice, original_key_slice, attn_mask_slice)
|
||||||
|
modified_attn_slice = attn.get_attention_scores(query_slice, modified_key_slice, attn_mask_slice)
|
||||||
|
|
||||||
|
# because the prompt modifications may result in token sequences shifted forwards or backwards,
|
||||||
|
# the original attention probabilities must be remapped to account for token index changes in the
|
||||||
|
# modified prompt
|
||||||
|
remapped_original_attn_slice = torch.index_select(original_attn_slice, -1,
|
||||||
|
swap_cross_attn_context.index_map)
|
||||||
|
|
||||||
|
# only some tokens taken from the original attention probabilities. this is controlled by the mask.
|
||||||
|
mask = swap_cross_attn_context.mask
|
||||||
|
inverse_mask = 1 - mask
|
||||||
|
attn_slice = \
|
||||||
|
remapped_original_attn_slice * mask + \
|
||||||
|
modified_attn_slice * inverse_mask
|
||||||
|
|
||||||
|
del remapped_original_attn_slice, modified_attn_slice
|
||||||
|
|
||||||
|
attn_slice = torch.bmm(attn_slice, modified_value[start_idx:end_idx])
|
||||||
|
hidden_states[start_idx:end_idx] = attn_slice
|
||||||
|
|
||||||
|
|
||||||
|
# done
|
||||||
|
hidden_states = attn.batch_to_head_dim(hidden_states)
|
||||||
|
|
||||||
|
# linear proj
|
||||||
|
hidden_states = attn.to_out[0](hidden_states)
|
||||||
|
# dropout
|
||||||
|
hidden_states = attn.to_out[1](hidden_states)
|
||||||
|
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class SwapCrossAttnProcessor(SlicedSwapCrossAttnProcesser):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super(SwapCrossAttnProcessor, self).__init__(slice_size=int(1e9)) # massive slice size = don't slice
|
||||||
|
|
||||||
|
@ -19,9 +19,9 @@ class DDIMSampler(Sampler):
|
|||||||
all_timesteps_count = kwargs.get('all_timesteps_count', t_enc)
|
all_timesteps_count = kwargs.get('all_timesteps_count', t_enc)
|
||||||
|
|
||||||
if extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control:
|
if extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control:
|
||||||
self.invokeai_diffuser.setup_cross_attention_control(extra_conditioning_info, step_count = all_timesteps_count)
|
self.invokeai_diffuser.override_cross_attention(extra_conditioning_info, step_count = all_timesteps_count)
|
||||||
else:
|
else:
|
||||||
self.invokeai_diffuser.remove_cross_attention_control()
|
self.invokeai_diffuser.restore_default_cross_attention()
|
||||||
|
|
||||||
|
|
||||||
# This is the central routine
|
# This is the central routine
|
||||||
|
@ -43,9 +43,9 @@ class CFGDenoiser(nn.Module):
|
|||||||
extra_conditioning_info = kwargs.get('extra_conditioning_info', None)
|
extra_conditioning_info = kwargs.get('extra_conditioning_info', None)
|
||||||
|
|
||||||
if extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control:
|
if extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control:
|
||||||
self.invokeai_diffuser.setup_cross_attention_control(extra_conditioning_info, step_count = t_enc)
|
self.invokeai_diffuser.override_cross_attention(extra_conditioning_info, step_count = t_enc)
|
||||||
else:
|
else:
|
||||||
self.invokeai_diffuser.remove_cross_attention_control()
|
self.invokeai_diffuser.restore_default_cross_attention()
|
||||||
|
|
||||||
|
|
||||||
def forward(self, x, sigma, uncond, cond, cond_scale):
|
def forward(self, x, sigma, uncond, cond, cond_scale):
|
||||||
|
@ -21,9 +21,9 @@ class PLMSSampler(Sampler):
|
|||||||
all_timesteps_count = kwargs.get('all_timesteps_count', t_enc)
|
all_timesteps_count = kwargs.get('all_timesteps_count', t_enc)
|
||||||
|
|
||||||
if extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control:
|
if extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control:
|
||||||
self.invokeai_diffuser.setup_cross_attention_control(extra_conditioning_info, step_count = all_timesteps_count)
|
self.invokeai_diffuser.override_cross_attention(extra_conditioning_info, step_count = all_timesteps_count)
|
||||||
else:
|
else:
|
||||||
self.invokeai_diffuser.remove_cross_attention_control()
|
self.invokeai_diffuser.restore_default_cross_attention()
|
||||||
|
|
||||||
|
|
||||||
# this is the essential routine
|
# this is the essential routine
|
||||||
|
@ -1,14 +1,16 @@
|
|||||||
import math
|
import math
|
||||||
|
from contextlib import contextmanager
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from math import ceil
|
from math import ceil
|
||||||
from typing import Callable, Optional, Union
|
from typing import Callable, Optional, Union, Any, Dict
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from diffusers.models.cross_attention import AttnProcessor
|
||||||
from ldm.models.diffusion.cross_attention_control import Arguments, \
|
from ldm.models.diffusion.cross_attention_control import Arguments, \
|
||||||
remove_cross_attention_control, setup_cross_attention_control, Context, get_cross_attention_modules, \
|
restore_default_cross_attention, override_cross_attention, Context, get_cross_attention_modules, \
|
||||||
CrossAttentionType
|
CrossAttentionType, SwapCrossAttnContext
|
||||||
from ldm.models.diffusion.cross_attention_map_saving import AttentionMapSaver
|
from ldm.models.diffusion.cross_attention_map_saving import AttentionMapSaver
|
||||||
|
|
||||||
|
|
||||||
@ -30,17 +32,20 @@ class InvokeAIDiffuserComponent:
|
|||||||
debug_thresholding = False
|
debug_thresholding = False
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
class ExtraConditioningInfo:
|
class ExtraConditioningInfo:
|
||||||
def __init__(self, tokens_count_including_eos_bos:int, cross_attention_control_args: Optional[Arguments]):
|
|
||||||
self.tokens_count_including_eos_bos = tokens_count_including_eos_bos
|
tokens_count_including_eos_bos: int
|
||||||
self.cross_attention_control_args = cross_attention_control_args
|
cross_attention_control_args: Optional[Arguments] = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def wants_cross_attention_control(self):
|
def wants_cross_attention_control(self):
|
||||||
return self.cross_attention_control_args is not None
|
return self.cross_attention_control_args is not None
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, model, model_forward_callback:
|
def __init__(self, model, model_forward_callback:
|
||||||
Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]
|
Callable[[torch.Tensor, torch.Tensor, torch.Tensor, Optional[dict[str,Any]]], torch.Tensor],
|
||||||
|
is_running_diffusers: bool=False,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
:param model: the unet model to pass through to cross attention control
|
:param model: the unet model to pass through to cross attention control
|
||||||
@ -48,21 +53,47 @@ class InvokeAIDiffuserComponent:
|
|||||||
"""
|
"""
|
||||||
self.conditioning = None
|
self.conditioning = None
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self.is_running_diffusers = is_running_diffusers
|
||||||
self.model_forward_callback = model_forward_callback
|
self.model_forward_callback = model_forward_callback
|
||||||
self.cross_attention_control_context = None
|
self.cross_attention_control_context = None
|
||||||
|
|
||||||
def setup_cross_attention_control(self, conditioning: ExtraConditioningInfo, step_count: int):
|
@contextmanager
|
||||||
|
def custom_attention_context(self,
|
||||||
|
extra_conditioning_info: Optional[ExtraConditioningInfo],
|
||||||
|
step_count: int):
|
||||||
|
do_swap = extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control
|
||||||
|
old_attn_processor = None
|
||||||
|
if do_swap:
|
||||||
|
old_attn_processor = self.override_cross_attention(extra_conditioning_info,
|
||||||
|
step_count=step_count)
|
||||||
|
try:
|
||||||
|
yield None
|
||||||
|
finally:
|
||||||
|
if old_attn_processor is not None:
|
||||||
|
self.restore_default_cross_attention(old_attn_processor)
|
||||||
|
# TODO resuscitate attention map saving
|
||||||
|
#self.remove_attention_map_saving()
|
||||||
|
|
||||||
|
def override_cross_attention(self, conditioning: ExtraConditioningInfo, step_count: int) -> Dict[str, AttnProcessor]:
|
||||||
|
"""
|
||||||
|
setup cross attention .swap control. for diffusers this replaces the attention processor, so
|
||||||
|
the previous attention processor is returned so that the caller can restore it later.
|
||||||
|
"""
|
||||||
self.conditioning = conditioning
|
self.conditioning = conditioning
|
||||||
self.cross_attention_control_context = Context(
|
self.cross_attention_control_context = Context(
|
||||||
arguments=self.conditioning.cross_attention_control_args,
|
arguments=self.conditioning.cross_attention_control_args,
|
||||||
step_count=step_count
|
step_count=step_count
|
||||||
)
|
)
|
||||||
setup_cross_attention_control(self.model, self.cross_attention_control_context)
|
return override_cross_attention(self.model,
|
||||||
|
self.cross_attention_control_context,
|
||||||
|
is_running_diffusers=self.is_running_diffusers)
|
||||||
|
|
||||||
def remove_cross_attention_control(self):
|
def restore_default_cross_attention(self, restore_attention_processor: Optional['AttnProcessor']=None):
|
||||||
self.conditioning = None
|
self.conditioning = None
|
||||||
self.cross_attention_control_context = None
|
self.cross_attention_control_context = None
|
||||||
remove_cross_attention_control(self.model)
|
restore_default_cross_attention(self.model,
|
||||||
|
is_running_diffusers=self.is_running_diffusers,
|
||||||
|
restore_attention_processor=restore_attention_processor)
|
||||||
|
|
||||||
def setup_attention_map_saving(self, saver: AttentionMapSaver):
|
def setup_attention_map_saving(self, saver: AttentionMapSaver):
|
||||||
def callback(slice, dim, offset, slice_size, key):
|
def callback(slice, dim, offset, slice_size, key):
|
||||||
@ -168,7 +199,41 @@ class InvokeAIDiffuserComponent:
|
|||||||
return unconditioned_next_x, conditioned_next_x
|
return unconditioned_next_x, conditioned_next_x
|
||||||
|
|
||||||
|
|
||||||
def apply_cross_attention_controlled_conditioning(self, x:torch.Tensor, sigma, unconditioning, conditioning, cross_attention_control_types_to_do):
|
def apply_cross_attention_controlled_conditioning(self,
|
||||||
|
x: torch.Tensor,
|
||||||
|
sigma,
|
||||||
|
unconditioning,
|
||||||
|
conditioning,
|
||||||
|
cross_attention_control_types_to_do):
|
||||||
|
if self.is_running_diffusers:
|
||||||
|
return self.apply_cross_attention_controlled_conditioning__diffusers(x, sigma, unconditioning, conditioning, cross_attention_control_types_to_do)
|
||||||
|
else:
|
||||||
|
return self.apply_cross_attention_controlled_conditioning__compvis(x, sigma, unconditioning, conditioning, cross_attention_control_types_to_do)
|
||||||
|
|
||||||
|
def apply_cross_attention_controlled_conditioning__diffusers(self,
|
||||||
|
x: torch.Tensor,
|
||||||
|
sigma,
|
||||||
|
unconditioning,
|
||||||
|
conditioning,
|
||||||
|
cross_attention_control_types_to_do):
|
||||||
|
context: Context = self.cross_attention_control_context
|
||||||
|
|
||||||
|
cross_attn_processor_context = SwapCrossAttnContext(modified_text_embeddings=context.arguments.edited_conditioning,
|
||||||
|
index_map=context.cross_attention_index_map,
|
||||||
|
mask=context.cross_attention_mask,
|
||||||
|
cross_attention_types_to_do=[])
|
||||||
|
# no cross attention for unconditioning (negative prompt)
|
||||||
|
unconditioned_next_x = self.model_forward_callback(x, sigma, unconditioning,
|
||||||
|
{"swap_cross_attn_context": cross_attn_processor_context})
|
||||||
|
|
||||||
|
# do requested cross attention types for conditioning (positive prompt)
|
||||||
|
cross_attn_processor_context.cross_attention_types_to_do = cross_attention_control_types_to_do
|
||||||
|
conditioned_next_x = self.model_forward_callback(x, sigma, conditioning,
|
||||||
|
{"swap_cross_attn_context": cross_attn_processor_context})
|
||||||
|
return unconditioned_next_x, conditioned_next_x
|
||||||
|
|
||||||
|
|
||||||
|
def apply_cross_attention_controlled_conditioning__compvis(self, x:torch.Tensor, sigma, unconditioning, conditioning, cross_attention_control_types_to_do):
|
||||||
# print('pct', percent_through, ': doing cross attention control on', cross_attention_control_types_to_do)
|
# print('pct', percent_through, ': doing cross attention control on', cross_attention_control_types_to_do)
|
||||||
# slower non-batched path (20% slower on mac MPS)
|
# slower non-batched path (20% slower on mac MPS)
|
||||||
# We are only interested in using attention maps for conditioned_next_x, but batching them with generation of
|
# We are only interested in using attention maps for conditioned_next_x, but batching them with generation of
|
||||||
|
Loading…
x
Reference in New Issue
Block a user