Merge branch 'main' into dev/installer

2024-08-30 20:32:17 +00:00 · 2023-02-01 17:50:22 -05:00 · 2023-02-01 17:50:22 -05:00 · 8ce9f07223
commit 8ce9f07223
parent 11ac50a6ea 31146eb797
27 changed files with 593 additions and 214 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,19 +1,18 @@
 *
 !assets/caution.png
 !backend
-!environments-and-requirements
+!frontend/dist
 !frontend
 !ldm
-!main.py
+!pyproject.toml
 !README.md
 !scripts
 !server
 !static
 !setup.py
 # Guard against pulling in any models that might exist in the directory tree
-**/*.pt*
+**.pt*
 # unignore configs, but only ignore the custom models.yaml, in case it exists
 !configs
 configs/models.yaml
 configs/models.yaml.orig
 **/__pycache__
--- a/.github/workflows/build-cloud-img.yml
+++ b/.github/workflows/build-cloud-img.yml
@ -21,6 +21,7 @@ env:
 jobs:
  docker:
    if: github.event.pull_request.draft == false
    strategy:
      fail-fast: false
      matrix:
--- a/.github/workflows/build-container.yml
+++ b/.github/workflows/build-container.yml
@ -3,63 +3,60 @@ on:
  push:
    branches:
      - 'main'
    tags:
      - 'v*.*.*'
 jobs:
  docker:
    if: github.event.pull_request.draft == false
    strategy:
      fail-fast: false
      matrix:
        registry:
          - ghcr.io
        flavor:
          - amd
          - cuda
          # - cloud
        include:
          - flavor: amd
-            pip-requirements: requirements-lin-amd.txt
+            pip-extra-index-url: 'https://download.pytorch.org/whl/rocm5.2'
            dockerfile: docker-build/Dockerfile
            platforms: linux/amd64,linux/arm64
          - flavor: cuda
-            pip-requirements: requirements-lin-cuda.txt
+            pip-extra-index-url: ''
            dockerfile: docker-build/Dockerfile
            platforms: linux/amd64,linux/arm64
          # - flavor: cloud
          #   pip-requirements: requirements-lin-cuda.txt
          #   dockerfile: docker-build/Dockerfile.cloud
          #   platforms: linux/amd64
    runs-on: ubuntu-latest
    name: ${{ matrix.flavor }}
    steps:
      - name: Checkout
        uses: actions/checkout@v3
      - name: Set up QEMU
        uses: docker/setup-qemu-action@v2
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v4
        with:
-          images: ${{ matrix.registry }}/${{ github.repository }}-${{ matrix.flavor }}
+          images: ghcr.io/${{ github.repository }}-${{ matrix.flavor }}
          tags: |
            type=ref,event=branch
            type=ref,event=tag
            type=semver,pattern={{version}}
            type=semver,pattern={{major}}.{{minor}}
            type=semver,pattern={{major}}
            type=sha
          flavor: |
            latest=true
      - name: Set up QEMU
        uses: docker/setup-qemu-action@v2
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v2
-      - if: github.event_name != 'pull_request'
+      - name: Login to GitHub Container Registry
-        name: Docker login
+        if: github.event_name != 'pull_request'
        uses: docker/login-action@v2
        with:
-          registry: ${{ matrix.registry }}
+          registry: ghcr.io
-          username: ${{ github.actor }}
+          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Build container
@ -71,4 +68,6 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
-          build-args: pip_requirements=${{ matrix.pip-requirements }}
+          build-args: PIP_EXTRA_INDEX_URL=${{ matrix.pip-extra-index-url }}
          # cache-from: type=gha
          # cache-to: type=gha,mode=max
--- a/.github/workflows/clean-caches.yml
+++ b/.github/workflows/clean-caches.yml
@ -0,0 +1,34 @@
 name: cleanup caches by a branch
 on:
  pull_request:
    types:
      - closed
  workflow_dispatch:
 jobs:
  cleanup:
    runs-on: ubuntu-latest
    steps:
      - name: Check out code
        uses: actions/checkout@v3
      - name: Cleanup
        run: |
          gh extension install actions/gh-actions-cache
          REPO=${{ github.repository }}
          BRANCH=${{ github.ref }}
          echo "Fetching list of cache key"
          cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH | cut -f 1 )
          ## Setting this to not fail the workflow while deleting cache keys.
          set +e
          echo "Deleting caches..."
          for cacheKey in $cacheKeysForPR
          do
              gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm
          done
          echo "Done"
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/lint-frontend.yml
+++ b/.github/workflows/lint-frontend.yml
@ -14,6 +14,7 @@ defaults:
 jobs:
  lint-frontend:
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-22.04
    steps:
      - name: Setup Node 18
--- a/.github/workflows/mkdocs-material.yml
+++ b/.github/workflows/mkdocs-material.yml
@ -7,6 +7,7 @@ on:
 jobs:
  mkdocs-material:
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    steps:
      - name: checkout sources
--- a/.github/workflows/pyflakes.yml
+++ b/.github/workflows/pyflakes.yml
@ -9,6 +9,7 @@ on:
 jobs:
  pyflakes:
    name: runner / pyflakes
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v2
--- a/docker-build/Dockerfile
+++ b/docker-build/Dockerfile
@ -1,59 +1,71 @@
-FROM python:3.10-slim AS builder
+# syntax=docker/dockerfile:1
 FROM python:3.9-slim AS python-base
 # use bash
 SHELL [ "/bin/bash", "-c" ]
 # Install necesarry packages
-RUN apt-get update \
+RUN \
  --mount=type=cache,target=/var/cache/apt,sharing=locked \
  --mount=type=cache,target=/var/lib/apt,sharing=locked \
  apt-get update \
  && apt-get install -y \
    --no-install-recommends \
    libgl1-mesa-glx=20.3.* \
    libglib2.0-0=2.66.* \
    libopencv-dev=4.5.* \
  && apt-get clean \
  && rm -rf /var/lib/apt/lists/*
 ARG APPDIR=/usr/src/app
 ENV APPDIR ${APPDIR}
 WORKDIR ${APPDIR}
 FROM python-base AS builder
 RUN \
  --mount=type=cache,target=/var/cache/apt,sharing=locked \
  --mount=type=cache,target=/var/lib/apt,sharing=locked \
  apt-get update \
  && apt-get install -y \
    --no-install-recommends \
    gcc=4:10.2.* \
    libgl1-mesa-glx=20.3.* \
    libglib2.0-0=2.66.* \
    python3-dev=3.9.* \
  && apt-get clean \
  && rm -rf /var/lib/apt/lists/*
-# set WORKDIR, PATH and copy sources
+# copy sources
-ARG APPDIR=/usr/src/app
+COPY --link . .
-WORKDIR ${APPDIR}
+ARG PIP_EXTRA_INDEX_URL
-ENV PATH ${APPDIR}/.venv/bin:$PATH
+ENV PIP_EXTRA_INDEX_URL ${PIP_EXTRA_INDEX_URL}
 ARG PIP_REQUIREMENTS=requirements-lin-cuda.txt
 COPY . ./environments-and-requirements/${PIP_REQUIREMENTS} ./
 # install requirements
-RUN python3 -m venv .venv \
+RUN python3 -m venv invokeai \
-  && pip install \
+  && ${APPDIR}/invokeai/bin/pip \
-    --upgrade \
+    install \
    --no-cache-dir \
-    'wheel>=0.38.4' \
+    --use-pep517 \
-  && pip install \
+    .
    --no-cache-dir \
    -r ${PIP_REQUIREMENTS}
-FROM python:3.10-slim AS runtime
+FROM python-base AS runtime
 # setup environment
-ARG APPDIR=/usr/src/app
+COPY --link . .
-WORKDIR ${APPDIR}
+COPY --from=builder ${APPDIR}/invokeai ${APPDIR}/invokeai
-COPY --from=builder ${APPDIR} .
+ENV PATH=${APPDIR}/invokeai/bin:$PATH
-ENV \
+ENV INVOKEAI_ROOT=/data
-  PATH=${APPDIR}/.venv/bin:$PATH \
+ENV INVOKE_MODEL_RECONFIGURE="--yes --default_only"
  INVOKEAI_ROOT=/data \
  INVOKE_MODEL_RECONFIGURE=--yes
-# Install necesarry packages
+# build patchmatch
-RUN apt-get update \
+RUN \
  --mount=type=cache,target=/var/cache/apt,sharing=locked \
  --mount=type=cache,target=/var/lib/apt,sharing=locked \
  apt-get update \
  && apt-get install -y \
    --no-install-recommends \
    build-essential=12.9 \
-    libgl1-mesa-glx=20.3.* \
+  && PYTHONDONTWRITEBYTECODE=1 \
-    libglib2.0-0=2.66.* \
+    python3 -c "from patchmatch import patch_match" \
    libopencv-dev=4.5.* \
  && ln -sf \
    /usr/lib/"$(arch)"-linux-gnu/pkgconfig/opencv4.pc \
    /usr/lib/"$(arch)"-linux-gnu/pkgconfig/opencv.pc \
  && python3 -c "from patchmatch import patch_match" \
  && apt-get remove -y \
    --autoremove \
    build-essential \
@ -61,5 +73,6 @@ RUN apt-get update \
  && rm -rf /var/lib/apt/lists/*
 # set Entrypoint and default CMD
-ENTRYPOINT [ "python3", "scripts/invoke.py" ]
+ENTRYPOINT [ "invoke" ]
 CMD [ "--web", "--host=0.0.0.0" ]
 VOLUME [ "/data" ]
--- a/docker-build/build.sh
+++ b/docker-build/build.sh
@ -2,18 +2,25 @@
 set -e
 # How to use: https://invoke-ai.github.io/InvokeAI/installation/INSTALL_DOCKER/#setup
 #
 # Some possible pip extra-index urls (cuda 11.7 is available without extra url):
 #
 #   CUDA 11.6:  https://download.pytorch.org/whl/cu116
 #   ROCm 5.2:   https://download.pytorch.org/whl/rocm5.2
 #   CPU:        https://download.pytorch.org/whl/cpu
 #
 #   as found on https://pytorch.org/get-started/locally/
-source ./docker-build/env.sh \
+cd "$(dirname "$0")" || exit 1
  || echo "please execute docker-build/build.sh from repository root" \
  || exit 1
-PIP_REQUIREMENTS=${PIP_REQUIREMENTS:-requirements-lin-cuda.txt}
+source ./env.sh
-DOCKERFILE=${INVOKE_DOCKERFILE:-docker-build/Dockerfile}
+
 DOCKERFILE=${INVOKE_DOCKERFILE:-"./Dockerfile"}
 # print the settings
 echo -e "You are using these values:\n"
 echo -e "Dockerfile:\t ${DOCKERFILE}"
-echo -e "Requirements:\t ${PIP_REQUIREMENTS}"
+echo -e "extra-index-url: ${PIP_EXTRA_INDEX_URL:-none}"
 echo -e "Volumename:\t ${VOLUMENAME}"
 echo -e "arch:\t\t ${ARCH}"
 echo -e "Platform:\t ${PLATFORM}"
@ -30,6 +37,6 @@ fi
 docker build \
    --platform="${PLATFORM}" \
    --tag="${INVOKEAI_TAG}" \
-  --build-arg="PIP_REQUIREMENTS=${PIP_REQUIREMENTS}" \
+    ${PIP_EXTRA_INDEX_URL:+--build-arg=PIP_EXTRA_INDEX_URL="${PIP_EXTRA_INDEX_URL}"} \
    --file="${DOCKERFILE}" \
-  .
+    ..
--- a/docker-build/env.sh
+++ b/docker-build/env.sh
@ -7,4 +7,4 @@ ARCH=${ARCH:-$(uname -m)}
 PLATFORM=${PLATFORM:-Linux/${ARCH}}
 CONTAINER_FLAVOR=${CONTAINER_FLAVOR:-cuda}
 INVOKEAI_BRANCH=$(git branch --show)
-INVOKEAI_TAG=${REPOSITORY_NAME,,}-${CONTAINER_FLAVOR}:${INVOKEAI_TAG:-${INVOKEAI_BRANCH/\//-}}
+INVOKEAI_TAG=${REPOSITORY_NAME,,}-${CONTAINER_FLAVOR}:${INVOKEAI_TAG:-${INVOKEAI_BRANCH##*/}}
--- a/docker-build/run.sh
+++ b/docker-build/run.sh
@ -4,17 +4,14 @@ set -e
 # How to use: https://invoke-ai.github.io/InvokeAI/installation/INSTALL_DOCKER/#run-the-container
 # IMPORTANT: You need to have a token on huggingface.co to be able to download the checkpoints!!!
-source ./docker-build/env.sh \
+cd "$(dirname "$0")" || exit 1
  || echo "please run from repository root" \
  || exit 1
-# check if HUGGINGFACE_TOKEN is available
+source ./env.sh
 # You must have accepted the terms of use for required models
 HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:?Please set your token for Huggingface as HUGGINGFACE_TOKEN}
 echo -e "You are using these values:\n"
-echo -e "Volumename:\t ${VOLUMENAME}"
+echo -e "Volumename:\t${VOLUMENAME}"
-echo -e "Invokeai_tag:\t ${INVOKEAI_TAG}\n"
+echo -e "Invokeai_tag:\t${INVOKEAI_TAG}"
 echo -e "local Models:\t${MODELSPATH:-unset}\n"
 docker run \
  --interactive \
@ -23,8 +20,10 @@ docker run \
  --platform="$PLATFORM" \
  --name="${REPOSITORY_NAME,,}" \
  --hostname="${REPOSITORY_NAME,,}" \
-  --mount="source=$VOLUMENAME,target=/data" \
+  --mount=source="$VOLUMENAME",target=/data \
-  --env="HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN}" \
+  ${MODELSPATH:+-u "$(id -u):$(id -g)"} \
  ${MODELSPATH:+--mount=type=bind,source=${MODELSPATH},target=/data/models} \
  ${HUGGING_FACE_HUB_TOKEN:+--env=HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}} \
  --publish=9090:9090 \
  --cap-add=sys_nice \
  ${GPU_FLAGS:+--gpus=${GPU_FLAGS}} \
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -52,12 +52,17 @@ introduces several changes you should know about.
  path: models/diffusers/hakurei-haifu-diffusion-1.4
  ```
-2. The format of the models directory has changed to mimic the
+2. In order of precedence, InvokeAI will now use HF_HOME, then
-   HuggingFace cache directory. By default, diffusers models are
+   XDG_CACHE_HOME, then finally default to `ROOTDIR/models` to
-   now automatically downloaded and retrieved from the directory
+   store HuggingFace diffusers models.
-   `ROOTDIR/models/diffusers`, while other models are stored in
+
-   the directory `ROOTDIR/models/hub`. This organization is the
+   Consequently, the format of the models directory has changed to
-   same as that used by HuggingFace for its cache management.
+   mimic the HuggingFace cache directory. When HF_HOME and XDG_HOME
   are not set, diffusers models are now automatically downloaded
   and retrieved from the directory `ROOTDIR/models/diffusers`,
   while other models are stored in the directory
   `ROOTDIR/models/hub`. This organization is the same as that used
   by HuggingFace for its cache management.
   This allows you to share diffusers and ckpt model files easily with
   other machine learning applications that use the HuggingFace
@ -66,7 +71,13 @@ introduces several changes you should know about.
   cache models in. To tell InvokeAI to use the standard HuggingFace
   cache directory, you would set HF_HOME like this (Linux/Mac):
-   `export HF_HOME=~/.cache/hugging_face`
+   `export HF_HOME=~/.cache/huggingface`
   Both HuggingFace and InvokeAI will fall back to the XDG_CACHE_HOME
   environment variable if HF_HOME is not set; this path
   takes precedence over `ROOTDIR/models` to allow for the same sharing
   with other machine learning applications that use HuggingFace
   libraries.
 3. If you upgrade to InvokeAI 2.3.* from an earlier version, there
   will be a one-time migration from the old models directory format
--- a/docs/features/PROMPTS.md
+++ b/docs/features/PROMPTS.md
@ -239,28 +239,24 @@ Generate an image with a given prompt, record the seed of the image, and then
 use the `prompt2prompt` syntax to substitute words in the original prompt for
 words in a new prompt. This works for `img2img` as well.
- `a ("fluffy cat").swap("smiling dog") eating a hotdog`.
+For example, consider the prompt `a cat.swap(dog) playing with a ball in the forest`. Normally, because of the word words interact with each other when doing a stable diffusion image generation, these two prompts would generate different compositions:
-    - quotes optional: `a (fluffy cat).swap(smiling dog) eating a hotdog`.
+  - `a cat playing with a ball in the forest`
-    - for single word substitutions parentheses are also optional:
+  - `a dog playing with a ball in the forest`
-      `a cat.swap(dog) eating a hotdog`.
+
- Supports options `s_start`, `s_end`, `t_start`, `t_end` (each 0-1) loosely
+| `a cat playing with a ball in the forest` | `a dog playing with a ball in the forest` |
-  corresponding to bloc97's `prompt_edit_spatial_start/_end` and
+| --- | --- |
-  `prompt_edit_tokens_start/_end` but with the math swapped to make it easier to
+| img | img |
-  intuitively understand.
+
-    - Example usage:`a (cat).swap(dog, s_end=0.3) eating a hotdog` - the `s_end`
+
-      argument means that the "spatial" (self-attention) edit will stop having any
+      - For multiple word swaps, use parentheses: `a (fluffy cat).swap(barking dog) playing with a ball in the forest`.
-      effect after 30% (=0.3) of the steps have been done, leaving Stable
+      - To swap a comma, use quotes: `a ("fluffy, grey cat").swap("big, barking dog") playing with a ball in the forest`.
-      Diffusion with 70% of the steps where it is free to decide for itself how to
+- Supports options `t_start` and `t_end` (each 0-1) loosely corresponding to bloc97's `prompt_edit_tokens_start/_end` but with the math swapped to make it easier to
-      reshape the cat-form into a dog form.
+  intuitively understand. `t_start` and `t_end` are used to control on which steps cross-attention control should run. With the default values `t_start=0` and `t_end=1`, cross-attention control is active on every step of image generation. Other values can be used to turn cross-attention control off for part of the image generation process.
-    - The numbers represent a percentage through the step sequence where the edits
+    - For example, if doing a diffusion with 10 steps for the prompt is `a cat.swap(dog, t_start=0.3, t_end=1.0) playing with a ball in the forest`, the first 3 steps will be run as `a cat playing with a ball in the forest`, while the last 7 steps will run as `a dog playing with a ball in the forest`, but the pixels that represent `dog` will be locked to the pixels that would have represented `cat` if the `cat` prompt had been used instead.
-      should happen. 0 means the start (noisy starting image), 1 is the end (final
+    - Conversely, for `a cat.swap(dog, t_start=0, t_end=0.7) playing with a ball in the forest`, the first 7 steps will run as `a dog playing with a ball in the forest` with the pixels that represent `dog` locked to the same pixels that would have represented `cat` if the `cat` prompt was being used instead. The final 3 steps will just run `a cat playing with a ball in the forest`.
-      image).
+    > For img2img, the step sequence does not start at 0 but instead at `(1.0-strength)` - so if the img2img `strength` is `0.7`, `t_start` and `t_end` must both be greater than `0.3` (`1.0-0.7`) to have any effect.
-        - For img2img, the step sequence does not start at 0 but instead at
+
-          (1-strength) - so if strength is 0.7, s_start and s_end must both be
+Prompt2prompt `.swap()` is not compatible with xformers, which will be temporarily disabled when doing a `.swap()` - so you should expect to use more VRAM and run slower that with xformers enabled.
          greater than 0.3 (1-0.7) to have any effect.
 - Convenience option `shape_freedom` (0-1) to specify how much "freedom" Stable
  Diffusion should have to change the shape of the subject being swapped.
    - `a (cat).swap(dog, shape_freedom=0.5) eating a hotdog`.
 The `prompt2prompt` code is based off
 [bloc97's colab](https://github.com/bloc97/CrossAttentionControl).
--- a/environments-and-requirements/requirements-lin-cuda.txt
+++ b/environments-and-requirements/requirements-lin-cuda.txt
@ -2,4 +2,5 @@
 -r environments-and-requirements/requirements-base.txt
 torch>=1.13.1
 torchvision>=0.14.1
 xformers~=0.16
 -e .
--- a/environments-and-requirements/requirements-win-colab-cuda.txt
+++ b/environments-and-requirements/requirements-win-colab-cuda.txt
@ -3,4 +3,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu117 --trusted-host https://download.pytorch.org
 torch==1.13.1
 torchvision==0.14.1
 xformers~=0.16
 -e .
--- a/ldm/generate.py
+++ b/ldm/generate.py
@ -20,6 +20,7 @@ import torch
 import transformers
 from PIL import Image, ImageOps
 from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.utils.import_utils import is_xformers_available
 from omegaconf import OmegaConf
 from pytorch_lightning import seed_everything, logging
@ -203,6 +204,14 @@ class Generate:
            self.precision = choose_precision(self.device)
        Globals.full_precision = self.precision=='float32'
        if is_xformers_available():
            if not Globals.disable_xformers:
                print('>> xformers memory-efficient attention is available and enabled')
            else:
                print('>> xformers memory-efficient attention is available but disabled')
        else:
            print('>> xformers not installed')
        # model caching system for fast switching
        self.model_manager = ModelManager(mconfig,self.device,self.precision,max_loaded_models=max_loaded_models)
        # don't accept invalid models
--- a/ldm/invoke/CLI.py
+++ b/ldm/invoke/CLI.py
@ -53,10 +53,11 @@ def main():
    if not args.conf:
        if not os.path.exists(os.path.join(Globals.root,'configs','models.yaml')):
-            print(f"\n** Error. The file {os.path.join(Globals.root,'configs','models.yaml')} could not be found.")
+            report_model_error(opt, e)
-            print('** Please check the location of your invokeai directory and use the --root_dir option to point to the correct path.')
+            # print(f"\n** Error. The file {os.path.join(Globals.root,'configs','models.yaml')} could not be found.")
-            print('** This script will now exit.')
+            # print('** Please check the location of your invokeai directory and use the --root_dir option to point to the correct path.')
-            sys.exit(-1)
+            # print('** This script will now exit.')
            # sys.exit(-1)
    print(f'>> {ldm.invoke.__app_name__}, version {ldm.invoke.__version__}')
    print(f'>> InvokeAI runtime directory is "{Globals.root}"')
@ -789,8 +790,8 @@ def _get_model_name(existing_names,completer,default_name:str='')->str:
        model_name = input(f'Short name for this model [{default_name}]: ').strip()
        if len(model_name)==0:
            model_name = default_name
-        if not re.match('^[\w._+-]+$',model_name):
+        if not re.match('^[\w._+:/-]+$',model_name):
-            print('** model name must contain only words, digits and the characters "._+-" **')
+            print('** model name must contain only words, digits and the characters "._+:/-" **')
        elif model_name != default_name and model_name in existing_names:
            print(f'** the name {model_name} is already in use. Pick another.')
        else:
--- a/ldm/invoke/generator/diffusers_pipeline.py
+++ b/ldm/invoke/generator/diffusers_pipeline.py
@ -24,9 +24,6 @@ from ...models.diffusion import cross_attention_control
 from ...models.diffusion.cross_attention_map_saving import AttentionMapSaver
 from ...modules.prompt_to_embeddings_converter import WeightedPromptFragmentsToEmbeddingsConverter
 # monkeypatch diffusers CrossAttention 🙈
 # this is to make prompt2prompt and (future) attention maps work
 attention.CrossAttention = cross_attention_control.InvokeAIDiffusersCrossAttention
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
@ -295,7 +292,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
        )
-        self.invokeai_diffuser = InvokeAIDiffuserComponent(self.unet, self._unet_forward)
+        self.invokeai_diffuser = InvokeAIDiffuserComponent(self.unet, self._unet_forward, is_running_diffusers=True)
        use_full_precision = (precision == 'float32' or precision == 'autocast')
        self.textual_inversion_manager = TextualInversionManager(tokenizer=self.tokenizer,
                                                                 text_encoder=self.text_encoder,
@ -307,8 +304,23 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
            textual_inversion_manager=self.textual_inversion_manager
        )
        self._enable_memory_efficient_attention()
    def _enable_memory_efficient_attention(self):
        """
        if xformers is available, use it, otherwise use sliced attention.
        """
        if is_xformers_available() and not Globals.disable_xformers:
            self.enable_xformers_memory_efficient_attention()
        else:
            if torch.backends.mps.is_available():
                # until pytorch #91617 is fixed, slicing is borked on MPS
                # https://github.com/pytorch/pytorch/issues/91617
                # fix is in https://github.com/kulinseth/pytorch/pull/222 but no idea when it will get merged to pytorch mainline.
                pass
            else:
                self.enable_attention_slicing(slice_size='auto')
    def image_from_embeddings(self, latents: torch.Tensor, num_inference_steps: int,
                              conditioning_data: ConditioningData,
@ -373,11 +385,9 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
        if additional_guidance is None:
            additional_guidance = []
        extra_conditioning_info = conditioning_data.extra
-        if extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control:
+        with self.invokeai_diffuser.custom_attention_context(extra_conditioning_info=extra_conditioning_info,
-            self.invokeai_diffuser.setup_cross_attention_control(extra_conditioning_info,
+                                                             step_count=len(self.scheduler.timesteps)
-                                                                 step_count=len(self.scheduler.timesteps))
+                                                             ):
        else:
            self.invokeai_diffuser.remove_cross_attention_control()
            yield PipelineIntermediateState(run_id=run_id, step=-1, timestep=self.scheduler.num_train_timesteps,
                                            latents=latents)
@ -388,7 +398,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
            latents = self.scheduler.add_noise(latents, noise, batched_t)
            attention_map_saver: Optional[AttentionMapSaver] = None
-        self.invokeai_diffuser.remove_attention_map_saving()
+
            for i, t in enumerate(self.progress_bar(timesteps)):
                batched_t.fill_(t)
                step_output = self.step(batched_t, latents, conditioning_data,
@ -398,16 +408,16 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
                latents = step_output.prev_sample
                predicted_original = getattr(step_output, 'pred_original_sample', None)
-            if i == len(timesteps)-1 and extra_conditioning_info is not None:
+                # TODO resuscitate attention map saving
-                eos_token_index = extra_conditioning_info.tokens_count_including_eos_bos - 1
+                #if i == len(timesteps)-1 and extra_conditioning_info is not None:
-                attention_map_token_ids = range(1, eos_token_index)
+                #    eos_token_index = extra_conditioning_info.tokens_count_including_eos_bos - 1
-                attention_map_saver = AttentionMapSaver(token_ids=attention_map_token_ids, latents_shape=latents.shape[-2:])
+                #    attention_map_token_ids = range(1, eos_token_index)
-                self.invokeai_diffuser.setup_attention_map_saving(attention_map_saver)
+                #    attention_map_saver = AttentionMapSaver(token_ids=attention_map_token_ids, latents_shape=latents.shape[-2:])
                #    self.invokeai_diffuser.setup_attention_map_saving(attention_map_saver)
                yield PipelineIntermediateState(run_id=run_id, step=i, timestep=int(t), latents=latents,
                                                predicted_original=predicted_original, attention_map_saver=attention_map_saver)
        self.invokeai_diffuser.remove_attention_map_saving()
            return latents, attention_map_saver
    @torch.inference_mode()
@ -447,7 +457,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
        return step_output
-    def _unet_forward(self, latents, t, text_embeddings):
+    def _unet_forward(self, latents, t, text_embeddings, cross_attention_kwargs: Optional[dict[str,Any]] = None):
        """predict the noise residual"""
        if is_inpainting_model(self.unet) and latents.size(1) == 4:
            # Pad out normal non-inpainting inputs for an inpainting model.
@ -460,7 +470,10 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
                initial_image_latents=torch.zeros_like(latents[:1], device=latents.device, dtype=latents.dtype)
            ).add_mask_channels(latents)
-        return self.unet(latents, t, encoder_hidden_states=text_embeddings).sample
+        return self.unet(sample=latents,
                         timestep=t,
                         encoder_hidden_states=text_embeddings,
                         cross_attention_kwargs=cross_attention_kwargs).sample
    def img2img_from_embeddings(self,
                                init_image: Union[torch.FloatTensor, PIL.Image.Image],
@ -531,6 +544,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
            init_image = image_resized_to_grid_as_tensor(init_image.convert('RGB'))
        init_image = init_image.to(device=device, dtype=latents_dtype)
        mask = mask.to(device=device, dtype=latents_dtype)
        if init_image.dim() == 3:
            init_image = init_image.unsqueeze(0)
@ -549,17 +563,22 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
        if mask.dim() == 3:
            mask = mask.unsqueeze(0)
-        mask = tv_resize(mask, init_image_latents.shape[-2:], T.InterpolationMode.BILINEAR) \
+        latent_mask = tv_resize(mask, init_image_latents.shape[-2:], T.InterpolationMode.BILINEAR) \
            .to(device=device, dtype=latents_dtype)
        guidance: List[Callable] = []
        if is_inpainting_model(self.unet):
            # You'd think the inpainting model wouldn't be paying attention to the area it is going to repaint
            # (that's why there's a mask!) but it seems to really want that blanked out.
            masked_init_image = init_image * torch.where(mask < 0.5, 1, 0)
            masked_latents = self.non_noised_latents_from_image(masked_init_image, device=device, dtype=latents_dtype)
            # TODO: we should probably pass this in so we don't have to try/finally around setting it.
            self.invokeai_diffuser.model_forward_callback = \
-                AddsMaskLatents(self._unet_forward, mask, init_image_latents)
+                AddsMaskLatents(self._unet_forward, latent_mask, masked_latents)
        else:
-            guidance.append(AddsMaskGuidance(mask, init_image_latents, self.scheduler, noise))
+            guidance.append(AddsMaskGuidance(latent_mask, init_image_latents, self.scheduler, noise))
        try:
            result_latents, result_attention_maps = self.latents_from_embeddings(
@ -578,11 +597,20 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
            output = InvokeAIStableDiffusionPipelineOutput(images=image, nsfw_content_detected=[], attention_map_saver=result_attention_maps)
            return self.check_for_safety(output, dtype=conditioning_data.dtype)
-    def non_noised_latents_from_image(self, init_image, *, device, dtype):
+    def non_noised_latents_from_image(self, init_image, *, device: torch.device, dtype):
        init_image = init_image.to(device=device, dtype=dtype)
        with torch.inference_mode():
            if device.type == 'mps':
                # workaround for torch MPS bug that has been fixed in https://github.com/kulinseth/pytorch/pull/222
                # TODO remove this workaround once kulinseth#222 is merged to pytorch mainline
                self.vae.to('cpu')
                init_image = init_image.to('cpu')
            init_latent_dist = self.vae.encode(init_image).latent_dist
            init_latents = init_latent_dist.sample().to(dtype=dtype)  # FIXME: uses torch.randn. make reproducible!
            if device.type == 'mps':
                self.vae.to(device)
                init_latents = init_latents.to(device)
        init_latents = 0.18215 * init_latents
        return init_latents
--- a/ldm/invoke/generator/inpaint.py
+++ b/ldm/invoke/generator/inpaint.py
@ -19,10 +19,12 @@ from ldm.util import debug_image
 def infill_methods()->list[str]:
-    methods = list()
+    methods = [
        "tile",
        "solid",
    ]
    if PatchMatch.patchmatch_available():
-        methods.append('patchmatch')
+        methods.insert(0, 'patchmatch')
    methods.append('tile')
    return methods
 class Inpaint(Img2Img):
@ -182,6 +184,7 @@ class Inpaint(Img2Img):
                       infill_method = None,
                       inpaint_width=None,
                       inpaint_height=None,
                       inpaint_fill:tuple(int)=(0x7F, 0x7F, 0x7F, 0xFF),
                       attention_maps_callback=None,
                       **kwargs):
        """
@ -202,12 +205,17 @@ class Inpaint(Img2Img):
            # Do infill
            if infill_method == 'patchmatch' and PatchMatch.patchmatch_available():
                init_filled = self.infill_patchmatch(self.pil_image.copy())
-            else: # if infill_method == 'tile': # Only two methods right now, so always use 'tile' if not patchmatch
+            elif infill_method == 'tile':
                init_filled = self.tile_fill_missing(
                    self.pil_image.copy(),
                    seed = self.seed,
                    tile_size = tile_size
                )
            elif infill_method == 'solid':
                solid_bg = PIL.Image.new("RGBA", init_image.size, inpaint_fill)
                init_filled = PIL.Image.alpha_composite(solid_bg, init_image)
            else:
                raise ValueError(f"Non-supported infill type {infill_method}", infill_method)
            init_filled.paste(init_image, (0,0), init_image.split()[-1])
            # Resize if requested for inpainting
--- a/ldm/invoke/generator/txt2img2img.py
+++ b/ldm/invoke/generator/txt2img2img.py
@ -3,10 +3,10 @@ ldm.invoke.generator.txt2img inherits from ldm.invoke.generator
 '''
 import math
 from diffusers.utils.logging import get_verbosity, set_verbosity, set_verbosity_error
 from typing import Callable, Optional
 import torch
 from diffusers.utils.logging import get_verbosity, set_verbosity, set_verbosity_error
 from ldm.invoke.generator.base import Generator
 from ldm.invoke.generator.diffusers_pipeline import trim_to_multiple_of, StableDiffusionGeneratorPipeline, \
@ -38,10 +38,6 @@ class Txt2Img2Img(Generator):
                uc, c, cfg_scale, extra_conditioning_info,
                threshold = ThresholdSettings(threshold, warmup=0.2) if threshold else None)
            .add_scheduler_args_if_applicable(pipeline.scheduler, eta=ddim_eta))
        scale_dim = min(width, height)
        scale = 512 / scale_dim
        init_width, init_height = trim_to_multiple_of(scale * width, scale * height)
        def make_image(x_T):
@ -54,6 +50,10 @@ class Txt2Img2Img(Generator):
                # TODO: threshold = threshold,
            )
            # Get our initial generation width and height directly from the latent output so
            # the message below is accurate.
            init_width = first_pass_latent_output.size()[3] * self.downsampling_factor
            init_height = first_pass_latent_output.size()[2] * self.downsampling_factor
            print(
                  f"\n>> Interpolating from {init_width}x{init_height} to {width}x{height} using DDIM sampling"
                 )
@ -106,27 +106,35 @@ class Txt2Img2Img(Generator):
    def get_noise(self,width,height,scale = True):
        # print(f"Get noise: {width}x{height}")
        if scale:
-            trained_square = 512 * 512
+            # Scale the input width and height for the initial generation
-            actual_square = width * height
+            # Make their area equivalent to the model's resolution area (e.g. 512*512 = 262144),
-            scale = math.sqrt(trained_square / actual_square)
+            # while keeping the minimum dimension at least 0.5 * resolution (e.g. 512*0.5 = 256)
-            scaled_width = math.ceil(scale * width / 64) * 64
+
-            scaled_height = math.ceil(scale * height / 64) * 64
+            aspect = width / height
            dimension = self.model.unet.config.sample_size * self.model.vae_scale_factor
            min_dimension = math.floor(dimension * 0.5)
            model_area = dimension * dimension # hardcoded for now since all models are trained on square images
            if aspect > 1.0:
                init_height = max(min_dimension, math.sqrt(model_area / aspect))
                init_width = init_height * aspect
            else:
                init_width = max(min_dimension, math.sqrt(model_area * aspect))
                init_height = init_width / aspect
            scaled_width, scaled_height = trim_to_multiple_of(math.floor(init_width), math.floor(init_height))
        else:
            scaled_width = width
            scaled_height = height
        device = self.model.device
        channels = self.latent_channels
        if channels == 9:
            channels = 4  # we don't really want noise for all the mask channels
        shape = (1, channels,
                 scaled_height // self.downsampling_factor, scaled_width // self.downsampling_factor)
        if self.use_mps_noise or device.type == 'mps':
-            return torch.randn([1,
+            return torch.randn(shape, dtype=self.torch_dtype(), device='cpu').to(device)
                                self.latent_channels,
                                scaled_height // self.downsampling_factor,
                                scaled_width  // self.downsampling_factor],
                               dtype=self.torch_dtype(),
                               device='cpu').to(device)
        else:
-            return torch.randn([1,
+            return torch.randn(shape, dtype=self.torch_dtype(), device=device)
                                self.latent_channels,
                                scaled_height // self.downsampling_factor,
                                scaled_width  // self.downsampling_factor],
                               dtype=self.torch_dtype(),
                               device=device)
--- a/ldm/invoke/model_manager.py
+++ b/ldm/invoke/model_manager.py
@ -125,7 +125,7 @@ class ModelManager(object):
        Set the default model. The change will not take
        effect until you call model_manager.commit()
        '''
-        assert model_name in self.models,f"unknown model '{model_name}'"
+        assert model_name in self.model_names(), f"unknown model '{model_name}'"
        config = self.config
        for model in config:
--- a/ldm/invoke/prompt_parser.py
+++ b/ldm/invoke/prompt_parser.py
@ -155,7 +155,7 @@ class CrossAttentionControlSubstitute(CrossAttentionControlledFragment):
        default_options = {
            's_start': 0.0,
            's_end': 0.2062994740159002, # ~= shape_freedom=0.5
-            't_start': 0.0,
+            't_start': 0.1,
            't_end': 1.0
        }
        merged_options = default_options
--- a/ldm/models/diffusion/cross_attention_control.py
+++ b/ldm/models/diffusion/cross_attention_control.py
@ -7,8 +7,10 @@ import torch
 import diffusers
 from torch import nn
 from diffusers.models.unet_2d_condition import UNet2DConditionModel
 from diffusers.models.cross_attention import AttnProcessor
 from ldm.invoke.devices import torch_dtype
 # adapted from bloc97's CrossAttentionControl colab
 # https://github.com/bloc97/CrossAttentionControl
@ -304,11 +306,15 @@ class InvokeAICrossAttentionMixin:
-def remove_cross_attention_control(model):
+def restore_default_cross_attention(model, is_running_diffusers: bool, restore_attention_processor: Optional[AttnProcessor]=None):
    if is_running_diffusers:
        unet = model
        unet.set_attn_processor(restore_attention_processor or CrossAttnProcessor())
    else:
        remove_attention_function(model)
-def setup_cross_attention_control(model, context: Context):
+def override_cross_attention(model, context: Context, is_running_diffusers = False):
    """
    Inject attention parameters and functions into the passed in model to enable cross attention editing.
@ -323,7 +329,7 @@ def setup_cross_attention_control(model, context: Context):
    # urgh. should this be hardcoded?
    max_length = 77
    # mask=1 means use base prompt attention, mask=0 means use edited prompt attention
-    mask = torch.zeros(max_length)
+    mask = torch.zeros(max_length, dtype=torch_dtype(device))
    indices_target = torch.arange(max_length, dtype=torch.long)
    indices = torch.arange(max_length, dtype=torch.long)
    for name, a0, a1, b0, b1 in context.arguments.edit_opcodes:
@ -333,10 +339,26 @@ def setup_cross_attention_control(model, context: Context):
                indices[b0:b1] = indices_target[a0:a1]
                mask[b0:b1] = 1
    context.register_cross_attention_modules(model)
    context.cross_attention_mask = mask.to(device)
    context.cross_attention_index_map = indices.to(device)
    if is_running_diffusers:
        unet = model
        old_attn_processors = unet.attn_processors
        if torch.backends.mps.is_available():
            # see note in StableDiffusionGeneratorPipeline.__init__ about borked slicing on MPS
            unet.set_attn_processor(SwapCrossAttnProcessor())
        else:
            # try to re-use an existing slice size
            default_slice_size = 4
            slice_size = next((p.slice_size for p in old_attn_processors.values() if type(p) is SlicedAttnProcessor), default_slice_size)
            unet.set_attn_processor(SlicedSwapCrossAttnProcesser(slice_size=slice_size))
        return old_attn_processors
    else:
        context.register_cross_attention_modules(model)
        inject_attention_function(model, context)
        return None
 def get_cross_attention_modules(model, which: CrossAttentionType) -> list[tuple[str, InvokeAICrossAttentionMixin]]:
@ -445,6 +467,7 @@ def get_mem_free_total(device):
    return mem_free_total
 class InvokeAIDiffusersCrossAttention(diffusers.models.attention.CrossAttention, InvokeAICrossAttentionMixin):
    def __init__(self, **kwargs):
@ -460,3 +483,176 @@ class InvokeAIDiffusersCrossAttention(diffusers.models.attention.CrossAttention,
        hidden_states = self.reshape_batch_dim_to_heads(attention_result)
        return hidden_states
 ## 🧨diffusers implementation follows
 """
 # base implementation
 class CrossAttnProcessor:
    def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None):
        batch_size, sequence_length, _ = hidden_states.shape
        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length)
        query = attn.to_q(hidden_states)
        query = attn.head_to_batch_dim(query)
        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
        key = attn.to_k(encoder_hidden_states)
        value = attn.to_v(encoder_hidden_states)
        key = attn.head_to_batch_dim(key)
        value = attn.head_to_batch_dim(value)
        attention_probs = attn.get_attention_scores(query, key, attention_mask)
        hidden_states = torch.bmm(attention_probs, value)
        hidden_states = attn.batch_to_head_dim(hidden_states)
        # linear proj
        hidden_states = attn.to_out[0](hidden_states)
        # dropout
        hidden_states = attn.to_out[1](hidden_states)
        return hidden_states
 """
 from dataclasses import field, dataclass
 import torch
 from diffusers.models.cross_attention import CrossAttention, CrossAttnProcessor, SlicedAttnProcessor, AttnProcessor
@dataclass
 class SwapCrossAttnContext:
    modified_text_embeddings: torch.Tensor
    index_map: torch.Tensor # maps from original prompt token indices to the equivalent tokens in the modified prompt
    mask: torch.Tensor # in the target space of the index_map
    cross_attention_types_to_do: list[CrossAttentionType] = field(default_factory=list)
    def __int__(self,
                cac_types_to_do: [CrossAttentionType],
                modified_text_embeddings: torch.Tensor,
                index_map: torch.Tensor,
                mask: torch.Tensor):
        self.cross_attention_types_to_do = cac_types_to_do
        self.modified_text_embeddings = modified_text_embeddings
        self.index_map = index_map
        self.mask = mask
    def wants_cross_attention_control(self, attn_type: CrossAttentionType) -> bool:
        return attn_type in self.cross_attention_types_to_do
    @classmethod
    def make_mask_and_index_map(cls, edit_opcodes: list[tuple[str, int, int, int, int]], max_length: int) \
            -> tuple[torch.Tensor, torch.Tensor]:
        # mask=1 means use original prompt attention, mask=0 means use modified prompt attention
        mask = torch.zeros(max_length)
        indices_target = torch.arange(max_length, dtype=torch.long)
        indices = torch.arange(max_length, dtype=torch.long)
        for name, a0, a1, b0, b1 in edit_opcodes:
            if b0 < max_length:
                if name == "equal":
                    # these tokens remain the same as in the original prompt
                    indices[b0:b1] = indices_target[a0:a1]
                    mask[b0:b1] = 1
        return mask, indices
 class SlicedSwapCrossAttnProcesser(SlicedAttnProcessor):
    # TODO: dynamically pick slice size based on memory conditions
    def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None,
                 # kwargs
                 swap_cross_attn_context: SwapCrossAttnContext=None):
        attention_type = CrossAttentionType.SELF if encoder_hidden_states is None else CrossAttentionType.TOKENS
        # if cross-attention control is not in play, just call through to the base implementation.
        if attention_type is CrossAttentionType.SELF or \
            swap_cross_attn_context is None or \
            not swap_cross_attn_context.wants_cross_attention_control(attention_type):
            #print(f"SwapCrossAttnContext for {attention_type} not active - passing request to superclass")
            return super().__call__(attn, hidden_states, encoder_hidden_states, attention_mask)
        #else:
        #    print(f"SwapCrossAttnContext for {attention_type} active")
        batch_size, sequence_length, _ = hidden_states.shape
        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length)
        query = attn.to_q(hidden_states)
        dim = query.shape[-1]
        query = attn.head_to_batch_dim(query)
        original_text_embeddings = encoder_hidden_states
        modified_text_embeddings = swap_cross_attn_context.modified_text_embeddings
        original_text_key = attn.to_k(original_text_embeddings)
        modified_text_key = attn.to_k(modified_text_embeddings)
        original_value = attn.to_v(original_text_embeddings)
        modified_value = attn.to_v(modified_text_embeddings)
        original_text_key = attn.head_to_batch_dim(original_text_key)
        modified_text_key = attn.head_to_batch_dim(modified_text_key)
        original_value = attn.head_to_batch_dim(original_value)
        modified_value = attn.head_to_batch_dim(modified_value)
        # compute slices and prepare output tensor
        batch_size_attention = query.shape[0]
        hidden_states = torch.zeros(
            (batch_size_attention, sequence_length, dim // attn.heads), device=query.device, dtype=query.dtype
        )
        # do slices
        for i in range(max(1,hidden_states.shape[0] // self.slice_size)):
            start_idx = i * self.slice_size
            end_idx = (i + 1) * self.slice_size
            query_slice = query[start_idx:end_idx]
            original_key_slice = original_text_key[start_idx:end_idx]
            modified_key_slice = modified_text_key[start_idx:end_idx]
            attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
            original_attn_slice = attn.get_attention_scores(query_slice, original_key_slice, attn_mask_slice)
            modified_attn_slice = attn.get_attention_scores(query_slice, modified_key_slice, attn_mask_slice)
            # because the prompt modifications may result in token sequences shifted forwards or backwards,
            # the original attention probabilities must be remapped to account for token index changes in the
            # modified prompt
            remapped_original_attn_slice = torch.index_select(original_attn_slice, -1,
                                                              swap_cross_attn_context.index_map)
            # only some tokens taken from the original attention probabilities. this is controlled by the mask.
            mask = swap_cross_attn_context.mask
            inverse_mask = 1 - mask
            attn_slice = \
                remapped_original_attn_slice * mask + \
                modified_attn_slice * inverse_mask
            del remapped_original_attn_slice, modified_attn_slice
            attn_slice = torch.bmm(attn_slice, modified_value[start_idx:end_idx])
            hidden_states[start_idx:end_idx] = attn_slice
        # done
        hidden_states = attn.batch_to_head_dim(hidden_states)
        # linear proj
        hidden_states = attn.to_out[0](hidden_states)
        # dropout
        hidden_states = attn.to_out[1](hidden_states)
        return hidden_states
 class SwapCrossAttnProcessor(SlicedSwapCrossAttnProcesser):
    def __init__(self):
        super(SwapCrossAttnProcessor, self).__init__(slice_size=int(1e9)) # massive slice size = don't slice
--- a/ldm/models/diffusion/ddim.py
+++ b/ldm/models/diffusion/ddim.py
@ -19,9 +19,9 @@ class DDIMSampler(Sampler):
        all_timesteps_count = kwargs.get('all_timesteps_count', t_enc)
        if extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control:
-            self.invokeai_diffuser.setup_cross_attention_control(extra_conditioning_info, step_count = all_timesteps_count)
+            self.invokeai_diffuser.override_cross_attention(extra_conditioning_info, step_count = all_timesteps_count)
        else:
-            self.invokeai_diffuser.remove_cross_attention_control()
+            self.invokeai_diffuser.restore_default_cross_attention()
    # This is the central routine
--- a/ldm/models/diffusion/ksampler.py
+++ b/ldm/models/diffusion/ksampler.py
@ -43,9 +43,9 @@ class CFGDenoiser(nn.Module):
        extra_conditioning_info = kwargs.get('extra_conditioning_info', None)
        if extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control:
-            self.invokeai_diffuser.setup_cross_attention_control(extra_conditioning_info, step_count = t_enc)
+            self.invokeai_diffuser.override_cross_attention(extra_conditioning_info, step_count = t_enc)
        else:
-            self.invokeai_diffuser.remove_cross_attention_control()
+            self.invokeai_diffuser.restore_default_cross_attention()
    def forward(self, x, sigma, uncond, cond, cond_scale):
--- a/ldm/models/diffusion/plms.py
+++ b/ldm/models/diffusion/plms.py
@ -21,9 +21,9 @@ class PLMSSampler(Sampler):
        all_timesteps_count = kwargs.get('all_timesteps_count', t_enc)
        if extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control:
-            self.invokeai_diffuser.setup_cross_attention_control(extra_conditioning_info, step_count = all_timesteps_count)
+            self.invokeai_diffuser.override_cross_attention(extra_conditioning_info, step_count = all_timesteps_count)
        else:
-            self.invokeai_diffuser.remove_cross_attention_control()
+            self.invokeai_diffuser.restore_default_cross_attention()
    # this is the essential routine
--- a/ldm/models/diffusion/shared_invokeai_diffusion.py
+++ b/ldm/models/diffusion/shared_invokeai_diffusion.py
@ -1,14 +1,16 @@
 import math
 from contextlib import contextmanager
 from dataclasses import dataclass
 from math import ceil
-from typing import Callable, Optional, Union
+from typing import Callable, Optional, Union, Any, Dict
 import numpy as np
 import torch
 from diffusers.models.cross_attention import AttnProcessor
 from ldm.models.diffusion.cross_attention_control import Arguments, \
-    remove_cross_attention_control, setup_cross_attention_control, Context, get_cross_attention_modules, \
+    restore_default_cross_attention, override_cross_attention, Context, get_cross_attention_modules, \
-    CrossAttentionType
+    CrossAttentionType, SwapCrossAttnContext
 from ldm.models.diffusion.cross_attention_map_saving import AttentionMapSaver
@ -30,17 +32,20 @@ class InvokeAIDiffuserComponent:
    debug_thresholding = False
    @dataclass
    class ExtraConditioningInfo:
-        def __init__(self, tokens_count_including_eos_bos:int, cross_attention_control_args: Optional[Arguments]):
+
-            self.tokens_count_including_eos_bos = tokens_count_including_eos_bos
+        tokens_count_including_eos_bos: int
-            self.cross_attention_control_args = cross_attention_control_args
+        cross_attention_control_args: Optional[Arguments] = None
        @property
        def wants_cross_attention_control(self):
            return self.cross_attention_control_args is not None
    def __init__(self, model, model_forward_callback:
-                    Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]
+                    Callable[[torch.Tensor, torch.Tensor, torch.Tensor, Optional[dict[str,Any]]], torch.Tensor],
                 is_running_diffusers: bool=False,
                 ):
        """
        :param model: the unet model to pass through to cross attention control
@ -48,21 +53,47 @@ class InvokeAIDiffuserComponent:
        """
        self.conditioning = None
        self.model = model
        self.is_running_diffusers = is_running_diffusers
        self.model_forward_callback = model_forward_callback
        self.cross_attention_control_context = None
-    def setup_cross_attention_control(self, conditioning: ExtraConditioningInfo, step_count: int):
+    @contextmanager
    def custom_attention_context(self,
                                 extra_conditioning_info: Optional[ExtraConditioningInfo],
                                 step_count: int):
        do_swap = extra_conditioning_info is not None and extra_conditioning_info.wants_cross_attention_control
        old_attn_processor = None
        if do_swap:
            old_attn_processor = self.override_cross_attention(extra_conditioning_info,
                                                               step_count=step_count)
        try:
            yield None
        finally:
            if old_attn_processor is not None:
                self.restore_default_cross_attention(old_attn_processor)
            # TODO resuscitate attention map saving
            #self.remove_attention_map_saving()
    def override_cross_attention(self, conditioning: ExtraConditioningInfo, step_count: int) -> Dict[str, AttnProcessor]:
        """
        setup cross attention .swap control. for diffusers this replaces the attention processor, so
        the previous attention processor is returned so that the caller can restore it later.
        """
        self.conditioning = conditioning
        self.cross_attention_control_context = Context(
            arguments=self.conditioning.cross_attention_control_args,
            step_count=step_count
        )
-        setup_cross_attention_control(self.model, self.cross_attention_control_context)
+        return override_cross_attention(self.model,
                                        self.cross_attention_control_context,
                                        is_running_diffusers=self.is_running_diffusers)
-    def remove_cross_attention_control(self):
+    def restore_default_cross_attention(self, restore_attention_processor: Optional['AttnProcessor']=None):
        self.conditioning = None
        self.cross_attention_control_context = None
-        remove_cross_attention_control(self.model)
+        restore_default_cross_attention(self.model,
                                        is_running_diffusers=self.is_running_diffusers,
                                        restore_attention_processor=restore_attention_processor)
    def setup_attention_map_saving(self, saver: AttentionMapSaver):
        def callback(slice, dim, offset, slice_size, key):
@ -168,7 +199,41 @@ class InvokeAIDiffuserComponent:
        return unconditioned_next_x, conditioned_next_x
-    def apply_cross_attention_controlled_conditioning(self, x:torch.Tensor, sigma, unconditioning, conditioning, cross_attention_control_types_to_do):
+    def apply_cross_attention_controlled_conditioning(self,
                                                     x: torch.Tensor,
                                                     sigma,
                                                     unconditioning,
                                                     conditioning,
                                                     cross_attention_control_types_to_do):
        if self.is_running_diffusers:
            return self.apply_cross_attention_controlled_conditioning__diffusers(x, sigma, unconditioning, conditioning, cross_attention_control_types_to_do)
        else:
            return self.apply_cross_attention_controlled_conditioning__compvis(x, sigma, unconditioning, conditioning, cross_attention_control_types_to_do)
    def apply_cross_attention_controlled_conditioning__diffusers(self,
                                                                 x: torch.Tensor,
                                                                 sigma,
                                                                 unconditioning,
                                                                 conditioning,
                                                                 cross_attention_control_types_to_do):
        context: Context = self.cross_attention_control_context
        cross_attn_processor_context = SwapCrossAttnContext(modified_text_embeddings=context.arguments.edited_conditioning,
                                                            index_map=context.cross_attention_index_map,
                                                            mask=context.cross_attention_mask,
                                                            cross_attention_types_to_do=[])
        # no cross attention for unconditioning (negative prompt)
        unconditioned_next_x = self.model_forward_callback(x, sigma, unconditioning,
                                                           {"swap_cross_attn_context": cross_attn_processor_context})
        # do requested cross attention types for conditioning (positive prompt)
        cross_attn_processor_context.cross_attention_types_to_do = cross_attention_control_types_to_do
        conditioned_next_x = self.model_forward_callback(x, sigma, conditioning,
                                                         {"swap_cross_attn_context": cross_attn_processor_context})
        return unconditioned_next_x, conditioned_next_x
    def apply_cross_attention_controlled_conditioning__compvis(self, x:torch.Tensor, sigma, unconditioning, conditioning, cross_attention_control_types_to_do):
        # print('pct', percent_through, ': doing cross attention control on', cross_attention_control_types_to_do)
        # slower non-batched path (20% slower on mac MPS)
        # We are only interested in using attention maps for conditioned_next_x, but batching them with generation of