Merge branch 'main' into bugfix/convert-v2-models

2024-08-30 20:32:17 +00:00 · 2023-02-16 23:00:58 -05:00
parent d38e7170fe 557e28f460
commit 159ce2ea08
119 changed files with 1965 additions and 1087 deletions
--- a/ldm/invoke/generator/base.py
+++ b/ldm/invoke/generator/base.py
@ -247,11 +247,14 @@ class Generator:
        fixdevice = 'cpu' if (self.model.device.type == 'mps') else self.model.device
        # limit noise to only the diffusion image channels, not the mask channels
        input_channels = min(self.latent_channels, 4)
+        # round up to the nearest block of 8
+        temp_width = int((width + 7) / 8) * 8
+        temp_height = int((height + 7) / 8) * 8
        noise = torch.stack([
-            rand_perlin_2d((height, width),
+            rand_perlin_2d((temp_height, temp_width),
                           (8, 8),
                           device = self.model.device).to(fixdevice) for _ in range(input_channels)], dim=0).to(self.model.device)
-        return noise
+        return noise[0:4, 0:height, 0:width]

    def new_seed(self):
        self.seed = random.randrange(0, np.iinfo(np.uint32).max)
--- a/ldm/invoke/generator/diffusers_pipeline.py
+++ b/ldm/invoke/generator/diffusers_pipeline.py
@ -3,39 +3,34 @@ from __future__ import annotations
 import dataclasses
 import inspect
 import secrets
-import sys
+from collections.abc import Sequence
 from dataclasses import dataclass, field
 from typing import List, Optional, Union, Callable, Type, TypeVar, Generic, Any

-if sys.version_info < (3, 10):
-    from typing_extensions import ParamSpec
-else:
-    from typing import ParamSpec
-
 import PIL.Image
 import einops
+import psutil
 import torch
 import torchvision.transforms as T
-from diffusers.utils.import_utils import is_xformers_available
-
-from ...models.diffusion.cross_attention_map_saving import AttentionMapSaver
-from ...modules.prompt_to_embeddings_converter import WeightedPromptFragmentsToEmbeddingsConverter
-
-
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.schedulers.scheduling_utils import SchedulerMixin, SchedulerOutput
-from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.outputs import BaseOutput
 from torchvision.transforms.functional import resize as tv_resize
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from typing_extensions import ParamSpec

 from ldm.invoke.globals import Globals
-from ldm.models.diffusion.shared_invokeai_diffusion import InvokeAIDiffuserComponent, ThresholdSettings
+from ldm.models.diffusion.shared_invokeai_diffusion import InvokeAIDiffuserComponent, PostprocessingSettings
 from ldm.modules.textual_inversion_manager import TextualInversionManager
+from ..offloading import LazilyLoadedModelGroup, FullyLoadedModelGroup, ModelGroup
+from ...models.diffusion.cross_attention_map_saving import AttentionMapSaver
+from ...modules.prompt_to_embeddings_converter import WeightedPromptFragmentsToEmbeddingsConverter


@dataclass
@ -199,8 +194,10 @@ class ConditioningData:
    """
    extra: Optional[InvokeAIDiffuserComponent.ExtraConditioningInfo] = None
    scheduler_args: dict[str, Any] = field(default_factory=dict)
-    """Additional arguments to pass to scheduler.step."""
-    threshold: Optional[ThresholdSettings] = None
+    """
+    Additional arguments to pass to invokeai_diffuser.do_latent_postprocessing().
+    """
+    postprocessing_settings: Optional[PostprocessingSettings] = None

    @property
    def dtype(self):
@ -262,6 +259,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
        feature_extractor ([`CLIPFeatureExtractor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """
+    _model_group: ModelGroup

    ID_LENGTH = 8

@ -271,7 +269,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
        text_encoder: CLIPTextModel,
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        scheduler: KarrasDiffusionSchedulers,
        safety_checker: Optional[StableDiffusionSafetyChecker],
        feature_extractor: Optional[CLIPFeatureExtractor],
        requires_safety_checker: bool = False,
@ -301,8 +299,11 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
            textual_inversion_manager=self.textual_inversion_manager
        )

+        self._model_group = FullyLoadedModelGroup(self.unet.device)
+        self._model_group.install(*self._submodels)

-    def _adjust_memory_efficient_attention(self, latents: Torch.tensor):
+
+    def _adjust_memory_efficient_attention(self, latents: torch.Tensor):
        """
        if xformers is available, use it, otherwise use sliced attention.
        """
@ -320,7 +321,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
                elif self.device.type == 'cuda':
                    mem_free, _ = torch.cuda.mem_get_info(self.device)
                else:
-                    raise ValueError(f"unrecognized device {device}")
+                    raise ValueError(f"unrecognized device {self.device}")
                # input tensor of [1, 4, h/8, w/8]
                # output tensor of [16, (h/8 * w/8), (h/8 * w/8)]
                bytes_per_element_needed_for_baddbmm_duplication = latents.element_size() + 4
@ -334,6 +335,66 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
                    self.disable_attention_slicing()


+    def enable_offload_submodels(self, device: torch.device):
+        """
+        Offload each submodel when it's not in use.
+
+        Useful for low-vRAM situations where the size of the model in memory is a big chunk of
+        the total available resource, and you want to free up as much for inference as possible.
+
+        This requires more moving parts and may add some delay as the U-Net is swapped out for the
+        VAE and vice-versa.
+        """
+        models = self._submodels
+        if self._model_group is not None:
+            self._model_group.uninstall(*models)
+        group = LazilyLoadedModelGroup(device)
+        group.install(*models)
+        self._model_group = group
+
+    def disable_offload_submodels(self):
+        """
+        Leave all submodels loaded.
+
+        Appropriate for cases where the size of the model in memory is small compared to the memory
+        required for inference. Avoids the delay and complexity of shuffling the submodels to and
+        from the GPU.
+        """
+        models = self._submodels
+        if self._model_group is not None:
+            self._model_group.uninstall(*models)
+        group = FullyLoadedModelGroup(self._model_group.execution_device)
+        group.install(*models)
+        self._model_group = group
+
+    def offload_all(self):
+        """Offload all this pipeline's models to CPU."""
+        self._model_group.offload_current()
+
+    def ready(self):
+        """
+        Ready this pipeline's models.
+
+        i.e. pre-load them to the GPU if appropriate.
+        """
+        self._model_group.ready()
+
+    def to(self, torch_device: Optional[Union[str, torch.device]] = None):
+        if torch_device is None:
+            return self
+        self._model_group.set_device(torch_device)
+        self._model_group.ready()
+
+    @property
+    def device(self) -> torch.device:
+        return self._model_group.execution_device
+
+    @property
+    def _submodels(self) -> Sequence[torch.nn.Module]:
+        module_names, _, _ = self.extract_init_dict(dict(self.config))
+        values = [getattr(self, name) for name in module_names.keys()]
+        return [m for m in values if isinstance(m, torch.nn.Module)]
+
    def image_from_embeddings(self, latents: torch.Tensor, num_inference_steps: int,
                              conditioning_data: ConditioningData,
                              *,
@ -375,7 +436,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
                                callback: Callable[[PipelineIntermediateState], None] = None
                                ) -> tuple[torch.Tensor, Optional[AttentionMapSaver]]:
        if timesteps is None:
-            self.scheduler.set_timesteps(num_inference_steps, device=self.unet.device)
+            self.scheduler.set_timesteps(num_inference_steps, device=self._model_group.device_for(self.unet))
            timesteps = self.scheduler.timesteps
        infer_latents_from_embeddings = GeneratorToCallbackinator(self.generate_latents_from_embeddings, PipelineIntermediateState)
        result: PipelineIntermediateState = infer_latents_from_embeddings(
@ -407,7 +468,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):

            batch_size = latents.shape[0]
            batched_t = torch.full((batch_size,), timesteps[0],
-                                   dtype=timesteps.dtype, device=self.unet.device)
+                                   dtype=timesteps.dtype, device=self._model_group.device_for(self.unet))
            latents = self.scheduler.add_noise(latents, noise, batched_t)

            attention_map_saver: Optional[AttentionMapSaver] = None
@ -419,6 +480,15 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
                                        total_step_count=len(timesteps),
                                        additional_guidance=additional_guidance)
                latents = step_output.prev_sample
+
+                latents = self.invokeai_diffuser.do_latent_postprocessing(
+                    postprocessing_settings=conditioning_data.postprocessing_settings,
+                    latents=latents,
+                    sigma=batched_t,
+                    step_index=i,
+                    total_step_count=len(timesteps)
+                )
+
                predicted_original = getattr(step_output, 'pred_original_sample', None)

                # TODO resuscitate attention map saving
@ -455,7 +525,6 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
            conditioning_data.guidance_scale,
            step_index=step_index,
            total_step_count=total_step_count,
-            threshold=conditioning_data.threshold
        )

        # compute the previous noisy sample x_t -> x_t-1
@ -483,9 +552,8 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
                initial_image_latents=torch.zeros_like(latents[:1], device=latents.device, dtype=latents.dtype)
            ).add_mask_channels(latents)

-        return self.unet(sample=latents,
-                         timestep=t,
-                         encoder_hidden_states=text_embeddings,
+        # First three args should be positional, not keywords, so torch hooks can see them.
+        return self.unet(latents, t, text_embeddings,
                         cross_attention_kwargs=cross_attention_kwargs).sample

    def img2img_from_embeddings(self,
@ -504,9 +572,9 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
            init_image = einops.rearrange(init_image, 'c h w -> 1 c h w')

        # 6. Prepare latent variables
-        device = self.unet.device
-        latents_dtype = self.unet.dtype
-        initial_latents = self.non_noised_latents_from_image(init_image, device=device, dtype=latents_dtype)
+        initial_latents = self.non_noised_latents_from_image(
+            init_image, device=self._model_group.device_for(self.unet),
+            dtype=self.unet.dtype)
        noise = noise_func(initial_latents)

        return self.img2img_from_latents_and_embeddings(initial_latents, num_inference_steps,
@ -519,7 +587,8 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
                                            strength,
                                            noise: torch.Tensor, run_id=None, callback=None
                                            ) -> InvokeAIStableDiffusionPipelineOutput:
-        timesteps, _ = self.get_img2img_timesteps(num_inference_steps, strength, self.unet.device)
+        timesteps, _ = self.get_img2img_timesteps(num_inference_steps, strength,
+                                                  device=self._model_group.device_for(self.unet))
        result_latents, result_attention_maps = self.latents_from_embeddings(
            initial_latents, num_inference_steps, conditioning_data,
            timesteps=timesteps,
@ -558,7 +627,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
            run_id=None,
            noise_func=None,
            ) -> InvokeAIStableDiffusionPipelineOutput:
-        device = self.unet.device
+        device = self._model_group.device_for(self.unet)
        latents_dtype = self.unet.dtype

        if isinstance(init_image, PIL.Image.Image):
@ -622,6 +691,8 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
                # TODO remove this workaround once kulinseth#222 is merged to pytorch mainline
                self.vae.to('cpu')
                init_image = init_image.to('cpu')
+            else:
+                self._model_group.load(self.vae)
            init_latent_dist = self.vae.encode(init_image).latent_dist
            init_latents = init_latent_dist.sample().to(dtype=dtype)  # FIXME: uses torch.randn. make reproducible!
            if device.type == 'mps':
@ -633,8 +704,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):

    def check_for_safety(self, output, dtype):
        with torch.inference_mode():
-            screened_images, has_nsfw_concept = self.run_safety_checker(
-                output.images, device=self._execution_device, dtype=dtype)
+            screened_images, has_nsfw_concept = self.run_safety_checker(output.images, dtype=dtype)
        screened_attention_map_saver = None
        if has_nsfw_concept is None or not has_nsfw_concept:
            screened_attention_map_saver = output.attention_map_saver
@ -643,6 +713,12 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
                                                     # block the attention maps if NSFW content is detected
                                                     attention_map_saver=screened_attention_map_saver)

+    def run_safety_checker(self, image, device=None, dtype=None):
+        # overriding to use the model group for device info instead of requiring the caller to know.
+        if self.safety_checker is not None:
+            device = self._model_group.device_for(self.safety_checker)
+        return super().run_safety_checker(image, device, dtype)
+
    @torch.inference_mode()
    def get_learned_conditioning(self, c: List[List[str]], *, return_tokens=True, fragment_weights=None):
        """
@ -652,7 +728,7 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
            text=c,
            fragment_weights=fragment_weights,
            should_return_tokens=return_tokens,
-            device=self.device)
+            device=self._model_group.device_for(self.unet))

    @property
    def cond_stage_model(self):
@ -673,6 +749,11 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
        """Compatible with DiffusionWrapper"""
        return self.unet.in_channels

+    def decode_latents(self, latents):
+        # Explicit call to get the vae loaded, since `decode` isn't the forward method.
+        self._model_group.load(self.vae)
+        return super().decode_latents(latents)
+
    def debug_latents(self, latents, msg):
        with torch.inference_mode():
            from ldm.util import debug_image
--- a/ldm/invoke/generator/img2img.py
+++ b/ldm/invoke/generator/img2img.py
@ -7,7 +7,7 @@ from diffusers import logging

 from ldm.invoke.generator.base import Generator
 from ldm.invoke.generator.diffusers_pipeline import StableDiffusionGeneratorPipeline, ConditioningData
-from ldm.models.diffusion.shared_invokeai_diffusion import ThresholdSettings
+from ldm.models.diffusion.shared_invokeai_diffusion import PostprocessingSettings


 class Img2Img(Generator):
@ -33,7 +33,7 @@ class Img2Img(Generator):
        conditioning_data = (
            ConditioningData(
                uc, c, cfg_scale, extra_conditioning_info,
-                threshold = ThresholdSettings(threshold, warmup=0.2) if threshold else None)
+                postprocessing_settings = PostprocessingSettings(threshold, warmup=0.2) if threshold else None)
            .add_scheduler_args_if_applicable(pipeline.scheduler, eta=ddim_eta))


--- a/ldm/invoke/generator/txt2img.py
+++ b/ldm/invoke/generator/txt2img.py
@ -6,7 +6,7 @@ import torch

 from .base import Generator
 from .diffusers_pipeline import StableDiffusionGeneratorPipeline, ConditioningData
-from ...models.diffusion.shared_invokeai_diffusion import ThresholdSettings
+from ...models.diffusion.shared_invokeai_diffusion import PostprocessingSettings


 class Txt2Img(Generator):
@ -33,7 +33,7 @@ class Txt2Img(Generator):
        conditioning_data = (
            ConditioningData(
                uc, c, cfg_scale, extra_conditioning_info,
-                threshold = ThresholdSettings(threshold, warmup=0.2) if threshold else None)
+                postprocessing_settings = PostprocessingSettings(threshold, warmup=0.2) if threshold else None)
            .add_scheduler_args_if_applicable(pipeline.scheduler, eta=ddim_eta))

        def make_image(x_T) -> PIL.Image.Image:
--- a/ldm/invoke/generator/txt2img2img.py
+++ b/ldm/invoke/generator/txt2img2img.py
@ -11,7 +11,7 @@ from diffusers.utils.logging import get_verbosity, set_verbosity, set_verbosity_
 from ldm.invoke.generator.base import Generator
 from ldm.invoke.generator.diffusers_pipeline import trim_to_multiple_of, StableDiffusionGeneratorPipeline, \
    ConditioningData
-from ldm.models.diffusion.shared_invokeai_diffusion import ThresholdSettings
+from ldm.models.diffusion.shared_invokeai_diffusion import PostprocessingSettings


 class Txt2Img2Img(Generator):
@ -36,7 +36,7 @@ class Txt2Img2Img(Generator):
        conditioning_data = (
            ConditioningData(
                uc, c, cfg_scale, extra_conditioning_info,
-                threshold = ThresholdSettings(threshold, warmup=0.2) if threshold else None)
+                postprocessing_settings = PostprocessingSettings(threshold=threshold, warmup=0.2) if threshold else None)
            .add_scheduler_args_if_applicable(pipeline.scheduler, eta=ddim_eta))

        def make_image(x_T):
@ -47,7 +47,6 @@ class Txt2Img2Img(Generator):
                conditioning_data=conditioning_data,
                noise=x_T,
                callback=step_callback,
-                # TODO: threshold = threshold,
            )

            # Get our initial generation width and height directly from the latent output so
--- a/ldm/invoke/globals.py
+++ b/ldm/invoke/globals.py
@ -33,7 +33,7 @@ Globals.models_file = 'models.yaml'
 Globals.models_dir = 'models'
 Globals.config_dir = 'configs'
 Globals.autoscan_dir = 'weights'
-Globals.converted_ckpts_dir = 'converted-ckpts'
+Globals.converted_ckpts_dir = 'converted_ckpts'

 # Try loading patchmatch
 Globals.try_patchmatch = True
@ -66,6 +66,9 @@ def global_models_dir()->Path:
 def global_autoscan_dir()->Path:
    return Path(Globals.root, Globals.autoscan_dir)

+def global_converted_ckpts_dir()->Path:
+    return Path(global_models_dir(), Globals.converted_ckpts_dir)
+
 def global_set_root(root_dir:Union[str,Path]):
    Globals.root = root_dir

--- a/ldm/invoke/model_manager.py
+++ b/ldm/invoke/model_manager.py
@ -25,8 +25,6 @@ import torch
 import transformers
 from diffusers import AutoencoderKL
 from diffusers import logging as dlogging
-from diffusers.utils.logging import (get_verbosity, set_verbosity,
-                                     set_verbosity_error)
 from huggingface_hub import scan_cache_dir
 from omegaconf import OmegaConf
 from omegaconf.dictconfig import DictConfig
@ -49,9 +47,10 @@ class ModelManager(object):
    def __init__(
        self,
        config: OmegaConf,
-        device_type: str = "cpu",
+        device_type: str | torch.device = "cpu",
        precision: str = "float16",
        max_loaded_models=DEFAULT_MAX_MODELS,
+        sequential_offload = False
    ):
        """
        Initialize with the path to the models.yaml config file,
@ -69,6 +68,7 @@ class ModelManager(object):
        self.models = {}
        self.stack = []  # this is an LRU FIFO
        self.current_model = None
+        self.sequential_offload = sequential_offload

    def valid_model(self, model_name: str) -> bool:
        """
@ -529,7 +529,10 @@ class ModelManager(object):
        dlogging.set_verbosity(verbosity)
        assert pipeline is not None, OSError(f'"{name_or_path}" could not be loaded')

-        pipeline.to(self.device)
+        if self.sequential_offload:
+            pipeline.enable_offload_submodels(self.device)
+        else:
+            pipeline.to(self.device)

        model_hash = self._diffuser_sha256(name_or_path)

@ -761,7 +764,7 @@ class ModelManager(object):
            return

        model_name = model_name or diffusers_path.name
-        model_description = model_description or "Optimized version of {model_name}"
+        model_description = model_description or f"Optimized version of {model_name}"
        print(f">> Optimizing {model_name} (30-60s)")
        try:
            # By passing the specified VAE too the conversion function, the autoencoder
@ -801,15 +804,17 @@ class ModelManager(object):
        models_folder_safetensors = Path(search_folder).glob("**/*.safetensors")

        ckpt_files = [x for x in models_folder_ckpt if x.is_file()]
-        safetensor_files = [x for x in models_folder_safetensors if x.is_file]
+        safetensor_files = [x for x in models_folder_safetensors if x.is_file()]

        files = ckpt_files + safetensor_files

        found_models = []
        for file in files:
-            found_models.append(
-                {"name": file.stem, "location": str(file.resolve()).replace("\\", "/")}
-            )
+            location = str(file.resolve()).replace("\\", "/")
+            if 'model.safetensors' not in location and 'diffusion_pytorch_model.safetensors' not in location:
+                found_models.append(
+                    {"name": file.stem, "location": location}
+                )

        return search_folder, found_models

@ -994,12 +999,12 @@ class ModelManager(object):
        if self.device == "cpu":
            return model

-        # diffusers really really doesn't like us moving a float16 model onto CPU
-        verbosity = get_verbosity()
-        set_verbosity_error()
+        if isinstance(model, StableDiffusionGeneratorPipeline):
+            model.offload_all()
+            return model
+
        model.cond_stage_model.device = "cpu"
        model.to("cpu")
-        set_verbosity(verbosity)

        for submodel in ("first_stage_model", "cond_stage_model", "model"):
            try:
@ -1012,6 +1017,10 @@ class ModelManager(object):
        if self.device == "cpu":
            return model

+        if isinstance(model, StableDiffusionGeneratorPipeline):
+            model.ready()
+            return model
+
        model.to(self.device)
        model.cond_stage_model.device = self.device

@ -1162,7 +1171,7 @@ class ModelManager(object):
        strategy.execute()

    @staticmethod
-    def _abs_path(path: Union(str, Path)) -> Path:
+    def _abs_path(path: str | Path) -> Path:
        if path is None or Path(path).is_absolute():
            return path
        return Path(Globals.root, path).resolve()
--- a/ldm/invoke/offloading.py
+++ b/ldm/invoke/offloading.py
@ -0,0 +1,247 @@
+from __future__ import annotations
+
+import warnings
+import weakref
+from abc import ABCMeta, abstractmethod
+from collections.abc import MutableMapping
+from typing import Callable
+
+import torch
+from accelerate.utils import send_to_device
+from torch.utils.hooks import RemovableHandle
+
+OFFLOAD_DEVICE = torch.device("cpu")
+
+class _NoModel:
+    """Symbol that indicates no model is loaded.
+
+    (We can't weakref.ref(None), so this was my best idea at the time to come up with something
+    type-checkable.)
+    """
+
+    def __bool__(self):
+        return False
+
+    def to(self, device: torch.device):
+        pass
+
+    def __repr__(self):
+        return "<NO MODEL>"
+
+NO_MODEL = _NoModel()
+
+
+class ModelGroup(metaclass=ABCMeta):
+    """
+    A group of models.
+
+    The use case I had in mind when writing this is the sub-models used by a DiffusionPipeline,
+    e.g. its text encoder, U-net, VAE, etc.
+
+    Those models are :py:class:`diffusers.ModelMixin`, but "model" is interchangeable with
+    :py:class:`torch.nn.Module` here.
+    """
+
+    def __init__(self, execution_device: torch.device):
+        self.execution_device = execution_device
+
+    @abstractmethod
+    def install(self, *models: torch.nn.Module):
+        """Add models to this group."""
+        pass
+
+    @abstractmethod
+    def uninstall(self, models: torch.nn.Module):
+        """Remove models from this group."""
+        pass
+
+    @abstractmethod
+    def uninstall_all(self):
+        """Remove all models from this group."""
+
+    @abstractmethod
+    def load(self, model: torch.nn.Module):
+        """Load this model to the execution device."""
+        pass
+
+    @abstractmethod
+    def offload_current(self):
+        """Offload the current model(s) from the execution device."""
+        pass
+
+    @abstractmethod
+    def ready(self):
+        """Ready this group for use."""
+        pass
+
+    @abstractmethod
+    def set_device(self, device: torch.device):
+        """Change which device models from this group will execute on."""
+        pass
+
+    @abstractmethod
+    def device_for(self, model) -> torch.device:
+        """Get the device the given model will execute on.
+
+        The model should already be a member of this group.
+        """
+        pass
+
+    @abstractmethod
+    def __contains__(self, model):
+        """Check if the model is a member of this group."""
+        pass
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} object at {id(self):x}: " \
+               f"device={self.execution_device} >"
+
+
+class LazilyLoadedModelGroup(ModelGroup):
+    """
+    Only one model from this group is loaded on the GPU at a time.
+
+    Running the forward method of a model will displace the previously-loaded model,
+    offloading it to CPU.
+
+    If you call other methods on the model, e.g. ``model.encode(x)`` instead of ``model(x)``,
+    you will need to explicitly load it with :py:method:`.load(model)`.
+
+    This implementation relies on pytorch forward-pre-hooks, and it will copy forward arguments
+    to the appropriate execution device, as long as they are positional arguments and not keyword
+    arguments. (I didn't make the rules; that's the way the pytorch 1.13 API works for hooks.)
+    """
+
+    _hooks: MutableMapping[torch.nn.Module, RemovableHandle]
+    _current_model_ref: Callable[[], torch.nn.Module | _NoModel]
+
+    def __init__(self, execution_device: torch.device):
+        super().__init__(execution_device)
+        self._hooks = weakref.WeakKeyDictionary()
+        self._current_model_ref = weakref.ref(NO_MODEL)
+
+    def install(self, *models: torch.nn.Module):
+        for model in models:
+            self._hooks[model] = model.register_forward_pre_hook(self._pre_hook)
+
+    def uninstall(self, *models: torch.nn.Module):
+        for model in models:
+            hook = self._hooks.pop(model)
+            hook.remove()
+            if self.is_current_model(model):
+                # no longer hooked by this object, so don't claim to manage it
+                self.clear_current_model()
+
+    def uninstall_all(self):
+        self.uninstall(*self._hooks.keys())
+
+    def _pre_hook(self, module: torch.nn.Module, forward_input):
+        self.load(module)
+        if len(forward_input) == 0:
+            warnings.warn(f"Hook for {module.__class__.__name__} got no input. "
+                          f"Inputs must be positional, not keywords.", stacklevel=3)
+        return send_to_device(forward_input, self.execution_device)
+
+    def load(self, module):
+        if not self.is_current_model(module):
+            self.offload_current()
+            self._load(module)
+
+    def offload_current(self):
+        module = self._current_model_ref()
+        if module is not NO_MODEL:
+            module.to(device=OFFLOAD_DEVICE)
+        self.clear_current_model()
+
+    def _load(self, module: torch.nn.Module) -> torch.nn.Module:
+        assert self.is_empty(), f"A model is already loaded: {self._current_model_ref()}"
+        module = module.to(self.execution_device)
+        self.set_current_model(module)
+        return module
+
+    def is_current_model(self, model: torch.nn.Module) -> bool:
+        """Is the given model the one currently loaded on the execution device?"""
+        return self._current_model_ref() is model
+
+    def is_empty(self):
+        """Are none of this group's models loaded on the execution device?"""
+        return self._current_model_ref() is NO_MODEL
+
+    def set_current_model(self, value):
+        self._current_model_ref = weakref.ref(value)
+
+    def clear_current_model(self):
+        self._current_model_ref = weakref.ref(NO_MODEL)
+
+    def set_device(self, device: torch.device):
+        if device == self.execution_device:
+            return
+        self.execution_device = device
+        current = self._current_model_ref()
+        if current is not NO_MODEL:
+            current.to(device)
+
+    def device_for(self, model):
+        if model not in self:
+            raise KeyError(f"This does not manage this model {type(model).__name__}", model)
+        return self.execution_device  # this implementation only dispatches to one device
+
+    def ready(self):
+        pass  # always ready to load on-demand
+
+    def __contains__(self, model):
+        return model in self._hooks
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__} object at {id(self):x}: " \
+               f"current_model={type(self._current_model_ref()).__name__} >"
+
+
+class FullyLoadedModelGroup(ModelGroup):
+    """
+    A group of models without any implicit loading or unloading.
+
+    :py:meth:`.ready` loads _all_ the models to the execution device at once.
+    """
+    _models: weakref.WeakSet
+
+    def __init__(self, execution_device: torch.device):
+        super().__init__(execution_device)
+        self._models = weakref.WeakSet()
+
+    def install(self, *models: torch.nn.Module):
+        for model in models:
+            self._models.add(model)
+            model.to(device=self.execution_device)
+
+    def uninstall(self, *models: torch.nn.Module):
+        for model in models:
+            self._models.remove(model)
+
+    def uninstall_all(self):
+        self.uninstall(*self._models)
+
+    def load(self, model):
+        model.to(device=self.execution_device)
+
+    def offload_current(self):
+        for model in self._models:
+            model.to(device=OFFLOAD_DEVICE)
+
+    def ready(self):
+        for model in self._models:
+            self.load(model)
+
+    def set_device(self, device: torch.device):
+        self.execution_device = device
+        for model in self._models:
+            if model.device != OFFLOAD_DEVICE:
+                model.to(device=device)
+
+    def device_for(self, model):
+        if model not in self:
+            raise KeyError("This does not manage this model f{type(model).__name__}", model)
+        return self.execution_device  # this implementation only dispatches to one device
+
+    def __contains__(self, model):
+        return model in self._models