new OffloadingDevice loads one model at a time, on demand (#2596)

* new OffloadingDevice loads one model at a time, on demand * fixup! new OffloadingDevice loads one model at a time, on demand * fix(prompt_to_embeddings): call the text encoder directly instead of its forward method allowing any associated hooks to run with it. * more attempts to get things on the right device from the offloader * more attempts to get things on the right device from the offloader * make offloading methods an explicit part of the pipeline interface * inlining some calls where device is only used once * ensure model group is ready after pipeline.to is called * fixup! Strategize slicing based on free [V]RAM (#2572) * doc(offloading): docstrings for offloading.ModelGroup * doc(offloading): docstrings for offloading-related pipeline methods * refactor(offloading): s/SimpleModelGroup/FullyLoadedModelGroup * refactor(offloading): s/HotSeatModelGroup/LazilyLoadedModelGroup to frame it is the same terms as "FullyLoadedModelGroup" --------- Co-authored-by: Damian Stewart <null@damianstewart.com>
2024-08-30 20:32:17 +00:00 · 2023-02-16 15:48:27 -08:00
parent 2468ba7445
commit 8a0d45ac5a
5 changed files with 371 additions and 47 deletions
--- a/ldm/invoke/model_manager.py
+++ b/ldm/invoke/model_manager.py
@ -25,8 +25,6 @@ import torch
 import transformers
 from diffusers import AutoencoderKL
 from diffusers import logging as dlogging
-from diffusers.utils.logging import (get_verbosity, set_verbosity,
-                                     set_verbosity_error)
 from huggingface_hub import scan_cache_dir
 from omegaconf import OmegaConf
 from omegaconf.dictconfig import DictConfig
@ -49,9 +47,10 @@ class ModelManager(object):
    def __init__(
        self,
        config: OmegaConf,
-        device_type: str = "cpu",
+        device_type: str | torch.device = "cpu",
        precision: str = "float16",
        max_loaded_models=DEFAULT_MAX_MODELS,
+        sequential_offload = False
    ):
        """
        Initialize with the path to the models.yaml config file,
@ -69,6 +68,7 @@ class ModelManager(object):
        self.models = {}
        self.stack = []  # this is an LRU FIFO
        self.current_model = None
+        self.sequential_offload = sequential_offload

    def valid_model(self, model_name: str) -> bool:
        """
@ -529,7 +529,10 @@ class ModelManager(object):
        dlogging.set_verbosity(verbosity)
        assert pipeline is not None, OSError(f'"{name_or_path}" could not be loaded')

-        pipeline.to(self.device)
+        if self.sequential_offload:
+            pipeline.enable_offload_submodels(self.device)
+        else:
+            pipeline.to(self.device)

        model_hash = self._diffuser_sha256(name_or_path)

@ -748,7 +751,6 @@ class ModelManager(object):
        into models.yaml.
        """
        new_config = None
-        import transformers

        from ldm.invoke.ckpt_to_diffuser import convert_ckpt_to_diffuser

@ -995,12 +997,12 @@ class ModelManager(object):
        if self.device == "cpu":
            return model

-        # diffusers really really doesn't like us moving a float16 model onto CPU
-        verbosity = get_verbosity()
-        set_verbosity_error()
+        if isinstance(model, StableDiffusionGeneratorPipeline):
+            model.offload_all()
+            return model
+
        model.cond_stage_model.device = "cpu"
        model.to("cpu")
-        set_verbosity(verbosity)

        for submodel in ("first_stage_model", "cond_stage_model", "model"):
            try:
@ -1013,6 +1015,10 @@ class ModelManager(object):
        if self.device == "cpu":
            return model

+        if isinstance(model, StableDiffusionGeneratorPipeline):
+            model.ready()
+            return model
+
        model.to(self.device)
        model.cond_stage_model.device = self.device

@ -1163,7 +1169,7 @@ class ModelManager(object):
        strategy.execute()

    @staticmethod
-    def _abs_path(path: Union(str, Path)) -> Path:
+    def _abs_path(path: str | Path) -> Path:
        if path is None or Path(path).is_absolute():
            return path
        return Path(Globals.root, path).resolve()