new OffloadingDevice loads one model at a time, on demand (#2596)

* new OffloadingDevice loads one model at a time, on demand

* fixup! new OffloadingDevice loads one model at a time, on demand

* fix(prompt_to_embeddings): call the text encoder directly instead of its forward method

allowing any associated hooks to run with it.

* more attempts to get things on the right device from the offloader

* more attempts to get things on the right device from the offloader

* make offloading methods an explicit part of the pipeline interface

* inlining some calls where device is only used once

* ensure model group is ready after pipeline.to is called

* fixup! Strategize slicing based on free [V]RAM (#2572)

* doc(offloading): docstrings for offloading.ModelGroup

* doc(offloading): docstrings for offloading-related pipeline methods

* refactor(offloading): s/SimpleModelGroup/FullyLoadedModelGroup

* refactor(offloading): s/HotSeatModelGroup/LazilyLoadedModelGroup

to frame it is the same terms as "FullyLoadedModelGroup"

---------

Co-authored-by: Damian Stewart <null@damianstewart.com>
This commit is contained in:
Kevin Turner
2023-02-16 15:48:27 -08:00
committed by GitHub
parent 2468ba7445
commit 8a0d45ac5a
5 changed files with 371 additions and 47 deletions

View File

@ -25,8 +25,6 @@ import torch
import transformers
from diffusers import AutoencoderKL
from diffusers import logging as dlogging
from diffusers.utils.logging import (get_verbosity, set_verbosity,
set_verbosity_error)
from huggingface_hub import scan_cache_dir
from omegaconf import OmegaConf
from omegaconf.dictconfig import DictConfig
@ -49,9 +47,10 @@ class ModelManager(object):
def __init__(
self,
config: OmegaConf,
device_type: str = "cpu",
device_type: str | torch.device = "cpu",
precision: str = "float16",
max_loaded_models=DEFAULT_MAX_MODELS,
sequential_offload = False
):
"""
Initialize with the path to the models.yaml config file,
@ -69,6 +68,7 @@ class ModelManager(object):
self.models = {}
self.stack = [] # this is an LRU FIFO
self.current_model = None
self.sequential_offload = sequential_offload
def valid_model(self, model_name: str) -> bool:
"""
@ -529,7 +529,10 @@ class ModelManager(object):
dlogging.set_verbosity(verbosity)
assert pipeline is not None, OSError(f'"{name_or_path}" could not be loaded')
pipeline.to(self.device)
if self.sequential_offload:
pipeline.enable_offload_submodels(self.device)
else:
pipeline.to(self.device)
model_hash = self._diffuser_sha256(name_or_path)
@ -748,7 +751,6 @@ class ModelManager(object):
into models.yaml.
"""
new_config = None
import transformers
from ldm.invoke.ckpt_to_diffuser import convert_ckpt_to_diffuser
@ -995,12 +997,12 @@ class ModelManager(object):
if self.device == "cpu":
return model
# diffusers really really doesn't like us moving a float16 model onto CPU
verbosity = get_verbosity()
set_verbosity_error()
if isinstance(model, StableDiffusionGeneratorPipeline):
model.offload_all()
return model
model.cond_stage_model.device = "cpu"
model.to("cpu")
set_verbosity(verbosity)
for submodel in ("first_stage_model", "cond_stage_model", "model"):
try:
@ -1013,6 +1015,10 @@ class ModelManager(object):
if self.device == "cpu":
return model
if isinstance(model, StableDiffusionGeneratorPipeline):
model.ready()
return model
model.to(self.device)
model.cond_stage_model.device = self.device
@ -1163,7 +1169,7 @@ class ModelManager(object):
strategy.execute()
@staticmethod
def _abs_path(path: Union(str, Path)) -> Path:
def _abs_path(path: str | Path) -> Path:
if path is None or Path(path).is_absolute():
return path
return Path(Globals.root, path).resolve()