From 667171ed904d74bd28d18ba904ab430e70e5c5fc Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Sun, 7 May 2023 18:07:28 -0400 Subject: [PATCH] cap model cache size using bytes, not # models --- .../app/services/model_manager_initializer.py | 2 +- invokeai/backend/args.py | 10 +- .../backend/model_management/model_cache.py | 119 +++++++++++++----- .../backend/model_management/model_manager.py | 6 +- 4 files changed, 99 insertions(+), 38 deletions(-) diff --git a/invokeai/app/services/model_manager_initializer.py b/invokeai/app/services/model_manager_initializer.py index c7924e797d..cae650fcfa 100644 --- a/invokeai/app/services/model_manager_initializer.py +++ b/invokeai/app/services/model_manager_initializer.py @@ -59,7 +59,7 @@ def get_model_manager(config: Args, logger: types.ModuleType) -> ModelManager: config.conf, precision=dtype, device_type=device, - max_loaded_models=config.max_loaded_models, + max_cache_size=config.max_cache_size, # temporarily disabled until model manager stabilizes # embedding_path = Path(embedding_path), logger = logger, diff --git a/invokeai/backend/args.py b/invokeai/backend/args.py index eb8b396ee0..2cd44b8f7b 100644 --- a/invokeai/backend/args.py +++ b/invokeai/backend/args.py @@ -502,11 +502,11 @@ class Args(object): help="Deprecated way to set --precision=float32", ) model_group.add_argument( - "--max_loaded_models", - dest="max_loaded_models", - type=int, - default=2, - help="Maximum number of models to keep in memory for fast switching, including the one in GPU", + "--max_cache_size", + dest="max_cache_size", + type=float, + default=6.0, + help="Maximum size of the model RAM cache (in GB). 6 GB is sufficient to keep 2-3 diffusers models in RAM simultaneously.", ) model_group.add_argument( "--free_gpu_mem", diff --git a/invokeai/backend/model_management/model_cache.py b/invokeai/backend/model_management/model_cache.py index c316c4292f..97c21159b4 100644 --- a/invokeai/backend/model_management/model_cache.py +++ b/invokeai/backend/model_management/model_cache.py @@ -19,11 +19,13 @@ context. Use like this: import contextlib import gc import hashlib +import logging import warnings from collections import Counter from enum import Enum from pathlib import Path -from typing import Sequence, Union, Tuple, types +from psutil import Process +from typing import Dict, Sequence, Union, Tuple, types import torch import safetensors.torch @@ -41,7 +43,12 @@ import invokeai.backend.util.logging as logger from ..globals import global_cache_dir from ..stable_diffusion import StableDiffusionGeneratorPipeline -MAX_MODELS = 4 +# Maximum size of the cache, in gigs +# Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously +DEFAULT_MAX_CACHE_SIZE = 6.0 + +# actual size of a gig +GIG = 1073741824 # This is the mapping from the stable diffusion submodel dict key to the class class SDModelType(Enum): @@ -65,6 +72,24 @@ class ModelStatus(Enum): in_ram='cached' in_vram='in gpu' active='locked in gpu' + +# This is used to guesstimate the size of a model before we load it. +# After loading, we will know it exactly. +# Sizes are in Gigs, estimated for float16; double for float32 +SIZE_GUESSTIMATE = { + SDModelType.diffusion_pipeline: 2.5, + SDModelType.diffusers: 2.5, + SDModelType.vae: 0.35, + SDModelType.text_encoder: 0.5, + SDModelType.tokenizer: 0.0001, + SDModelType.unet: 3.4, + SDModelType.scheduler: 0.0001, + SDModelType.safety_checker: 1.2, + SDModelType.feature_extractor: 0.0001, + SDModelType.lora: 0.1, + SDModelType.textual_inversion: 0.0001, + SDModelType.ckpt: 4.2, +} # The list of model classes we know how to fetch, for typechecking ModelClass = Union[tuple([x.value for x in SDModelType])] @@ -90,7 +115,7 @@ class ModelLocker(object): class ModelCache(object): def __init__( self, - max_models: int=MAX_MODELS, + max_cache_size: float=DEFAULT_MAX_CACHE_SIZE, execution_device: torch.device=torch.device('cuda'), storage_device: torch.device=torch.device('cpu'), precision: torch.dtype=torch.float16, @@ -113,13 +138,15 @@ class ModelCache(object): self.lazy_offloading = lazy_offloading self.sequential_offload: bool=sequential_offload self.precision: torch.dtype=precision - self.max_models: int=max_models + self.current_cache_size: int=0 + self.max_cache_size: int=max_cache_size self.execution_device: torch.device=execution_device self.storage_device: torch.device=storage_device self.sha_chunksize=sha_chunksize self.logger = logger self.loaded_models: set = set() # set of model keys loaded in GPU self.locked_models: Counter = Counter() # set of model keys locked in GPU + self.model_sizes: Dict[str,int] = dict() def get_model( self, @@ -172,21 +199,33 @@ class ModelCache(object): model_type.value, revision, subfolder - ) + ) + if key in self.models: # cached - move to bottom of stack with contextlib.suppress(ValueError): self.stack.remove(key) self.stack.append(key) model = self.models[key] + else: # not cached -load - self._make_cache_room() - model = self._load_model_from_storage( - repo_id_or_path=repo_id_or_path, - model_class=model_type.value, - subfolder=subfolder, - revision=revision, - legacy_info=legacy_info, - ) + self.logger.info(f'Loading model {repo_id_or_path}, type {model_type}') + + # this will remove older cached models until + # there is sufficient room to load the requested model + self._make_cache_room(key, model_type) + + with MemoryUsage() as usage: + model = self._load_model_from_storage( + repo_id_or_path=repo_id_or_path, + model_class=model_type.value, + subfolder=subfolder, + revision=revision, + legacy_info=legacy_info, + ) + logger.debug(f'Actual memory used to load model: {(usage.mem_used/GIG):.2f} GB') + self.model_sizes[key] = usage.mem_used + self.current_cache_size += usage.mem_used + if model_type==SDModelType.diffusion_pipeline and attach_model_part[0]: self.attach_part(model,*attach_model_part) self.stack.append(key) # add to LRU cache @@ -200,11 +239,11 @@ class ModelCache(object): def uncache_model(self, key: str): '''Remove corresponding model from the cache''' if key is not None and key in self.models: - with contextlib.suppress(ValueError): + with contextlib.suppress(ValueError), contextlib.suppress(KeyError): del self.models[key] del self.locked_models[key] - self.stack.remove(key) self.loaded_models.remove(key) + self.stack.remove(key) class ModelLocker(object): def __init__(self, cache, key, model, gpu_load): @@ -304,9 +343,9 @@ class ModelCache(object): else: return self._hf_commit_hash(repo_id_or_path,revision) - def cache_size(self)->int: - "Return the current number of models cached." - return len(self.models) + def cache_size(self)->float: + "Return the current size of the cache, in GB" + return self.current_cache_size / GIG @classmethod def is_legacy_ckpt(cls, repo_id_or_path: Union[str,Path])->bool: @@ -342,18 +381,29 @@ class ModelCache(object): return self.execution_device.type == 'cuda' def _print_cuda_stats(self): - vram = "%4.2fG" % (torch.cuda.memory_allocated() / 1e9) + vram = "%4.2fG" % (torch.cuda.memory_allocated() / GIG) + ram = "%4.2fG" % (self.current_cache_size / GIG) loaded_models = len(self.loaded_models) locked_models = len([x for x in self.locked_models if self.locked_models[x]>0]) - logger.debug(f"Current VRAM usage: {vram}; locked_models/loaded_models = {locked_models}/{loaded_models}") + logger.debug(f"Current VRAM/RAM usage: {vram}/{ram}; locked_models/loaded_models = {locked_models}/{loaded_models}") - def _make_cache_room(self): - models_in_ram = len(self.models) - while models_in_ram >= self.max_models: + def _make_cache_room(self, key, model_type): + # calculate how much memory this model will require + multiplier = 2 if self.precision==torch.float32 else 1 + bytes_needed = int(self.model_sizes.get(key,0) or SIZE_GUESSTIMATE[model_type]*GIG*multiplier) + maximum_size = self.max_cache_size * GIG # stored in GB, convert to bytes + current_size = self.current_cache_size + + adjective = 'guesstimated' if key not in self.model_sizes else 'known from previous load' + logger.debug(f'{(bytes_needed/GIG):.2f} GB needed to load this model ({adjective})') + while current_size+bytes_needed > maximum_size: if least_recently_used_key := self.stack.pop(0): - logger.debug(f'Maximum cache size reached: cache_size={models_in_ram}; unloading model {least_recently_used_key}') - del self.models[least_recently_used_key] - models_in_ram = len(self.models) + model_size = self.model_sizes.get(least_recently_used_key,0) + logger.debug(f'Max cache size exceeded: cache_size={(current_size/GIG):.2f} GB, need an additional {(bytes_needed/GIG):.2f} GB') + logger.debug(f'Unloading model {least_recently_used_key} to free {(model_size/GIG):.2f} GB') + self.uncache_model(least_recently_used_key) + current_size -= model_size + self.current_cache_size = current_size gc.collect() def _offload_unlocked_models(self): @@ -393,8 +443,8 @@ class ModelCache(object): revision, model_class, ) - if self.sequential_offload and isinstance(model,StableDiffusionGeneratorPipeline): - model.enable_offload_submodels(self.execution_device) + if self.sequential_offload and isinstance(model,StableDiffusionGeneratorPipeline): + model.enable_offload_submodels(self.execution_device) return model def _load_diffusers_from_storage( @@ -411,7 +461,6 @@ class ModelCache(object): :param revision: model revision :param model_class: class of model to return, defaults to StableDiffusionGeneratorPIpeline ''' - self.logger.info(f'Loading model {repo_id_or_path}') revisions = [revision] if revision \ else ['fp16','main'] if self.precision==torch.float16 \ else ['main'] @@ -529,3 +578,15 @@ class SilenceWarnings(object): transformers_logging.set_verbosity(self.transformers_verbosity) diffusers_logging.set_verbosity(self.diffusers_verbosity) warnings.simplefilter('default') + +class MemoryUsage(object): + def __init__(self): + self.vms = None + self.mem_used = 0 + + def __enter__(self): + self.vms = Process().memory_info().vms + return self + + def __exit__(self, *args): + self.mem_used = Process().memory_info().vms - self.vms diff --git a/invokeai/backend/model_management/model_manager.py b/invokeai/backend/model_management/model_manager.py index 7c3c32c004..368d581cd8 100644 --- a/invokeai/backend/model_management/model_manager.py +++ b/invokeai/backend/model_management/model_manager.py @@ -141,7 +141,7 @@ class SDLegacyType(Enum): V2_v = auto() UNKNOWN = auto() -DEFAULT_MAX_MODELS = 2 +MAX_CACHE_SIZE = 6.0 # GB class ModelManager(object): """ @@ -155,7 +155,7 @@ class ModelManager(object): config_path: Path, device_type: torch.device = CUDA_DEVICE, precision: torch.dtype = torch.float16, - max_loaded_models=DEFAULT_MAX_MODELS, + max_cache_size=MAX_CACHE_SIZE, sequential_offload=False, logger: types.ModuleType = logger, ): @@ -168,7 +168,7 @@ class ModelManager(object): self.config_path = config_path self.config = OmegaConf.load(self.config_path) self.cache = ModelCache( - max_models=max_loaded_models, + max_cache_size=max_cache_size, execution_device = device_type, precision = precision, sequential_offload = sequential_offload,