From d6cb0e54b347a4707ab3239ec9310604a6b933b6 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Sun, 9 Jul 2023 14:26:30 -0400 Subject: [PATCH 1/6] don't unload models from GPU until the space is needed --- .../backend/model_management/model_cache.py | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/invokeai/backend/model_management/model_cache.py b/invokeai/backend/model_management/model_cache.py index e80e065cff..ad95a02e52 100644 --- a/invokeai/backend/model_management/model_cache.py +++ b/invokeai/backend/model_management/model_cache.py @@ -201,14 +201,22 @@ class ModelCache(object): self._cache_stack.remove(key) self._cache_stack.append(key) - return self.ModelLocker(self, key, cache_entry.model, gpu_load) + return self.ModelLocker(self, key, cache_entry.model, gpu_load, cache_entry.size) class ModelLocker(object): - def __init__(self, cache, key, model, gpu_load): + def __init__(self, cache, key, model, gpu_load, size_needed): + ''' + :param cache: The model_cache object + :param key: The key of the model to lock in GPU + :param model: The model to lock + :param gpu_load: True if load into gpu + :param size_needed: Size of the model to load + ''' self.gpu_load = gpu_load self.cache = cache self.key = key self.model = model + self.size_needed = size_needed self.cache_entry = self.cache._cached_models[self.key] def __enter__(self) -> Any: @@ -222,7 +230,7 @@ class ModelCache(object): try: if self.cache.lazy_offloading: - self.cache._offload_unlocked_models() + self.cache._offload_unlocked_models(self.size_needed) if self.model.device != self.cache.execution_device: self.cache.logger.debug(f'Moving {self.key} into {self.cache.execution_device}') @@ -337,9 +345,12 @@ class ModelCache(object): self.logger.debug(f"After unloading: cached_models={len(self._cached_models)}") - - def _offload_unlocked_models(self): - for model_key, cache_entry in self._cached_models.items(): + def _offload_unlocked_models(self, size_needed: int=0): + for model_key, cache_entry in sorted(self._cached_models.items(), key=lambda x:x[1].size): + free_mem, used_mem = torch.cuda.mem_get_info() + self.logger.debug(f'Require {(size_needed/GIG):.2f}GB VRAM. Have {((free_mem-reserve)/GIG):.2f}GB available.') + if free_mem-reserve > size_needed: + return if not cache_entry.locked and cache_entry.loaded: self.logger.debug(f'Offloading {model_key} from {self.execution_device} into {self.storage_device}') with VRAMUsage() as mem: From 8d7dba937d85f38acfc17a6d942a66a6a7c69d52 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Sun, 9 Jul 2023 14:37:45 -0400 Subject: [PATCH 2/6] fix undefined variable --- invokeai/backend/model_management/model_cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/invokeai/backend/model_management/model_cache.py b/invokeai/backend/model_management/model_cache.py index ad95a02e52..ecda15beac 100644 --- a/invokeai/backend/model_management/model_cache.py +++ b/invokeai/backend/model_management/model_cache.py @@ -348,8 +348,8 @@ class ModelCache(object): def _offload_unlocked_models(self, size_needed: int=0): for model_key, cache_entry in sorted(self._cached_models.items(), key=lambda x:x[1].size): free_mem, used_mem = torch.cuda.mem_get_info() - self.logger.debug(f'Require {(size_needed/GIG):.2f}GB VRAM. Have {((free_mem-reserve)/GIG):.2f}GB available.') - if free_mem-reserve > size_needed: + self.logger.debug(f'Require {(size_needed/GIG):.2f}GB VRAM. Have {(free_mem/GIG):.2f}GB available.') + if free_mem > size_needed: return if not cache_entry.locked and cache_entry.loaded: self.logger.debug(f'Offloading {model_key} from {self.execution_device} into {self.storage_device}') From 5759a390f937f18f2532970901161bb7b36a32f0 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Sun, 9 Jul 2023 18:35:04 -0400 Subject: [PATCH 3/6] introduce gpu_mem_reserved configuration parameter --- invokeai/app/services/config.py | 4 +++- invokeai/backend/model_management/model_cache.py | 15 ++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/invokeai/app/services/config.py b/invokeai/app/services/config.py index 9c67eeca4c..921f01665c 100644 --- a/invokeai/app/services/config.py +++ b/invokeai/app/services/config.py @@ -23,9 +23,10 @@ InvokeAI: xformers_enabled: false sequential_guidance: false precision: float16 - max_loaded_models: 4 + max_cache_size: 6 always_use_cpu: false free_gpu_mem: false + reserve_gpu_mem: 1 Features: nsfw_checker: true restore: true @@ -365,6 +366,7 @@ setting environment variables INVOKEAI_. free_gpu_mem : bool = Field(default=False, description="If true, purge model from GPU after each generation.", category='Memory/Performance') max_loaded_models : int = Field(default=3, gt=0, description="(DEPRECATED: use max_cache_size) Maximum number of models to keep in memory for rapid switching", category='Memory/Performance') max_cache_size : float = Field(default=6.0, gt=0, description="Maximum memory amount used by model cache for rapid switching", category='Memory/Performance') + gpu_mem_reserved : float = Field(default=1.75, ge=0, description="Amount of VRAM to reserve for use during generation", category='Memory/Performance') precision : Literal[tuple(['auto','float16','float32','autocast'])] = Field(default='float16',description='Floating point precision', category='Memory/Performance') sequential_guidance : bool = Field(default=False, description="Whether to calculate guidance in serial instead of in parallel, lowering memory requirements", category='Memory/Performance') xformers_enabled : bool = Field(default=True, description="Enable/disable memory-efficient attention", category='Memory/Performance') diff --git a/invokeai/backend/model_management/model_cache.py b/invokeai/backend/model_management/model_cache.py index ecda15beac..f6d3c49bc0 100644 --- a/invokeai/backend/model_management/model_cache.py +++ b/invokeai/backend/model_management/model_cache.py @@ -36,6 +36,9 @@ from .models import BaseModelType, ModelType, SubModelType, ModelBase # Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously DEFAULT_MAX_CACHE_SIZE = 6.0 +# amount of GPU memory to hold in reserve for use by generations (GB) +DEFAULT_GPU_MEM_RESERVED= 1.75 + # actual size of a gig GIG = 1073741824 @@ -88,6 +91,7 @@ class ModelCache(object): sequential_offload: bool=False, lazy_offloading: bool=True, sha_chunksize: int = 16777216, + gpu_mem_reserved: float=DEFAULT_GPU_MEM_RESERVED, logger: types.ModuleType = logger ): ''' @@ -99,12 +103,11 @@ class ModelCache(object): :param sequential_offload: Conserve VRAM by loading and unloading each stage of the pipeline sequentially :param sha_chunksize: Chunksize to use when calculating sha256 model hash ''' - #max_cache_size = 9999 self.model_infos: Dict[str, ModelBase] = dict() self.lazy_offloading = lazy_offloading - #self.sequential_offload: bool=sequential_offload self.precision: torch.dtype=precision - self.max_cache_size: int=max_cache_size + self.max_cache_size: float=max_cache_size + self.gpu_mem_reserved: float=gpu_mem_reserved self.execution_device: torch.device=execution_device self.storage_device: torch.device=storage_device self.sha_chunksize=sha_chunksize @@ -346,11 +349,13 @@ class ModelCache(object): self.logger.debug(f"After unloading: cached_models={len(self._cached_models)}") def _offload_unlocked_models(self, size_needed: int=0): + reserved = self.gpu_mem_reserved * GIG for model_key, cache_entry in sorted(self._cached_models.items(), key=lambda x:x[1].size): free_mem, used_mem = torch.cuda.mem_get_info() - self.logger.debug(f'Require {(size_needed/GIG):.2f}GB VRAM. Have {(free_mem/GIG):.2f}GB available.') + free_mem -= reserved + self.logger.debug(f'Require {(size_needed/GIG):.2f}GB VRAM. Have {(free_mem/GIG):.2f}GB available ({(reserved/GIG):.2f} reserved).') if free_mem > size_needed: - return + break if not cache_entry.locked and cache_entry.loaded: self.logger.debug(f'Offloading {model_key} from {self.execution_device} into {self.storage_device}') with VRAMUsage() as mem: From c9c61ee4596016f6438811574af67a9e1b9e856a Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Mon, 10 Jul 2023 18:46:32 -0400 Subject: [PATCH 4/6] Update invokeai/app/services/config.py Co-authored-by: Eugene Brodsky --- invokeai/app/services/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/invokeai/app/services/config.py b/invokeai/app/services/config.py index 921f01665c..f37145a4bf 100644 --- a/invokeai/app/services/config.py +++ b/invokeai/app/services/config.py @@ -26,7 +26,7 @@ InvokeAI: max_cache_size: 6 always_use_cpu: false free_gpu_mem: false - reserve_gpu_mem: 1 + gpu_mem_reserved: 1 Features: nsfw_checker: true restore: true From d32f9f7cb0f88390271cb85e212e9c06db9707c2 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Tue, 11 Jul 2023 15:16:35 -0400 Subject: [PATCH 5/6] reverse logic of gpu_mem_reserved - gpu_mem_reserved now indicates the amount of VRAM that will be reserved for model caching (similar to max_cache_size). --- invokeai/app/services/config.py | 4 ++-- invokeai/backend/model_management/model_cache.py | 11 ++++++----- invokeai/backend/model_management/model_manager.py | 1 + 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/invokeai/app/services/config.py b/invokeai/app/services/config.py index f37145a4bf..548700e816 100644 --- a/invokeai/app/services/config.py +++ b/invokeai/app/services/config.py @@ -26,7 +26,7 @@ InvokeAI: max_cache_size: 6 always_use_cpu: false free_gpu_mem: false - gpu_mem_reserved: 1 + gpu_mem_reserved: 2.7 Features: nsfw_checker: true restore: true @@ -366,7 +366,7 @@ setting environment variables INVOKEAI_. free_gpu_mem : bool = Field(default=False, description="If true, purge model from GPU after each generation.", category='Memory/Performance') max_loaded_models : int = Field(default=3, gt=0, description="(DEPRECATED: use max_cache_size) Maximum number of models to keep in memory for rapid switching", category='Memory/Performance') max_cache_size : float = Field(default=6.0, gt=0, description="Maximum memory amount used by model cache for rapid switching", category='Memory/Performance') - gpu_mem_reserved : float = Field(default=1.75, ge=0, description="Amount of VRAM to reserve for use during generation", category='Memory/Performance') + gpu_mem_reserved : float = Field(default=2.75, ge=0, description="Amount of VRAM reserved for model storage", category='Memory/Performance') precision : Literal[tuple(['auto','float16','float32','autocast'])] = Field(default='float16',description='Floating point precision', category='Memory/Performance') sequential_guidance : bool = Field(default=False, description="Whether to calculate guidance in serial instead of in parallel, lowering memory requirements", category='Memory/Performance') xformers_enabled : bool = Field(default=True, description="Enable/disable memory-efficient attention", category='Memory/Performance') diff --git a/invokeai/backend/model_management/model_cache.py b/invokeai/backend/model_management/model_cache.py index f6d3c49bc0..b3284226e1 100644 --- a/invokeai/backend/model_management/model_cache.py +++ b/invokeai/backend/model_management/model_cache.py @@ -37,7 +37,7 @@ from .models import BaseModelType, ModelType, SubModelType, ModelBase DEFAULT_MAX_CACHE_SIZE = 6.0 # amount of GPU memory to hold in reserve for use by generations (GB) -DEFAULT_GPU_MEM_RESERVED= 1.75 +DEFAULT_GPU_MEM_RESERVED= 2.75 # actual size of a gig GIG = 1073741824 @@ -350,17 +350,18 @@ class ModelCache(object): def _offload_unlocked_models(self, size_needed: int=0): reserved = self.gpu_mem_reserved * GIG + vram_in_use = torch.cuda.memory_allocated() + self.logger.debug(f'{(vram_in_use/GIG):.2f}GB VRAM used for models; max allowed={(reserved/GIG):.2f}GB') for model_key, cache_entry in sorted(self._cached_models.items(), key=lambda x:x[1].size): - free_mem, used_mem = torch.cuda.mem_get_info() - free_mem -= reserved - self.logger.debug(f'Require {(size_needed/GIG):.2f}GB VRAM. Have {(free_mem/GIG):.2f}GB available ({(reserved/GIG):.2f} reserved).') - if free_mem > size_needed: + if vram_in_use <= reserved: break if not cache_entry.locked and cache_entry.loaded: self.logger.debug(f'Offloading {model_key} from {self.execution_device} into {self.storage_device}') with VRAMUsage() as mem: cache_entry.model.to(self.storage_device) self.logger.debug(f'GPU VRAM freed: {(mem.vram_used/GIG):.2f} GB') + vram_in_use += mem.vram_used # note vram_used is negative + self.logger.debug(f'{(vram_in_use/GIG):.2f}GB VRAM used for models; max allowed={(reserved/GIG):.2f}GB') def _local_model_hash(self, model_path: Union[str, Path]) -> str: sha = hashlib.sha256() diff --git a/invokeai/backend/model_management/model_manager.py b/invokeai/backend/model_management/model_manager.py index d092e05c05..f74be90639 100644 --- a/invokeai/backend/model_management/model_manager.py +++ b/invokeai/backend/model_management/model_manager.py @@ -340,6 +340,7 @@ class ModelManager(object): precision = precision, sequential_offload = sequential_offload, logger = logger, + gpu_mem_reserved = self.app_config.gpu_mem_reserved ) self.cache_keys = dict() From dab03fb6460b5ba233216f7903f99f800b9297c8 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Tue, 11 Jul 2023 15:25:39 -0400 Subject: [PATCH 6/6] rename gpu_mem_reserved to max_vram_cache_size To be consistent with max_cache_size, the amount of memory to hold in VRAM for model caching is now controlled by the max_vram_cache_size configuration parameter. --- invokeai/app/services/config.py | 9 +++++---- invokeai/backend/model_management/model_cache.py | 8 ++++---- invokeai/backend/model_management/model_manager.py | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/invokeai/app/services/config.py b/invokeai/app/services/config.py index 548700e816..e5d1612ed6 100644 --- a/invokeai/app/services/config.py +++ b/invokeai/app/services/config.py @@ -24,9 +24,9 @@ InvokeAI: sequential_guidance: false precision: float16 max_cache_size: 6 + max_vram_cache_size: 2.7 always_use_cpu: false free_gpu_mem: false - gpu_mem_reserved: 2.7 Features: nsfw_checker: true restore: true @@ -271,7 +271,7 @@ class InvokeAISettings(BaseSettings): @classmethod def _excluded(self)->List[str]: - return ['type','initconf'] + return ['type','initconf', 'gpu_mem_reserved', 'max_loaded_models'] class Config: env_file_encoding = 'utf-8' @@ -364,9 +364,10 @@ setting environment variables INVOKEAI_. always_use_cpu : bool = Field(default=False, description="If true, use the CPU for rendering even if a GPU is available.", category='Memory/Performance') free_gpu_mem : bool = Field(default=False, description="If true, purge model from GPU after each generation.", category='Memory/Performance') - max_loaded_models : int = Field(default=3, gt=0, description="(DEPRECATED: use max_cache_size) Maximum number of models to keep in memory for rapid switching", category='Memory/Performance') + max_loaded_models : int = Field(default=3, gt=0, description="(DEPRECATED: use max_cache_size) Maximum number of models to keep in memory for rapid switching", category='DEPRECATED') max_cache_size : float = Field(default=6.0, gt=0, description="Maximum memory amount used by model cache for rapid switching", category='Memory/Performance') - gpu_mem_reserved : float = Field(default=2.75, ge=0, description="Amount of VRAM reserved for model storage", category='Memory/Performance') + max_vram_cache_size : float = Field(default=2.75, ge=0, description="Amount of VRAM reserved for model storage", category='Memory/Performance') + gpu_mem_reserved : float = Field(default=2.75, ge=0, description="DEPRECATED: use max_vram_cache_size. Amount of VRAM reserved for model storage", category='DEPRECATED') precision : Literal[tuple(['auto','float16','float32','autocast'])] = Field(default='float16',description='Floating point precision', category='Memory/Performance') sequential_guidance : bool = Field(default=False, description="Whether to calculate guidance in serial instead of in parallel, lowering memory requirements", category='Memory/Performance') xformers_enabled : bool = Field(default=True, description="Enable/disable memory-efficient attention", category='Memory/Performance') diff --git a/invokeai/backend/model_management/model_cache.py b/invokeai/backend/model_management/model_cache.py index b3284226e1..e4cba3517e 100644 --- a/invokeai/backend/model_management/model_cache.py +++ b/invokeai/backend/model_management/model_cache.py @@ -37,7 +37,7 @@ from .models import BaseModelType, ModelType, SubModelType, ModelBase DEFAULT_MAX_CACHE_SIZE = 6.0 # amount of GPU memory to hold in reserve for use by generations (GB) -DEFAULT_GPU_MEM_RESERVED= 2.75 +DEFAULT_MAX_VRAM_CACHE_SIZE= 2.75 # actual size of a gig GIG = 1073741824 @@ -85,13 +85,13 @@ class ModelCache(object): def __init__( self, max_cache_size: float=DEFAULT_MAX_CACHE_SIZE, + max_vram_cache_size: float=DEFAULT_MAX_VRAM_CACHE_SIZE, execution_device: torch.device=torch.device('cuda'), storage_device: torch.device=torch.device('cpu'), precision: torch.dtype=torch.float16, sequential_offload: bool=False, lazy_offloading: bool=True, sha_chunksize: int = 16777216, - gpu_mem_reserved: float=DEFAULT_GPU_MEM_RESERVED, logger: types.ModuleType = logger ): ''' @@ -107,7 +107,7 @@ class ModelCache(object): self.lazy_offloading = lazy_offloading self.precision: torch.dtype=precision self.max_cache_size: float=max_cache_size - self.gpu_mem_reserved: float=gpu_mem_reserved + self.max_vram_cache_size: float=max_vram_cache_size self.execution_device: torch.device=execution_device self.storage_device: torch.device=storage_device self.sha_chunksize=sha_chunksize @@ -349,7 +349,7 @@ class ModelCache(object): self.logger.debug(f"After unloading: cached_models={len(self._cached_models)}") def _offload_unlocked_models(self, size_needed: int=0): - reserved = self.gpu_mem_reserved * GIG + reserved = self.max_vram_cache_size * GIG vram_in_use = torch.cuda.memory_allocated() self.logger.debug(f'{(vram_in_use/GIG):.2f}GB VRAM used for models; max allowed={(reserved/GIG):.2f}GB') for model_key, cache_entry in sorted(self._cached_models.items(), key=lambda x:x[1].size): diff --git a/invokeai/backend/model_management/model_manager.py b/invokeai/backend/model_management/model_manager.py index f74be90639..6670ca06ae 100644 --- a/invokeai/backend/model_management/model_manager.py +++ b/invokeai/backend/model_management/model_manager.py @@ -336,11 +336,11 @@ class ModelManager(object): self.logger = logger self.cache = ModelCache( max_cache_size=max_cache_size, + max_vram_cache_size = self.app_config.max_vram_cache_size, execution_device = device_type, precision = precision, sequential_offload = sequential_offload, logger = logger, - gpu_mem_reserved = self.app_config.gpu_mem_reserved ) self.cache_keys = dict()