From 667171ed904d74bd28d18ba904ab430e70e5c5fc Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Sun, 7 May 2023 18:07:28 -0400
Subject: [PATCH] cap model cache size using bytes, not # models

---
 .../app/services/model_manager_initializer.py |   2 +-
 invokeai/backend/args.py                      |  10 +-
 .../backend/model_management/model_cache.py   | 119 +++++++++++++-----
 .../backend/model_management/model_manager.py |   6 +-
 4 files changed, 99 insertions(+), 38 deletions(-)

diff --git a/invokeai/app/services/model_manager_initializer.py b/invokeai/app/services/model_manager_initializer.py
index c7924e797d..cae650fcfa 100644
--- a/invokeai/app/services/model_manager_initializer.py
+++ b/invokeai/app/services/model_manager_initializer.py
@@ -59,7 +59,7 @@ def get_model_manager(config: Args, logger: types.ModuleType) -> ModelManager:
             config.conf,
             precision=dtype,
             device_type=device,
-            max_loaded_models=config.max_loaded_models,
+            max_cache_size=config.max_cache_size,
 # temporarily disabled until model manager stabilizes
 #            embedding_path = Path(embedding_path),
             logger = logger,
diff --git a/invokeai/backend/args.py b/invokeai/backend/args.py
index eb8b396ee0..2cd44b8f7b 100644
--- a/invokeai/backend/args.py
+++ b/invokeai/backend/args.py
@@ -502,11 +502,11 @@ class Args(object):
             help="Deprecated way to set --precision=float32",
         )
         model_group.add_argument(
-            "--max_loaded_models",
-            dest="max_loaded_models",
-            type=int,
-            default=2,
-            help="Maximum number of models to keep in memory for fast switching, including the one in GPU",
+            "--max_cache_size",
+            dest="max_cache_size",
+            type=float,
+            default=6.0,
+            help="Maximum size of the model RAM cache (in GB). 6 GB is sufficient to keep 2-3 diffusers models in RAM simultaneously.",
         )
         model_group.add_argument(
             "--free_gpu_mem",
diff --git a/invokeai/backend/model_management/model_cache.py b/invokeai/backend/model_management/model_cache.py
index c316c4292f..97c21159b4 100644
--- a/invokeai/backend/model_management/model_cache.py
+++ b/invokeai/backend/model_management/model_cache.py
@@ -19,11 +19,13 @@ context. Use like this:
 import contextlib
 import gc
 import hashlib
+import logging
 import warnings
 from collections import Counter
 from enum import Enum
 from pathlib import Path
-from typing import Sequence, Union, Tuple, types
+from psutil import Process
+from typing import Dict, Sequence, Union, Tuple, types
 
 import torch
 import safetensors.torch
@@ -41,7 +43,12 @@ import invokeai.backend.util.logging as logger
 from ..globals import global_cache_dir
 from ..stable_diffusion import StableDiffusionGeneratorPipeline
 
-MAX_MODELS = 4
+# Maximum size of the cache, in gigs
+# Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously
+DEFAULT_MAX_CACHE_SIZE = 6.0
+
+# actual size of a gig
+GIG = 1073741824 
 
 # This is the mapping from the stable diffusion submodel dict key to the class
 class SDModelType(Enum):
@@ -65,6 +72,24 @@ class ModelStatus(Enum):
     in_ram='cached'
     in_vram='in gpu'
     active='locked in gpu'
+
+# This is used to guesstimate the size of a model before we load it.
+# After loading, we will know it exactly.
+# Sizes are in Gigs, estimated for float16; double for float32
+SIZE_GUESSTIMATE = {
+    SDModelType.diffusion_pipeline: 2.5,
+    SDModelType.diffusers: 2.5,
+    SDModelType.vae: 0.35,
+    SDModelType.text_encoder: 0.5,
+    SDModelType.tokenizer: 0.0001,
+    SDModelType.unet: 3.4,
+    SDModelType.scheduler: 0.0001,
+    SDModelType.safety_checker: 1.2,
+    SDModelType.feature_extractor: 0.0001,
+    SDModelType.lora: 0.1,
+    SDModelType.textual_inversion: 0.0001,
+    SDModelType.ckpt: 4.2,
+}
         
 # The list of model classes we know how to fetch, for typechecking
 ModelClass = Union[tuple([x.value for x in SDModelType])]
@@ -90,7 +115,7 @@ class ModelLocker(object):
 class ModelCache(object):
     def __init__(
             self,
-            max_models: int=MAX_MODELS,
+            max_cache_size: float=DEFAULT_MAX_CACHE_SIZE,
             execution_device: torch.device=torch.device('cuda'),
             storage_device: torch.device=torch.device('cpu'),
             precision: torch.dtype=torch.float16,
@@ -113,13 +138,15 @@ class ModelCache(object):
         self.lazy_offloading = lazy_offloading
         self.sequential_offload: bool=sequential_offload
         self.precision: torch.dtype=precision
-        self.max_models: int=max_models
+        self.current_cache_size: int=0
+        self.max_cache_size: int=max_cache_size
         self.execution_device: torch.device=execution_device
         self.storage_device: torch.device=storage_device
         self.sha_chunksize=sha_chunksize
         self.logger = logger
         self.loaded_models: set = set()   # set of model keys loaded in GPU
         self.locked_models: Counter = Counter()   # set of model keys locked in GPU
+        self.model_sizes: Dict[str,int] = dict()
 
     def get_model(
             self,
@@ -172,21 +199,33 @@ class ModelCache(object):
             model_type.value,
             revision,
             subfolder
-        ) 
+        )
+        
         if key in self.models: # cached - move to bottom of stack
             with contextlib.suppress(ValueError):
                 self.stack.remove(key)
                 self.stack.append(key)
             model = self.models[key]
+
         else:  # not cached -load
-            self._make_cache_room()
-            model = self._load_model_from_storage(
-                repo_id_or_path=repo_id_or_path,
-                model_class=model_type.value,
-                subfolder=subfolder,
-                revision=revision,
-                legacy_info=legacy_info,
-            )
+            self.logger.info(f'Loading model {repo_id_or_path}, type {model_type}')
+
+            # this will remove older cached models until
+            # there is sufficient room to load the requested model
+            self._make_cache_room(key, model_type)
+            
+            with MemoryUsage() as usage:
+                model = self._load_model_from_storage(
+                    repo_id_or_path=repo_id_or_path,
+                    model_class=model_type.value,
+                    subfolder=subfolder,
+                    revision=revision,
+                    legacy_info=legacy_info,
+                )
+            logger.debug(f'Actual memory used to load model: {(usage.mem_used/GIG):.2f} GB')
+            self.model_sizes[key] = usage.mem_used
+            self.current_cache_size += usage.mem_used
+            
             if model_type==SDModelType.diffusion_pipeline and attach_model_part[0]:
                 self.attach_part(model,*attach_model_part)
             self.stack.append(key)          # add to LRU cache
@@ -200,11 +239,11 @@ class ModelCache(object):
     def uncache_model(self, key: str):
         '''Remove corresponding model from the cache'''
         if key is not None and key in self.models:
-            with contextlib.suppress(ValueError):
+            with contextlib.suppress(ValueError), contextlib.suppress(KeyError):
                 del self.models[key]
                 del self.locked_models[key]
-                self.stack.remove(key)
                 self.loaded_models.remove(key)
+                self.stack.remove(key)
 
     class ModelLocker(object):
         def __init__(self, cache, key, model, gpu_load):
@@ -304,9 +343,9 @@ class ModelCache(object):
         else:
             return self._hf_commit_hash(repo_id_or_path,revision)
 
-    def cache_size(self)->int:
-        "Return the current number of models cached."
-        return len(self.models)
+    def cache_size(self)->float:
+        "Return the current size of the cache, in GB"
+        return self.current_cache_size / GIG
 
     @classmethod
     def is_legacy_ckpt(cls, repo_id_or_path: Union[str,Path])->bool:
@@ -342,18 +381,29 @@ class ModelCache(object):
         return self.execution_device.type == 'cuda'
 
     def _print_cuda_stats(self):
-        vram = "%4.2fG" % (torch.cuda.memory_allocated() / 1e9)
+        vram = "%4.2fG" % (torch.cuda.memory_allocated() / GIG)
+        ram = "%4.2fG" % (self.current_cache_size / GIG)
         loaded_models = len(self.loaded_models)
         locked_models = len([x for x in self.locked_models if self.locked_models[x]>0])
-        logger.debug(f"Current VRAM usage: {vram}; locked_models/loaded_models = {locked_models}/{loaded_models}")
+        logger.debug(f"Current VRAM/RAM usage: {vram}/{ram}; locked_models/loaded_models = {locked_models}/{loaded_models}")
 
-    def _make_cache_room(self):
-        models_in_ram = len(self.models)
-        while models_in_ram >= self.max_models:
+    def _make_cache_room(self, key, model_type):
+        # calculate how much memory this model will require
+        multiplier = 2 if self.precision==torch.float32 else 1
+        bytes_needed = int(self.model_sizes.get(key,0) or SIZE_GUESSTIMATE[model_type]*GIG*multiplier)
+        maximum_size = self.max_cache_size * GIG  # stored in GB, convert to bytes
+        current_size = self.current_cache_size
+
+        adjective = 'guesstimated' if key not in self.model_sizes else 'known from previous load'
+        logger.debug(f'{(bytes_needed/GIG):.2f} GB needed to load this model ({adjective})')
+        while current_size+bytes_needed > maximum_size:
             if least_recently_used_key := self.stack.pop(0):
-                logger.debug(f'Maximum cache size reached: cache_size={models_in_ram}; unloading model {least_recently_used_key}')
-                del self.models[least_recently_used_key]
-            models_in_ram = len(self.models)
+                model_size = self.model_sizes.get(least_recently_used_key,0) 
+                logger.debug(f'Max cache size exceeded: cache_size={(current_size/GIG):.2f} GB, need an additional {(bytes_needed/GIG):.2f} GB')
+                logger.debug(f'Unloading model {least_recently_used_key} to free {(model_size/GIG):.2f} GB')
+                self.uncache_model(least_recently_used_key)
+                current_size -= model_size
+        self.current_cache_size = current_size
         gc.collect()
 
     def _offload_unlocked_models(self):
@@ -393,8 +443,8 @@ class ModelCache(object):
                     revision,
                     model_class,
                 )
-        if self.sequential_offload and isinstance(model,StableDiffusionGeneratorPipeline):
-            model.enable_offload_submodels(self.execution_device)
+            if self.sequential_offload and isinstance(model,StableDiffusionGeneratorPipeline):
+                model.enable_offload_submodels(self.execution_device)
         return model
 
     def _load_diffusers_from_storage(
@@ -411,7 +461,6 @@ class ModelCache(object):
         :param revision: model revision
         :param model_class: class of model to return, defaults to StableDiffusionGeneratorPIpeline
         '''
-        self.logger.info(f'Loading model {repo_id_or_path}')
         revisions = [revision] if revision \
             else ['fp16','main'] if self.precision==torch.float16 \
                  else ['main']
@@ -529,3 +578,15 @@ class SilenceWarnings(object):
         transformers_logging.set_verbosity(self.transformers_verbosity)
         diffusers_logging.set_verbosity(self.diffusers_verbosity)
         warnings.simplefilter('default')
+
+class MemoryUsage(object):
+    def __init__(self):
+        self.vms = None
+        self.mem_used = 0
+        
+    def __enter__(self):
+        self.vms = Process().memory_info().vms
+        return self
+
+    def __exit__(self, *args):
+        self.mem_used = Process().memory_info().vms - self.vms
diff --git a/invokeai/backend/model_management/model_manager.py b/invokeai/backend/model_management/model_manager.py
index 7c3c32c004..368d581cd8 100644
--- a/invokeai/backend/model_management/model_manager.py
+++ b/invokeai/backend/model_management/model_manager.py
@@ -141,7 +141,7 @@ class SDLegacyType(Enum):
     V2_v = auto()
     UNKNOWN = auto()
 
-DEFAULT_MAX_MODELS = 2
+MAX_CACHE_SIZE = 6.0  # GB
 
 class ModelManager(object):
     """
@@ -155,7 +155,7 @@ class ModelManager(object):
             config_path: Path,
             device_type: torch.device = CUDA_DEVICE,
             precision: torch.dtype = torch.float16,
-            max_loaded_models=DEFAULT_MAX_MODELS,
+            max_cache_size=MAX_CACHE_SIZE,
             sequential_offload=False,
             logger: types.ModuleType = logger,
     ):
@@ -168,7 +168,7 @@ class ModelManager(object):
         self.config_path = config_path
         self.config = OmegaConf.load(self.config_path)
         self.cache = ModelCache(
-            max_models=max_loaded_models,
+            max_cache_size=max_cache_size,
             execution_device = device_type,
             precision = precision,
             sequential_offload = sequential_offload,