fix conflicts

2024-08-30 20:32:17 +00:00 · 2023-07-11 15:55:10 -04:00 · 2023-07-11 15:55:10 -04:00 · 25591788c1
commit 25591788c1
parent fabcf276ac dab03fb646
3 changed files with 35 additions and 13 deletions
--- a/invokeai/app/services/config.py
+++ b/invokeai/app/services/config.py
@ -23,7 +23,8 @@ InvokeAI:
    xformers_enabled: false
    sequential_guidance: false
    precision: float16
-    max_loaded_models: 4
+    max_cache_size: 6
+    max_vram_cache_size: 2.7
    always_use_cpu: false
    free_gpu_mem: false
  Features:
@ -269,8 +270,9 @@ class InvokeAISettings(BaseSettings):
        parser.add_parser(cls.cmd_name(), help=cls.__doc__)

    @classmethod
-    def _excluded(self)->Set[str]:
-        return {'type','initconf','version'}
+    def _excluded(self)->List[str]:
+        # combination of deprecated parameters and internal ones
+        return ['type','initconf', 'gpu_mem_reserved', 'max_loaded_models', 'version']

    class Config:
        env_file_encoding = 'utf-8'
@ -363,8 +365,10 @@ setting environment variables INVOKEAI_<setting>.

    always_use_cpu      : bool = Field(default=False, description="If true, use the CPU for rendering even if a GPU is available.", category='Memory/Performance')
    free_gpu_mem        : bool = Field(default=False, description="If true, purge model from GPU after each generation.", category='Memory/Performance')
-    max_loaded_models   : int = Field(default=3, gt=0, description="(DEPRECATED: use max_cache_size) Maximum number of models to keep in memory for rapid switching", category='Memory/Performance')
+    max_loaded_models   : int = Field(default=3, gt=0, description="(DEPRECATED: use max_cache_size) Maximum number of models to keep in memory for rapid switching", category='DEPRECATED')
    max_cache_size      : float = Field(default=6.0, gt=0, description="Maximum memory amount used by model cache for rapid switching", category='Memory/Performance')
+    max_vram_cache_size : float = Field(default=2.75, ge=0, description="Amount of VRAM reserved for model storage", category='Memory/Performance')
+    gpu_mem_reserved    : float = Field(default=2.75, ge=0, description="DEPRECATED: use max_vram_cache_size. Amount of VRAM reserved for model storage", category='DEPRECATED')
    precision           : Literal[tuple(['auto','float16','float32','autocast'])] = Field(default='float16',description='Floating point precision', category='Memory/Performance')
    sequential_guidance : bool = Field(default=False, description="Whether to calculate guidance in serial instead of in parallel, lowering memory requirements", category='Memory/Performance')
    xformers_enabled    : bool = Field(default=True, description="Enable/disable memory-efficient attention", category='Memory/Performance')
--- a/invokeai/backend/model_management/model_cache.py
+++ b/invokeai/backend/model_management/model_cache.py
@ -36,6 +36,9 @@ from .models import BaseModelType, ModelType, SubModelType, ModelBase
 # Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously
 DEFAULT_MAX_CACHE_SIZE = 6.0

+# amount of GPU memory to hold in reserve for use by generations (GB)
+DEFAULT_MAX_VRAM_CACHE_SIZE= 2.75
+
 # actual size of a gig
 GIG = 1073741824

@ -82,6 +85,7 @@ class ModelCache(object):
    def __init__(
        self,
        max_cache_size: float=DEFAULT_MAX_CACHE_SIZE,
+        max_vram_cache_size: float=DEFAULT_MAX_VRAM_CACHE_SIZE,
        execution_device: torch.device=torch.device('cuda'),
        storage_device: torch.device=torch.device('cpu'),
        precision: torch.dtype=torch.float16,
@ -99,12 +103,11 @@ class ModelCache(object):
        :param sequential_offload: Conserve VRAM by loading and unloading each stage of the pipeline sequentially
        :param sha_chunksize: Chunksize to use when calculating sha256 model hash
        '''
-        #max_cache_size = 9999
        self.model_infos: Dict[str, ModelBase] = dict()
        self.lazy_offloading = lazy_offloading
-        #self.sequential_offload: bool=sequential_offload
        self.precision: torch.dtype=precision
-        self.max_cache_size: int=max_cache_size
+        self.max_cache_size: float=max_cache_size
+        self.max_vram_cache_size: float=max_vram_cache_size
        self.execution_device: torch.device=execution_device
        self.storage_device: torch.device=storage_device
        self.sha_chunksize=sha_chunksize
@ -201,14 +204,22 @@ class ModelCache(object):
            self._cache_stack.remove(key)
        self._cache_stack.append(key)

-        return self.ModelLocker(self, key, cache_entry.model, gpu_load)
+        return self.ModelLocker(self, key, cache_entry.model, gpu_load, cache_entry.size)

    class ModelLocker(object):
-        def __init__(self, cache, key, model, gpu_load):
+        def __init__(self, cache, key, model, gpu_load, size_needed):
+            '''
+            :param cache: The model_cache object
+            :param key: The key of the model to lock in GPU
+            :param model: The model to lock
+            :param gpu_load: True if load into gpu
+            :param size_needed: Size of the model to load
+            '''
            self.gpu_load = gpu_load
            self.cache = cache
            self.key = key
            self.model = model
+            self.size_needed = size_needed
            self.cache_entry = self.cache._cached_models[self.key]

        def __enter__(self) -> Any:
@ -222,7 +233,7 @@ class ModelCache(object):

                try:
                    if self.cache.lazy_offloading:
-                       self.cache._offload_unlocked_models()
+                       self.cache._offload_unlocked_models(self.size_needed)
                       
                    if self.model.device != self.cache.execution_device:
                        self.cache.logger.debug(f'Moving {self.key} into {self.cache.execution_device}')
@ -337,14 +348,20 @@ class ModelCache(object):

        self.logger.debug(f"After unloading: cached_models={len(self._cached_models)}")

-
-    def _offload_unlocked_models(self):
-        for model_key, cache_entry in self._cached_models.items():
+    def _offload_unlocked_models(self, size_needed: int=0):
+        reserved = self.max_vram_cache_size * GIG
+        vram_in_use = torch.cuda.memory_allocated()
+        self.logger.debug(f'{(vram_in_use/GIG):.2f}GB VRAM used for models; max allowed={(reserved/GIG):.2f}GB')
+        for model_key, cache_entry in sorted(self._cached_models.items(), key=lambda x:x[1].size):
+            if vram_in_use <= reserved:
+                break
            if not cache_entry.locked and cache_entry.loaded:
                self.logger.debug(f'Offloading {model_key} from {self.execution_device} into {self.storage_device}')
                with VRAMUsage() as mem:
                    cache_entry.model.to(self.storage_device)
                self.logger.debug(f'GPU VRAM freed: {(mem.vram_used/GIG):.2f} GB')
+                vram_in_use += mem.vram_used  # note vram_used is negative
+                self.logger.debug(f'{(vram_in_use/GIG):.2f}GB VRAM used for models; max allowed={(reserved/GIG):.2f}GB')
        
    def _local_model_hash(self, model_path: Union[str, Path]) -> str:
        sha = hashlib.sha256()
--- a/invokeai/backend/model_management/model_manager.py
+++ b/invokeai/backend/model_management/model_manager.py
@ -340,6 +340,7 @@ class ModelManager(object):
        self.logger = logger
        self.cache = ModelCache(
            max_cache_size=max_cache_size,
+            max_vram_cache_size = self.app_config.max_vram_cache_size,
            execution_device = device_type,
            precision = precision,
            sequential_offload = sequential_offload,