mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
reverse logic of gpu_mem_reserved
- gpu_mem_reserved now indicates the amount of VRAM that will be reserved for model caching (similar to max_cache_size).
This commit is contained in:
parent
83ec4c983c
commit
d32f9f7cb0
@ -26,7 +26,7 @@ InvokeAI:
|
|||||||
max_cache_size: 6
|
max_cache_size: 6
|
||||||
always_use_cpu: false
|
always_use_cpu: false
|
||||||
free_gpu_mem: false
|
free_gpu_mem: false
|
||||||
gpu_mem_reserved: 1
|
gpu_mem_reserved: 2.7
|
||||||
Features:
|
Features:
|
||||||
nsfw_checker: true
|
nsfw_checker: true
|
||||||
restore: true
|
restore: true
|
||||||
@ -366,7 +366,7 @@ setting environment variables INVOKEAI_<setting>.
|
|||||||
free_gpu_mem : bool = Field(default=False, description="If true, purge model from GPU after each generation.", category='Memory/Performance')
|
free_gpu_mem : bool = Field(default=False, description="If true, purge model from GPU after each generation.", category='Memory/Performance')
|
||||||
max_loaded_models : int = Field(default=3, gt=0, description="(DEPRECATED: use max_cache_size) Maximum number of models to keep in memory for rapid switching", category='Memory/Performance')
|
max_loaded_models : int = Field(default=3, gt=0, description="(DEPRECATED: use max_cache_size) Maximum number of models to keep in memory for rapid switching", category='Memory/Performance')
|
||||||
max_cache_size : float = Field(default=6.0, gt=0, description="Maximum memory amount used by model cache for rapid switching", category='Memory/Performance')
|
max_cache_size : float = Field(default=6.0, gt=0, description="Maximum memory amount used by model cache for rapid switching", category='Memory/Performance')
|
||||||
gpu_mem_reserved : float = Field(default=1.75, ge=0, description="Amount of VRAM to reserve for use during generation", category='Memory/Performance')
|
gpu_mem_reserved : float = Field(default=2.75, ge=0, description="Amount of VRAM reserved for model storage", category='Memory/Performance')
|
||||||
precision : Literal[tuple(['auto','float16','float32','autocast'])] = Field(default='float16',description='Floating point precision', category='Memory/Performance')
|
precision : Literal[tuple(['auto','float16','float32','autocast'])] = Field(default='float16',description='Floating point precision', category='Memory/Performance')
|
||||||
sequential_guidance : bool = Field(default=False, description="Whether to calculate guidance in serial instead of in parallel, lowering memory requirements", category='Memory/Performance')
|
sequential_guidance : bool = Field(default=False, description="Whether to calculate guidance in serial instead of in parallel, lowering memory requirements", category='Memory/Performance')
|
||||||
xformers_enabled : bool = Field(default=True, description="Enable/disable memory-efficient attention", category='Memory/Performance')
|
xformers_enabled : bool = Field(default=True, description="Enable/disable memory-efficient attention", category='Memory/Performance')
|
||||||
|
@ -37,7 +37,7 @@ from .models import BaseModelType, ModelType, SubModelType, ModelBase
|
|||||||
DEFAULT_MAX_CACHE_SIZE = 6.0
|
DEFAULT_MAX_CACHE_SIZE = 6.0
|
||||||
|
|
||||||
# amount of GPU memory to hold in reserve for use by generations (GB)
|
# amount of GPU memory to hold in reserve for use by generations (GB)
|
||||||
DEFAULT_GPU_MEM_RESERVED= 1.75
|
DEFAULT_GPU_MEM_RESERVED= 2.75
|
||||||
|
|
||||||
# actual size of a gig
|
# actual size of a gig
|
||||||
GIG = 1073741824
|
GIG = 1073741824
|
||||||
@ -350,17 +350,18 @@ class ModelCache(object):
|
|||||||
|
|
||||||
def _offload_unlocked_models(self, size_needed: int=0):
|
def _offload_unlocked_models(self, size_needed: int=0):
|
||||||
reserved = self.gpu_mem_reserved * GIG
|
reserved = self.gpu_mem_reserved * GIG
|
||||||
|
vram_in_use = torch.cuda.memory_allocated()
|
||||||
|
self.logger.debug(f'{(vram_in_use/GIG):.2f}GB VRAM used for models; max allowed={(reserved/GIG):.2f}GB')
|
||||||
for model_key, cache_entry in sorted(self._cached_models.items(), key=lambda x:x[1].size):
|
for model_key, cache_entry in sorted(self._cached_models.items(), key=lambda x:x[1].size):
|
||||||
free_mem, used_mem = torch.cuda.mem_get_info()
|
if vram_in_use <= reserved:
|
||||||
free_mem -= reserved
|
|
||||||
self.logger.debug(f'Require {(size_needed/GIG):.2f}GB VRAM. Have {(free_mem/GIG):.2f}GB available ({(reserved/GIG):.2f} reserved).')
|
|
||||||
if free_mem > size_needed:
|
|
||||||
break
|
break
|
||||||
if not cache_entry.locked and cache_entry.loaded:
|
if not cache_entry.locked and cache_entry.loaded:
|
||||||
self.logger.debug(f'Offloading {model_key} from {self.execution_device} into {self.storage_device}')
|
self.logger.debug(f'Offloading {model_key} from {self.execution_device} into {self.storage_device}')
|
||||||
with VRAMUsage() as mem:
|
with VRAMUsage() as mem:
|
||||||
cache_entry.model.to(self.storage_device)
|
cache_entry.model.to(self.storage_device)
|
||||||
self.logger.debug(f'GPU VRAM freed: {(mem.vram_used/GIG):.2f} GB')
|
self.logger.debug(f'GPU VRAM freed: {(mem.vram_used/GIG):.2f} GB')
|
||||||
|
vram_in_use += mem.vram_used # note vram_used is negative
|
||||||
|
self.logger.debug(f'{(vram_in_use/GIG):.2f}GB VRAM used for models; max allowed={(reserved/GIG):.2f}GB')
|
||||||
|
|
||||||
def _local_model_hash(self, model_path: Union[str, Path]) -> str:
|
def _local_model_hash(self, model_path: Union[str, Path]) -> str:
|
||||||
sha = hashlib.sha256()
|
sha = hashlib.sha256()
|
||||||
|
@ -340,6 +340,7 @@ class ModelManager(object):
|
|||||||
precision = precision,
|
precision = precision,
|
||||||
sequential_offload = sequential_offload,
|
sequential_offload = sequential_offload,
|
||||||
logger = logger,
|
logger = logger,
|
||||||
|
gpu_mem_reserved = self.app_config.gpu_mem_reserved
|
||||||
)
|
)
|
||||||
self.cache_keys = dict()
|
self.cache_keys = dict()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user