diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
index b160ff6fed..de6e5f09d8 100644
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -80,6 +80,7 @@ class ModelManagerService(ModelManagerServiceBase):
         ram_cache = ModelCache(
             max_cache_size=app_config.ram,
             max_vram_cache_size=app_config.vram,
+            lazy_offloading=app_config.lazy_offload,
             logger=logger,
             execution_device=execution_device,
         )
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index b8312f619a..4d5d09864e 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -421,13 +421,20 @@ class ModelCache(ModelCacheBase[AnyModel]):
 
         self.logger.debug(f"After making room: cached_models={len(self._cached_models)}")
 
+    def _free_vram(self, device: torch.device) -> int:
+        vram_device = (  # mem_get_info() needs an indexed device
+            device if device.index is not None else torch.device(str(device), index=0)
+        )
+        free_mem, _ = torch.cuda.mem_get_info(vram_device)
+        for _, cache_entry in self._cached_models.items():
+            if cache_entry.loaded and not cache_entry.locked:
+                free_mem += cache_entry.size
+        return free_mem
+
     def _check_free_vram(self, target_device: torch.device, needed_size: int) -> None:
         if target_device.type != "cuda":
             return
-        vram_device = (  # mem_get_info() needs an indexed device
-            target_device if target_device.index is not None else torch.device(str(target_device), index=0)
-        )
-        free_mem, _ = torch.cuda.mem_get_info(torch.device(vram_device))
+        free_mem = self._free_vram(target_device)
         if needed_size > free_mem:
             needed_gb = round(needed_size / GIG, 2)
             free_gb = round(free_mem / GIG, 2)
diff --git a/invokeai/backend/model_manager/load/model_cache/model_locker.py b/invokeai/backend/model_manager/load/model_cache/model_locker.py
index 81dca346e5..ea38d3773c 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_locker.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_locker.py
@@ -33,14 +33,13 @@ class ModelLocker(ModelLockerBase):
             return self.model
 
         # NOTE that the model has to have the to() method in order for this code to move it into GPU!
-        self._cache_entry.lock()
-
         try:
             if self._cache.lazy_offloading:
                 self._cache.offload_unlocked_models(self._cache_entry.size)
 
             self._cache.move_model_to_device(self._cache_entry, self._cache.execution_device)
             self._cache_entry.loaded = True
+            self._cache_entry.lock()
 
             self._cache.logger.debug(f"Locking {self._cache_entry.key} in {self._cache.execution_device}")
             self._cache.print_cuda_stats()
@@ -51,6 +50,7 @@ class ModelLocker(ModelLockerBase):
         except Exception:
             self._cache_entry.unlock()
             raise
+
         return self.model
 
     def unlock(self) -> None: