diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py
index 895008b753..4f4d4850da 100644
--- a/invokeai/app/services/config/config_default.py
+++ b/invokeai/app/services/config/config_default.py
@@ -24,6 +24,7 @@ INIT_FILE = Path("invokeai.yaml")
 DB_FILE = Path("invokeai.db")
 LEGACY_INIT_FILE = Path("invokeai.init")
 DEFAULT_RAM_CACHE = 10.0
+DEFAULT_VRAM_CACHE = 0.25
 DEFAULT_CONVERT_CACHE = 20.0
 DEVICE = Literal["auto", "cpu", "cuda:0", "cuda:1", "cuda:2", "cuda:3", "cuda:4", "cuda:5", "cuda:6", "cuda:7", "mps"]
 PRECISION = Literal["auto", "float16", "bfloat16", "float32", "autocast"]
@@ -99,7 +100,9 @@ class InvokeAIAppConfig(BaseSettings):
         profile_prefix: An optional prefix for profile output files.
         profiles_dir: Path to profiles output directory.
         ram: Maximum memory amount used by memory model cache for rapid switching (GB).
+        vram: Amount of VRAM reserved for model storage (GB).
         convert_cache: Maximum size of on-disk converted models cache (GB).
+        lazy_offload: Keep models in VRAM until their space is needed.
         log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
         device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda:0`, `cuda:1`, `cuda:2`, `cuda:3`, `cuda:4`, `cuda:5`, `cuda:6`, `cuda:7`, `mps`
         devices: List of execution devices; will override default device selected.
@@ -167,7 +170,9 @@ class InvokeAIAppConfig(BaseSettings):
 
     # CACHE
     ram:                          float = Field(default_factory=get_default_ram_cache_size, gt=0, description="Maximum memory amount used by memory model cache for rapid switching (GB).")
+    vram:                         float = Field(default=DEFAULT_VRAM_CACHE, ge=0, description="Amount of VRAM reserved for model storage (GB).")
     convert_cache:                float = Field(default=DEFAULT_CONVERT_CACHE, ge=0, description="Maximum size of on-disk converted models cache (GB).")
+    lazy_offload:                  bool = Field(default=True,               description="Keep models in VRAM until their space is needed.")
     log_memory_usage:              bool = Field(default=False,              description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.")
 
     # DEVICE
@@ -366,9 +371,6 @@ def migrate_v3_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig:
             # `max_cache_size` was renamed to `ram` some time in v3, but both names were used
             if k == "max_cache_size" and "ram" not in category_dict:
                 parsed_config_dict["ram"] = v
-            # vram was removed in v4.0.2
-            if k in ["vram", "max_vram_cache_size", "lazy_offload"]:
-                continue
             # autocast was removed in v4.0.1
             if k == "precision" and v == "autocast":
                 parsed_config_dict["precision"] = "auto"
@@ -419,6 +421,9 @@ def migrate_v4_0_0_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig
 def migrate_v4_0_1_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig:
     """Migrate v4.0.1 config dictionary to a current config object.
 
+    A few new multi-GPU options were added in 4.0.2, and this simply
+    updates the schema label.
+
     Args:
         config_dict: A dictionary of settings from a v4.0.1 config file.
 
@@ -426,15 +431,14 @@ def migrate_v4_0_1_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig
         An instance of `InvokeAIAppConfig` with the migrated settings.
     """
     parsed_config_dict: dict[str, Any] = {}
-    for k, v in config_dict.items():
-        if k not in ["vram", "lazy_offload"]:
-            parsed_config_dict[k] = v
+    for k, _ in config_dict.items():
         if k == "schema_version":
             parsed_config_dict[k] = CONFIG_SCHEMA_VERSION
     config = DefaultInvokeAIAppConfig.model_validate(parsed_config_dict)
     return config
 
 
+# TO DO: replace this with a formal registration and migration system
 def load_and_migrate_config(config_path: Path) -> InvokeAIAppConfig:
     """Load and migrate a config file to the latest version.
 
diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
index ccb68f783b..fbb19ab527 100644
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -76,6 +76,8 @@ class ModelManagerService(ModelManagerServiceBase):
 
         ram_cache = ModelCache(
             max_cache_size=app_config.ram,
+            max_vram_cache_size=app_config.vram,
+            lazy_offloading=app_config.lazy_offload,
             logger=logger,
         )
         convert_cache = ModelConvertCache(cache_path=app_config.convert_cache_path, max_size=app_config.convert_cache)
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
index c86ec5ddda..6e6553db47 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
@@ -113,12 +113,28 @@ class ModelCacheBase(ABC, Generic[T]):
         """
         pass
 
+    @property
+    @abstractmethod
+    def lazy_offloading(self) -> bool:
+        """Return true if the cache is configured to lazily offload models in VRAM."""
+        pass
+
     @property
     @abstractmethod
     def max_cache_size(self) -> float:
         """Return true if the cache is configured to lazily offload models in VRAM."""
         pass
 
+    @abstractmethod
+    def offload_unlocked_models(self, size_required: int) -> None:
+        """Offload from VRAM any models not actively in use."""
+        pass
+
+    @abstractmethod
+    def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
+        """Move model into the indicated device."""
+        pass
+
     @property
     @abstractmethod
     def stats(self) -> Optional[CacheStats]:
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 910087c4bb..3cebfb8820 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -19,8 +19,10 @@ context. Use like this:
 """
 
 import gc
+import math
 import sys
 import threading
+import time
 from contextlib import contextmanager, suppress
 from logging import Logger
 from threading import BoundedSemaphore
@@ -29,7 +31,7 @@ from typing import Dict, Generator, List, Optional, Set
 import torch
 
 from invokeai.backend.model_manager import AnyModel, SubModelType
-from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot
+from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot, get_pretty_snapshot_diff
 from invokeai.backend.util.devices import TorchDevice
 from invokeai.backend.util.logging import InvokeAILogger
 
@@ -40,6 +42,11 @@ from .model_locker import ModelLocker
 # Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously
 DEFAULT_MAX_CACHE_SIZE = 6.0
 
+# amount of GPU memory to hold in reserve for use by generations (GB)
+# Empirically this value seems to improve performance without starving other
+# processes.
+DEFAULT_MAX_VRAM_CACHE_SIZE = 0.25
+
 # actual size of a gig
 GIG = 1073741824
 
@@ -53,10 +60,12 @@ class ModelCache(ModelCacheBase[AnyModel]):
     def __init__(
         self,
         max_cache_size: float = DEFAULT_MAX_CACHE_SIZE,
+        max_vram_cache_size: float = DEFAULT_MAX_VRAM_CACHE_SIZE,
         storage_device: torch.device = torch.device("cpu"),
         execution_devices: Optional[Set[torch.device]] = None,
         precision: torch.dtype = torch.float16,
         sequential_offload: bool = False,
+        lazy_offloading: bool = True,
         sha_chunksize: int = 16777216,
         log_memory_usage: bool = False,
         logger: Optional[Logger] = None,
@@ -67,14 +76,18 @@ class ModelCache(ModelCacheBase[AnyModel]):
         :param max_cache_size: Maximum size of the RAM cache [6.0 GB]
         :param storage_device: Torch device to save inactive model in [torch.device('cpu')]
         :param precision: Precision for loaded models [torch.float16]
+        :param lazy_offloading: Keep model in VRAM until another model needs to be loaded
         :param sequential_offload: Conserve VRAM by loading and unloading each stage of the pipeline sequentially
         :param log_memory_usage: If True, a memory snapshot will be captured before and after every model cache
             operation, and the result will be logged (at debug level). There is a time cost to capturing the memory
             snapshots, so it is recommended to disable this feature unless you are actively inspecting the model cache's
             behaviour.
         """
+        # allow lazy offloading only when vram cache enabled
+        self._lazy_offloading = lazy_offloading and max_vram_cache_size > 0
         self._precision: torch.dtype = precision
         self._max_cache_size: float = max_cache_size
+        self._max_vram_cache_size: float = max_vram_cache_size
         self._storage_device: torch.device = storage_device
         self._ram_lock = threading.Lock()
         self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
@@ -98,6 +111,11 @@ class ModelCache(ModelCacheBase[AnyModel]):
         """Return the logger used by the cache."""
         return self._logger
 
+    @property
+    def lazy_offloading(self) -> bool:
+        """Return true if the cache is configured to lazily offload models in VRAM."""
+        return self._lazy_offloading
+
     @property
     def storage_device(self) -> torch.device:
         """Return the storage device (e.g. "CPU" for RAM)."""
@@ -277,6 +295,87 @@ class ModelCache(ModelCacheBase[AnyModel]):
         else:
             return model_key
 
+    def offload_unlocked_models(self, size_required: int) -> None:
+        """Move any unused models from VRAM."""
+        reserved = self._max_vram_cache_size * GIG
+        vram_in_use = torch.cuda.memory_allocated() + size_required
+        self.logger.debug(f"{(vram_in_use/GIG):.2f}GB VRAM needed for models; max allowed={(reserved/GIG):.2f}GB")
+        for _, cache_entry in sorted(self._cached_models.items(), key=lambda x: x[1].size):
+            if vram_in_use <= reserved:
+                break
+            if not cache_entry.loaded:
+                continue
+            if not cache_entry.locked:
+                self.move_model_to_device(cache_entry, self.storage_device)
+                cache_entry.loaded = False
+                vram_in_use = torch.cuda.memory_allocated() + size_required
+                self.logger.debug(
+                    f"Removing {cache_entry.key} from VRAM to free {(cache_entry.size/GIG):.2f}GB; vram free = {(torch.cuda.memory_allocated()/GIG):.2f}GB"
+                )
+
+        TorchDevice.empty_cache()
+
+    def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
+        """Move model into the indicated device.
+
+        :param cache_entry: The CacheRecord for the model
+        :param target_device: The torch.device to move the model into
+
+        May raise a torch.cuda.OutOfMemoryError
+        """
+        # These attributes are not in the base ModelMixin class but in various derived classes.
+        # Some models don't have these attributes, in which case they run in RAM/CPU.
+        self.logger.debug(f"Called to move {cache_entry.key} to {target_device}")
+        if not (hasattr(cache_entry.model, "device") and hasattr(cache_entry.model, "to")):
+            return
+
+        source_device = cache_entry.model.device
+
+        # Note: We compare device types only so that 'cuda' == 'cuda:0'.
+        # This would need to be revised to support multi-GPU.
+        if torch.device(source_device).type == torch.device(target_device).type:
+            return
+
+        start_model_to_time = time.time()
+        snapshot_before = self._capture_memory_snapshot()
+        try:
+            cache_entry.model.to(target_device)
+        except Exception as e:  # blow away cache entry
+            self._delete_cache_entry(cache_entry)
+            raise e
+
+        snapshot_after = self._capture_memory_snapshot()
+        end_model_to_time = time.time()
+        self.logger.debug(
+            f"Moved model '{cache_entry.key}' from {source_device} to"
+            f" {target_device} in {(end_model_to_time-start_model_to_time):.2f}s."
+            f"Estimated model size: {(cache_entry.size/GIG):.3f} GB."
+            f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
+        )
+
+        if (
+            snapshot_before is not None
+            and snapshot_after is not None
+            and snapshot_before.vram is not None
+            and snapshot_after.vram is not None
+        ):
+            vram_change = abs(snapshot_before.vram - snapshot_after.vram)
+
+            # If the estimated model size does not match the change in VRAM, log a warning.
+            if not math.isclose(
+                vram_change,
+                cache_entry.size,
+                rel_tol=0.1,
+                abs_tol=10 * MB,
+            ):
+                self.logger.debug(
+                    f"Moving model '{cache_entry.key}' from {source_device} to"
+                    f" {target_device} caused an unexpected change in VRAM usage. The model's"
+                    " estimated size may be incorrect. Estimated model size:"
+                    f" {(cache_entry.size/GIG):.3f} GB.\n"
+                    f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
+                )
+
     def print_cuda_stats(self) -> None:
         """Log CUDA diagnostics."""
         vram = "%4.2fG" % (torch.cuda.memory_allocated() / GIG)
diff --git a/invokeai/backend/model_manager/load/model_cache/model_locker.py b/invokeai/backend/model_manager/load/model_cache/model_locker.py
index c7685fc8f7..b9349ea3dd 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_locker.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_locker.py
@@ -2,7 +2,6 @@
 Base class and implementation of a class that moves models in and out of VRAM.
 """
 
-import copy
 from typing import Optional
 
 import torch
@@ -55,13 +54,14 @@ class ModelLocker(ModelLockerBase):
         # NOTE that the model has to have the to() method in order for this code to move it into GPU!
         self._cache_entry.lock()
         try:
-            # We wait for a gpu to be free - may raise a ValueError
-            self._execution_device = self._cache.get_execution_device()
-            self._cache.logger.debug(f"Locking {self._cache_entry.key} in {self._execution_device}")
-            model_in_gpu = copy.deepcopy(self._cache_entry.model)
-            if hasattr(model_in_gpu, "to"):
-                model_in_gpu.to(self._execution_device)
+            if self._cache.lazy_offloading:
+                self._cache.offload_unlocked_models(self._cache_entry.size)
+
+            execution_device = self._cache.get_execution_device()
+            self._cache.move_model_to_device(self._cache_entry, execution_device)
             self._cache_entry.loaded = True
+
+            self._cache.logger.debug(f"Locking {self._cache_entry.key} in {execution_device}")
             self._cache.print_cuda_stats()
         except torch.cuda.OutOfMemoryError:
             self._cache.logger.warning("Insufficient GPU memory to load model. Aborting")
@@ -70,11 +70,15 @@ class ModelLocker(ModelLockerBase):
         except Exception:
             self._cache_entry.unlock()
             raise
-        return model_in_gpu
+
+        return self.model
 
     def unlock(self) -> None:
         """Call upon exit from context."""
         if not hasattr(self.model, "to"):
             return
+
         self._cache_entry.unlock()
-        self._cache.print_cuda_stats()
+        if not self._cache.lazy_offloading:
+            self._cache.offload_unlocked_models(self._cache_entry.size)
+            self._cache.print_cuda_stats()