Add debug logging of changes in RAM and VRAM for all model cache operations.

2024-08-30 20:32:17 +00:00 · 2023-09-28 16:27:16 -04:00 · 2023-09-28 16:27:16 -04:00 · 594fd3ba6d
commit 594fd3ba6d
parent 44d68f5ed5
1 changed files with 99 additions and 26 deletions
--- a/invokeai/backend/model_management/model_cache.py
+++ b/invokeai/backend/model_management/model_cache.py
@ -20,11 +20,13 @@ import gc
 import hashlib
 import os
 import sys
 import time
 from contextlib import suppress
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Dict, Optional, Type, Union, types
 import psutil
 import torch
 import invokeai.backend.util.logging as logger
@ -211,17 +213,27 @@ class ModelCache(object):
            if self.stats:
                self.stats.misses += 1
-            # this will remove older cached models until
+            self_reported_model_size_before_load = model_info.get_size(submodel)
-            # there is sufficient room to load the requested model
+            # Remove old models from the cache to make room for the new model.
-            self._make_cache_room(model_info.get_size(submodel))
+            self._make_cache_room(self_reported_model_size_before_load)
-            # clean memory to make MemoryUsage() more accurate
+            # Load the model from disk and capture a memory snapshot before/after.
-            gc.collect()
+            start_load_time = time.time()
            snapshot_before = MemorySnapshot.capture()
            model = model_info.get_model(child_type=submodel, torch_dtype=self.precision)
-            if mem_used := model_info.get_size(submodel):
+            snapshot_after = MemorySnapshot.capture()
-                self.logger.debug(f"CPU RAM used for load: {(mem_used/GIG):.2f} GB")
+            end_load_time = time.time()
-            cache_entry = _CacheRecord(self, model, mem_used)
+            self_reported_model_size_after_load = model_info.get_size(submodel)
            self.logger.debug(
                f"Moved model '{key}' from disk to cpu in {(end_load_time-start_load_time):.2f}s. Self-reported size"
                f" before/after load: {(self_reported_model_size_before_load/GIG):.2f}GB /"
                f" {(self_reported_model_size_after_load/GIG):.2f}GB."
                f" {get_pretty_snapshot_diff(snapshot_before, snapshot_after)}."
            )
            cache_entry = _CacheRecord(self, model, self_reported_model_size_after_load)
            self._cached_models[key] = cache_entry
        else:
            if self.stats:
@ -271,10 +283,17 @@ class ModelCache(object):
                        self.cache._offload_unlocked_models(self.size_needed)
                    if self.model.device != self.cache.execution_device:
-                        self.cache.logger.debug(f"Moving {self.key} into {self.cache.execution_device}")
+                        start_model_to_time = time.time()
-                        with VRAMUsage() as mem:
+                        snapshot_before = MemorySnapshot.capture()
                        self.model.to(self.cache.execution_device)  # move into GPU
-                        self.cache.logger.debug(f"GPU VRAM used for load: {(mem.vram_used/GIG):.2f} GB")
+                        snapshot_after = MemorySnapshot.capture()
                        end_model_to_time = time.time()
                        self.cache.logger.debug(
                            f"Moved model '{self.key}' from {self.cache.storage_device} to"
                            f" {self.cache.execution_device} in {(end_model_to_time-start_model_to_time):.2f}s."
                            f" Estimated model size: {(self.cache_entry.size/GIG):.2f} GB."
                            f" {get_pretty_snapshot_diff(snapshot_before, snapshot_after)}."
                        )
                    self.cache.logger.debug(f"Locking {self.key} in {self.cache.execution_device}")
                    self.cache._print_cuda_stats()
@ -427,11 +446,19 @@ class ModelCache(object):
            if vram_in_use <= reserved:
                break
            if not cache_entry.locked and cache_entry.loaded:
-                self.logger.debug(f"Offloading {model_key} from {self.execution_device} into {self.storage_device}")
+                start_model_to_time = time.time()
-                with VRAMUsage() as mem:
+                snapshot_before = MemorySnapshot.capture()
                cache_entry.model.to(self.storage_device)
-                self.logger.debug(f"GPU VRAM freed: {(mem.vram_used/GIG):.2f} GB")
+                snapshot_after = MemorySnapshot.capture()
-                vram_in_use += mem.vram_used  # note vram_used is negative
+                end_model_to_time = time.time()
                self.logger.debug(
                    f"Moved model '{model_key}' from {self.execution_device} to {self.storage_device} in"
                    f" {(end_model_to_time-start_model_to_time):.2f}s. Estimated model size:"
                    f" {(cache_entry.size/GIG):.2f} GB."
                    f" {get_pretty_snapshot_diff(snapshot_before, snapshot_after)}."
                )
                vram_in_use = snapshot_after.vram or 0
                self.logger.debug(f"{(vram_in_use/GIG):.2f}GB VRAM used for models; max allowed={(reserved/GIG):.2f}GB")
        gc.collect()
@ -460,14 +487,60 @@ class ModelCache(object):
        return hash
-class VRAMUsage(object):
+class MemorySnapshot:
-    def __init__(self):
+    """A snapshot of RAM and VRAM usage. All values are in bytes."""
        self.vram = None
        self.vram_used = 0
-    def __enter__(self):
+    def __init__(self, process_ram: int, vram: Optional[int]):
-        self.vram = torch.cuda.memory_allocated()
+        """Initialize a MemorySnapshot.
        return self
-    def __exit__(self, *args):
+        Most of the time, `MemorySnapshot` will be constructed with `MemorySnapshot.capture()`.
-        self.vram_used = torch.cuda.memory_allocated() - self.vram
+
        Args:
            process_ram (int): CPU RAM used by the current process.
            vram (Optional[int]): VRAM used by torch.
        """
        self.process_ram = process_ram
        self.vram = vram
    @classmethod
    def capture(cls, run_garbage_collector: bool = True):
        """Capture and return a MemorySnapshot.
        Note: This function has significant overhead, particularly if `run_garbage_collector == True`.
        Args:
            run_garbage_collector (bool, optional): If true, gc.collect() will be run before checking the process RAM
                usage. Defaults to True.
        Returns:
            MemorySnapshot
        """
        if run_garbage_collector:
            gc.collect()
        # According to the psutil docs (https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info), rss is
        # supported on all platforms.
        process_ram = psutil.Process().memory_info().rss
        if choose_torch_device() == torch.device("cuda"):
            vram = torch.cuda.memory_allocated()
        else:
            # TODO: We could add support for mps.current_allocated_memory() as well. Leaving out for now until we have
            # time to test it properly.
            vram = None
        return cls(process_ram, vram)
 def get_pretty_snapshot_diff(snapshot_1: MemorySnapshot, snapshot_2: MemorySnapshot) -> str:
    """Get a pretty string describing the difference between two `MemorySnapshot`s."""
    ram_diff = snapshot_2.process_ram - snapshot_1.process_ram
    msg = f"RAM ({(ram_diff/GIG):+.2f}): {(snapshot_1.process_ram/GIG):.2f}GB -> {(snapshot_2.process_ram/GIG):.2f}GB"
    vram_diff = None
    if snapshot_1.vram is not None and snapshot_2.vram is not None:
        vram_diff = snapshot_2.vram - snapshot_1.vram
    msg += f", VRAM ({(vram_diff/GIG):+.2f}): {(snapshot_1.vram/GIG):.2f}GB -> {(snapshot_2.vram/GIG):.2f}GB"
    return msg