Merge branch 'main' into ryan/model-tests

2024-08-30 20:32:17 +00:00 · 2023-10-06 19:47:28 -04:00
parent e0e001758a c1887135b3
commit 4f97bd4418
8 changed files with 347 additions and 36 deletions
--- a/docs/installation/020_INSTALL_MANUAL.md
+++ b/docs/installation/020_INSTALL_MANUAL.md
@ -256,6 +256,10 @@ manager, please follow these steps:
        *highly recommended** if your virtual environment is located outside of
        your runtime directory.

+    !!! tip
+
+        On linux, it is recommended to run invokeai with the following env var: `MALLOC_MMAP_THRESHOLD_=1048576`. For example: `MALLOC_MMAP_THRESHOLD_=1048576 invokeai --web`. This helps to prevent memory fragmentation that can lead to memory accumulation over time. This env var is set automatically when running via `invoke.sh`.
+
 10.  Render away!

    Browse the [features](../features/index.md) section to learn about all the
--- a/installer/templates/invoke.sh.in
+++ b/installer/templates/invoke.sh.in
@ -46,6 +46,9 @@ if [ "$(uname -s)" == "Darwin" ]; then
    export PYTORCH_ENABLE_MPS_FALLBACK=1
 fi

+# Avoid glibc memory fragmentation. See invokeai/backend/model_management/README.md for details.
+export MALLOC_MMAP_THRESHOLD_=1048576
+
 # Primary function for the case statement to determine user input
 do_choice() {
    case $1 in
--- a/invokeai/backend/model_management/README.md
+++ b/invokeai/backend/model_management/README.md
@ -0,0 +1,27 @@
+# Model Cache
+
+## `glibc` Memory Allocator Fragmentation
+
+Python (and PyTorch) relies on the memory allocator from the C Standard Library (`libc`). On linux, with the GNU C Standard Library implementation (`glibc`), our memory access patterns have been observed to cause severe memory fragmentation. This fragmentation results in large amounts of memory that has been freed but can't be released back to the OS. Loading models from disk and moving them between CPU/CUDA seem to be the operations that contribute most to the fragmentation. This memory fragmentation issue can result in OOM crashes during frequent model switching, even if `max_cache_size` is set to a reasonable value (e.g. a OOM crash with `max_cache_size=16` on a system with 32GB of RAM).
+
+This problem may also exist on other OSes, and other `libc` implementations. But, at the time of writing, it has only been investigated on linux with `glibc`.
+
+To better understand how the `glibc` memory allocator works, see these references:
+- Basics: https://www.gnu.org/software/libc/manual/html_node/The-GNU-Allocator.html
+- Details: https://sourceware.org/glibc/wiki/MallocInternals
+
+Note the differences between memory allocated as chunks in an arena vs. memory allocated with `mmap`. Under `glibc`'s default configuration, most model tensors get allocated as chunks in an arena making them vulnerable to the problem of fragmentation.
+
+We can work around this memory fragmentation issue by setting the following env var:
+
+```bash
+# Force blocks >1MB to be allocated with `mmap` so that they are released to the system immediately when they are freed.
+MALLOC_MMAP_THRESHOLD_=1048576
+```
+
+See the following references for more information about the `malloc` tunable parameters:
+- https://www.gnu.org/software/libc/manual/html_node/Malloc-Tunable-Parameters.html
+- https://www.gnu.org/software/libc/manual/html_node/Memory-Allocation-Tunables.html
+- https://man7.org/linux/man-pages/man3/mallopt.3.html
+
+The model cache emits debug logs that provide visibility into the state of the `libc` memory allocator. See the `LibcUtil` class for more info on how these `libc` malloc stats are collected.
--- a/invokeai/backend/model_management/libc_util.py
+++ b/invokeai/backend/model_management/libc_util.py
@ -0,0 +1,75 @@
+import ctypes
+
+
+class Struct_mallinfo2(ctypes.Structure):
+    """A ctypes Structure that matches the libc mallinfo2 struct.
+
+    Docs:
+    - https://man7.org/linux/man-pages/man3/mallinfo.3.html
+    - https://www.gnu.org/software/libc/manual/html_node/Statistics-of-Malloc.html
+
+    struct mallinfo2 {
+        size_t arena;     /* Non-mmapped space allocated (bytes) */
+        size_t ordblks;   /* Number of free chunks */
+        size_t smblks;    /* Number of free fastbin blocks */
+        size_t hblks;     /* Number of mmapped regions */
+        size_t hblkhd;    /* Space allocated in mmapped regions (bytes) */
+        size_t usmblks;   /* See below */
+        size_t fsmblks;   /* Space in freed fastbin blocks (bytes) */
+        size_t uordblks;  /* Total allocated space (bytes) */
+        size_t fordblks;  /* Total free space (bytes) */
+        size_t keepcost;  /* Top-most, releasable space (bytes) */
+    };
+    """
+
+    _fields_ = [
+        ("arena", ctypes.c_size_t),
+        ("ordblks", ctypes.c_size_t),
+        ("smblks", ctypes.c_size_t),
+        ("hblks", ctypes.c_size_t),
+        ("hblkhd", ctypes.c_size_t),
+        ("usmblks", ctypes.c_size_t),
+        ("fsmblks", ctypes.c_size_t),
+        ("uordblks", ctypes.c_size_t),
+        ("fordblks", ctypes.c_size_t),
+        ("keepcost", ctypes.c_size_t),
+    ]
+
+    def __str__(self):
+        s = ""
+        s += f"{'arena': <10}= {(self.arena/2**30):15.5f}   # Non-mmapped space allocated (GB) (uordblks + fordblks)\n"
+        s += f"{'ordblks': <10}= {(self.ordblks): >15}   # Number of free chunks\n"
+        s += f"{'smblks': <10}= {(self.smblks): >15}   # Number of free fastbin blocks \n"
+        s += f"{'hblks': <10}= {(self.hblks): >15}   # Number of mmapped regions \n"
+        s += f"{'hblkhd': <10}= {(self.hblkhd/2**30):15.5f}   # Space allocated in mmapped regions (GB)\n"
+        s += f"{'usmblks': <10}= {(self.usmblks): >15}   # Unused\n"
+        s += f"{'fsmblks': <10}= {(self.fsmblks/2**30):15.5f}   # Space in freed fastbin blocks (GB)\n"
+        s += (
+            f"{'uordblks': <10}= {(self.uordblks/2**30):15.5f}   # Space used by in-use allocations (non-mmapped)"
+            " (GB)\n"
+        )
+        s += f"{'fordblks': <10}= {(self.fordblks/2**30):15.5f}   # Space in free blocks (non-mmapped) (GB)\n"
+        s += f"{'keepcost': <10}= {(self.keepcost/2**30):15.5f}   # Top-most, releasable space (GB)\n"
+        return s
+
+
+class LibcUtil:
+    """A utility class for interacting with the C Standard Library (`libc`) via ctypes.
+
+    Note that this class will raise on __init__() if 'libc.so.6' can't be found. Take care to handle environments where
+    this shared library is not available.
+
+    TODO: Improve cross-OS compatibility of this class.
+    """
+
+    def __init__(self):
+        self._libc = ctypes.cdll.LoadLibrary("libc.so.6")
+
+    def mallinfo2(self) -> Struct_mallinfo2:
+        """Calls `libc` `mallinfo2`.
+
+        Docs: https://man7.org/linux/man-pages/man3/mallinfo.3.html
+        """
+        mallinfo2 = self._libc.mallinfo2
+        mallinfo2.restype = Struct_mallinfo2
+        return mallinfo2()
--- a/invokeai/backend/model_management/memory_snapshot.py
+++ b/invokeai/backend/model_management/memory_snapshot.py
@ -0,0 +1,94 @@
+import gc
+from typing import Optional
+
+import psutil
+import torch
+
+from invokeai.backend.model_management.libc_util import LibcUtil, Struct_mallinfo2
+
+GB = 2**30  # 1 GB
+
+
+class MemorySnapshot:
+    """A snapshot of RAM and VRAM usage. All values are in bytes."""
+
+    def __init__(self, process_ram: int, vram: Optional[int], malloc_info: Optional[Struct_mallinfo2]):
+        """Initialize a MemorySnapshot.
+
+        Most of the time, `MemorySnapshot` will be constructed with `MemorySnapshot.capture()`.
+
+        Args:
+            process_ram (int): CPU RAM used by the current process.
+            vram (Optional[int]): VRAM used by torch.
+            malloc_info (Optional[Struct_mallinfo2]): Malloc info obtained from LibcUtil.
+        """
+        self.process_ram = process_ram
+        self.vram = vram
+        self.malloc_info = malloc_info
+
+    @classmethod
+    def capture(cls, run_garbage_collector: bool = True):
+        """Capture and return a MemorySnapshot.
+
+        Note: This function has significant overhead, particularly if `run_garbage_collector == True`.
+
+        Args:
+            run_garbage_collector (bool, optional): If true, gc.collect() will be run before checking the process RAM
+                usage. Defaults to True.
+
+        Returns:
+            MemorySnapshot
+        """
+        if run_garbage_collector:
+            gc.collect()
+
+        # According to the psutil docs (https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info), rss is
+        # supported on all platforms.
+        process_ram = psutil.Process().memory_info().rss
+
+        if torch.cuda.is_available():
+            vram = torch.cuda.memory_allocated()
+        else:
+            # TODO: We could add support for mps.current_allocated_memory() as well. Leaving out for now until we have
+            # time to test it properly.
+            vram = None
+
+        try:
+            malloc_info = LibcUtil().mallinfo2()
+        except OSError:
+            # This is expected in environments that do not have the 'libc.so.6' shared library.
+            malloc_info = None
+
+        return cls(process_ram, vram, malloc_info)
+
+
+def get_pretty_snapshot_diff(snapshot_1: MemorySnapshot, snapshot_2: MemorySnapshot) -> str:
+    """Get a pretty string describing the difference between two `MemorySnapshot`s."""
+
+    def get_msg_line(prefix: str, val1: int, val2: int):
+        diff = val2 - val1
+        return f"{prefix: <30} ({(diff/GB):+5.3f}): {(val1/GB):5.3f}GB -> {(val2/GB):5.3f}GB\n"
+
+    msg = ""
+
+    msg += get_msg_line("Process RAM", snapshot_1.process_ram, snapshot_2.process_ram)
+
+    if snapshot_1.malloc_info is not None and snapshot_2.malloc_info is not None:
+        msg += get_msg_line("libc mmap allocated", snapshot_1.malloc_info.hblkhd, snapshot_2.malloc_info.hblkhd)
+
+        msg += get_msg_line("libc arena used", snapshot_1.malloc_info.uordblks, snapshot_2.malloc_info.uordblks)
+
+        msg += get_msg_line("libc arena free", snapshot_1.malloc_info.fordblks, snapshot_2.malloc_info.fordblks)
+
+        libc_total_allocated_1 = snapshot_1.malloc_info.arena + snapshot_1.malloc_info.hblkhd
+        libc_total_allocated_2 = snapshot_2.malloc_info.arena + snapshot_2.malloc_info.hblkhd
+        msg += get_msg_line("libc total allocated", libc_total_allocated_1, libc_total_allocated_2)
+
+        libc_total_used_1 = snapshot_1.malloc_info.uordblks + snapshot_1.malloc_info.hblkhd
+        libc_total_used_2 = snapshot_2.malloc_info.uordblks + snapshot_2.malloc_info.hblkhd
+        msg += get_msg_line("libc total used", libc_total_used_1, libc_total_used_2)
+
+    if snapshot_1.vram is not None and snapshot_2.vram is not None:
+        msg += get_msg_line("VRAM", snapshot_1.vram, snapshot_2.vram)
+
+    return msg
--- a/invokeai/backend/model_management/model_cache.py
+++ b/invokeai/backend/model_management/model_cache.py
@ -18,8 +18,10 @@ context. Use like this:

 import gc
 import hashlib
+import math
 import os
 import sys
+import time
 from contextlib import suppress
 from dataclasses import dataclass, field
 from pathlib import Path
@ -28,6 +30,7 @@ from typing import Any, Dict, Optional, Type, Union, types
 import torch

 import invokeai.backend.util.logging as logger
+from invokeai.backend.model_management.memory_snapshot import MemorySnapshot, get_pretty_snapshot_diff

 from ..util.devices import choose_torch_device
 from .models import BaseModelType, ModelBase, ModelType, SubModelType
@ -44,6 +47,8 @@ DEFAULT_MAX_VRAM_CACHE_SIZE = 2.75

 # actual size of a gig
 GIG = 1073741824
+# Size of a MB in bytes.
+MB = 2**20


@dataclass
@ -205,22 +210,43 @@ class ModelCache(object):
        cache_entry = self._cached_models.get(key, None)
        if cache_entry is None:
            self.logger.info(
-                f"Loading model {model_path}, type {base_model.value}:{model_type.value}{':'+submodel.value if submodel else ''}"
+                f"Loading model {model_path}, type"
+                f" {base_model.value}:{model_type.value}{':'+submodel.value if submodel else ''}"
            )
            if self.stats:
                self.stats.misses += 1

-            # this will remove older cached models until
-            # there is sufficient room to load the requested model
-            self._make_cache_room(model_info.get_size(submodel))
+            self_reported_model_size_before_load = model_info.get_size(submodel)
+            # Remove old models from the cache to make room for the new model.
+            self._make_cache_room(self_reported_model_size_before_load)

-            # clean memory to make MemoryUsage() more accurate
-            gc.collect()
+            # Load the model from disk and capture a memory snapshot before/after.
+            start_load_time = time.time()
+            snapshot_before = MemorySnapshot.capture()
            model = model_info.get_model(child_type=submodel, torch_dtype=self.precision)
-            if mem_used := model_info.get_size(submodel):
-                self.logger.debug(f"CPU RAM used for load: {(mem_used/GIG):.2f} GB")
+            snapshot_after = MemorySnapshot.capture()
+            end_load_time = time.time()

-            cache_entry = _CacheRecord(self, model, mem_used)
+            self_reported_model_size_after_load = model_info.get_size(submodel)
+
+            self.logger.debug(
+                f"Moved model '{key}' from disk to cpu in {(end_load_time-start_load_time):.2f}s.\n"
+                f"Self-reported size before/after load: {(self_reported_model_size_before_load/GIG):.3f}GB /"
+                f" {(self_reported_model_size_after_load/GIG):.3f}GB.\n"
+                f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
+            )
+
+            # We only log a warning for over-reported (not under-reported) model sizes before load. There is a known
+            # issue where models report their fp32 size before load, and are then loaded as fp16. Once this issue is
+            # addressed, it would make sense to log a warning for both over-reported and under-reported model sizes.
+            if (self_reported_model_size_after_load - self_reported_model_size_before_load) > 10 * MB:
+                self.logger.warning(
+                    f"Model '{key}' mis-reported its size before load. Self-reported size before/after load:"
+                    f" {(self_reported_model_size_before_load/GIG):.2f}GB /"
+                    f" {(self_reported_model_size_after_load/GIG):.2f}GB."
+                )
+
+            cache_entry = _CacheRecord(self, model, self_reported_model_size_after_load)
            self._cached_models[key] = cache_entry
        else:
            if self.stats:
@ -240,6 +266,45 @@ class ModelCache(object):

        return self.ModelLocker(self, key, cache_entry.model, gpu_load, cache_entry.size)

+    def _move_model_to_device(self, key: str, target_device: torch.device):
+        cache_entry = self._cached_models[key]
+
+        source_device = cache_entry.model.device
+        # Note: We compare device types only so that 'cuda' == 'cuda:0'. This would need to be revised to support
+        # multi-GPU.
+        if torch.device(source_device).type == torch.device(target_device).type:
+            return
+
+        start_model_to_time = time.time()
+        snapshot_before = MemorySnapshot.capture()
+        cache_entry.model.to(target_device)
+        snapshot_after = MemorySnapshot.capture()
+        end_model_to_time = time.time()
+        self.logger.debug(
+            f"Moved model '{key}' from {source_device} to"
+            f" {target_device} in {(end_model_to_time-start_model_to_time):.2f}s.\n"
+            f"Estimated model size: {(cache_entry.size/GIG):.3f} GB.\n"
+            f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
+        )
+
+        if snapshot_before.vram is not None and snapshot_after.vram is not None:
+            vram_change = abs(snapshot_before.vram - snapshot_after.vram)
+
+            # If the estimated model size does not match the change in VRAM, log a warning.
+            if not math.isclose(
+                vram_change,
+                cache_entry.size,
+                rel_tol=0.1,
+                abs_tol=10 * MB,
+            ):
+                self.logger.warning(
+                    f"Moving model '{key}' from {source_device} to"
+                    f" {target_device} caused an unexpected change in VRAM usage. The model's"
+                    " estimated size may be incorrect. Estimated model size:"
+                    f" {(cache_entry.size/GIG):.3f} GB.\n"
+                    f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
+                )
+
    class ModelLocker(object):
        def __init__(self, cache, key, model, gpu_load, size_needed):
            """
@ -269,11 +334,7 @@ class ModelCache(object):
                    if self.cache.lazy_offloading:
                        self.cache._offload_unlocked_models(self.size_needed)

-                    if self.model.device != self.cache.execution_device:
-                        self.cache.logger.debug(f"Moving {self.key} into {self.cache.execution_device}")
-                        with VRAMUsage() as mem:
-                            self.model.to(self.cache.execution_device)  # move into GPU
-                        self.cache.logger.debug(f"GPU VRAM used for load: {(mem.vram_used/GIG):.2f} GB")
+                    self.cache._move_model_to_device(self.key, self.cache.execution_device)

                    self.cache.logger.debug(f"Locking {self.key} in {self.cache.execution_device}")
                    self.cache._print_cuda_stats()
@ -286,7 +347,7 @@ class ModelCache(object):
            # in the event that the caller wants the model in RAM, we
            # move it into CPU if it is in GPU and not locked
            elif self.cache_entry.loaded and not self.cache_entry.locked:
-                self.model.to(self.cache.storage_device)
+                self.cache._move_model_to_device(self.key, self.cache.storage_device)

            return self.model

@ -339,7 +400,8 @@ class ModelCache(object):
                locked_models += 1

        self.logger.debug(
-            f"Current VRAM/RAM usage: {vram}/{ram}; cached_models/loaded_models/locked_models/ = {cached_models}/{loaded_models}/{locked_models}"
+            f"Current VRAM/RAM usage: {vram}/{ram}; cached_models/loaded_models/locked_models/ ="
+            f" {cached_models}/{loaded_models}/{locked_models}"
        )

    def _cache_size(self) -> int:
@ -354,7 +416,8 @@ class ModelCache(object):

        if current_size + bytes_needed > maximum_size:
            self.logger.debug(
-                f"Max cache size exceeded: {(current_size/GIG):.2f}/{self.max_cache_size:.2f} GB, need an additional {(bytes_needed/GIG):.2f} GB"
+                f"Max cache size exceeded: {(current_size/GIG):.2f}/{self.max_cache_size:.2f} GB, need an additional"
+                f" {(bytes_needed/GIG):.2f} GB"
            )

        self.logger.debug(f"Before unloading: cached_models={len(self._cached_models)}")
@ -387,7 +450,8 @@ class ModelCache(object):

            device = cache_entry.model.device if hasattr(cache_entry.model, "device") else None
            self.logger.debug(
-                f"Model: {model_key}, locks: {cache_entry._locks}, device: {device}, loaded: {cache_entry.loaded}, refs: {refs}"
+                f"Model: {model_key}, locks: {cache_entry._locks}, device: {device}, loaded: {cache_entry.loaded},"
+                f" refs: {refs}"
            )

            # 2 refs:
@ -423,11 +487,9 @@ class ModelCache(object):
            if vram_in_use <= reserved:
                break
            if not cache_entry.locked and cache_entry.loaded:
-                self.logger.debug(f"Offloading {model_key} from {self.execution_device} into {self.storage_device}")
-                with VRAMUsage() as mem:
-                    cache_entry.model.to(self.storage_device)
-                self.logger.debug(f"GPU VRAM freed: {(mem.vram_used/GIG):.2f} GB")
-                vram_in_use += mem.vram_used  # note vram_used is negative
+                self._move_model_to_device(model_key, self.storage_device)
+
+                vram_in_use = torch.cuda.memory_allocated()
                self.logger.debug(f"{(vram_in_use/GIG):.2f}GB VRAM used for models; max allowed={(reserved/GIG):.2f}GB")

        gc.collect()
@ -454,16 +516,3 @@ class ModelCache(object):
        with open(hashpath, "w") as f:
            f.write(hash)
        return hash
-
-
-class VRAMUsage(object):
-    def __init__(self):
-        self.vram = None
-        self.vram_used = 0
-
-    def __enter__(self):
-        self.vram = torch.cuda.memory_allocated()
-        return self
-
-    def __exit__(self, *args):
-        self.vram_used = torch.cuda.memory_allocated() - self.vram
--- a/tests/backend/model_management/test_libc_util.py
+++ b/tests/backend/model_management/test_libc_util.py
@ -0,0 +1,24 @@
+import pytest
+
+from invokeai.backend.model_management.libc_util import LibcUtil, Struct_mallinfo2
+
+
+def test_libc_util_mallinfo2():
+    """Smoke test of LibcUtil().mallinfo2()."""
+    try:
+        libc = LibcUtil()
+    except OSError:
+        # TODO: Set the expected result preemptively based on the system properties.
+        pytest.xfail("libc shared library is not available on this system.")
+
+    info = libc.mallinfo2()
+
+    assert info.arena > 0
+
+
+def test_struct_mallinfo2_to_str():
+    """Smoke test of Struct_mallinfo2.__str__()."""
+    info = Struct_mallinfo2()
+    info_str = str(info)
+
+    assert len(info_str) > 0
--- a/tests/backend/model_management/test_memory_snapshot.py
+++ b/tests/backend/model_management/test_memory_snapshot.py
@ -0,0 +1,35 @@
+import pytest
+
+from invokeai.backend.model_management.libc_util import Struct_mallinfo2
+from invokeai.backend.model_management.memory_snapshot import MemorySnapshot, get_pretty_snapshot_diff
+
+
+def test_memory_snapshot_capture():
+    """Smoke test of MemorySnapshot.capture()."""
+    snapshot = MemorySnapshot.capture()
+
+    # We just check process_ram, because it is the only field that should be supported on all platforms.
+    assert snapshot.process_ram > 0
+
+
+snapshots = [
+    MemorySnapshot(process_ram=1.0, vram=2.0, malloc_info=Struct_mallinfo2()),
+    MemorySnapshot(process_ram=1.0, vram=2.0, malloc_info=None),
+    MemorySnapshot(process_ram=1.0, vram=None, malloc_info=Struct_mallinfo2()),
+    MemorySnapshot(process_ram=1.0, vram=None, malloc_info=None),
+]
+
+
+@pytest.mark.parametrize("snapshot_1", snapshots)
+@pytest.mark.parametrize("snapshot_2", snapshots)
+def test_get_pretty_snapshot_diff(snapshot_1, snapshot_2):
+    """Test that get_pretty_snapshot_diff() works with various combinations of missing MemorySnapshot fields."""
+    msg = get_pretty_snapshot_diff(snapshot_1, snapshot_2)
+
+    expected_lines = 1
+    if snapshot_1.vram is not None and snapshot_2.vram is not None:
+        expected_lines += 1
+    if snapshot_1.malloc_info is not None and snapshot_2.malloc_info is not None:
+        expected_lines += 5
+
+    assert len(msg.splitlines()) == expected_lines