fixup unit tests and remove debugging statements

2024-08-30 20:32:17 +00:00 · 2024-06-02 18:19:29 -04:00
parent e26360f85b
commit 589a7959c0
11 changed files with 61 additions and 186 deletions
--- a/invokeai/app/api/dependencies.py
+++ b/invokeai/app/api/dependencies.py
@ -4,7 +4,6 @@ from logging import Logger

 import torch

-import invokeai.backend.util.devices  # horrible hack
 from invokeai.app.services.object_serializer.object_serializer_disk import ObjectSerializerDisk
 from invokeai.app.services.object_serializer.object_serializer_forward_cache import ObjectSerializerForwardCache
 from invokeai.app.services.shared.sqlite.sqlite_util import init_db
--- a/invokeai/app/invocations/compel.py
+++ b/invokeai/app/invocations/compel.py
@ -99,6 +99,7 @@ class CompelInvocation(BaseInvocation):
                textual_inversion_manager=ti_manager,
                dtype_for_device_getter=TorchDevice.choose_torch_dtype,
                truncate_long_prompts=False,
+                device=TorchDevice.choose_torch_device(),
            )

            conjunction = Compel.parse_prompt_string(self.prompt)
@ -113,6 +114,7 @@ class CompelInvocation(BaseInvocation):
        conditioning_data = ConditioningFieldData(conditionings=[BasicConditioningInfo(embeds=c)])

        conditioning_name = context.conditioning.save(conditioning_data)
+
        return ConditioningOutput(
            conditioning=ConditioningField(
                conditioning_name=conditioning_name,
--- a/invokeai/app/services/invocation_stats/invocation_stats_default.py
+++ b/invokeai/app/services/invocation_stats/invocation_stats_default.py
@ -74,9 +74,9 @@ class InvocationStatsService(InvocationStatsServiceBase):
            )
            self._stats[graph_execution_state_id].add_node_execution_stats(node_stats)

-    def reset_stats(self):
-        self._stats = {}
-        self._cache_stats = {}
+    def reset_stats(self, graph_execution_state_id: str):
+        self._stats.pop(graph_execution_state_id)
+        self._cache_stats.pop(graph_execution_state_id)

    def get_stats(self, graph_execution_state_id: str) -> InvocationStatsSummary:
        graph_stats_summary = self._get_graph_summary(graph_execution_state_id)
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@ -76,8 +76,6 @@ class ModelManagerService(ModelManagerServiceBase):

        ram_cache = ModelCache(
            max_cache_size=app_config.ram,
-            max_vram_cache_size=app_config.vram,
-            lazy_offloading=app_config.lazy_offload,
            logger=logger,
        )
        convert_cache = ModelConvertCache(cache_path=app_config.convert_cache_path, max_size=app_config.convert_cache)
--- a/invokeai/app/services/session_processor/session_processor_default.py
+++ b/invokeai/app/services/session_processor/session_processor_default.py
@ -1,7 +1,7 @@
 import traceback
 from contextlib import suppress
 from queue import Queue
-from threading import BoundedSemaphore, Thread, Lock
+from threading import BoundedSemaphore, Lock, Thread
 from threading import Event as ThreadEvent
 from typing import Optional, Set

@ -61,7 +61,9 @@ class DefaultSessionRunner(SessionRunnerBase):
        self._on_after_run_session_callbacks = on_after_run_session_callbacks or []
        self._process_lock = Lock()

-    def start(self, services: InvocationServices, cancel_event: ThreadEvent, profiler: Optional[Profiler] = None) -> None:
+    def start(
+        self, services: InvocationServices, cancel_event: ThreadEvent, profiler: Optional[Profiler] = None
+    ) -> None:
        self._services = services
        self._cancel_event = cancel_event
        self._profiler = profiler
@ -214,7 +216,7 @@ class DefaultSessionRunner(SessionRunnerBase):
            # we don't care about that - suppress the error.
            with suppress(GESStatsNotFoundError):
                self._services.performance_statistics.log_stats(queue_item.session.id)
-                self._services.performance_statistics.reset_stats()
+                self._services.performance_statistics.reset_stats(queue_item.session.id)

            for callback in self._on_after_run_session_callbacks:
                callback(queue_item=queue_item)
@ -384,7 +386,6 @@ class DefaultSessionProcessor(SessionProcessorBase):
            )
            worker.start()

-
    def stop(self, *args, **kwargs) -> None:
        self._stop_event.set()

@ -465,7 +466,7 @@ class DefaultSessionProcessor(SessionProcessorBase):
                    # Run the graph
                    # self.session_runner.run(queue_item=self._queue_item)

-                except Exception as e:
+                except Exception:
                    # Wait for next polling interval or event to try again
                    poll_now_event.wait(self._polling_interval)
                    continue
@ -494,7 +495,7 @@ class DefaultSessionProcessor(SessionProcessorBase):
                with self._invoker.services.model_manager.load.ram_cache.reserve_execution_device():
                    # Run the session on the reserved GPU
                    self.session_runner.run(queue_item=queue_item)
-            except Exception as e:
+            except Exception:
                continue
            finally:
                self._active_queue_items.remove(queue_item)
--- a/invokeai/app/services/session_queue/session_queue_common.py
+++ b/invokeai/app/services/session_queue/session_queue_common.py
@ -239,6 +239,7 @@ class SessionQueueItemWithoutGraph(BaseModel):
    def __hash__(self) -> int:
        return self.item_id

+
 class SessionQueueItemDTO(SessionQueueItemWithoutGraph):
    pass

--- a/invokeai/app/services/shared/invocation_context.py
+++ b/invokeai/app/services/shared/invocation_context.py
@ -325,7 +325,6 @@ class ConditioningInterface(InvocationContextInterface):
        Returns:
            The loaded conditioning data.
        """
-
        return self._services.conditioning.load(name)


--- a/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_base.py
@ -43,26 +43,9 @@ T = TypeVar("T")

@dataclass
 class CacheRecord(Generic[T]):
-    """
-    Elements of the cache:
-
-    key: Unique key for each model, same as used in the models database.
-    model: Model in memory.
-    state_dict: A read-only copy of the model's state dict in RAM. It will be
-                used as a template for creating a copy in the VRAM.
-    size: Size of the model
-    loaded: True if the model's state dict is currently in VRAM
-
-    Before a model is executed, the state_dict template is copied into VRAM,
-    and then injected into the model. When the model is finished, the VRAM
-    copy of the state dict is deleted, and the RAM version is reinjected
-    into the model.
-    """
+    """Elements of the cache."""

    key: str
-    model: T
-    device: torch.device
-    state_dict: Optional[Dict[str, torch.Tensor]]
    size: int
    model: T
    loaded: bool = False
@ -130,28 +113,12 @@ class ModelCacheBase(ABC, Generic[T]):
        """
        pass

-    @property
-    @abstractmethod
-    def lazy_offloading(self) -> bool:
-        """Return true if the cache is configured to lazily offload models in VRAM."""
-        pass
-
    @property
    @abstractmethod
    def max_cache_size(self) -> float:
        """Return true if the cache is configured to lazily offload models in VRAM."""
        pass

-    @abstractmethod
-    def offload_unlocked_models(self, size_required: int) -> None:
-        """Offload from VRAM any models not actively in use."""
-        pass
-
-    @abstractmethod
-    def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
-        """Move model into the indicated device."""
-        pass
-
    @property
    @abstractmethod
    def stats(self) -> Optional[CacheStats]:
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@ -19,10 +19,8 @@ context. Use like this:
 """

 import gc
-import math
 import sys
 import threading
-import time
 from contextlib import contextmanager, suppress
 from logging import Logger
 from threading import BoundedSemaphore
@ -31,7 +29,7 @@ from typing import Dict, Generator, List, Optional, Set
 import torch

 from invokeai.backend.model_manager import AnyModel, SubModelType
-from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot, get_pretty_snapshot_diff
+from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot
 from invokeai.backend.util.devices import TorchDevice
 from invokeai.backend.util.logging import InvokeAILogger

@ -42,11 +40,6 @@ from .model_locker import ModelLocker
 # Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously
 DEFAULT_MAX_CACHE_SIZE = 6.0

-# amount of GPU memory to hold in reserve for use by generations (GB)
-# Empirically this value seems to improve performance without starving other
-# processes.
-DEFAULT_MAX_VRAM_CACHE_SIZE = 0.25
-
 # actual size of a gig
 GIG = 1073741824

@ -60,12 +53,10 @@ class ModelCache(ModelCacheBase[AnyModel]):
    def __init__(
        self,
        max_cache_size: float = DEFAULT_MAX_CACHE_SIZE,
-        max_vram_cache_size: float = DEFAULT_MAX_VRAM_CACHE_SIZE,
        storage_device: torch.device = torch.device("cpu"),
        execution_devices: Optional[Set[torch.device]] = None,
        precision: torch.dtype = torch.float16,
        sequential_offload: bool = False,
-        lazy_offloading: bool = True,
        sha_chunksize: int = 16777216,
        log_memory_usage: bool = False,
        logger: Optional[Logger] = None,
@ -76,18 +67,14 @@ class ModelCache(ModelCacheBase[AnyModel]):
        :param max_cache_size: Maximum size of the RAM cache [6.0 GB]
        :param storage_device: Torch device to save inactive model in [torch.device('cpu')]
        :param precision: Precision for loaded models [torch.float16]
-        :param lazy_offloading: Keep model in VRAM until another model needs to be loaded
        :param sequential_offload: Conserve VRAM by loading and unloading each stage of the pipeline sequentially
        :param log_memory_usage: If True, a memory snapshot will be captured before and after every model cache
            operation, and the result will be logged (at debug level). There is a time cost to capturing the memory
            snapshots, so it is recommended to disable this feature unless you are actively inspecting the model cache's
            behaviour.
        """
-        # allow lazy offloading only when vram cache enabled
-        self._lazy_offloading = lazy_offloading and max_vram_cache_size > 0
        self._precision: torch.dtype = precision
        self._max_cache_size: float = max_cache_size
-        self._max_vram_cache_size: float = max_vram_cache_size
        self._storage_device: torch.device = storage_device
        self._ram_lock = threading.Lock()
        self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
@ -111,11 +98,6 @@ class ModelCache(ModelCacheBase[AnyModel]):
        """Return the logger used by the cache."""
        return self._logger

-    @property
-    def lazy_offloading(self) -> bool:
-        """Return true if the cache is configured to lazily offload models in VRAM."""
-        return self._lazy_offloading
-
    @property
    def storage_device(self) -> torch.device:
        """Return the storage device (e.g. "CPU" for RAM)."""
@ -233,10 +215,9 @@ class ModelCache(ModelCacheBase[AnyModel]):
            if key in self._cached_models:
                return
            self.make_room(size)
-        state_dict = model.state_dict() if isinstance(model, torch.nn.Module) else None
-        cache_record = CacheRecord(key=key, model=model, device=self.storage_device, state_dict=state_dict, size=size)
-        self._cached_models[key] = cache_record
-        self._cache_stack.append(key)
+            cache_record = CacheRecord(key, model=model, size=size)
+            self._cached_models[key] = cache_record
+            self._cache_stack.append(key)

    def get(
        self,
@ -296,107 +277,6 @@ class ModelCache(ModelCacheBase[AnyModel]):
        else:
            return model_key

-    def offload_unlocked_models(self, size_required: int) -> None:
-        """Move any unused models from VRAM."""
-        reserved = self._max_vram_cache_size * GIG
-        vram_in_use = torch.cuda.memory_allocated() + size_required
-        self.logger.debug(f"{(vram_in_use/GIG):.2f}GB VRAM needed for models; max allowed={(reserved/GIG):.2f}GB")
-        for _, cache_entry in sorted(self._cached_models.items(), key=lambda x: x[1].size):
-            if vram_in_use <= reserved:
-                break
-            if not cache_entry.loaded:
-                continue
-            if not cache_entry.locked:
-                self.move_model_to_device(cache_entry, self.storage_device)
-                cache_entry.loaded = False
-                vram_in_use = torch.cuda.memory_allocated() + size_required
-                self.logger.debug(
-                    f"Removing {cache_entry.key} from VRAM to free {(cache_entry.size/GIG):.2f}GB; vram free = {(torch.cuda.memory_allocated()/GIG):.2f}GB"
-                )
-
-        TorchDevice.empty_cache()
-
-    def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
-        """Move model into the indicated device.
-
-        :param cache_entry: The CacheRecord for the model
-        :param target_device: The torch.device to move the model into
-
-        May raise a torch.cuda.OutOfMemoryError
-        """
-        # These attributes are not in the base ModelMixin class but in various derived classes.
-        # Some models don't have these attributes, in which case they run in RAM/CPU.
-        self.logger.debug(f"Called to move {cache_entry.key} to {target_device}")
-        if not (hasattr(cache_entry.model, "device") and hasattr(cache_entry.model, "to")):
-            return
-
-        source_device = cache_entry.device
-
-        # Note: We compare device types only so that 'cuda' == 'cuda:0'.
-        # This would need to be revised to support multi-GPU.
-        if torch.device(source_device).type == torch.device(target_device).type:
-            return
-
-        # This roundabout method for moving the model around is done to avoid
-        # the cost of moving the model from RAM to VRAM and then back from VRAM to RAM.
-        # When moving to VRAM, we copy (not move) each element of the state dict from
-        # RAM to a new state dict in VRAM, and then inject it into the model.
-        # This operation is slightly faster than running `to()` on the whole model.
-        #
-        # When the model needs to be removed from VRAM we simply delete the copy
-        # of the state dict in VRAM, and reinject the state dict that is cached
-        # in RAM into the model. So this operation is very fast.
-        start_model_to_time = time.time()
-        snapshot_before = self._capture_memory_snapshot()
-
-        try:
-            if cache_entry.state_dict is not None:
-                assert hasattr(cache_entry.model, "load_state_dict")
-                if target_device == self.storage_device:
-                    cache_entry.model.load_state_dict(cache_entry.state_dict, assign=True)
-                else:
-                    new_dict: Dict[str, torch.Tensor] = {}
-                    for k, v in cache_entry.state_dict.items():
-                        new_dict[k] = v.to(torch.device(target_device), copy=True)
-                    cache_entry.model.load_state_dict(new_dict, assign=True)
-            cache_entry.model.to(target_device)
-            cache_entry.device = target_device
-        except Exception as e:  # blow away cache entry
-            self._delete_cache_entry(cache_entry)
-            raise e
-
-        snapshot_after = self._capture_memory_snapshot()
-        end_model_to_time = time.time()
-        self.logger.debug(
-            f"Moved model '{cache_entry.key}' from {source_device} to"
-            f" {target_device} in {(end_model_to_time-start_model_to_time):.2f}s."
-            f"Estimated model size: {(cache_entry.size/GIG):.3f} GB."
-            f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
-        )
-
-        if (
-            snapshot_before is not None
-            and snapshot_after is not None
-            and snapshot_before.vram is not None
-            and snapshot_after.vram is not None
-        ):
-            vram_change = abs(snapshot_before.vram - snapshot_after.vram)
-
-            # If the estimated model size does not match the change in VRAM, log a warning.
-            if not math.isclose(
-                vram_change,
-                cache_entry.size,
-                rel_tol=0.1,
-                abs_tol=10 * MB,
-            ):
-                self.logger.debug(
-                    f"Moving model '{cache_entry.key}' from {source_device} to"
-                    f" {target_device} caused an unexpected change in VRAM usage. The model's"
-                    " estimated size may be incorrect. Estimated model size:"
-                    f" {(cache_entry.size/GIG):.3f} GB.\n"
-                    f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
-                )
-
    def print_cuda_stats(self) -> None:
        """Log CUDA diagnostics."""
        vram = "%4.2fG" % (torch.cuda.memory_allocated() / GIG)
@ -440,12 +320,43 @@ class ModelCache(ModelCacheBase[AnyModel]):
        while current_size + bytes_needed > maximum_size and pos < len(self._cache_stack):
            model_key = self._cache_stack[pos]
            cache_entry = self._cached_models[model_key]
+
+            refs = sys.getrefcount(cache_entry.model)
+
+            # HACK: This is a workaround for a memory-management issue that we haven't tracked down yet. We are directly
+            # going against the advice in the Python docs by using `gc.get_referrers(...)` in this way:
+            # https://docs.python.org/3/library/gc.html#gc.get_referrers
+
+            # manualy clear local variable references of just finished function calls
+            # for some reason python don't want to collect it even by gc.collect() immidiately
+            if refs > 2:
+                while True:
+                    cleared = False
+                    for referrer in gc.get_referrers(cache_entry.model):
+                        if type(referrer).__name__ == "frame":
+                            # RuntimeError: cannot clear an executing frame
+                            with suppress(RuntimeError):
+                                referrer.clear()
+                                cleared = True
+                                # break
+
+                    # repeat if referrers changes(due to frame clear), else exit loop
+                    if cleared:
+                        gc.collect()
+                    else:
+                        break
+
            device = cache_entry.model.device if hasattr(cache_entry.model, "device") else None
            self.logger.debug(
-                f"Model: {model_key}, locks: {cache_entry._locks}, device: {device}, loaded: {cache_entry.loaded}"
+                f"Model: {model_key}, locks: {cache_entry._locks}, device: {device}, loaded: {cache_entry.loaded},"
+                f" refs: {refs}"
            )

-            if not cache_entry.locked:
+            # Expected refs:
+            # 1 from cache_entry
+            # 1 from getrefcount function
+            # 1 from onnx runtime object
+            if not cache_entry.locked and refs <= (3 if "onnx" in model_key else 2):
                self.logger.debug(
                    f"Removing {model_key} from RAM cache to free at least {(size/GIG):.2f} GB (-{(cache_entry.size/GIG):.2f} GB)"
                )
--- a/invokeai/backend/model_manager/load/model_cache/model_locker.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_locker.py
@ -2,6 +2,7 @@
 Base class and implementation of a class that moves models in and out of VRAM.
 """

+import copy
 from typing import Optional

 import torch
@ -54,14 +55,13 @@ class ModelLocker(ModelLockerBase):
        # NOTE that the model has to have the to() method in order for this code to move it into GPU!
        self._cache_entry.lock()
        try:
-            if self._cache.lazy_offloading:
-                self._cache.offload_unlocked_models(self._cache_entry.size)
-
-            execution_device = self._cache.get_execution_device()
-            self._cache.move_model_to_device(self._cache_entry, execution_device)
+            # We wait for a gpu to be free - may raise a ValueError
+            self._execution_device = self._cache.get_execution_device()
+            self._cache.logger.debug(f"Locking {self._cache_entry.key} in {self._execution_device}")
+            model_in_gpu = copy.deepcopy(self._cache_entry.model)
+            if hasattr(model_in_gpu, "to"):
+                model_in_gpu.to(self._execution_device)
            self._cache_entry.loaded = True
-
-            self._cache.logger.debug(f"Locking {self._cache_entry.key} in {execution_device}")
            self._cache.print_cuda_stats()
        except torch.cuda.OutOfMemoryError:
            self._cache.logger.warning("Insufficient GPU memory to load model. Aborting")
@ -70,15 +70,11 @@ class ModelLocker(ModelLockerBase):
        except Exception:
            self._cache_entry.unlock()
            raise
-
-        return self.model
+        return model_in_gpu

    def unlock(self) -> None:
        """Call upon exit from context."""
        if not hasattr(self.model, "to"):
            return
-
        self._cache_entry.unlock()
-        if not self._cache.lazy_offloading:
-            self._cache.offload_unlocked_models(0)
-            self._cache.print_cuda_stats()
+        self._cache.print_cuda_stats()
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -54,6 +54,7 @@ def mock_services() -> InvocationServices:
        workflow_records=None,  # type: ignore
        tensors=None,  # type: ignore
        conditioning=None,  # type: ignore
+        performance_statistics=None,  # type: ignore
    )