add ram cache module and support files

2024-08-30 20:32:17 +00:00 · 2024-01-31 23:37:59 -05:00 · 2024-01-31 23:37:59 -05:00 · 231c12fd1e
commit 231c12fd1e
parent 66e2d1b346
14 changed files with 1334 additions and 6 deletions
--- a/invokeai/backend/model_manager/config.py
+++ b/invokeai/backend/model_manager/config.py
@ -152,6 +152,7 @@ class _DiffusersConfig(ModelConfigBase):
    format: Literal[ModelFormat.Diffusers] = ModelFormat.Diffusers
    repo_variant: Optional[ModelRepoVariant] = ModelRepoVariant.DEFAULT

+
 class LoRAConfig(ModelConfigBase):
    """Model config for LoRA/Lycoris models."""

@ -179,6 +180,7 @@ class ControlNetDiffusersConfig(_DiffusersConfig):
    type: Literal[ModelType.ControlNet] = ModelType.ControlNet
    format: Literal[ModelFormat.Diffusers] = ModelFormat.Diffusers

+
 class ControlNetCheckpointConfig(_CheckpointConfig):
    """Model config for ControlNet models (diffusers version)."""

@ -214,6 +216,7 @@ class MainDiffusersConfig(_DiffusersConfig, _MainConfig):
    prediction_type: SchedulerPredictionType = SchedulerPredictionType.Epsilon
    upcast_attention: bool = False

+
 class ONNXSD1Config(_MainConfig):
    """Model config for ONNX format models based on sd-1."""

--- a/invokeai/backend/model_manager/load/init.py
+++ b/invokeai/backend/model_manager/load/init.py
--- a/invokeai/backend/model_manager/load/load_base.py
+++ b/invokeai/backend/model_manager/load/load_base.py
@ -0,0 +1,193 @@
+# Copyright (c) 2024, Lincoln D. Stein and the InvokeAI Development Team
+"""
+Base class for model loading in InvokeAI.
+
+Use like this:
+
+  loader = AnyModelLoader(...)
+  loaded_model = loader.get_model('019ab39adfa1840455')
+  with loaded_model as model:  # context manager moves model into VRAM
+       # do something with loaded_model
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from logging import Logger
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Type, Union
+
+import torch
+from diffusers import DiffusionPipeline
+from injector import inject
+
+from invokeai.app.services.config import InvokeAIAppConfig
+from invokeai.app.services.model_records import ModelRecordServiceBase
+from invokeai.backend.model_manager import AnyModelConfig, BaseModelType, ModelFormat, ModelType, SubModelType
+from invokeai.backend.model_manager.convert_cache import ModelConvertCacheBase
+from invokeai.backend.model_manager.onnx_runtime import IAIOnnxRuntimeModel
+from invokeai.backend.model_manager.ram_cache import ModelCacheBase
+
+AnyModel = Union[DiffusionPipeline, torch.nn.Module, IAIOnnxRuntimeModel]
+
+
+class ModelLockerBase(ABC):
+    """Base class for the model locker used by the loader."""
+
+    @abstractmethod
+    def lock(self) -> None:
+        """Lock the contained model and move it into VRAM."""
+        pass
+
+    @abstractmethod
+    def unlock(self) -> None:
+        """Unlock the contained model, and remove it from VRAM."""
+        pass
+
+    @property
+    @abstractmethod
+    def model(self) -> AnyModel:
+        """Return the model."""
+        pass
+
+
+@dataclass
+class LoadedModel:
+    """Context manager object that mediates transfer from RAM<->VRAM."""
+
+    config: AnyModelConfig
+    locker: ModelLockerBase
+
+    def __enter__(self) -> AnyModel:  # I think load_file() always returns a dict
+        """Context entry."""
+        self.locker.lock()
+        return self.model
+
+    def __exit__(self, *args: Any, **kwargs: Any) -> None:
+        """Context exit."""
+        self.locker.unlock()
+
+    @property
+    def model(self) -> AnyModel:
+        """Return the model without locking it."""
+        return self.locker.model()
+
+
+class ModelLoaderBase(ABC):
+    """Abstract base class for loading models into RAM/VRAM."""
+
+    @abstractmethod
+    def __init__(
+        self,
+        app_config: InvokeAIAppConfig,
+        logger: Logger,
+        ram_cache: ModelCacheBase,
+        convert_cache: ModelConvertCacheBase,
+    ):
+        """Initialize the loader."""
+        pass
+
+    @abstractmethod
+    def load_model(self, model_config: AnyModelConfig, submodel_type: Optional[SubModelType] = None) -> LoadedModel:
+        """
+        Return a model given its key.
+
+        Given a model key identified in the model configuration backend,
+        return a ModelInfo object that can be used to retrieve the model.
+
+        :param model_config: Model configuration, as returned by ModelConfigRecordStore
+        :param submodel_type: an ModelType enum indicating the portion of
+               the model to retrieve (e.g. ModelType.Vae)
+        """
+        pass
+
+    @abstractmethod
+    def get_size_fs(
+        self, config: AnyModelConfig, model_path: Path, submodel_type: Optional[SubModelType] = None
+    ) -> int:
+        """Return size in bytes of the model, calculated before loading."""
+        pass
+
+
+# TO DO: Better name?
+class AnyModelLoader:
+    """This class manages the model loaders and invokes the correct one to load a model of given base and type."""
+
+    # this tracks the loader subclasses
+    _registry: Dict[str, Type[ModelLoaderBase]] = {}
+
+    @inject
+    def __init__(
+        self,
+        store: ModelRecordServiceBase,
+        app_config: InvokeAIAppConfig,
+        logger: Logger,
+        ram_cache: ModelCacheBase,
+        convert_cache: ModelConvertCacheBase,
+    ):
+        """Store the provided ModelRecordServiceBase and empty the registry."""
+        self._store = store
+        self._app_config = app_config
+        self._logger = logger
+        self._ram_cache = ram_cache
+        self._convert_cache = convert_cache
+
+    def get_model(self, key: str, submodel_type: Optional[SubModelType] = None) -> LoadedModel:
+        """
+        Return a model given its key.
+
+        Given a model key identified in the model configuration backend,
+        return a ModelInfo object that can be used to retrieve the model.
+
+        :param key: model key, as known to the config backend
+        :param submodel_type: an ModelType enum indicating the portion of
+               the model to retrieve (e.g. ModelType.Vae)
+        """
+        model_config = self._store.get_model(key)
+        implementation = self.__class__.get_implementation(
+            base=model_config.base, type=model_config.type, format=model_config.format
+        )
+        return implementation(
+            app_config=self._app_config,
+            logger=self._logger,
+            ram_cache=self._ram_cache,
+            convert_cache=self._convert_cache,
+        ).load_model(model_config, submodel_type)
+
+    @staticmethod
+    def _to_registry_key(base: BaseModelType, type: ModelType, format: ModelFormat) -> str:
+        return "-".join([base.value, type.value, format.value])
+
+    @classmethod
+    def get_implementation(cls, base: BaseModelType, type: ModelType, format: ModelFormat) -> Type[ModelLoaderBase]:
+        """Get subclass of ModelLoaderBase registered to handle base and type."""
+        key1 = cls._to_registry_key(base, type, format)  # for a specific base type
+        key2 = cls._to_registry_key(BaseModelType.Any, type, format)  # with wildcard Any
+        implementation = cls._registry.get(key1) or cls._registry.get(key2)
+        if not implementation:
+            raise NotImplementedError(
+                "No subclass of LoadedModel is registered for base={base}, type={type}, format={format}"
+            )
+        return implementation
+
+    @classmethod
+    def register(
+        cls, type: ModelType, format: ModelFormat, base: BaseModelType = BaseModelType.Any
+    ) -> Callable[[Type[ModelLoaderBase]], Type[ModelLoaderBase]]:
+        """Define a decorator which registers the subclass of loader."""
+
+        def decorator(subclass: Type[ModelLoaderBase]) -> Type[ModelLoaderBase]:
+            print("Registering class", subclass.__name__)
+            key = cls._to_registry_key(base, type, format)
+            cls._registry[key] = subclass
+            return subclass
+
+        return decorator
+
+
+# in _init__.py will call something like
+# def configure_loader_dependencies(binder):
+#     binder.bind(ModelRecordServiceBase, ApiDependencies.invoker.services.model_records, scope=singleton)
+#     binder.bind(InvokeAIAppConfig, ApiDependencies.invoker.services.configuration, scope=singleton)
+# etc
+# injector = Injector(configure_loader_dependencies)
+# loader = injector.get(ModelFactory)
--- a/invokeai/backend/model_manager/load/load_default.py
+++ b/invokeai/backend/model_manager/load/load_default.py
@ -0,0 +1,168 @@
+# Copyright (c) 2024, Lincoln D. Stein and the InvokeAI Development Team
+"""Default implementation of model loading in InvokeAI."""
+
+import sys
+from logging import Logger
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple
+
+from diffusers import ModelMixin
+from diffusers.configuration_utils import ConfigMixin
+from injector import inject
+
+from invokeai.app.services.config import InvokeAIAppConfig
+from invokeai.backend.model_manager import AnyModelConfig, InvalidModelConfigException, ModelRepoVariant, SubModelType
+from invokeai.backend.model_manager.convert_cache import ModelConvertCacheBase
+from invokeai.backend.model_manager.load.load_base import AnyModel, LoadedModel, ModelLoaderBase
+from invokeai.backend.model_manager.load.model_util import calc_model_size_by_fs
+from invokeai.backend.model_manager.load.optimizations import skip_torch_weight_init
+from invokeai.backend.model_manager.ram_cache import ModelCacheBase, ModelLockerBase
+from invokeai.backend.util.devices import choose_torch_device, torch_dtype
+
+
+class ConfigLoader(ConfigMixin):
+    """Subclass of ConfigMixin for loading diffusers configuration files."""
+
+    @classmethod
+    def load_config(cls, *args: Any, **kwargs: Any) -> Dict[str, Any]:
+        """Load a diffusrs ConfigMixin configuration."""
+        cls.config_name = kwargs.pop("config_name")
+        # Diffusers doesn't provide typing info
+        return super().load_config(*args, **kwargs)  # type: ignore
+
+
+# TO DO: The loader is not thread safe!
+class ModelLoader(ModelLoaderBase):
+    """Default implementation of ModelLoaderBase."""
+
+    @inject  # can inject instances of each of the classes in the call signature
+    def __init__(
+        self,
+        app_config: InvokeAIAppConfig,
+        logger: Logger,
+        ram_cache: ModelCacheBase,
+        convert_cache: ModelConvertCacheBase,
+    ):
+        """Initialize the loader."""
+        self._app_config = app_config
+        self._logger = logger
+        self._ram_cache = ram_cache
+        self._convert_cache = convert_cache
+        self._torch_dtype = torch_dtype(choose_torch_device())
+        self._size: Optional[int] = None  # model size
+
+    def load_model(self, model_config: AnyModelConfig, submodel_type: Optional[SubModelType] = None) -> LoadedModel:
+        """
+        Return a model given its configuration.
+
+        Given a model's configuration as returned by the ModelRecordConfigStore service,
+        return a LoadedModel object that can be used for inference.
+
+        :param model config: Configuration record for this model
+        :param submodel_type: an ModelType enum indicating the portion of
+               the model to retrieve (e.g. ModelType.Vae)
+        """
+        if model_config.type == "main" and not submodel_type:
+            raise InvalidModelConfigException("submodel_type is required when loading a main model")
+
+        model_path, is_submodel_override = self._get_model_path(model_config, submodel_type)
+        if is_submodel_override:
+            submodel_type = None
+
+        if not model_path.exists():
+            raise InvalidModelConfigException(f"Files for model 'model_config.name' not found at {model_path}")
+
+        model_path = self._convert_if_needed(model_config, model_path, submodel_type)
+        locker = self._load_if_needed(model_config, model_path, submodel_type)
+        return LoadedModel(config=model_config, locker=locker)
+
+    # IMPORTANT: This needs to be overridden in the StableDiffusion subclass so as to handle vae overrides
+    # and submodels!!!!
+    def _get_model_path(
+        self, config: AnyModelConfig, submodel_type: Optional[SubModelType] = None
+    ) -> Tuple[Path, bool]:
+        model_base = self._app_config.models_path
+        return ((model_base / config.path).resolve(), False)
+
+    def _convert_if_needed(
+        self, config: AnyModelConfig, model_path: Path, submodel_type: Optional[SubModelType] = None
+    ) -> Path:
+        if not self._needs_conversion(config):
+            return model_path
+
+        self._convert_cache.make_room(self._size or self.get_size_fs(config, model_path, submodel_type))
+        cache_path: Path = self._convert_cache.cache_path(config.key)
+        if cache_path.exists():
+            return cache_path
+
+        self._convert_model(model_path, cache_path)
+        return cache_path
+
+    def _needs_conversion(self, config: AnyModelConfig) -> bool:
+        return False
+
+    def _load_if_needed(
+        self, config: AnyModelConfig, model_path: Path, submodel_type: Optional[SubModelType] = None
+    ) -> ModelLockerBase:
+        # TO DO: This is not thread safe!
+        if self._ram_cache.exists(config.key, submodel_type):
+            return self._ram_cache.get(config.key, submodel_type)
+
+        model_variant = getattr(config, "repo_variant", None)
+        self._ram_cache.make_room(self.get_size_fs(config, model_path, submodel_type))
+
+        # This is where the model is actually loaded!
+        with skip_torch_weight_init():
+            loaded_model = self._load_model(model_path, model_variant=model_variant, submodel_type=submodel_type)
+
+        self._ram_cache.put(
+            config.key,
+            submodel_type=submodel_type,
+            model=loaded_model,
+        )
+
+        return self._ram_cache.get(config.key, submodel_type)
+
+    def get_size_fs(
+        self, config: AnyModelConfig, model_path: Path, submodel_type: Optional[SubModelType] = None
+    ) -> int:
+        """Get the size of the model on disk."""
+        return calc_model_size_by_fs(
+            model_path=model_path,
+            subfolder=submodel_type.value if submodel_type else None,
+            variant=config.repo_variant if hasattr(config, "repo_variant") else None,
+        )
+
+    def _convert_model(self, model_path: Path, cache_path: Path) -> None:
+        raise NotImplementedError
+
+    def _load_model(
+        self,
+        model_path: Path,
+        model_variant: Optional[ModelRepoVariant] = None,
+        submodel_type: Optional[SubModelType] = None,
+    ) -> AnyModel:
+        raise NotImplementedError
+
+    def _load_diffusers_config(self, model_path: Path, config_name: str = "config.json") -> Dict[str, Any]:
+        return ConfigLoader.load_config(model_path, config_name=config_name)
+
+    # TO DO: Add exception handling
+    def _hf_definition_to_type(self, module: str, class_name: str) -> ModelMixin:  # fix with correct type
+        if module in ["diffusers", "transformers"]:
+            res_type = sys.modules[module]
+        else:
+            res_type = sys.modules["diffusers"].pipelines
+        result: ModelMixin = getattr(res_type, class_name)
+        return result
+
+    # TO DO: Add exception handling
+    def _get_hf_load_class(self, model_path: Path, submodel_type: Optional[SubModelType] = None) -> ModelMixin:
+        if submodel_type:
+            config = self._load_diffusers_config(model_path, config_name="model_index.json")
+            module, class_name = config[submodel_type.value]
+            return self._hf_definition_to_type(module=module, class_name=class_name)
+        else:
+            config = self._load_diffusers_config(model_path, config_name="config.json")
+            class_name = config["_class_name"]
+            return self._hf_definition_to_type(module="diffusers", class_name=class_name)
--- a/invokeai/backend/model_manager/load/memory_snapshot.py
+++ b/invokeai/backend/model_manager/load/memory_snapshot.py
@ -0,0 +1,100 @@
+import gc
+from typing import Optional
+
+import psutil
+import torch
+from typing_extensions import Self
+
+from invokeai.backend.model_management.libc_util import LibcUtil, Struct_mallinfo2
+
+GB = 2**30  # 1 GB
+
+
+class MemorySnapshot:
+    """A snapshot of RAM and VRAM usage. All values are in bytes."""
+
+    def __init__(self, process_ram: int, vram: Optional[int], malloc_info: Optional[Struct_mallinfo2]):
+        """Initialize a MemorySnapshot.
+
+        Most of the time, `MemorySnapshot` will be constructed with `MemorySnapshot.capture()`.
+
+        Args:
+            process_ram (int): CPU RAM used by the current process.
+            vram (Optional[int]): VRAM used by torch.
+            malloc_info (Optional[Struct_mallinfo2]): Malloc info obtained from LibcUtil.
+        """
+        self.process_ram = process_ram
+        self.vram = vram
+        self.malloc_info = malloc_info
+
+    @classmethod
+    def capture(cls, run_garbage_collector: bool = True) -> Self:
+        """Capture and return a MemorySnapshot.
+
+        Note: This function has significant overhead, particularly if `run_garbage_collector == True`.
+
+        Args:
+            run_garbage_collector (bool, optional): If true, gc.collect() will be run before checking the process RAM
+                usage. Defaults to True.
+
+        Returns:
+            MemorySnapshot
+        """
+        if run_garbage_collector:
+            gc.collect()
+
+        # According to the psutil docs (https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info), rss is
+        # supported on all platforms.
+        process_ram = psutil.Process().memory_info().rss
+
+        if torch.cuda.is_available():
+            vram = torch.cuda.memory_allocated()
+        else:
+            # TODO: We could add support for mps.current_allocated_memory() as well. Leaving out for now until we have
+            # time to test it properly.
+            vram = None
+
+        try:
+            malloc_info = LibcUtil().mallinfo2()  # type: ignore
+        except (OSError, AttributeError):
+            # OSError: This is expected in environments that do not have the 'libc.so.6' shared library.
+            # AttributeError: This is expected in environments that have `libc.so.6` but do not have the `mallinfo2` (e.g. glibc < 2.33)
+            # TODO: Does `mallinfo` work?
+            malloc_info = None
+
+        return cls(process_ram, vram, malloc_info)
+
+
+def get_pretty_snapshot_diff(snapshot_1: Optional[MemorySnapshot], snapshot_2: Optional[MemorySnapshot]) -> str:
+    """Get a pretty string describing the difference between two `MemorySnapshot`s."""
+
+    def get_msg_line(prefix: str, val1: int, val2: int) -> str:
+        diff = val2 - val1
+        return f"{prefix: <30} ({(diff/GB):+5.3f}): {(val1/GB):5.3f}GB -> {(val2/GB):5.3f}GB\n"
+
+    msg = ""
+
+    if snapshot_1 is None or snapshot_2 is None:
+        return msg
+
+    msg += get_msg_line("Process RAM", snapshot_1.process_ram, snapshot_2.process_ram)
+
+    if snapshot_1.malloc_info is not None and snapshot_2.malloc_info is not None:
+        msg += get_msg_line("libc mmap allocated", snapshot_1.malloc_info.hblkhd, snapshot_2.malloc_info.hblkhd)
+
+        msg += get_msg_line("libc arena used", snapshot_1.malloc_info.uordblks, snapshot_2.malloc_info.uordblks)
+
+        msg += get_msg_line("libc arena free", snapshot_1.malloc_info.fordblks, snapshot_2.malloc_info.fordblks)
+
+        libc_total_allocated_1 = snapshot_1.malloc_info.arena + snapshot_1.malloc_info.hblkhd
+        libc_total_allocated_2 = snapshot_2.malloc_info.arena + snapshot_2.malloc_info.hblkhd
+        msg += get_msg_line("libc total allocated", libc_total_allocated_1, libc_total_allocated_2)
+
+        libc_total_used_1 = snapshot_1.malloc_info.uordblks + snapshot_1.malloc_info.hblkhd
+        libc_total_used_2 = snapshot_2.malloc_info.uordblks + snapshot_2.malloc_info.hblkhd
+        msg += get_msg_line("libc total used", libc_total_used_1, libc_total_used_2)
+
+    if snapshot_1.vram is not None and snapshot_2.vram is not None:
+        msg += get_msg_line("VRAM", snapshot_1.vram, snapshot_2.vram)
+
+    return msg
--- a/invokeai/backend/model_manager/load/model_util.py
+++ b/invokeai/backend/model_manager/load/model_util.py
@ -0,0 +1,109 @@
+# Copyright (c) 2024 The InvokeAI Development Team
+"""Various utility functions needed by the loader and caching system."""
+
+import json
+from pathlib import Path
+from typing import Optional, Union
+
+import torch
+from diffusers import DiffusionPipeline
+
+from invokeai.backend.model_manager.onnx_runtime import IAIOnnxRuntimeModel
+
+
+def calc_model_size_by_data(model: Union[DiffusionPipeline, torch.nn.Module, IAIOnnxRuntimeModel]) -> int:
+    """Get size of a model in memory in bytes."""
+    if isinstance(model, DiffusionPipeline):
+        return _calc_pipeline_by_data(model)
+    elif isinstance(model, torch.nn.Module):
+        return _calc_model_by_data(model)
+    elif isinstance(model, IAIOnnxRuntimeModel):
+        return _calc_onnx_model_by_data(model)
+    else:
+        return 0
+
+
+def _calc_pipeline_by_data(pipeline: DiffusionPipeline) -> int:
+    res = 0
+    assert hasattr(pipeline, "components")
+    for submodel_key in pipeline.components.keys():
+        submodel = getattr(pipeline, submodel_key)
+        if submodel is not None and isinstance(submodel, torch.nn.Module):
+            res += _calc_model_by_data(submodel)
+    return res
+
+
+def _calc_model_by_data(model: torch.nn.Module) -> int:
+    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
+    mem_bufs = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])
+    mem: int = mem_params + mem_bufs  # in bytes
+    return mem
+
+
+def _calc_onnx_model_by_data(model: IAIOnnxRuntimeModel) -> int:
+    tensor_size = model.tensors.size() * 2  # The session doubles this
+    mem = tensor_size  # in bytes
+    return mem
+
+
+def calc_model_size_by_fs(model_path: Path, subfolder: Optional[str] = None, variant: Optional[str] = None) -> int:
+    """Estimate the size of a model on disk in bytes."""
+    if subfolder is not None:
+        model_path = model_path / subfolder
+
+    # this can happen when, for example, the safety checker is not downloaded.
+    if not model_path.exists():
+        return 0
+
+    all_files = [f for f in model_path.iterdir() if (model_path / f).is_file()]
+
+    fp16_files = {f for f in all_files if ".fp16." in f.name or ".fp16-" in f.name}
+    bit8_files = {f for f in all_files if ".8bit." in f.name or ".8bit-" in f.name}
+    other_files = set(all_files) - fp16_files - bit8_files
+
+    if variant is None:
+        files = other_files
+    elif variant == "fp16":
+        files = fp16_files
+    elif variant == "8bit":
+        files = bit8_files
+    else:
+        raise NotImplementedError(f"Unknown variant: {variant}")
+
+    # try read from index if exists
+    index_postfix = ".index.json"
+    if variant is not None:
+        index_postfix = f".index.{variant}.json"
+
+    for file in files:
+        if not file.name.endswith(index_postfix):
+            continue
+        try:
+            with open(model_path / file, "r") as f:
+                index_data = json.loads(f.read())
+            return int(index_data["metadata"]["total_size"])
+        except Exception:
+            pass
+
+    # calculate files size if there is no index file
+    formats = [
+        (".safetensors",),  # safetensors
+        (".bin",),  # torch
+        (".onnx", ".pb"),  # onnx
+        (".msgpack",),  # flax
+        (".ckpt",),  # tf
+        (".h5",),  # tf2
+    ]
+
+    for file_format in formats:
+        model_files = [f for f in files if f.suffix in file_format]
+        if len(model_files) == 0:
+            continue
+
+        model_size = 0
+        for model_file in model_files:
+            file_stats = (model_path / model_file).stat()
+            model_size += file_stats.st_size
+        return model_size
+
+    return 0  # scheduler/feature_extractor/tokenizer - models without loading to gpu
--- a/invokeai/backend/model_manager/load/optimizations.py
+++ b/invokeai/backend/model_manager/load/optimizations.py
@ -0,0 +1,30 @@
+from contextlib import contextmanager
+
+import torch
+
+
+def _no_op(*args, **kwargs):
+    pass
+
+
+@contextmanager
+def skip_torch_weight_init():
+    """A context manager that monkey-patches several of the common torch layers (torch.nn.Linear, torch.nn.Conv1d, etc.)
+    to skip weight initialization.
+
+    By default, `torch.nn.Linear` and `torch.nn.ConvNd` layers initialize their weights (according to a particular
+    distribution) when __init__ is called. This weight initialization step can take a significant amount of time, and is
+    completely unnecessary if the intent is to load checkpoint weights from disk for the layer. This context manager
+    monkey-patches common torch layers to skip the weight initialization step.
+    """
+    torch_modules = [torch.nn.Linear, torch.nn.modules.conv._ConvNd, torch.nn.Embedding]
+    saved_functions = [m.reset_parameters for m in torch_modules]
+
+    try:
+        for torch_module in torch_modules:
+            torch_module.reset_parameters = _no_op
+
+        yield None
+    finally:
+        for torch_module, saved_function in zip(torch_modules, saved_functions, strict=True):
+            torch_module.reset_parameters = saved_function
--- a/invokeai/backend/model_manager/load/ram_cache/init.py
+++ b/invokeai/backend/model_manager/load/ram_cache/init.py
--- a/invokeai/backend/model_manager/load/ram_cache/ram_cache_base.py
+++ b/invokeai/backend/model_manager/load/ram_cache/ram_cache_base.py
@ -0,0 +1,145 @@
+# Copyright (c) 2024 Lincoln D. Stein and the InvokeAI Development team
+# TODO: Add Stalker's proper name to copyright
+"""
+Manage a RAM cache of diffusion/transformer models for fast switching.
+They are moved between GPU VRAM and CPU RAM as necessary. If the cache
+grows larger than a preset maximum, then the least recently used
+model will be cleared and (re)loaded from disk when next needed.
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from logging import Logger
+from typing import Dict, Optional
+
+import torch
+
+from invokeai.backend.model_manager import SubModelType
+from invokeai.backend.model_manager.load.load_base import AnyModel, ModelLockerBase
+
+
+@dataclass
+class CacheStats(object):
+    """Data object to record statistics on cache hits/misses."""
+
+    hits: int = 0  # cache hits
+    misses: int = 0  # cache misses
+    high_watermark: int = 0  # amount of cache used
+    in_cache: int = 0  # number of models in cache
+    cleared: int = 0  # number of models cleared to make space
+    cache_size: int = 0  # total size of cache
+    loaded_model_sizes: Dict[str, int] = field(default_factory=dict)
+
+
+@dataclass
+class CacheRecord:
+    """Elements of the cache."""
+
+    key: str
+    model: AnyModel
+    size: int
+    _locks: int = 0
+
+    def lock(self) -> None:
+        """Lock this record."""
+        self._locks += 1
+
+    def unlock(self) -> None:
+        """Unlock this record."""
+        self._locks -= 1
+        assert self._locks >= 0
+
+    @property
+    def locked(self) -> bool:
+        """Return true if record is locked."""
+        return self._locks > 0
+
+
+class ModelCacheBase(ABC):
+    """Virtual base class for RAM model cache."""
+
+    @property
+    @abstractmethod
+    def storage_device(self) -> torch.device:
+        """Return the storage device (e.g. "CPU" for RAM)."""
+        pass
+
+    @property
+    @abstractmethod
+    def execution_device(self) -> torch.device:
+        """Return the exection device (e.g. "cuda" for VRAM)."""
+        pass
+
+    @property
+    @abstractmethod
+    def lazy_offloading(self) -> bool:
+        """Return true if the cache is configured to lazily offload models in VRAM."""
+        pass
+
+    @abstractmethod
+    def offload_unlocked_models(self) -> None:
+        """Offload from VRAM any models not actively in use."""
+        pass
+
+    @abstractmethod
+    def move_model_to_device(self, cache_entry: CacheRecord, device: torch.device) -> None:
+        """Move model into the indicated device."""
+        pass
+
+    @property
+    @abstractmethod
+    def logger(self) -> Logger:
+        """Return the logger used by the cache."""
+        pass
+
+    @abstractmethod
+    def make_room(self, size: int) -> None:
+        """Make enough room in the cache to accommodate a new model of indicated size."""
+        pass
+
+    @abstractmethod
+    def put(
+        self,
+        key: str,
+        model: AnyModel,
+        submodel_type: Optional[SubModelType] = None,
+    ) -> None:
+        """Store model under key and optional submodel_type."""
+        pass
+
+    @abstractmethod
+    def get(
+        self,
+        key: str,
+        submodel_type: Optional[SubModelType] = None,
+    ) -> ModelLockerBase:
+        """
+        Retrieve model locker object using key and optional submodel_type.
+
+        This may return an UnknownModelException if the model is not in the cache.
+        """
+        pass
+
+    @abstractmethod
+    def exists(
+        self,
+        key: str,
+        submodel_type: Optional[SubModelType] = None,
+    ) -> bool:
+        """Return true if the model identified by key and submodel_type is in the cache."""
+        pass
+
+    @abstractmethod
+    def cache_size(self) -> int:
+        """Get the total size of the models currently cached."""
+        pass
+
+    @abstractmethod
+    def get_stats(self) -> CacheStats:
+        """Return cache hit/miss/size statistics."""
+        pass
+
+    @abstractmethod
+    def print_cuda_stats(self) -> None:
+        """Log debugging information on CUDA usage."""
+        pass
--- a/invokeai/backend/model_manager/load/ram_cache/ram_cache_default.py
+++ b/invokeai/backend/model_manager/load/ram_cache/ram_cache_default.py
@ -0,0 +1,332 @@
+# Copyright (c) 2024 Lincoln D. Stein and the InvokeAI Development team
+# TODO: Add Stalker's proper name to copyright
+"""
+Manage a RAM cache of diffusion/transformer models for fast switching.
+They are moved between GPU VRAM and CPU RAM as necessary. If the cache
+grows larger than a preset maximum, then the least recently used
+model will be cleared and (re)loaded from disk when next needed.
+
+The cache returns context manager generators designed to load the
+model into the GPU within the context, and unload outside the
+context. Use like this:
+
+   cache = ModelCache(max_cache_size=7.5)
+   with cache.get_model('runwayml/stable-diffusion-1-5') as SD1,
+          cache.get_model('stabilityai/stable-diffusion-2') as SD2:
+       do_something_in_GPU(SD1,SD2)
+
+
+"""
+
+import math
+import time
+from contextlib import suppress
+from logging import Logger
+from typing import Any, Dict, List, Optional
+
+import torch
+
+from invokeai.app.services.model_records import UnknownModelException
+from invokeai.backend.model_manager import SubModelType
+from invokeai.backend.model_manager.load.load_base import AnyModel, ModelLockerBase
+from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot, get_pretty_snapshot_diff
+from invokeai.backend.model_manager.load.model_util import calc_model_size_by_data
+from invokeai.backend.model_manager.load.ram_cache.ram_cache_base import CacheRecord, CacheStats, ModelCacheBase
+from invokeai.backend.util.devices import choose_torch_device
+from invokeai.backend.util.logging import InvokeAILogger
+
+if choose_torch_device() == torch.device("mps"):
+    from torch import mps
+
+# Maximum size of the cache, in gigs
+# Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously
+DEFAULT_MAX_CACHE_SIZE = 6.0
+
+# amount of GPU memory to hold in reserve for use by generations (GB)
+DEFAULT_MAX_VRAM_CACHE_SIZE = 2.75
+
+# actual size of a gig
+GIG = 1073741824
+
+# Size of a MB in bytes.
+MB = 2**20
+
+
+class ModelCache(ModelCacheBase):
+    """Implementation of ModelCacheBase."""
+
+    def __init__(
+        self,
+        max_cache_size: float = DEFAULT_MAX_CACHE_SIZE,
+        max_vram_cache_size: float = DEFAULT_MAX_VRAM_CACHE_SIZE,
+        execution_device: torch.device = torch.device("cuda"),
+        storage_device: torch.device = torch.device("cpu"),
+        precision: torch.dtype = torch.float16,
+        sequential_offload: bool = False,
+        lazy_offloading: bool = True,
+        sha_chunksize: int = 16777216,
+        log_memory_usage: bool = False,
+        logger: Optional[Logger] = None,
+    ):
+        """
+        Initialize the model RAM cache.
+
+        :param max_cache_size: Maximum size of the RAM cache [6.0 GB]
+        :param execution_device: Torch device to load active model into [torch.device('cuda')]
+        :param storage_device: Torch device to save inactive model in [torch.device('cpu')]
+        :param precision: Precision for loaded models [torch.float16]
+        :param lazy_offloading: Keep model in VRAM until another model needs to be loaded
+        :param sequential_offload: Conserve VRAM by loading and unloading each stage of the pipeline sequentially
+        :param log_memory_usage: If True, a memory snapshot will be captured before and after every model cache
+            operation, and the result will be logged (at debug level). There is a time cost to capturing the memory
+            snapshots, so it is recommended to disable this feature unless you are actively inspecting the model cache's
+            behaviour.
+        """
+        # allow lazy offloading only when vram cache enabled
+        self._lazy_offloading = lazy_offloading and max_vram_cache_size > 0
+        self._precision: torch.dtype = precision
+        self._max_cache_size: float = max_cache_size
+        self._max_vram_cache_size: float = max_vram_cache_size
+        self._execution_device: torch.device = execution_device
+        self._storage_device: torch.device = storage_device
+        self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
+        self._log_memory_usage = log_memory_usage
+
+        # used for stats collection
+        self.stats = None
+
+        self._cached_models: Dict[str, CacheRecord] = {}
+        self._cache_stack: List[str] = []
+
+    class ModelLocker(ModelLockerBase):
+        """Internal class that mediates movement in and out of GPU."""
+
+        def __init__(self, cache: ModelCacheBase, cache_entry: CacheRecord):
+            """
+            Initialize the model locker.
+
+            :param cache: The ModelCache object
+            :param cache_entry: The entry in the model cache
+            """
+            self._cache = cache
+            self._cache_entry = cache_entry
+
+        @property
+        def model(self) -> AnyModel:
+            """Return the model without moving it around."""
+            return self._cache_entry.model
+
+        def lock(self) -> Any:
+            """Move the model into the execution device (GPU) and lock it."""
+            if not hasattr(self.model, "to"):
+                return self.model
+
+            # NOTE that the model has to have the to() method in order for this code to move it into GPU!
+            self._cache_entry.lock()
+
+            try:
+                if self._cache.lazy_offloading:
+                    self._cache.offload_unlocked_models()
+
+                self._cache.move_model_to_device(self._cache_entry, self._cache.execution_device)
+
+                self._cache.logger.debug(f"Locking {self._cache_entry.key} in {self._cache.execution_device}")
+                self._cache.print_cuda_stats()
+
+            except Exception:
+                self._cache_entry.unlock()
+                raise
+            return self.model
+
+        def unlock(self) -> None:
+            """Call upon exit from context."""
+            if not hasattr(self.model, "to"):
+                return
+
+            self._cache_entry.unlock()
+            if not self._cache.lazy_offloading:
+                self._cache.offload_unlocked_models()
+                self._cache.print_cuda_stats()
+
+    @property
+    def logger(self) -> Logger:
+        """Return the logger used by the cache."""
+        return self._logger
+
+    @property
+    def lazy_offloading(self) -> bool:
+        """Return true if the cache is configured to lazily offload models in VRAM."""
+        return self._lazy_offloading
+
+    @property
+    def storage_device(self) -> torch.device:
+        """Return the storage device (e.g. "CPU" for RAM)."""
+        return self._storage_device
+
+    @property
+    def execution_device(self) -> torch.device:
+        """Return the exection device (e.g. "cuda" for VRAM)."""
+        return self._execution_device
+
+    def cache_size(self) -> int:
+        """Get the total size of the models currently cached."""
+        total = 0
+        for cache_record in self._cached_models.values():
+            total += cache_record.size
+        return total
+
+    def exists(
+        self,
+        key: str,
+        submodel_type: Optional[SubModelType] = None,
+    ) -> bool:
+        """Return true if the model identified by key and submodel_type is in the cache."""
+        key = self._make_cache_key(key, submodel_type)
+        return key in self._cached_models
+
+    def put(
+        self,
+        key: str,
+        model: AnyModel,
+        submodel_type: Optional[SubModelType] = None,
+    ) -> None:
+        """Store model under key and optional submodel_type."""
+        key = self._make_cache_key(key, submodel_type)
+        assert key not in self._cached_models
+
+        loaded_model_size = calc_model_size_by_data(model)
+        cache_record = CacheRecord(key, model, loaded_model_size)
+        self._cached_models[key] = cache_record
+        self._cache_stack.append(key)
+
+    def get(
+        self,
+        key: str,
+        submodel_type: Optional[SubModelType] = None,
+    ) -> ModelLockerBase:
+        """
+        Retrieve model using key and optional submodel_type.
+
+        This may return an UnknownModelException if the model is not in the cache.
+        """
+        key = self._make_cache_key(key, submodel_type)
+        if key not in self._cached_models:
+            raise UnknownModelException
+
+        # this moves the entry to the top (right end) of the stack
+        with suppress(Exception):
+            self._cache_stack.remove(key)
+        self._cache_stack.append(key)
+        cache_entry = self._cached_models[key]
+        return self.ModelLocker(
+            cache=self,
+            cache_entry=cache_entry,
+        )
+
+    def _capture_memory_snapshot(self) -> Optional[MemorySnapshot]:
+        if self._log_memory_usage:
+            return MemorySnapshot.capture()
+        return None
+
+    def _make_cache_key(self, model_key: str, submodel_type: Optional[SubModelType] = None) -> str:
+        if submodel_type:
+            return f"{model_key}:{submodel_type.value}"
+        else:
+            return model_key
+
+    def offload_unlocked_models(self) -> None:
+        """Move any unused models from VRAM."""
+        reserved = self._max_vram_cache_size * GIG
+        vram_in_use = torch.cuda.memory_allocated()
+        self.logger.debug(f"{(vram_in_use/GIG):.2f}GB VRAM used for models; max allowed={(reserved/GIG):.2f}GB")
+        for _, cache_entry in sorted(self._cached_models.items(), key=lambda x: x[1].size):
+            if vram_in_use <= reserved:
+                break
+            if not cache_entry.locked:
+                self.move_model_to_device(cache_entry, self.storage_device)
+
+                vram_in_use = torch.cuda.memory_allocated()
+                self.logger.debug(f"{(vram_in_use/GIG):.2f}GB VRAM used for models; max allowed={(reserved/GIG):.2f}GB")
+
+        torch.cuda.empty_cache()
+        if choose_torch_device() == torch.device("mps"):
+            mps.empty_cache()
+
+    # TO DO: Only reason to pass the CacheRecord rather than the model is to get the key and size
+    # for printing debugging messages. Revisit whether this is necessary
+    def move_model_to_device(self, cache_entry: CacheRecord, target_device: torch.device) -> None:
+        """Move model into the indicated device."""
+        # These attributes are not in the base class but in derived classes
+        assert hasattr(cache_entry.model, "device")
+        assert hasattr(cache_entry.model, "to")
+
+        source_device = cache_entry.model.device
+
+        # Note: We compare device types only so that 'cuda' == 'cuda:0'. This would need to be revised to support
+        # multi-GPU.
+        if torch.device(source_device).type == torch.device(target_device).type:
+            return
+
+        start_model_to_time = time.time()
+        snapshot_before = self._capture_memory_snapshot()
+        cache_entry.model.to(target_device)
+        snapshot_after = self._capture_memory_snapshot()
+        end_model_to_time = time.time()
+        self.logger.debug(
+            f"Moved model '{cache_entry.key}' from {source_device} to"
+            f" {target_device} in {(end_model_to_time-start_model_to_time):.2f}s.\n"
+            f"Estimated model size: {(cache_entry.size/GIG):.3f} GB.\n"
+            f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
+        )
+
+        if (
+            snapshot_before is not None
+            and snapshot_after is not None
+            and snapshot_before.vram is not None
+            and snapshot_after.vram is not None
+        ):
+            vram_change = abs(snapshot_before.vram - snapshot_after.vram)
+
+            # If the estimated model size does not match the change in VRAM, log a warning.
+            if not math.isclose(
+                vram_change,
+                cache_entry.size,
+                rel_tol=0.1,
+                abs_tol=10 * MB,
+            ):
+                self.logger.debug(
+                    f"Moving model '{cache_entry.key}' from {source_device} to"
+                    f" {target_device} caused an unexpected change in VRAM usage. The model's"
+                    " estimated size may be incorrect. Estimated model size:"
+                    f" {(cache_entry.size/GIG):.3f} GB.\n"
+                    f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
+                )
+
+    def print_cuda_stats(self) -> None:
+        """Log CUDA diagnostics."""
+        vram = "%4.2fG" % (torch.cuda.memory_allocated() / GIG)
+        ram = "%4.2fG" % self.cache_size()
+
+        cached_models = 0
+        loaded_models = 0
+        locked_models = 0
+        for cache_record in self._cached_models.values():
+            cached_models += 1
+            assert hasattr(cache_record.model, "device")
+            if cache_record.model.device is self.storage_device:
+                loaded_models += 1
+            if cache_record.locked:
+                locked_models += 1
+
+        self.logger.debug(
+            f"Current VRAM/RAM usage: {vram}/{ram}; cached_models/loaded_models/locked_models/ ="
+            f" {cached_models}/{loaded_models}/{locked_models}"
+        )
+
+    def get_stats(self) -> CacheStats:
+        """Return cache hit/miss/size statistics."""
+        raise NotImplementedError
+
+    def make_room(self, size: int) -> None:
+        """Make enough room in the cache to accommodate a new model of indicated size."""
+        raise NotImplementedError
--- a/invokeai/backend/model_manager/load/vae.py
+++ b/invokeai/backend/model_manager/load/vae.py
@ -0,0 +1,31 @@
+# Copyright (c) 2024, Lincoln D. Stein and the InvokeAI Development Team
+"""Class for VAE model loading in InvokeAI."""
+
+from pathlib import Path
+from typing import Dict, Optional
+
+import torch
+
+from invokeai.backend.model_manager import BaseModelType, ModelFormat, ModelRepoVariant, ModelType, SubModelType
+from invokeai.backend.model_manager.load.load_base import AnyModelLoader
+from invokeai.backend.model_manager.load.load_default import ModelLoader
+
+
+@AnyModelLoader.register(base=BaseModelType.Any, type=ModelType.Vae, format=ModelFormat.Diffusers)
+class VaeDiffusersModel(ModelLoader):
+    """Class to load VAE models."""
+
+    def _load_model(
+        self,
+        model_path: Path,
+        model_variant: Optional[ModelRepoVariant] = None,
+        submodel_type: Optional[SubModelType] = None,
+    ) -> Dict[str, torch.Tensor]:
+        if submodel_type is not None:
+            raise Exception("There are no submodels in VAEs")
+        vae_class = self._get_hf_load_class(model_path)
+        variant = model_variant.value if model_variant else ""
+        result: Dict[str, torch.Tensor] = vae_class.from_pretrained(
+            model_path, torch_dtype=self._torch_dtype, variant=variant
+        )  # type: ignore
+        return result
--- a/invokeai/backend/model_manager/onnx_runtime.py
+++ b/invokeai/backend/model_manager/onnx_runtime.py
@ -0,0 +1,216 @@
+# Copyright (c) 2024 The InvokeAI Development Team
+import os
+import sys
+from pathlib import Path
+from typing import Any, List, Optional, Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import numpy_helper
+from onnxruntime import InferenceSession, SessionOptions, get_available_providers
+
+ONNX_WEIGHTS_NAME = "model.onnx"
+
+
+# NOTE FROM LS: This was copied from Stalker's original implementation.
+# I have not yet gone through and fixed all the type hints
+class IAIOnnxRuntimeModel:
+    class _tensor_access:
+        def __init__(self, model):  # type: ignore
+            self.model = model
+            self.indexes = {}
+            for idx, obj in enumerate(self.model.proto.graph.initializer):
+                self.indexes[obj.name] = idx
+
+        def __getitem__(self, key: str):  # type: ignore
+            value = self.model.proto.graph.initializer[self.indexes[key]]
+            return numpy_helper.to_array(value)
+
+        def __setitem__(self, key: str, value: np.ndarray):  # type: ignore
+            new_node = numpy_helper.from_array(value)
+            # set_external_data(new_node, location="in-memory-location")
+            new_node.name = key
+            # new_node.ClearField("raw_data")
+            del self.model.proto.graph.initializer[self.indexes[key]]
+            self.model.proto.graph.initializer.insert(self.indexes[key], new_node)
+            # self.model.data[key] = OrtValue.ortvalue_from_numpy(value)
+
+        # __delitem__
+
+        def __contains__(self, key: str) -> bool:
+            return self.indexes[key] in self.model.proto.graph.initializer
+
+        def items(self) -> List[Tuple[str, Any]]:  # fixme
+            raise NotImplementedError("tensor.items")
+            # return [(obj.name, obj) for obj in self.raw_proto]
+
+        def keys(self) -> List[str]:
+            return list(self.indexes.keys())
+
+        def values(self) -> List[Any]:  # fixme
+            raise NotImplementedError("tensor.values")
+            # return [obj for obj in self.raw_proto]
+
+        def size(self) -> int:
+            bytesSum = 0
+            for node in self.model.proto.graph.initializer:
+                bytesSum += sys.getsizeof(node.raw_data)
+            return bytesSum
+
+    class _access_helper:
+        def __init__(self, raw_proto):  # type: ignore
+            self.indexes = {}
+            self.raw_proto = raw_proto
+            for idx, obj in enumerate(raw_proto):
+                self.indexes[obj.name] = idx
+
+        def __getitem__(self, key: str):  # type: ignore
+            return self.raw_proto[self.indexes[key]]
+
+        def __setitem__(self, key: str, value):  # type: ignore
+            index = self.indexes[key]
+            del self.raw_proto[index]
+            self.raw_proto.insert(index, value)
+
+        # __delitem__
+
+        def __contains__(self, key: str) -> bool:
+            return key in self.indexes
+
+        def items(self) -> List[Tuple[str, Any]]:
+            return [(obj.name, obj) for obj in self.raw_proto]
+
+        def keys(self) -> List[str]:
+            return list(self.indexes.keys())
+
+        def values(self) -> List[Any]:  # fixme
+            return list(self.raw_proto)
+
+    def __init__(self, model_path: str, provider: Optional[str]):
+        self.path = model_path
+        self.session = None
+        self.provider = provider
+        """
+        self.data_path = self.path + "_data"
+        if not os.path.exists(self.data_path):
+            print(f"Moving model tensors to separate file: {self.data_path}")
+            tmp_proto = onnx.load(model_path, load_external_data=True)
+            onnx.save_model(tmp_proto, self.path, save_as_external_data=True, all_tensors_to_one_file=True, location=os.path.basename(self.data_path), size_threshold=1024, convert_attribute=False)
+            del tmp_proto
+            gc.collect()
+
+        self.proto = onnx.load(model_path, load_external_data=False)
+        """
+
+        self.proto = onnx.load(model_path, load_external_data=True)
+        # self.data = dict()
+        # for tensor in self.proto.graph.initializer:
+        #     name = tensor.name
+
+        #     if tensor.HasField("raw_data"):
+        #         npt = numpy_helper.to_array(tensor)
+        #         orv = OrtValue.ortvalue_from_numpy(npt)
+        #         # self.data[name] = orv
+        #         # set_external_data(tensor, location="in-memory-location")
+        #         tensor.name = name
+        #         # tensor.ClearField("raw_data")
+
+        self.nodes = self._access_helper(self.proto.graph.node)  # type: ignore
+        # self.initializers = self._access_helper(self.proto.graph.initializer)
+        # print(self.proto.graph.input)
+        # print(self.proto.graph.initializer)
+
+        self.tensors = self._tensor_access(self)  # type: ignore
+
+    # TODO: integrate with model manager/cache
+    def create_session(self, height=None, width=None):
+        if self.session is None or self.session_width != width or self.session_height != height:
+            # onnx.save(self.proto, "tmp.onnx")
+            # onnx.save_model(self.proto, "tmp.onnx", save_as_external_data=True, all_tensors_to_one_file=True, location="tmp.onnx_data", size_threshold=1024, convert_attribute=False)
+            # TODO: something to be able to get weight when they already moved outside of model proto
+            # (trimmed_model, external_data) = buffer_external_data_tensors(self.proto)
+            sess = SessionOptions()
+            # self._external_data.update(**external_data)
+            # sess.add_external_initializers(list(self.data.keys()), list(self.data.values()))
+            # sess.enable_profiling = True
+
+            # sess.intra_op_num_threads = 1
+            # sess.inter_op_num_threads = 1
+            # sess.execution_mode = ExecutionMode.ORT_SEQUENTIAL
+            # sess.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
+            # sess.enable_cpu_mem_arena = True
+            # sess.enable_mem_pattern = True
+            # sess.add_session_config_entry("session.intra_op.use_xnnpack_threadpool", "1") ########### It's the key code
+            self.session_height = height
+            self.session_width = width
+            if height and width:
+                sess.add_free_dimension_override_by_name("unet_sample_batch", 2)
+                sess.add_free_dimension_override_by_name("unet_sample_channels", 4)
+                sess.add_free_dimension_override_by_name("unet_hidden_batch", 2)
+                sess.add_free_dimension_override_by_name("unet_hidden_sequence", 77)
+                sess.add_free_dimension_override_by_name("unet_sample_height", self.session_height)
+                sess.add_free_dimension_override_by_name("unet_sample_width", self.session_width)
+                sess.add_free_dimension_override_by_name("unet_time_batch", 1)
+            providers = []
+            if self.provider:
+                providers.append(self.provider)
+            else:
+                providers = get_available_providers()
+            if "TensorrtExecutionProvider" in providers:
+                providers.remove("TensorrtExecutionProvider")
+            try:
+                self.session = InferenceSession(self.proto.SerializeToString(), providers=providers, sess_options=sess)
+            except Exception as e:
+                raise e
+            # self.session = InferenceSession("tmp.onnx", providers=[self.provider], sess_options=self.sess_options)
+            # self.io_binding = self.session.io_binding()
+
+    def release_session(self):
+        self.session = None
+        import gc
+
+        gc.collect()
+        return
+
+    def __call__(self, **kwargs):
+        if self.session is None:
+            raise Exception("You should call create_session before running model")
+
+        inputs = {k: np.array(v) for k, v in kwargs.items()}
+        # output_names = self.session.get_outputs()
+        # for k in inputs:
+        #     self.io_binding.bind_cpu_input(k, inputs[k])
+        # for name in output_names:
+        #     self.io_binding.bind_output(name.name)
+        # self.session.run_with_iobinding(self.io_binding, None)
+        # return self.io_binding.copy_outputs_to_cpu()
+        return self.session.run(None, inputs)
+
+    # compatability with diffusers load code
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_id: Union[str, Path],
+        subfolder: Optional[Union[str, Path]] = None,
+        file_name: Optional[str] = None,
+        provider: Optional[str] = None,
+        sess_options: Optional["SessionOptions"] = None,
+        **kwargs: Any,
+    ) -> Any:  # fixme
+        file_name = file_name or ONNX_WEIGHTS_NAME
+
+        if os.path.isdir(model_id):
+            model_path = model_id
+            if subfolder is not None:
+                model_path = os.path.join(model_path, subfolder)
+            model_path = os.path.join(model_path, file_name)
+
+        else:
+            model_path = model_id
+
+        # load model from local directory
+        if not os.path.isfile(model_path):
+            raise Exception(f"Model not found: {model_path}")
+
+        # TODO: session options
+        return cls(str(model_path), provider=provider)
--- a/invokeai/backend/model_manager/probe.py
+++ b/invokeai/backend/model_manager/probe.py
@ -18,9 +18,9 @@ from .config import (
    InvalidModelConfigException,
    ModelConfigFactory,
    ModelFormat,
+    ModelRepoVariant,
    ModelType,
    ModelVariantType,
-    ModelRepoVariant,
    SchedulerPredictionType,
 )
 from .hash import FastModelHash
@ -483,8 +483,8 @@ class FolderProbeBase(ProbeBase):

    def get_repo_variant(self) -> ModelRepoVariant:
        # get all files ending in .bin or .safetensors
-        weight_files = list(self.model_path.glob('**/*.safetensors'))
-        weight_files.extend(list(self.model_path.glob('**/*.bin')))
+        weight_files = list(self.model_path.glob("**/*.safetensors"))
+        weight_files.extend(list(self.model_path.glob("**/*.bin")))
        for x in weight_files:
            if ".fp16" in x.suffixes:
                return ModelRepoVariant.FP16
@ -496,6 +496,7 @@ class FolderProbeBase(ProbeBase):
                return ModelRepoVariant.ONNX
        return ModelRepoVariant.DEFAULT

+
 class PipelineFolderProbe(FolderProbeBase):
    def get_base_type(self) -> BaseModelType:
        with open(self.model_path / "unet" / "config.json", "r") as file:
@ -540,7 +541,6 @@ class PipelineFolderProbe(FolderProbeBase):
        except Exception:
            pass
        return ModelVariantType.Normal
-        


 class VaeFolderProbe(FolderProbeBase):
--- a/tests/test_model_probe.py
+++ b/tests/test_model_probe.py
@ -21,9 +21,10 @@ def test_get_base_type(vae_path: str, expected_type: BaseModelType, datadir: Pat
    base_type = probe.get_base_type()
    assert base_type == expected_type
    repo_variant = probe.get_repo_variant()
-    assert repo_variant == 'default'
+    assert repo_variant == "default"
+

 def test_repo_variant(datadir: Path):
    probe = VaeFolderProbe(datadir / "vae" / "taesdxl-fp16")
    repo_variant = probe.get_repo_variant()
-    assert repo_variant == 'fp16'
+    assert repo_variant == "fp16"