From 9c1d250665fc9018afaab88be20a8db5d3bd3a7a Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Wed, 20 Dec 2023 22:11:16 -0500 Subject: [PATCH 1/3] hacked in stable-fast; can generate one image before crashing --- .../backend/model_management/model_cache.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/invokeai/backend/model_management/model_cache.py b/invokeai/backend/model_management/model_cache.py index 2a7f4b5a95..da850075aa 100644 --- a/invokeai/backend/model_management/model_cache.py +++ b/invokeai/backend/model_management/model_cache.py @@ -39,6 +39,14 @@ from .models import BaseModelType, ModelBase, ModelType, SubModelType if choose_torch_device() == torch.device("mps"): from torch import mps +sfast_available = True +if sfast_available: + from sfast.compilers.diffusion_pipeline_compiler import (compile, + compile_unet, + compile_vae, + CompilationConfig + ) + # Maximum size of the cache, in gigs # Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously DEFAULT_MAX_CACHE_SIZE = 6.0 @@ -276,6 +284,8 @@ class ModelCache(object): self._cache_stack.remove(key) self._cache_stack.append(key) + if sfast_available and submodel: + cache_entry.model = self._compile_model(cache_entry.model, submodel) return self.ModelLocker(self, key, cache_entry.model, gpu_load, cache_entry.size) def _move_model_to_device(self, key: str, target_device: torch.device): @@ -322,6 +332,20 @@ class ModelCache(object): f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}" ) + def _compile_model(self, model, model_type): + config = CompilationConfig.Default() + config.enable_xformers = True + config.enable_triton = True + config.enable_jit_freeze = True + config.enable_cuda_graph = True + if model_type == SubModelType("unet"): + return compile_unet(model, config) + elif model_type == SubModelType("vae"): + return compile_vae(model, config) + else: + return model + + class ModelLocker(object): def __init__(self, cache, key, model, gpu_load, size_needed): """ From 6cb3031c09072ca9a4e28f0303641db4c48b1555 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Wed, 20 Dec 2023 22:40:56 -0500 Subject: [PATCH 2/3] only compile model the first time :-) --- invokeai/backend/model_management/model_cache.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/invokeai/backend/model_management/model_cache.py b/invokeai/backend/model_management/model_cache.py index da850075aa..8080d4cade 100644 --- a/invokeai/backend/model_management/model_cache.py +++ b/invokeai/backend/model_management/model_cache.py @@ -247,6 +247,9 @@ class ModelCache(object): snapshot_before = self._capture_memory_snapshot() with skip_torch_weight_init(): model = model_info.get_model(child_type=submodel, torch_dtype=self.precision) + if sfast_available and submodel: + model = self._compile_model(model, submodel) + snapshot_after = self._capture_memory_snapshot() end_load_time = time.time() @@ -284,8 +287,6 @@ class ModelCache(object): self._cache_stack.remove(key) self._cache_stack.append(key) - if sfast_available and submodel: - cache_entry.model = self._compile_model(cache_entry.model, submodel) return self.ModelLocker(self, key, cache_entry.model, gpu_load, cache_entry.size) def _move_model_to_device(self, key: str, target_device: torch.device): @@ -336,7 +337,6 @@ class ModelCache(object): config = CompilationConfig.Default() config.enable_xformers = True config.enable_triton = True - config.enable_jit_freeze = True config.enable_cuda_graph = True if model_type == SubModelType("unet"): return compile_unet(model, config) From e3ab074b9553e843637618224ec7cc2eaa722d22 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Thu, 21 Dec 2023 16:10:52 -0500 Subject: [PATCH 3/3] probe for availability of stable-fast compiler and triton at startup time --- .../app/services/config/config_default.py | 1 + .../backend/model_management/model_cache.py | 45 ++++++++++++------- pyproject.toml | 1 + 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py index a55bcd3a21..32178e5a39 100644 --- a/invokeai/app/services/config/config_default.py +++ b/invokeai/app/services/config/config_default.py @@ -271,6 +271,7 @@ class InvokeAIAppConfig(InvokeAISettings): attention_slice_size: Literal["auto", "balanced", "max", 1, 2, 3, 4, 5, 6, 7, 8] = Field(default="auto", description='Slice size, valid when attention_type=="sliced"', json_schema_extra=Categories.Generation) force_tiled_decode : bool = Field(default=False, description="Whether to enable tiled VAE decode (reduces memory consumption with some performance penalty)", json_schema_extra=Categories.Generation) png_compress_level : int = Field(default=6, description="The compress_level setting of PIL.Image.save(), used for PNG encoding. All settings are lossless. 0 = fastest, largest filesize, 9 = slowest, smallest filesize", json_schema_extra=Categories.Generation) + stable_fast : bool = Field(default=True, description="Enable stable-fast performance optimizations, if the library is installed and functional", json_schema_extra=Categories.Generation) # QUEUE max_queue_size : int = Field(default=10000, gt=0, description="Maximum number of items in the session queue", json_schema_extra=Categories.Queue) diff --git a/invokeai/backend/model_management/model_cache.py b/invokeai/backend/model_management/model_cache.py index 8080d4cade..18bcae40aa 100644 --- a/invokeai/backend/model_management/model_cache.py +++ b/invokeai/backend/model_management/model_cache.py @@ -39,13 +39,29 @@ from .models import BaseModelType, ModelBase, ModelType, SubModelType if choose_torch_device() == torch.device("mps"): from torch import mps -sfast_available = True -if sfast_available: - from sfast.compilers.diffusion_pipeline_compiler import (compile, - compile_unet, - compile_vae, - CompilationConfig - ) +SFAST_AVAILABLE = False +TRITON_AVAILABLE = False +SFAST_CONFIG = None + +try: + import triton + + TRITON_AVAILABLE = True +except ImportError: + pass + +try: + from sfast.compilers.diffusion_pipeline_compiler import compile_unet, compile_vae, CompilationConfig + + SFAST_CONFIG = CompilationConfig.Default() + SFAST_CONFIG.enable_xformers = True + SFAST_CONFIG.enable_cuda_graph = True + if TRITON_AVAILABLE: + SFAST_CONFIG.enable_triton = True + SFAST_AVAILABLE = True +except ImportError: + pass + # Maximum size of the cache, in gigs # Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously @@ -247,7 +263,7 @@ class ModelCache(object): snapshot_before = self._capture_memory_snapshot() with skip_torch_weight_init(): model = model_info.get_model(child_type=submodel, torch_dtype=self.precision) - if sfast_available and submodel: + if SFAST_AVAILABLE and submodel: model = self._compile_model(model, submodel) snapshot_after = self._capture_memory_snapshot() @@ -333,18 +349,15 @@ class ModelCache(object): f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}" ) - def _compile_model(self, model, model_type): - config = CompilationConfig.Default() - config.enable_xformers = True - config.enable_triton = True - config.enable_cuda_graph = True + def _compile_model(self, model: Any, model_type: SubModelType) -> Any: if model_type == SubModelType("unet"): - return compile_unet(model, config) + self.logger.info("SFast-compiling unet model") + return compile_unet(model, SFAST_CONFIG) elif model_type == SubModelType("vae"): - return compile_vae(model, config) + self.logger.info("SFast-compiling vae model") + return compile_vae(model, SFAST_CONFIG) else: return model - class ModelLocker(object): def __init__(self, cache, key, model, gpu_load, size_needed): diff --git a/pyproject.toml b/pyproject.toml index 98018dc7cb..0af7d96f3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -113,6 +113,7 @@ dependencies = [ "onnx" = ["onnxruntime"] "onnx-cuda" = ["onnxruntime-gpu"] "onnx-directml" = ["onnxruntime-directml"] +"stable-fast" = ["stable-fast"] [project.scripts]