From 9c1d250665fc9018afaab88be20a8db5d3bd3a7a Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Wed, 20 Dec 2023 22:11:16 -0500
Subject: [PATCH 1/3] hacked in stable-fast; can generate one image before
 crashing

---
 .../backend/model_management/model_cache.py   | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/invokeai/backend/model_management/model_cache.py b/invokeai/backend/model_management/model_cache.py
index 2a7f4b5a95..da850075aa 100644
--- a/invokeai/backend/model_management/model_cache.py
+++ b/invokeai/backend/model_management/model_cache.py
@@ -39,6 +39,14 @@ from .models import BaseModelType, ModelBase, ModelType, SubModelType
 if choose_torch_device() == torch.device("mps"):
     from torch import mps
 
+sfast_available = True
+if sfast_available:
+    from sfast.compilers.diffusion_pipeline_compiler import (compile,
+                                                             compile_unet,
+                                                             compile_vae,
+                                                             CompilationConfig
+                                                             )
+
 # Maximum size of the cache, in gigs
 # Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously
 DEFAULT_MAX_CACHE_SIZE = 6.0
@@ -276,6 +284,8 @@ class ModelCache(object):
             self._cache_stack.remove(key)
         self._cache_stack.append(key)
 
+        if sfast_available and submodel:
+            cache_entry.model = self._compile_model(cache_entry.model, submodel)
         return self.ModelLocker(self, key, cache_entry.model, gpu_load, cache_entry.size)
 
     def _move_model_to_device(self, key: str, target_device: torch.device):
@@ -322,6 +332,20 @@ class ModelCache(object):
                     f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
                 )
 
+    def _compile_model(self, model, model_type):
+        config = CompilationConfig.Default()
+        config.enable_xformers = True
+        config.enable_triton = True
+        config.enable_jit_freeze = True
+        config.enable_cuda_graph = True
+        if model_type == SubModelType("unet"):
+            return compile_unet(model, config)
+        elif model_type == SubModelType("vae"):
+            return compile_vae(model, config)
+        else:
+            return model
+        
+
     class ModelLocker(object):
         def __init__(self, cache, key, model, gpu_load, size_needed):
             """

From 6cb3031c09072ca9a4e28f0303641db4c48b1555 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Wed, 20 Dec 2023 22:40:56 -0500
Subject: [PATCH 2/3] only compile model the first time :-)

---
 invokeai/backend/model_management/model_cache.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/invokeai/backend/model_management/model_cache.py b/invokeai/backend/model_management/model_cache.py
index da850075aa..8080d4cade 100644
--- a/invokeai/backend/model_management/model_cache.py
+++ b/invokeai/backend/model_management/model_cache.py
@@ -247,6 +247,9 @@ class ModelCache(object):
             snapshot_before = self._capture_memory_snapshot()
             with skip_torch_weight_init():
                 model = model_info.get_model(child_type=submodel, torch_dtype=self.precision)
+            if sfast_available and submodel:
+                model = self._compile_model(model, submodel)
+
             snapshot_after = self._capture_memory_snapshot()
             end_load_time = time.time()
 
@@ -284,8 +287,6 @@ class ModelCache(object):
             self._cache_stack.remove(key)
         self._cache_stack.append(key)
 
-        if sfast_available and submodel:
-            cache_entry.model = self._compile_model(cache_entry.model, submodel)
         return self.ModelLocker(self, key, cache_entry.model, gpu_load, cache_entry.size)
 
     def _move_model_to_device(self, key: str, target_device: torch.device):
@@ -336,7 +337,6 @@ class ModelCache(object):
         config = CompilationConfig.Default()
         config.enable_xformers = True
         config.enable_triton = True
-        config.enable_jit_freeze = True
         config.enable_cuda_graph = True
         if model_type == SubModelType("unet"):
             return compile_unet(model, config)

From e3ab074b9553e843637618224ec7cc2eaa722d22 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Thu, 21 Dec 2023 16:10:52 -0500
Subject: [PATCH 3/3] probe for availability of stable-fast compiler and triton
 at startup time

---
 .../app/services/config/config_default.py     |  1 +
 .../backend/model_management/model_cache.py   | 45 ++++++++++++-------
 pyproject.toml                                |  1 +
 3 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py
index a55bcd3a21..32178e5a39 100644
--- a/invokeai/app/services/config/config_default.py
+++ b/invokeai/app/services/config/config_default.py
@@ -271,6 +271,7 @@ class InvokeAIAppConfig(InvokeAISettings):
     attention_slice_size: Literal["auto", "balanced", "max", 1, 2, 3, 4, 5, 6, 7, 8] = Field(default="auto", description='Slice size, valid when attention_type=="sliced"', json_schema_extra=Categories.Generation)
     force_tiled_decode  : bool = Field(default=False, description="Whether to enable tiled VAE decode (reduces memory consumption with some performance penalty)", json_schema_extra=Categories.Generation)
     png_compress_level  : int = Field(default=6, description="The compress_level setting of PIL.Image.save(), used for PNG encoding. All settings are lossless. 0 = fastest, largest filesize, 9 = slowest, smallest filesize", json_schema_extra=Categories.Generation)
+    stable_fast         : bool = Field(default=True, description="Enable stable-fast performance optimizations, if the library is installed and functional", json_schema_extra=Categories.Generation)
 
     # QUEUE
     max_queue_size      : int = Field(default=10000, gt=0, description="Maximum number of items in the session queue", json_schema_extra=Categories.Queue)
diff --git a/invokeai/backend/model_management/model_cache.py b/invokeai/backend/model_management/model_cache.py
index 8080d4cade..18bcae40aa 100644
--- a/invokeai/backend/model_management/model_cache.py
+++ b/invokeai/backend/model_management/model_cache.py
@@ -39,13 +39,29 @@ from .models import BaseModelType, ModelBase, ModelType, SubModelType
 if choose_torch_device() == torch.device("mps"):
     from torch import mps
 
-sfast_available = True
-if sfast_available:
-    from sfast.compilers.diffusion_pipeline_compiler import (compile,
-                                                             compile_unet,
-                                                             compile_vae,
-                                                             CompilationConfig
-                                                             )
+SFAST_AVAILABLE = False
+TRITON_AVAILABLE = False
+SFAST_CONFIG = None
+
+try:
+    import triton
+
+    TRITON_AVAILABLE = True
+except ImportError:
+    pass
+
+try:
+    from sfast.compilers.diffusion_pipeline_compiler import compile_unet, compile_vae, CompilationConfig
+
+    SFAST_CONFIG = CompilationConfig.Default()
+    SFAST_CONFIG.enable_xformers = True
+    SFAST_CONFIG.enable_cuda_graph = True
+    if TRITON_AVAILABLE:
+        SFAST_CONFIG.enable_triton = True
+    SFAST_AVAILABLE = True
+except ImportError:
+    pass
+
 
 # Maximum size of the cache, in gigs
 # Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously
@@ -247,7 +263,7 @@ class ModelCache(object):
             snapshot_before = self._capture_memory_snapshot()
             with skip_torch_weight_init():
                 model = model_info.get_model(child_type=submodel, torch_dtype=self.precision)
-            if sfast_available and submodel:
+            if SFAST_AVAILABLE and submodel:
                 model = self._compile_model(model, submodel)
 
             snapshot_after = self._capture_memory_snapshot()
@@ -333,18 +349,15 @@ class ModelCache(object):
                     f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
                 )
 
-    def _compile_model(self, model, model_type):
-        config = CompilationConfig.Default()
-        config.enable_xformers = True
-        config.enable_triton = True
-        config.enable_cuda_graph = True
+    def _compile_model(self, model: Any, model_type: SubModelType) -> Any:
         if model_type == SubModelType("unet"):
-            return compile_unet(model, config)
+            self.logger.info("SFast-compiling unet model")
+            return compile_unet(model, SFAST_CONFIG)
         elif model_type == SubModelType("vae"):
-            return compile_vae(model, config)
+            self.logger.info("SFast-compiling vae model")
+            return compile_vae(model, SFAST_CONFIG)
         else:
             return model
-        
 
     class ModelLocker(object):
         def __init__(self, cache, key, model, gpu_load, size_needed):
diff --git a/pyproject.toml b/pyproject.toml
index 98018dc7cb..0af7d96f3c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -113,6 +113,7 @@ dependencies = [
 "onnx" = ["onnxruntime"]
 "onnx-cuda" = ["onnxruntime-gpu"]
 "onnx-directml" = ["onnxruntime-directml"]
+"stable-fast" = ["stable-fast"]
 
 [project.scripts]