wired attention configuration into backend

2024-08-30 20:32:17 +00:00 · 2023-08-17 14:20:45 -04:00
parent ed38eaa10c
commit c19835c2d0
2 changed files with 20 additions and 8 deletions
--- a/invokeai/app/services/config/invokeai_config.py
+++ b/invokeai/app/services/config/invokeai_config.py
@ -227,9 +227,9 @@ class InvokeAIAppConfig(InvokeAISettings):
    version             : bool = Field(default=False, description="Show InvokeAI version and exit", category="Other")

    # CACHE
-    ram                 : Union[float, Literal["auto"]] = Field(default=6.0, gt=0, description="Maximum memory amount used by model cache for rapid switching (floating point number or 'auto')", category="Cache", )
-    vram                : Union[float, Literal["auto"]] = Field(default=0.25, ge=0, description="Amount of VRAM reserved for model storage (floating point number or 'auto')", category="Cache", )
-    lazy_offload        : bool = Field(default=True, description="Keep models in VRAM until their space is needed", category="Cache", )
+    ram                 : Union[float, Literal["auto"]] = Field(default=6.0, gt=0, description="Maximum memory amount used by model cache for rapid switching (floating point number or 'auto')", category="Model Cache", )
+    vram                : Union[float, Literal["auto"]] = Field(default=0.25, ge=0, description="Amount of VRAM reserved for model storage (floating point number or 'auto')", category="Model Cache", )
+    lazy_offload        : bool = Field(default=True, description="Keep models in VRAM until their space is needed", category="Model Cache", )

    # DEVICE
    device              : Literal[tuple(["auto", "cpu", "cuda", "cuda:1", "mps"])] = Field(default="auto", description="Generation device", category="Device", )
@ -246,8 +246,6 @@ class InvokeAIAppConfig(InvokeAISettings):
    free_gpu_mem        : Optional[bool] = Field(default=None, description="If true, purge model from GPU after each generation.", category='Memory/Performance')
    max_cache_size      : Optional[float] = Field(default=None, gt=0, description="Maximum memory amount used by model cache for rapid switching", category='Memory/Performance')
    max_vram_cache_size : Optional[float] = Field(default=None, ge=0, description="Amount of VRAM reserved for model storage", category='Memory/Performance')
-    precision           : Literal[tuple(['auto','float16','float32','autocast'])] = Field(default='auto',description='Floating point precision', category='Memory/Performance')
-    sequential_guidance : bool = Field(default=False, description="Whether to calculate guidance in serial instead of in parallel, lowering memory requirements", category='Memory/Performance')
    xformers_enabled    : bool = Field(default=True, description="Enable/disable memory-efficient attention", category='Memory/Performance')
    tiled_decode        : bool = Field(default=False, description="Whether to enable tiled VAE decode (reduces memory consumption with some performance penalty)", category='Memory/Performance')

--- a/invokeai/backend/stable_diffusion/diffusers_pipeline.py
+++ b/invokeai/backend/stable_diffusion/diffusers_pipeline.py
@ -2,10 +2,8 @@ from __future__ import annotations

 import dataclasses
 import inspect
-import math
-import secrets
 from dataclasses import dataclass, field
-from typing import Any, Callable, Generic, List, Optional, Type, Union
+from typing import Any, Callable, List, Optional, Union

 import PIL.Image
 import einops
@ -293,6 +291,22 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
        if xformers is available, use it, otherwise use sliced attention.
        """
        config = InvokeAIAppConfig.get_config()
+        if config.attention_type == "xformers":
+            self.enable_xformers_memory_efficient_attention()
+            return
+        elif config.attention_type == "sliced":
+            slice_size = config.attention_slice_size
+            if torch.backends.mps.is_available():  #  doesn't auto already do this?
+                slice_size = "max"
+            self.enable_attention_slicing(slice_size=slice_size)
+            return
+        elif config.attention_type == "normal":
+            self.disable_attention_slicing()
+            return
+        elif config.attention_type == "torch-sdp":
+            raise Exception("torch-sdp attention slicing not yet implemented")
+
+        # the remainder if this code is called when attention_type=='auto'
        if self.unet.device.type == "cuda":
            if is_xformers_available() and not config.disable_xformers:
                self.enable_xformers_memory_efficient_attention()