wired attention configuration into backend

This commit is contained in:
Lincoln Stein 2023-08-17 14:20:45 -04:00
parent ed38eaa10c
commit c19835c2d0
2 changed files with 20 additions and 8 deletions

View File

@ -227,9 +227,9 @@ class InvokeAIAppConfig(InvokeAISettings):
version : bool = Field(default=False, description="Show InvokeAI version and exit", category="Other")
# CACHE
ram : Union[float, Literal["auto"]] = Field(default=6.0, gt=0, description="Maximum memory amount used by model cache for rapid switching (floating point number or 'auto')", category="Cache", )
vram : Union[float, Literal["auto"]] = Field(default=0.25, ge=0, description="Amount of VRAM reserved for model storage (floating point number or 'auto')", category="Cache", )
lazy_offload : bool = Field(default=True, description="Keep models in VRAM until their space is needed", category="Cache", )
ram : Union[float, Literal["auto"]] = Field(default=6.0, gt=0, description="Maximum memory amount used by model cache for rapid switching (floating point number or 'auto')", category="Model Cache", )
vram : Union[float, Literal["auto"]] = Field(default=0.25, ge=0, description="Amount of VRAM reserved for model storage (floating point number or 'auto')", category="Model Cache", )
lazy_offload : bool = Field(default=True, description="Keep models in VRAM until their space is needed", category="Model Cache", )
# DEVICE
device : Literal[tuple(["auto", "cpu", "cuda", "cuda:1", "mps"])] = Field(default="auto", description="Generation device", category="Device", )
@ -246,8 +246,6 @@ class InvokeAIAppConfig(InvokeAISettings):
free_gpu_mem : Optional[bool] = Field(default=None, description="If true, purge model from GPU after each generation.", category='Memory/Performance')
max_cache_size : Optional[float] = Field(default=None, gt=0, description="Maximum memory amount used by model cache for rapid switching", category='Memory/Performance')
max_vram_cache_size : Optional[float] = Field(default=None, ge=0, description="Amount of VRAM reserved for model storage", category='Memory/Performance')
precision : Literal[tuple(['auto','float16','float32','autocast'])] = Field(default='auto',description='Floating point precision', category='Memory/Performance')
sequential_guidance : bool = Field(default=False, description="Whether to calculate guidance in serial instead of in parallel, lowering memory requirements", category='Memory/Performance')
xformers_enabled : bool = Field(default=True, description="Enable/disable memory-efficient attention", category='Memory/Performance')
tiled_decode : bool = Field(default=False, description="Whether to enable tiled VAE decode (reduces memory consumption with some performance penalty)", category='Memory/Performance')

View File

@ -2,10 +2,8 @@ from __future__ import annotations
import dataclasses
import inspect
import math
import secrets
from dataclasses import dataclass, field
from typing import Any, Callable, Generic, List, Optional, Type, Union
from typing import Any, Callable, List, Optional, Union
import PIL.Image
import einops
@ -293,6 +291,22 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
if xformers is available, use it, otherwise use sliced attention.
"""
config = InvokeAIAppConfig.get_config()
if config.attention_type == "xformers":
self.enable_xformers_memory_efficient_attention()
return
elif config.attention_type == "sliced":
slice_size = config.attention_slice_size
if torch.backends.mps.is_available(): # doesn't auto already do this?
slice_size = "max"
self.enable_attention_slicing(slice_size=slice_size)
return
elif config.attention_type == "normal":
self.disable_attention_slicing()
return
elif config.attention_type == "torch-sdp":
raise Exception("torch-sdp attention slicing not yet implemented")
# the remainder if this code is called when attention_type=='auto'
if self.unet.device.type == "cuda":
if is_xformers_available() and not config.disable_xformers:
self.enable_xformers_memory_efficient_attention()