Disable autocast for cpu to fix error. Remove unused precision arg. (#518)

When running on just cpu (intel), a call to torch.layer_norm would error with RuntimeError: expected scalar type BFloat16 but found Float Fix buggy device handling in model.py. Tested with scripts/dream.py --full_precision on just cpu on intel laptop. Works but slow at ~10s/it.
2024-08-30 20:32:17 +00:00 · 2022-09-12 23:55:21 +03:00 · 2022-09-12 23:55:21 +03:00 · 0bc6779361
commit 0bc6779361
parent 6665f4494f
3 changed files with 5 additions and 8 deletions
--- a/ldm/dream/devices.py
+++ b/ldm/dream/devices.py
@ -13,8 +13,9 @@ def choose_torch_device() -> str:
 def choose_autocast_device(device):
    '''Returns an autocast compatible device from a torch device'''
    device_type = device.type # this returns 'mps' on M1
-    # autocast only supports cuda or cpu
+    if device_type == 'cuda':
    if device_type in ('cuda','cpu'):
        return device_type,autocast
    elif device_type == 'cpu':
        return device_type,nullcontext
    else:
        return 'cpu',nullcontext
--- a/ldm/generate.py
+++ b/ldm/generate.py
@ -111,7 +111,6 @@ class Generate:
            height                = 512,
            sampler_name          = 'k_lms',
            ddim_eta              = 0.0,  # deterministic
            precision             = 'autocast',
            full_precision        = False,
            strength              = 0.75,  # default in scripts/img2img.py
            seamless              = False,
@ -129,7 +128,6 @@ class Generate:
        self.sampler_name             = sampler_name
        self.grid                     = grid
        self.ddim_eta                 = ddim_eta
        self.precision                = precision
        self.full_precision           = True if choose_torch_device() == 'mps' else full_precision
        self.strength                 = strength
        self.seamless                 = seamless
--- a/ldm/modules/diffusionmodules/model.py
+++ b/ldm/modules/diffusionmodules/model.py
@ -209,8 +209,7 @@ class AttnBlock(nn.Module):
        h_ = torch.zeros_like(k, device=q.device)
-        device_type = 'mps' if q.device.type == 'mps' else 'cuda'
+        if q.device.type == 'cuda':
        if device_type == 'cuda':
            stats = torch.cuda.memory_stats(q.device)
            mem_active = stats['active_bytes.all.current']
            mem_reserved = stats['reserved_bytes.all.current']
@ -612,9 +611,8 @@ class Decoder(nn.Module):
        del h3
        # prepare for up sampling
        device_type = 'mps' if h.device.type == 'mps' else 'cuda'
        gc.collect()
-        if device_type == 'cuda':
+        if h.device.type == 'cuda':
            torch.cuda.empty_cache()
        # upsampling