~7% speedup by switch to += in ldm.modules.attention. (#569)

Tested on 8GB eGPU nvidia setup so YMMV.
Re-land with .clone() fix, context #508
This commit is contained in:
Mihai 2022-09-15 01:10:33 +03:00 committed by GitHub
parent 94a7b3cc07
commit dd3fff1d3e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -281,9 +281,9 @@ class BasicTransformerBlock(nn.Module):
def _forward(self, x, context=None):
x = x.contiguous() if x.device.type == 'mps' else x
x = self.attn1(self.norm1(x)) + x
x = self.attn2(self.norm2(x), context=context) + x
x = self.ff(self.norm3(x)) + x
x += self.attn1(self.norm1(x.clone()))
x += self.attn2(self.norm2(x.clone()), context=context)
x += self.ff(self.norm3(x.clone()))
return x