From dcf11a01ce8a9b44ade057c0ff1d5f46418a4e7b Mon Sep 17 00:00:00 2001
From: Ryan Dick <ryanjdick3@gmail.com>
Date: Thu, 4 Apr 2024 11:33:27 -0400
Subject: [PATCH] Add TODO comment about peformance bottleneck in LoRA loading
 code.

---
 invokeai/backend/lora.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/invokeai/backend/lora.py b/invokeai/backend/lora.py
index e2820e392b..e1e83e0d05 100644
--- a/invokeai/backend/lora.py
+++ b/invokeai/backend/lora.py
@@ -544,6 +544,10 @@ class LoRAModelRaw(RawModel):  # (torch.nn.Module):
         for layer_key, values in state_dict.items():
             layer = layer_cls(layer_key, values)
 
+            # TODO(ryand): This .to() call causes an implicit CUDA sync point in a tight loop. This is very slow (even
+            # slower than loading the weights from disk). We should ideally only be copying the weights once - right
+            # before they are used. Or, if we want to do this here, then setting non_blocking = True would probably
+            # help.
             layer.to(device=device, dtype=dtype)
             model.layers[layer_key] = layer
         return model