Cleanup vram after models offloading (#3826)

## What type of PR is this? (check all applicable) - [ ] Refactor - [ ] Feature - [ ] Bug Fix - [x] Optimization - [ ] Documentation Update ## Have you discussed this change with the InvokeAI team? - [x] Yes - [ ] No, because: ## Description There no vram cleanup on models offload which leads to filling vram and slow generation speed.
2024-08-30 20:32:17 +00:00 · 2023-07-19 10:17:17 +12:00 · 2023-07-19 10:17:17 +12:00 · 2a0a765ec4
commit 2a0a765ec4
parent dea9a5da7a 186e98da5e
1 changed files with 22 additions and 0 deletions
--- a/invokeai/backend/model_management/model_cache.py
+++ b/invokeai/backend/model_management/model_cache.py
@ -328,6 +328,25 @@ class ModelCache(object):

            refs = sys.getrefcount(cache_entry.model)

+            # manualy clear local variable references of just finished function calls
+            # for some reason python don't want to collect it even by gc.collect() immidiately
+            if refs > 2:
+                while True:
+                    cleared = False
+                    for referrer in gc.get_referrers(cache_entry.model):
+                        if type(referrer).__name__ == "frame":
+                            # RuntimeError: cannot clear an executing frame
+                            with suppress(RuntimeError):
+                                referrer.clear()
+                                cleared = True
+                                #break
+
+                    # repeat if referrers changes(due to frame clear), else exit loop
+                    if cleared:
+                        gc.collect()
+                    else:
+                        break
+
            device = cache_entry.model.device if hasattr(cache_entry.model, "device") else None
            self.logger.debug(f"Model: {model_key}, locks: {cache_entry._locks}, device: {device}, loaded: {cache_entry.loaded}, refs: {refs}")

@ -363,6 +382,9 @@ class ModelCache(object):
                self.logger.debug(f'GPU VRAM freed: {(mem.vram_used/GIG):.2f} GB')
                vram_in_use += mem.vram_used  # note vram_used is negative
                self.logger.debug(f'{(vram_in_use/GIG):.2f}GB VRAM used for models; max allowed={(reserved/GIG):.2f}GB')
+
+        gc.collect()
+        torch.cuda.empty_cache()
        
    def _local_model_hash(self, model_path: Union[str, Path]) -> str:
        sha = hashlib.sha256()