use Half precision for reduced memory usage & faster speed

This allows users with 6 & 8gb cards to run 512x512 and for even larger resolutions for bigger GPUs I compared the output in Beyond Compare and there are minor differences detected at tolerance 3, but side by side the differences are not perceptible.
2024-08-30 20:32:17 +00:00 · 2022-08-19 17:23:43 +09:00 · 2022-08-19 17:23:43 +09:00 · a5fb8469ed
commit a5fb8469ed
parent 9eaef0c5a8
1 changed files with 5 additions and 2 deletions
--- a/ldm/simplet2i.py
+++ b/ldm/simplet2i.py
@ -256,6 +256,8 @@ class T2I:

        model = self.load_model()  # will instantiate the model or return it from cache

+        precision_scope = autocast if self.precision=="autocast" else nullcontext
+
        # grid and individual are mutually exclusive, with individual taking priority.
        # not necessary, but needed for compatability with dream bot
        if (grid is None):
@ -279,7 +281,8 @@ class T2I:
        assert os.path.isfile(init_img)
        init_image = self._load_img(init_img).to(self.device)
        init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
-        init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image))  # move to latent space
+        with precision_scope("cuda"):
+            init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image))  # move to latent space

        sampler.make_schedule(ddim_num_steps=steps, ddim_eta=ddim_eta, verbose=False)

@ -292,7 +295,6 @@ class T2I:
        t_enc = int(strength * steps)
        print(f"target t_enc is {t_enc} steps")

-        precision_scope = autocast if self.precision=="autocast" else nullcontext
        images = list()
        seeds  = list()

@ -401,6 +403,7 @@ class T2I:
        m, u = model.load_state_dict(sd, strict=False)
        model.cuda()
        model.eval()
+        model.half()
        return model

    def _load_img(self,path):