resize initial image to match requested width and height, preserving aspect ratio. Closes #210. Closes #207 (#214)

2024-08-30 20:32:17 +00:00 · 2022-08-30 15:26:02 -04:00 · 2022-08-30 15:26:02 -04:00 · a51e18ea98
commit a51e18ea98
parent 8bf321f6ae
4 changed files with 71 additions and 48 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -1,35 +0,0 @@
 Feature requests:
 1. "gobig" mode - split image into strips, scale up, add detail using  - DONE!
   img2img and reassemble with feathering. Issue #66.
   See https://github.com/jquesnelle/txt2imghd
 2. Port basujindal low VRAM optimizations. Issue #62
 3. Store images under folders named after the prompt. Issue #27.
 4. Some sort of automation for generating variations. Issues #32 and #47.
 5. Support for inpainting masks #68.
 6. Support for loading variations of the stable-diffusion
   weights #49
 7. Support for klms and other non-ddim samplers in img2img() #36 - DONE!
 8. Pass a shell command to open up an image viewer on the last
   batch of images generated #29.
 9. Change sampler and outdir after initialization #115
 Code Refactorization:
 1. Move the PNG file generation code out of simplet2i and into - DONE!
   separate module. txt2img() and img2img() should return Image
   objects, and parent code is responsible for filenaming logic.
 2. Refactor redundant code that is shared between txt2img() and - DONE!
   img2img().
 3. Experiment with replacing CompViz code with HuggingFace. - NOT WORTH IT!
--- a/ldm/dream/image_util.py
+++ b/ldm/dream/image_util.py
@ -0,0 +1,54 @@
 from PIL import Image
 class InitImageResizer():
    """Simple class to create resized copies of an Image while preserving the aspect ratio."""
    def __init__(self,Image):
        self.image = Image
    def resize(self,width=None,height=None) -> Image:
        """
        Return a copy of the image resized to width x height.
        The aspect ratio is maintained, with any excess space
        filled using black borders (i.e. letterboxed). If
        neither width nor height are provided, then returns
        a copy of the original image. If one or the other is
        provided, then the other will be calculated from the
        aspect ratio.
        Everything is floored to the nearest multiple of 64 so
        that it can be passed to img2img()
        """
        im    = self.image
        if not(width or height):
            return im.copy()
        ar = im.width/im.height
        # Infer missing values from aspect ratio
        if not height:          # height missing
            height = int(width/ar)
        if not width:          # width missing
            width  = int(height*ar)
        # rw and rh are the resizing width and height for the image
        # they maintain the aspect ratio, but may not completelyl fill up
        # the requested destination size
        (rw,rh) = (width,int(width/ar)) if im.width>=im.height else (int(height*ar),width)
        #round everything to multiples of 64
        width,height,rw,rh = map(
            lambda x: x-x%64, (width,height,rw,rh)
            )
        # resize the original image so that it fits inside the dest
        resized_image = self.image.resize((rw,rh),resample=Image.Resampling.LANCZOS)
        # create new destination image of specified dimensions
        # and paste the resized image into it centered appropriately
        new_image = Image.new('RGB',(width,height))
        new_image.paste(resized_image,((width-rw)//2,(height-rh)//2))
        return new_image
--- a/ldm/dream/readline.py
+++ b/ldm/dream/readline.py
@ -23,7 +23,7 @@ class Completer:
        buffer = readline.get_line_buffer()
        if text.startswith(('-I', '--init_img')):
-            return self._path_completions(text, state, ('.png'))
+            return self._path_completions(text, state, ('.png','.jpg','.jpeg'))
        if buffer.strip().endswith('cd') or text.startswith(('.', '/')):
            return self._path_completions(text, state, ())
--- a/ldm/simplet2i.py
+++ b/ldm/simplet2i.py
@ -27,6 +27,7 @@ from ldm.models.diffusion.ddim import DDIMSampler
 from ldm.models.diffusion.plms import PLMSSampler
 from ldm.models.diffusion.ksampler import KSampler
 from ldm.dream.pngwriter import PngWriter
 from ldm.dream.image_util import InitImageResizer
 """Simplified text to image API for stable diffusion/latent diffusion
@ -204,7 +205,6 @@ class T2I:
        skip_normalize=False,
        image_callback=None,
        step_callback=None,
        # these are specific to txt2img
        width=None,
        height=None,
        # these are specific to img2img
@ -270,8 +270,10 @@ class T2I:
        assert (
            0.0 <= strength <= 1.0
        ), 'can only work with strength in [0.0, 1.0]'
-        w = int(width / 64) * 64
+        w, h = map(
-        h = int(height / 64) * 64
+            lambda x: x - x % 64, (width, height)
        )  # resize to integer multiple of 64
        if h != height or w != width:
            print(
                f'Height and width must be multiples of 64. Resizing to {h}x{w}.'
@ -301,6 +303,8 @@ class T2I:
                    ddim_eta=ddim_eta,
                    skip_normalize=skip_normalize,
                    init_img=init_img,
                    width=width,
                    height=height,
                    strength=strength,
                    callback=step_callback,
                )
@ -441,6 +445,8 @@ class T2I:
        ddim_eta,
        skip_normalize,
        init_img,
        width,
        height,
        strength,
        callback, # Currently not implemented for img2img
    ):
@ -457,7 +463,7 @@ class T2I:
        else:
            sampler = self.sampler
-        init_image = self._load_img(init_img).to(self.device)
+        init_image = self._load_img(init_img,width,height).to(self.device)
        init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
        with precision_scope(self.device.type):
            init_latent = self.model.get_first_stage_encoding(
@ -616,17 +622,15 @@ class T2I:
            model.half()
        return model
-    def _load_img(self, path):
+    def _load_img(self, path, width, height):
        print(f'image path = {path}, cwd = {os.getcwd()}')
        with Image.open(path) as img:
            image = img.convert('RGB')
        print(f'loaded input image of size {image.width}x{image.height} from {path}')
        image = InitImageResizer(image).resize(width,height)
        print(f'resized input image to size {image.width}x{image.height}')
        w, h = image.size
        print(f'loaded input image of size ({w}, {h}) from {path}')
        w, h = map(
            lambda x: x - x % 32, (w, h)
        )  # resize to integer multiple of 32
        image = image.resize((w, h), resample=Image.Resampling.LANCZOS)
        image = np.array(image).astype(np.float32) / 255.0
        image = image[None].transpose(0, 3, 1, 2)
        image = torch.from_numpy(image)