InvokeAI/ldm/invoke/txt2mask.py

'''Makes available the Txt2Mask class, which assists in the automatic
assignment of masks via text prompt using clipseg.

Here is typical usage:

    from ldm.invoke.txt2mask import Txt2Mask, SegmentedGrayscale
    from PIL import Image

    txt2mask = Txt2Mask(self.device)
    segmented = txt2mask.segment(Image.open('/path/to/img.png'),'a bagel')

    # this will return a grayscale Image of the segmented data
    grayscale = segmented.to_grayscale()

    # this will return a semi-transparent image in which the
    # selected object(s) are opaque and the rest is at various
    # levels of transparency
    transparent = segmented.to_transparent()

    # this will return a masked image suitable for use in inpainting:
    mask = segmented.to_mask(threshold=0.5)

The threshold used in the call to to_mask() selects pixels for use in
the mask that exceed the indicated confidence threshold. Values range
from 0.0 to 1.0. The higher the threshold, the more confident the
algorithm is. In limited testing, I have found that values around 0.5
work fine.
'''

import torch
import numpy as  np
from transformers import AutoProcessor, CLIPSegForImageSegmentation
from PIL import Image, ImageOps
from torchvision import transforms
from ldm.invoke.globals import global_cache_dir

CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined'
CLIPSEG_SIZE = 352

class SegmentedGrayscale(object):
    def __init__(self, image:Image, heatmap:torch.Tensor):
        self.heatmap = heatmap
        self.image = image

    def to_grayscale(self,invert:bool=False)->Image:
        return self._rescale(Image.fromarray(np.uint8(255 - self.heatmap * 255 if invert else self.heatmap * 255)))

    def to_mask(self,threshold:float=0.5)->Image:
        discrete_heatmap = self.heatmap.lt(threshold).int()
        return self._rescale(Image.fromarray(np.uint8(discrete_heatmap*255),mode='L'))

    def to_transparent(self,invert:bool=False)->Image:
        transparent_image = self.image.copy()
        # For img2img, we want the selected regions to be transparent,
        # but to_grayscale() returns the opposite. Thus invert.
        gs = self.to_grayscale(not invert)
        transparent_image.putalpha(gs)
        return transparent_image

    # unscales and uncrops the 352x352 heatmap so that it matches the image again
    def _rescale(self, heatmap:Image)->Image:
        size = self.image.width if (self.image.width > self.image.height) else self.image.height
        resized_image = heatmap.resize(
            (size,size),
            resample=Image.Resampling.LANCZOS
        )
        return resized_image.crop((0,0,self.image.width,self.image.height))

class Txt2Mask(object):
    '''
    Create new Txt2Mask object. The optional device argument can be one of
    'cuda', 'mps' or 'cpu'.
    '''
    def __init__(self,device='cpu',refined=False):
        print('>> Initializing clipseg model for text to mask inference')

        # BUG: we are not doing anything with the device option at this time
        self.device = device
        self.processor = AutoProcessor.from_pretrained(CLIPSEG_MODEL,
                                                       cache_dir=global_cache_dir('hub')
                                                       )
        self.model = CLIPSegForImageSegmentation.from_pretrained(CLIPSEG_MODEL,
                                                                 cache_dir=global_cache_dir('hub')
                                                                 )

    @torch.no_grad()
    def segment(self, image, prompt:str) -> SegmentedGrayscale:
        '''
        Given a prompt string such as "a bagel", tries to identify the object in the
        provided image and returns a SegmentedGrayscale object in which the brighter
        pixels indicate where the object is inferred to be.
        '''
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            transforms.Resize((CLIPSEG_SIZE, CLIPSEG_SIZE)), # must be multiple of 64...
        ])

        if type(image) is str:
            image = Image.open(image).convert('RGB')

        image = ImageOps.exif_transpose(image)
        img = self._scale_and_crop(image)

        inputs = self.processor(text=[prompt],
                                images=[img],
                                padding=True,
                                return_tensors='pt')
        outputs = self.model(**inputs)
        heatmap = torch.sigmoid(outputs.logits)
        return SegmentedGrayscale(image, heatmap)

    def _scale_and_crop(self, image:Image)->Image:
        scaled_image = Image.new('RGB',(CLIPSEG_SIZE,CLIPSEG_SIZE))
        if image.width > image.height: # width is constraint
            scale = CLIPSEG_SIZE / image.width
        else:
            scale = CLIPSEG_SIZE / image.height
        scaled_image.paste(
            image.resize(
                (int(scale * image.width),
                 int(scale * image.height)
                ),
                resample=Image.Resampling.LANCZOS
            ),box=(0,0)
        )
        return scaled_image
clipseg library and environment in place 2022-10-16 20:45:07 +00:00			`'''Makes available the Txt2Mask class, which assists in the automatic`
			`assignment of masks via text prompt using clipseg.`

			`Here is typical usage:`
Global replace [ \t]+$, add "GB" (#1751) * "GB" * Replace [ \t]+$ global Co-authored-by: Lincoln Stein <lincoln.stein@gmail.com> 2022-12-19 16:36:39 +00:00
clipseg library and environment in place 2022-10-16 20:45:07 +00:00			`from ldm.invoke.txt2mask import Txt2Mask, SegmentedGrayscale`
			`from PIL import Image`

			`txt2mask = Txt2Mask(self.device)`
			`segmented = txt2mask.segment(Image.open('/path/to/img.png'),'a bagel')`
Global replace [ \t]+$, add "GB" (#1751) * "GB" * Replace [ \t]+$ global Co-authored-by: Lincoln Stein <lincoln.stein@gmail.com> 2022-12-19 16:36:39 +00:00
clipseg library and environment in place 2022-10-16 20:45:07 +00:00			`# this will return a grayscale Image of the segmented data`
			`grayscale = segmented.to_grayscale()`

			`# this will return a semi-transparent image in which the`
			`# selected object(s) are opaque and the rest is at various`
			`# levels of transparency`
			`transparent = segmented.to_transparent()`

			`# this will return a masked image suitable for use in inpainting:`
			`mask = segmented.to_mask(threshold=0.5)`

			`The threshold used in the call to to_mask() selects pixels for use in`
			`the mask that exceed the indicated confidence threshold. Values range`
			`from 0.0 to 1.0. The higher the threshold, the more confident the`
			`algorithm is. In limited testing, I have found that values around 0.5`
			`work fine.`
			`'''`

			`import torch`
			`import numpy as np`
remove dependency on original clipseg library - This replaces the original clipseg library with the transformers version from HuggingFace. - This should make it possible to register InvokeAI at PyPi and do a fully automated pip-based install. - Minor regression: it is no longer possible to specify which device the clipseg model will be loaded into, and it will reside in CPU. However, performance is more than acceptable. 2023-01-26 14:35:16 +00:00			`from transformers import AutoProcessor, CLIPSegForImageSegmentation`
add !mask command to view output of clipseg - The !mask command takes an image path, a text prompt, and (optionally) a masking threshold. It creates a mask over the region indicated by the prompt, and outputs several files that show which regions will be masked by the chosen prompt and threshold. - The mask images should not be passed directly to img2img because they are designed for visualization only. Instead, use the --text_mask option to pass the selected prompt and threshold. - See docs/features/INPAINTING.md for details. 2022-10-20 06:33:07 +00:00			`from PIL import Image, ImageOps`
clipseg library and environment in place 2022-10-16 20:45:07 +00:00			`from torchvision import transforms`
remove dependency on original clipseg library - This replaces the original clipseg library with the transformers version from HuggingFace. - This should make it possible to register InvokeAI at PyPi and do a fully automated pip-based install. - Minor regression: it is no longer possible to specify which device the clipseg model will be loaded into, and it will reside in CPU. However, performance is more than acceptable. 2023-01-26 14:35:16 +00:00			`from ldm.invoke.globals import global_cache_dir`
clipseg library and environment in place 2022-10-16 20:45:07 +00:00
remove dependency on original clipseg library - This replaces the original clipseg library with the transformers version from HuggingFace. - This should make it possible to register InvokeAI at PyPi and do a fully automated pip-based install. - Minor regression: it is no longer possible to specify which device the clipseg model will be loaded into, and it will reside in CPU. However, performance is more than acceptable. 2023-01-26 14:35:16 +00:00			`CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined'`
add clipseg support for creating inpaint masks from text On the command line, the new option is --text_mask or -tm. Example: ``` invoke> a baseball -I /path/to/still_life.png -tm orange ``` This will find the orange fruit in the still life painting and replace it with an image of a baseball. 2022-10-17 03:30:24 +00:00			`CLIPSEG_SIZE = 352`
clipseg library and environment in place 2022-10-16 20:45:07 +00:00
			`class SegmentedGrayscale(object):`
			`def __init__(self, image:Image, heatmap:torch.Tensor):`
			`self.heatmap = heatmap`
			`self.image = image`
Global replace [ \t]+$, add "GB" (#1751) * "GB" * Replace [ \t]+$ global Co-authored-by: Lincoln Stein <lincoln.stein@gmail.com> 2022-12-19 16:36:39 +00:00
Option to directly invert the grayscale heatmap Theoretically less work inverting the image while it's small but I can't measure a significant difference. Though, handy option to have in some cases. 2022-11-01 20:11:19 +00:00			`def to_grayscale(self,invert:bool=False)->Image:`
Option to directly invert the grayscale heatmap - fix 2022-11-01 23:21:27 +00:00			`return self._rescale(Image.fromarray(np.uint8(255 - self.heatmap * 255 if invert else self.heatmap * 255)))`
clipseg library and environment in place 2022-10-16 20:45:07 +00:00
			`def to_mask(self,threshold:float=0.5)->Image:`
			`discrete_heatmap = self.heatmap.lt(threshold).int()`
add clipseg support for creating inpaint masks from text On the command line, the new option is --text_mask or -tm. Example: ``` invoke> a baseball -I /path/to/still_life.png -tm orange ``` This will find the orange fruit in the still life painting and replace it with an image of a baseball. 2022-10-17 03:30:24 +00:00			`return self._rescale(Image.fromarray(np.uint8(discrete_heatmap*255),mode='L'))`
clipseg library and environment in place 2022-10-16 20:45:07 +00:00
add !mask command to view output of clipseg - The !mask command takes an image path, a text prompt, and (optionally) a masking threshold. It creates a mask over the region indicated by the prompt, and outputs several files that show which regions will be masked by the chosen prompt and threshold. - The mask images should not be passed directly to img2img because they are designed for visualization only. Instead, use the --text_mask option to pass the selected prompt and threshold. - See docs/features/INPAINTING.md for details. 2022-10-20 06:33:07 +00:00			`def to_transparent(self,invert:bool=False)->Image:`
clipseg library and environment in place 2022-10-16 20:45:07 +00:00			`transparent_image = self.image.copy()`
add !mask command to view output of clipseg - The !mask command takes an image path, a text prompt, and (optionally) a masking threshold. It creates a mask over the region indicated by the prompt, and outputs several files that show which regions will be masked by the chosen prompt and threshold. - The mask images should not be passed directly to img2img because they are designed for visualization only. Instead, use the --text_mask option to pass the selected prompt and threshold. - See docs/features/INPAINTING.md for details. 2022-10-20 06:33:07 +00:00			`# For img2img, we want the selected regions to be transparent,`
Option to directly invert the grayscale heatmap Theoretically less work inverting the image while it's small but I can't measure a significant difference. Though, handy option to have in some cases. 2022-11-01 20:11:19 +00:00			`# but to_grayscale() returns the opposite. Thus invert.`
Update txt2mask.py 2022-11-01 20:37:33 +00:00			`gs = self.to_grayscale(not invert)`
add !mask command to view output of clipseg - The !mask command takes an image path, a text prompt, and (optionally) a masking threshold. It creates a mask over the region indicated by the prompt, and outputs several files that show which regions will be masked by the chosen prompt and threshold. - The mask images should not be passed directly to img2img because they are designed for visualization only. Instead, use the --text_mask option to pass the selected prompt and threshold. - See docs/features/INPAINTING.md for details. 2022-10-20 06:33:07 +00:00			`transparent_image.putalpha(gs)`
clipseg library and environment in place 2022-10-16 20:45:07 +00:00			`return transparent_image`

add clipseg support for creating inpaint masks from text On the command line, the new option is --text_mask or -tm. Example: ``` invoke> a baseball -I /path/to/still_life.png -tm orange ``` This will find the orange fruit in the still life painting and replace it with an image of a baseball. 2022-10-17 03:30:24 +00:00			`# unscales and uncrops the 352x352 heatmap so that it matches the image again`
			`def _rescale(self, heatmap:Image)->Image:`
			`size = self.image.width if (self.image.width > self.image.height) else self.image.height`
			`resized_image = heatmap.resize(`
			`(size,size),`
			`resample=Image.Resampling.LANCZOS`
			`)`
			`return resized_image.crop((0,0,self.image.width,self.image.height))`

clipseg library and environment in place 2022-10-16 20:45:07 +00:00			`class Txt2Mask(object):`
			`'''`
			`Create new Txt2Mask object. The optional device argument can be one of`
			`'cuda', 'mps' or 'cpu'.`
			`'''`
Optional refined model for Txt2Mask Don't merge right now, just wanted to show the necessary changes 2022-11-01 23:33:46 +00:00			`def __init__(self,device='cpu',refined=False):`
add clipseg support for creating inpaint masks from text On the command line, the new option is --text_mask or -tm. Example: ``` invoke> a baseball -I /path/to/still_life.png -tm orange ``` This will find the orange fruit in the still life painting and replace it with an image of a baseball. 2022-10-17 03:30:24 +00:00			`print('>> Initializing clipseg model for text to mask inference')`
remove dependency on original clipseg library - This replaces the original clipseg library with the transformers version from HuggingFace. - This should make it possible to register InvokeAI at PyPi and do a fully automated pip-based install. - Minor regression: it is no longer possible to specify which device the clipseg model will be loaded into, and it will reside in CPU. However, performance is more than acceptable. 2023-01-26 14:35:16 +00:00
			`# BUG: we are not doing anything with the device option at this time`
add clipseg support for creating inpaint masks from text On the command line, the new option is --text_mask or -tm. Example: ``` invoke> a baseball -I /path/to/still_life.png -tm orange ``` This will find the orange fruit in the still life painting and replace it with an image of a baseball. 2022-10-17 03:30:24 +00:00			`self.device = device`
remove dependency on original clipseg library - This replaces the original clipseg library with the transformers version from HuggingFace. - This should make it possible to register InvokeAI at PyPi and do a fully automated pip-based install. - Minor regression: it is no longer possible to specify which device the clipseg model will be loaded into, and it will reside in CPU. However, performance is more than acceptable. 2023-01-26 14:35:16 +00:00			`self.processor = AutoProcessor.from_pretrained(CLIPSEG_MODEL,`
			`cache_dir=global_cache_dir('hub')`
			`)`
			`self.model = CLIPSegForImageSegmentation.from_pretrained(CLIPSEG_MODEL,`
			`cache_dir=global_cache_dir('hub')`
			`)`
clipseg library and environment in place 2022-10-16 20:45:07 +00:00
			`@torch.no_grad()`
add !mask command to view output of clipseg - The !mask command takes an image path, a text prompt, and (optionally) a masking threshold. It creates a mask over the region indicated by the prompt, and outputs several files that show which regions will be masked by the chosen prompt and threshold. - The mask images should not be passed directly to img2img because they are designed for visualization only. Instead, use the --text_mask option to pass the selected prompt and threshold. - See docs/features/INPAINTING.md for details. 2022-10-20 06:33:07 +00:00			`def segment(self, image, prompt:str) -> SegmentedGrayscale:`
clipseg library and environment in place 2022-10-16 20:45:07 +00:00			`'''`
			`Given a prompt string such as "a bagel", tries to identify the object in the`
			`provided image and returns a SegmentedGrayscale object in which the brighter`
			`pixels indicate where the object is inferred to be.`
			`'''`
			`transform = transforms.Compose([`
			`transforms.ToTensor(),`
			`transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),`
add clipseg support for creating inpaint masks from text On the command line, the new option is --text_mask or -tm. Example: ``` invoke> a baseball -I /path/to/still_life.png -tm orange ``` This will find the orange fruit in the still life painting and replace it with an image of a baseball. 2022-10-17 03:30:24 +00:00			`transforms.Resize((CLIPSEG_SIZE, CLIPSEG_SIZE)), # must be multiple of 64...`
clipseg library and environment in place 2022-10-16 20:45:07 +00:00			`])`
add clipseg support for creating inpaint masks from text On the command line, the new option is --text_mask or -tm. Example: ``` invoke> a baseball -I /path/to/still_life.png -tm orange ``` This will find the orange fruit in the still life painting and replace it with an image of a baseball. 2022-10-17 03:30:24 +00:00
add !mask command to view output of clipseg - The !mask command takes an image path, a text prompt, and (optionally) a masking threshold. It creates a mask over the region indicated by the prompt, and outputs several files that show which regions will be masked by the chosen prompt and threshold. - The mask images should not be passed directly to img2img because they are designed for visualization only. Instead, use the --text_mask option to pass the selected prompt and threshold. - See docs/features/INPAINTING.md for details. 2022-10-20 06:33:07 +00:00			`if type(image) is str:`
			`image = Image.open(image).convert('RGB')`

			`image = ImageOps.exif_transpose(image)`
add clipseg support for creating inpaint masks from text On the command line, the new option is --text_mask or -tm. Example: ``` invoke> a baseball -I /path/to/still_life.png -tm orange ``` This will find the orange fruit in the still life painting and replace it with an image of a baseball. 2022-10-17 03:30:24 +00:00			`img = self._scale_and_crop(image)`

remove dependency on original clipseg library - This replaces the original clipseg library with the transformers version from HuggingFace. - This should make it possible to register InvokeAI at PyPi and do a fully automated pip-based install. - Minor regression: it is no longer possible to specify which device the clipseg model will be loaded into, and it will reside in CPU. However, performance is more than acceptable. 2023-01-26 14:35:16 +00:00			`inputs = self.processor(text=[prompt],`
registration of mask images was off due to typo - Problem found and fixed by @spezialspezial - Closes #2470 2023-02-03 22:32:35 +00:00			`images=[img],`
remove dependency on original clipseg library - This replaces the original clipseg library with the transformers version from HuggingFace. - This should make it possible to register InvokeAI at PyPi and do a fully automated pip-based install. - Minor regression: it is no longer possible to specify which device the clipseg model will be loaded into, and it will reside in CPU. However, performance is more than acceptable. 2023-01-26 14:35:16 +00:00			`padding=True,`
			`return_tensors='pt')`
			`outputs = self.model(**inputs)`
simplified code a bit 2023-01-26 14:46:34 +00:00			`heatmap = torch.sigmoid(outputs.logits)`
clipseg library and environment in place 2022-10-16 20:45:07 +00:00			`return SegmentedGrayscale(image, heatmap)`

add clipseg support for creating inpaint masks from text On the command line, the new option is --text_mask or -tm. Example: ``` invoke> a baseball -I /path/to/still_life.png -tm orange ``` This will find the orange fruit in the still life painting and replace it with an image of a baseball. 2022-10-17 03:30:24 +00:00			`def _scale_and_crop(self, image:Image)->Image:`
			`scaled_image = Image.new('RGB',(CLIPSEG_SIZE,CLIPSEG_SIZE))`
			`if image.width > image.height: # width is constraint`
			`scale = CLIPSEG_SIZE / image.width`
			`else:`
			`scale = CLIPSEG_SIZE / image.height`
			`scaled_image.paste(`
			`image.resize(`
			`(int(scale * image.width),`
			`int(scale * image.height)`
			`),`
			`resample=Image.Resampling.LANCZOS`
			`),box=(0,0)`
			`)`
			`return scaled_image`