mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
7ea995149e
- Make environment variable settings case InSenSiTive: INVOKEAI_MAX_LOADED_MODELS and InvokeAI_Max_Loaded_Models environment variables will both set `max_loaded_models` - Updated realesrgan to use new config system. - Updated textual_inversion_training to use new config system. - Discovered a race condition when InvokeAIAppConfig is created at module load time, which makes it impossible to customize or replace the help message produced with --help on the command line. To fix this, moved all instances of get_invokeai_config() from module load time to object initialization time. Makes code cleaner, too. - Added `--from_file` argument to `invokeai-node-cli` and changed github action to match. CI tests will hopefully work now.
131 lines
4.7 KiB
Python
131 lines
4.7 KiB
Python
"""Makes available the Txt2Mask class, which assists in the automatic
|
|
assignment of masks via text prompt using clipseg.
|
|
|
|
Here is typical usage:
|
|
|
|
from invokeai.backend.image_util.txt2mask import Txt2Mask, SegmentedGrayscale
|
|
from PIL import Image
|
|
|
|
txt2mask = Txt2Mask(self.device)
|
|
segmented = txt2mask.segment(Image.open('/path/to/img.png'),'a bagel')
|
|
|
|
# this will return a grayscale Image of the segmented data
|
|
grayscale = segmented.to_grayscale()
|
|
|
|
# this will return a semi-transparent image in which the
|
|
# selected object(s) are opaque and the rest is at various
|
|
# levels of transparency
|
|
transparent = segmented.to_transparent()
|
|
|
|
# this will return a masked image suitable for use in inpainting:
|
|
mask = segmented.to_mask(threshold=0.5)
|
|
|
|
The threshold used in the call to to_mask() selects pixels for use in
|
|
the mask that exceed the indicated confidence threshold. Values range
|
|
from 0.0 to 1.0. The higher the threshold, the more confident the
|
|
algorithm is. In limited testing, I have found that values around 0.5
|
|
work fine.
|
|
"""
|
|
|
|
import numpy as np
|
|
import torch
|
|
from PIL import Image, ImageOps
|
|
from transformers import AutoProcessor, CLIPSegForImageSegmentation
|
|
|
|
import invokeai.backend.util.logging as logger
|
|
from invokeai.app.services.config import get_invokeai_config
|
|
|
|
CLIPSEG_MODEL = "CIDAS/clipseg-rd64-refined"
|
|
CLIPSEG_SIZE = 352
|
|
|
|
class SegmentedGrayscale(object):
|
|
def __init__(self, image: Image, heatmap: torch.Tensor):
|
|
self.heatmap = heatmap
|
|
self.image = image
|
|
|
|
def to_grayscale(self, invert: bool = False) -> Image:
|
|
return self._rescale(
|
|
Image.fromarray(
|
|
np.uint8(255 - self.heatmap * 255 if invert else self.heatmap * 255)
|
|
)
|
|
)
|
|
|
|
def to_mask(self, threshold: float = 0.5) -> Image:
|
|
discrete_heatmap = self.heatmap.lt(threshold).int()
|
|
return self._rescale(
|
|
Image.fromarray(np.uint8(discrete_heatmap * 255), mode="L")
|
|
)
|
|
|
|
def to_transparent(self, invert: bool = False) -> Image:
|
|
transparent_image = self.image.copy()
|
|
# For img2img, we want the selected regions to be transparent,
|
|
# but to_grayscale() returns the opposite. Thus invert.
|
|
gs = self.to_grayscale(not invert)
|
|
transparent_image.putalpha(gs)
|
|
return transparent_image
|
|
|
|
# unscales and uncrops the 352x352 heatmap so that it matches the image again
|
|
def _rescale(self, heatmap: Image) -> Image:
|
|
size = (
|
|
self.image.width
|
|
if (self.image.width > self.image.height)
|
|
else self.image.height
|
|
)
|
|
resized_image = heatmap.resize((size, size), resample=Image.Resampling.LANCZOS)
|
|
return resized_image.crop((0, 0, self.image.width, self.image.height))
|
|
|
|
|
|
class Txt2Mask(object):
|
|
"""
|
|
Create new Txt2Mask object. The optional device argument can be one of
|
|
'cuda', 'mps' or 'cpu'.
|
|
"""
|
|
|
|
def __init__(self, device="cpu", refined=False):
|
|
logger.info("Initializing clipseg model for text to mask inference")
|
|
config = get_invokeai_config()
|
|
|
|
# BUG: we are not doing anything with the device option at this time
|
|
self.device = device
|
|
self.processor = AutoProcessor.from_pretrained(
|
|
CLIPSEG_MODEL, cache_dir=config.cache_dir
|
|
)
|
|
self.model = CLIPSegForImageSegmentation.from_pretrained(
|
|
CLIPSEG_MODEL, cache_dir=config.cache_dir
|
|
)
|
|
|
|
@torch.no_grad()
|
|
def segment(self, image, prompt: str) -> SegmentedGrayscale:
|
|
"""
|
|
Given a prompt string such as "a bagel", tries to identify the object in the
|
|
provided image and returns a SegmentedGrayscale object in which the brighter
|
|
pixels indicate where the object is inferred to be.
|
|
"""
|
|
if type(image) is str:
|
|
image = Image.open(image).convert("RGB")
|
|
|
|
image = ImageOps.exif_transpose(image)
|
|
img = self._scale_and_crop(image)
|
|
|
|
inputs = self.processor(
|
|
text=[prompt], images=[img], padding=True, return_tensors="pt"
|
|
)
|
|
outputs = self.model(**inputs)
|
|
heatmap = torch.sigmoid(outputs.logits)
|
|
return SegmentedGrayscale(image, heatmap)
|
|
|
|
def _scale_and_crop(self, image: Image) -> Image:
|
|
scaled_image = Image.new("RGB", (CLIPSEG_SIZE, CLIPSEG_SIZE))
|
|
if image.width > image.height: # width is constraint
|
|
scale = CLIPSEG_SIZE / image.width
|
|
else:
|
|
scale = CLIPSEG_SIZE / image.height
|
|
scaled_image.paste(
|
|
image.resize(
|
|
(int(scale * image.width), int(scale * image.height)),
|
|
resample=Image.Resampling.LANCZOS,
|
|
),
|
|
box=(0, 0),
|
|
)
|
|
return scaled_image
|