mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
Merge branch 'development' of github.com:lstein/stable-diffusion into asymmetric-tiling
This commit is contained in:
commit
9d19213b8a
BIN
docs/assets/still-life-inpainted.png
Normal file
BIN
docs/assets/still-life-inpainted.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 338 KiB |
BIN
docs/assets/still-life-scaled.jpg
Normal file
BIN
docs/assets/still-life-scaled.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 59 KiB |
@ -85,6 +85,7 @@ overridden on a per-prompt basis (see [List of prompt arguments](#list-of-prompt
|
|||||||
| `--from_file <path>` | | `None` | Read list of prompts from a file. Use `-` to read from standard input |
|
| `--from_file <path>` | | `None` | Read list of prompts from a file. Use `-` to read from standard input |
|
||||||
| `--model <modelname>` | | `stable-diffusion-1.4` | Loads model specified in configs/models.yaml. Currently one of "stable-diffusion-1.4" or "laion400m" |
|
| `--model <modelname>` | | `stable-diffusion-1.4` | Loads model specified in configs/models.yaml. Currently one of "stable-diffusion-1.4" or "laion400m" |
|
||||||
| `--full_precision` | `-F` | `False` | Run in slower full-precision mode. Needed for Macintosh M1/M2 hardware and some older video cards. |
|
| `--full_precision` | `-F` | `False` | Run in slower full-precision mode. Needed for Macintosh M1/M2 hardware and some older video cards. |
|
||||||
|
| `--png_compression <0-9>` | `-z<0-9>` | 6 | Select level of compression for output files, from 0 (no compression) to 9 (max compression) |
|
||||||
| `--web` | | `False` | Start in web server mode |
|
| `--web` | | `False` | Start in web server mode |
|
||||||
| `--host <ip addr>` | | `localhost` | Which network interface web server should listen on. Set to 0.0.0.0 to listen on any. |
|
| `--host <ip addr>` | | `localhost` | Which network interface web server should listen on. Set to 0.0.0.0 to listen on any. |
|
||||||
| `--port <port>` | | `9090` | Which port web server should listen for requests on. |
|
| `--port <port>` | | `9090` | Which port web server should listen for requests on. |
|
||||||
@ -153,6 +154,7 @@ Here are the invoke> command that apply to txt2img:
|
|||||||
| --seed <int> | -S<int> | None | Set the random seed for the next series of images. This can be used to recreate an image generated previously.|
|
| --seed <int> | -S<int> | None | Set the random seed for the next series of images. This can be used to recreate an image generated previously.|
|
||||||
| --sampler <sampler>| -A<sampler>| k_lms | Sampler to use. Use -h to get list of available samplers. |
|
| --sampler <sampler>| -A<sampler>| k_lms | Sampler to use. Use -h to get list of available samplers. |
|
||||||
| --hires_fix | | | Larger images often have duplication artefacts. This option suppresses duplicates by generating the image at low res, and then using img2img to increase the resolution |
|
| --hires_fix | | | Larger images often have duplication artefacts. This option suppresses duplicates by generating the image at low res, and then using img2img to increase the resolution |
|
||||||
|
| --png_compression <0-9> | -z<0-9> | 6 | Select level of compression for output files, from 0 (no compression) to 9 (max compression) |
|
||||||
| --grid | -g | False | Turn on grid mode to return a single image combining all the images generated by this prompt |
|
| --grid | -g | False | Turn on grid mode to return a single image combining all the images generated by this prompt |
|
||||||
| --individual | -i | True | Turn off grid mode (deprecated; leave off --grid instead) |
|
| --individual | -i | True | Turn off grid mode (deprecated; leave off --grid instead) |
|
||||||
| --outdir <path> | -o<path> | outputs/img_samples | Temporarily change the location of these images |
|
| --outdir <path> | -o<path> | outputs/img_samples | Temporarily change the location of these images |
|
||||||
@ -211,11 +213,35 @@ accepts additional options:
|
|||||||
[Inpainting](./INPAINTING.md) for details.
|
[Inpainting](./INPAINTING.md) for details.
|
||||||
|
|
||||||
inpainting accepts all the arguments used for txt2img and img2img, as
|
inpainting accepts all the arguments used for txt2img and img2img, as
|
||||||
well as the --mask (-M) argument:
|
well as the --mask (-M) and --text_mask (-tm) arguments:
|
||||||
|
|
||||||
| Argument <img width="100" align="right"/> | Shortcut | Default | Description |
|
| Argument <img width="100" align="right"/> | Shortcut | Default | Description |
|
||||||
|--------------------|------------|---------------------|--------------|
|
|--------------------|------------|---------------------|--------------|
|
||||||
| `--init_mask <path>` | `-M<path>` | `None` |Path to an image the same size as the initial_image, with areas for inpainting made transparent.|
|
| `--init_mask <path>` | `-M<path>` | `None` |Path to an image the same size as the initial_image, with areas for inpainting made transparent.|
|
||||||
|
| `--text_mask <prompt> [<float>]` | `-tm <prompt> [<float>]` | <none> | Create a mask from a text prompt describing part of the image|
|
||||||
|
|
||||||
|
`--text_mask` (short form `-tm`) is a way to generate a mask using a
|
||||||
|
text description of the part of the image to replace. For example, if
|
||||||
|
you have an image of a breakfast plate with a bagel, toast and
|
||||||
|
scrambled eggs, you can selectively mask the bagel and replace it with
|
||||||
|
a piece of cake this way:
|
||||||
|
|
||||||
|
~~~
|
||||||
|
invoke> a piece of cake -I /path/to/breakfast.png -tm bagel
|
||||||
|
~~~
|
||||||
|
|
||||||
|
The algorithm uses <a
|
||||||
|
href="https://github.com/timojl/clipseg">clipseg</a> to classify
|
||||||
|
different regions of the image. The classifier puts out a confidence
|
||||||
|
score for each region it identifies. Generally regions that score
|
||||||
|
above 0.5 are reliable, but if you are getting too much or too little
|
||||||
|
masking you can adjust the threshold down (to get more mask), or up
|
||||||
|
(to get less). In this example, by passing `-tm` a higher value, we
|
||||||
|
are insisting on a more stringent classification.
|
||||||
|
|
||||||
|
~~~
|
||||||
|
invoke> a piece of cake -I /path/to/breakfast.png -tm bagel 0.6
|
||||||
|
~~~
|
||||||
|
|
||||||
# Other Commands
|
# Other Commands
|
||||||
|
|
||||||
|
@ -34,7 +34,46 @@ original unedited image and the masked (partially transparent) image:
|
|||||||
invoke> "man with cat on shoulder" -I./images/man.png -M./images/man-transparent.png
|
invoke> "man with cat on shoulder" -I./images/man.png -M./images/man-transparent.png
|
||||||
```
|
```
|
||||||
|
|
||||||
We are hoping to get rid of the need for this workaround in an upcoming release.
|
## **Masking using Text**
|
||||||
|
|
||||||
|
You can also create a mask using a text prompt to select the part of
|
||||||
|
the image you want to alter, using the <a
|
||||||
|
href="https://github.com/timojl/clipseg">clipseg</a> algorithm. This
|
||||||
|
works on any image, not just ones generated by InvokeAI.
|
||||||
|
|
||||||
|
The `--text_mask` (short form `-tm`) option takes two arguments. The
|
||||||
|
first argument is a text description of the part of the image you wish
|
||||||
|
to mask (paint over). If the text description contains a space, you must
|
||||||
|
surround it with quotation marks. The optional second argument is the
|
||||||
|
minimum threshold for the mask classifier's confidence score, described
|
||||||
|
in more detail below.
|
||||||
|
|
||||||
|
To see how this works in practice, here's an image of a still life
|
||||||
|
painting that I got off the web.
|
||||||
|
|
||||||
|
<img src="../assets/still-life-scaled.jpg">
|
||||||
|
|
||||||
|
You can selectively mask out the
|
||||||
|
orange and replace it with a baseball in this way:
|
||||||
|
|
||||||
|
~~~
|
||||||
|
invoke> a baseball -I /path/to/still_life.png -tm orange
|
||||||
|
~~~
|
||||||
|
|
||||||
|
<img src="../assets/still-life-inpainted.png">
|
||||||
|
|
||||||
|
The clipseg classifier produces a confidence score for each region it
|
||||||
|
identifies. Generally regions that score above 0.5 are reliable, but
|
||||||
|
if you are getting too much or too little masking you can adjust the
|
||||||
|
threshold down (to get more mask), or up (to get less). In this
|
||||||
|
example, by passing `-tm` a higher value, we are insisting on a tigher
|
||||||
|
mask. However, if you make it too high, the orange may not be picked
|
||||||
|
up at all!
|
||||||
|
|
||||||
|
~~~
|
||||||
|
invoke> a baseball -I /path/to/breakfast.png -tm orange 0.6
|
||||||
|
~~~
|
||||||
|
|
||||||
|
|
||||||
### Inpainting is not changing the masked region enough!
|
### Inpainting is not changing the masked region enough!
|
||||||
|
|
||||||
|
@ -57,6 +57,7 @@ dependencies:
|
|||||||
- -e git+https://github.com/openai/CLIP.git@main#egg=clip
|
- -e git+https://github.com/openai/CLIP.git@main#egg=clip
|
||||||
- -e git+https://github.com/Birch-san/k-diffusion.git@mps#egg=k_diffusion
|
- -e git+https://github.com/Birch-san/k-diffusion.git@mps#egg=k_diffusion
|
||||||
- -e git+https://github.com/TencentARC/GFPGAN.git#egg=gfpgan
|
- -e git+https://github.com/TencentARC/GFPGAN.git#egg=gfpgan
|
||||||
|
- -e git+https://github.com/invoke-ai/clipseg.git#egg=clipseg
|
||||||
- -e .
|
- -e .
|
||||||
variables:
|
variables:
|
||||||
PYTORCH_ENABLE_MPS_FALLBACK: 1
|
PYTORCH_ENABLE_MPS_FALLBACK: 1
|
||||||
|
@ -37,4 +37,5 @@ dependencies:
|
|||||||
- -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
|
- -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
|
||||||
- -e git+https://github.com/Birch-san/k-diffusion.git@mps#egg=k_diffusion
|
- -e git+https://github.com/Birch-san/k-diffusion.git@mps#egg=k_diffusion
|
||||||
- -e git+https://github.com/TencentARC/GFPGAN.git#egg=gfpgan
|
- -e git+https://github.com/TencentARC/GFPGAN.git#egg=gfpgan
|
||||||
|
- -e git+https://github.com/invoke-ai/clipseg.git#egg=clipseg
|
||||||
- -e .
|
- -e .
|
||||||
|
File diff suppressed because one or more lines are too long
2
frontend/dist/index.html
vendored
2
frontend/dist/index.html
vendored
@ -6,7 +6,7 @@
|
|||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<title>InvokeAI - A Stable Diffusion Toolkit</title>
|
<title>InvokeAI - A Stable Diffusion Toolkit</title>
|
||||||
<link rel="shortcut icon" type="icon" href="/assets/favicon.0d253ced.ico" />
|
<link rel="shortcut icon" type="icon" href="/assets/favicon.0d253ced.ico" />
|
||||||
<script type="module" crossorigin src="/assets/index.ea68b5f5.js"></script>
|
<script type="module" crossorigin src="/assets/index.89883620.js"></script>
|
||||||
<link rel="stylesheet" href="/assets/index.58175ea1.css">
|
<link rel="stylesheet" href="/assets/index.58175ea1.css">
|
||||||
</head>
|
</head>
|
||||||
|
|
||||||
|
@ -22,9 +22,9 @@ import * as InvokeAI from '../invokeai';
|
|||||||
* some new action to handle whatever data was sent from the server.
|
* some new action to handle whatever data was sent from the server.
|
||||||
*/
|
*/
|
||||||
export const socketioMiddleware = () => {
|
export const socketioMiddleware = () => {
|
||||||
const { hostname, port } = new URL(window.location.href);
|
const { origin } = new URL(window.location.href);
|
||||||
|
|
||||||
const socketio = io(`http://${hostname}:${port}`, {
|
const socketio = io(origin, {
|
||||||
timeout: 60000,
|
timeout: 60000,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -35,7 +35,8 @@ from ldm.invoke.devices import choose_torch_device, choose_precision
|
|||||||
from ldm.invoke.conditioning import get_uc_and_c
|
from ldm.invoke.conditioning import get_uc_and_c
|
||||||
from ldm.invoke.model_cache import ModelCache
|
from ldm.invoke.model_cache import ModelCache
|
||||||
from ldm.invoke.seamless import configure_model_padding
|
from ldm.invoke.seamless import configure_model_padding
|
||||||
|
from ldm.invoke.txt2mask import Txt2Mask, SegmentedGrayscale
|
||||||
|
|
||||||
def fix_func(orig):
|
def fix_func(orig):
|
||||||
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
||||||
def new_func(*args, **kw):
|
def new_func(*args, **kw):
|
||||||
@ -190,6 +191,7 @@ class Generate:
|
|||||||
self.esrgan = esrgan
|
self.esrgan = esrgan
|
||||||
self.free_gpu_mem = free_gpu_mem
|
self.free_gpu_mem = free_gpu_mem
|
||||||
self.size_matters = True # used to warn once about large image sizes and VRAM
|
self.size_matters = True # used to warn once about large image sizes and VRAM
|
||||||
|
self.txt2mask = None
|
||||||
|
|
||||||
# Note that in previous versions, there was an option to pass the
|
# Note that in previous versions, there was an option to pass the
|
||||||
# device to Generate(). However the device was then ignored, so
|
# device to Generate(). However the device was then ignored, so
|
||||||
@ -269,6 +271,7 @@ class Generate:
|
|||||||
# these are specific to img2img and inpaint
|
# these are specific to img2img and inpaint
|
||||||
init_img = None,
|
init_img = None,
|
||||||
init_mask = None,
|
init_mask = None,
|
||||||
|
text_mask = None,
|
||||||
fit = False,
|
fit = False,
|
||||||
strength = None,
|
strength = None,
|
||||||
init_color = None,
|
init_color = None,
|
||||||
@ -301,6 +304,8 @@ class Generate:
|
|||||||
seamless // whether the generated image should tile
|
seamless // whether the generated image should tile
|
||||||
hires_fix // whether the Hires Fix should be applied during generation
|
hires_fix // whether the Hires Fix should be applied during generation
|
||||||
init_img // path to an initial image
|
init_img // path to an initial image
|
||||||
|
init_mask // path to a mask for the initial image
|
||||||
|
text_mask // a text string that will be used to guide clipseg generation of the init_mask
|
||||||
strength // strength for noising/unnoising init_img. 0.0 preserves image exactly, 1.0 replaces it completely
|
strength // strength for noising/unnoising init_img. 0.0 preserves image exactly, 1.0 replaces it completely
|
||||||
facetool_strength // strength for GFPGAN/CodeFormer. 0.0 preserves image exactly, 1.0 replaces it completely
|
facetool_strength // strength for GFPGAN/CodeFormer. 0.0 preserves image exactly, 1.0 replaces it completely
|
||||||
ddim_eta // image randomness (eta=0.0 means the same seed always produces the same image)
|
ddim_eta // image randomness (eta=0.0 means the same seed always produces the same image)
|
||||||
@ -407,6 +412,7 @@ class Generate:
|
|||||||
width,
|
width,
|
||||||
height,
|
height,
|
||||||
fit=fit,
|
fit=fit,
|
||||||
|
text_mask=text_mask,
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO: Hacky selection of operation to perform. Needs to be refactored.
|
# TODO: Hacky selection of operation to perform. Needs to be refactored.
|
||||||
@ -622,17 +628,14 @@ class Generate:
|
|||||||
width,
|
width,
|
||||||
height,
|
height,
|
||||||
fit=False,
|
fit=False,
|
||||||
|
text_mask=None,
|
||||||
):
|
):
|
||||||
init_image = None
|
init_image = None
|
||||||
init_mask = None
|
init_mask = None
|
||||||
if not img:
|
if not img:
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
image = self._load_img(
|
image = self._load_img(img)
|
||||||
img,
|
|
||||||
width,
|
|
||||||
height,
|
|
||||||
)
|
|
||||||
|
|
||||||
if image.width < self.width and image.height < self.height:
|
if image.width < self.width and image.height < self.height:
|
||||||
print(f'>> WARNING: img2img and inpainting may produce unexpected results with initial images smaller than {self.width}x{self.height} in both dimensions')
|
print(f'>> WARNING: img2img and inpainting may produce unexpected results with initial images smaller than {self.width}x{self.height} in both dimensions')
|
||||||
@ -650,10 +653,12 @@ class Generate:
|
|||||||
init_image = self._create_init_image(image,width,height,fit=fit) # this returns a torch tensor
|
init_image = self._create_init_image(image,width,height,fit=fit) # this returns a torch tensor
|
||||||
|
|
||||||
if mask:
|
if mask:
|
||||||
mask_image = self._load_img(
|
mask_image = self._load_img(mask) # this returns an Image
|
||||||
mask, width, height) # this returns an Image
|
|
||||||
init_mask = self._create_init_mask(mask_image,width,height,fit=fit)
|
init_mask = self._create_init_mask(mask_image,width,height,fit=fit)
|
||||||
|
|
||||||
|
elif text_mask:
|
||||||
|
init_mask = self._txt2mask(image, text_mask, width, height, fit=fit)
|
||||||
|
|
||||||
return init_image, init_mask
|
return init_image, init_mask
|
||||||
|
|
||||||
def _make_base(self):
|
def _make_base(self):
|
||||||
@ -832,7 +837,7 @@ class Generate:
|
|||||||
|
|
||||||
print(msg)
|
print(msg)
|
||||||
|
|
||||||
def _load_img(self, img, width, height)->Image:
|
def _load_img(self, img)->Image:
|
||||||
if isinstance(img, Image.Image):
|
if isinstance(img, Image.Image):
|
||||||
image = img
|
image = img
|
||||||
print(
|
print(
|
||||||
@ -894,6 +899,29 @@ class Generate:
|
|||||||
mask = ImageOps.invert(mask)
|
mask = ImageOps.invert(mask)
|
||||||
return mask
|
return mask
|
||||||
|
|
||||||
|
# TODO: The latter part of this method repeats code from _create_init_mask()
|
||||||
|
def _txt2mask(self, image:Image, text_mask:list, width, height, fit=True) -> Image:
|
||||||
|
prompt = text_mask[0]
|
||||||
|
confidence_level = text_mask[1] if len(text_mask)>1 else 0.5
|
||||||
|
if self.txt2mask is None:
|
||||||
|
self.txt2mask = Txt2Mask(device = self.device)
|
||||||
|
|
||||||
|
segmented = self.txt2mask.segment(image, prompt)
|
||||||
|
mask = segmented.to_mask(float(confidence_level))
|
||||||
|
mask = mask.convert('RGB')
|
||||||
|
# now we adjust the size
|
||||||
|
if fit:
|
||||||
|
mask = self._fit_image(mask, (width, height))
|
||||||
|
else:
|
||||||
|
mask = self._squeeze_image(mask)
|
||||||
|
mask = mask.resize((mask.width//downsampling, mask.height //
|
||||||
|
downsampling), resample=Image.Resampling.NEAREST)
|
||||||
|
mask = np.array(mask)
|
||||||
|
mask = mask.astype(np.float32) / 255.0
|
||||||
|
mask = mask[None].transpose(0, 3, 1, 2)
|
||||||
|
mask = torch.from_numpy(mask)
|
||||||
|
return mask.to(self.device)
|
||||||
|
|
||||||
def _has_transparency(self, image):
|
def _has_transparency(self, image):
|
||||||
if image.info.get("transparency", None) is not None:
|
if image.info.get("transparency", None) is not None:
|
||||||
return True
|
return True
|
||||||
|
@ -378,6 +378,14 @@ class Args(object):
|
|||||||
default='stable-diffusion-1.4',
|
default='stable-diffusion-1.4',
|
||||||
help='Indicates which diffusion model to load. (currently "stable-diffusion-1.4" (default) or "laion400m")',
|
help='Indicates which diffusion model to load. (currently "stable-diffusion-1.4" (default) or "laion400m")',
|
||||||
)
|
)
|
||||||
|
model_group.add_argument(
|
||||||
|
'--png_compression','-z',
|
||||||
|
type=int,
|
||||||
|
default=6,
|
||||||
|
choices=range(0,9),
|
||||||
|
dest='png_compression',
|
||||||
|
help='level of PNG compression, from 0 (none) to 9 (maximum). Default is 6.'
|
||||||
|
)
|
||||||
model_group.add_argument(
|
model_group.add_argument(
|
||||||
'--sampler',
|
'--sampler',
|
||||||
'-A',
|
'-A',
|
||||||
@ -649,6 +657,14 @@ class Args(object):
|
|||||||
dest='save_intermediates',
|
dest='save_intermediates',
|
||||||
help='Save every nth intermediate image into an "intermediates" directory within the output directory'
|
help='Save every nth intermediate image into an "intermediates" directory within the output directory'
|
||||||
)
|
)
|
||||||
|
render_group.add_argument(
|
||||||
|
'--png_compression','-z',
|
||||||
|
type=int,
|
||||||
|
default=6,
|
||||||
|
choices=range(0,10),
|
||||||
|
dest='png_compression',
|
||||||
|
help='level of PNG compression, from 0 (none) to 9 (maximum). Default is 6.'
|
||||||
|
)
|
||||||
img2img_group.add_argument(
|
img2img_group.add_argument(
|
||||||
'-I',
|
'-I',
|
||||||
'--init_img',
|
'--init_img',
|
||||||
@ -661,6 +677,14 @@ class Args(object):
|
|||||||
type=str,
|
type=str,
|
||||||
help='Path to input mask for inpainting mode (supersedes width and height)',
|
help='Path to input mask for inpainting mode (supersedes width and height)',
|
||||||
)
|
)
|
||||||
|
img2img_group.add_argument(
|
||||||
|
'-tm',
|
||||||
|
'--text_mask',
|
||||||
|
nargs='+',
|
||||||
|
type=str,
|
||||||
|
help='Use the clipseg classifier to generate the mask area for inpainting. Provide a description of the area to mask ("a mug"), optionally followed by the confidence level threshold (0-1.0; defaults to 0.5).',
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
img2img_group.add_argument(
|
img2img_group.add_argument(
|
||||||
'--init_color',
|
'--init_color',
|
||||||
type=str,
|
type=str,
|
||||||
|
@ -74,3 +74,4 @@ class Txt2Img(Generator):
|
|||||||
if self.perlin > 0.0:
|
if self.perlin > 0.0:
|
||||||
x = (1-self.perlin)*x + self.perlin*self.get_perlin_noise(width // self.downsampling_factor, height // self.downsampling_factor)
|
x = (1-self.perlin)*x + self.perlin*self.get_perlin_noise(width // self.downsampling_factor, height // self.downsampling_factor)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
@ -33,13 +33,13 @@ class PngWriter:
|
|||||||
|
|
||||||
# saves image named _image_ to outdir/name, writing metadata from prompt
|
# saves image named _image_ to outdir/name, writing metadata from prompt
|
||||||
# returns full path of output
|
# returns full path of output
|
||||||
def save_image_and_prompt_to_png(self, image, dream_prompt, name, metadata=None):
|
def save_image_and_prompt_to_png(self, image, dream_prompt, name, metadata=None, compress_level=6):
|
||||||
path = os.path.join(self.outdir, name)
|
path = os.path.join(self.outdir, name)
|
||||||
info = PngImagePlugin.PngInfo()
|
info = PngImagePlugin.PngInfo()
|
||||||
info.add_text('Dream', dream_prompt)
|
info.add_text('Dream', dream_prompt)
|
||||||
if metadata:
|
if metadata:
|
||||||
info.add_text('sd-metadata', json.dumps(metadata))
|
info.add_text('sd-metadata', json.dumps(metadata))
|
||||||
image.save(path, 'PNG', pnginfo=info)
|
image.save(path, 'PNG', pnginfo=info, compress_level=compress_level)
|
||||||
return path
|
return path
|
||||||
|
|
||||||
def retrieve_metadata(self,img_basename):
|
def retrieve_metadata(self,img_basename):
|
||||||
|
@ -53,6 +53,8 @@ COMMANDS = (
|
|||||||
'--log_tokenization','-t',
|
'--log_tokenization','-t',
|
||||||
'--hires_fix',
|
'--hires_fix',
|
||||||
'--inpaint_replace','-r',
|
'--inpaint_replace','-r',
|
||||||
|
'--png_compression','-z',
|
||||||
|
'--text_mask','-tm',
|
||||||
'!fix','!fetch','!history','!search','!clear',
|
'!fix','!fetch','!history','!search','!clear',
|
||||||
'!models','!switch','!import_model','!edit_model'
|
'!models','!switch','!import_model','!edit_model'
|
||||||
)
|
)
|
||||||
|
122
ldm/invoke/txt2mask.py
Normal file
122
ldm/invoke/txt2mask.py
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
'''Makes available the Txt2Mask class, which assists in the automatic
|
||||||
|
assignment of masks via text prompt using clipseg.
|
||||||
|
|
||||||
|
Here is typical usage:
|
||||||
|
|
||||||
|
from ldm.invoke.txt2mask import Txt2Mask, SegmentedGrayscale
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
txt2mask = Txt2Mask(self.device)
|
||||||
|
segmented = txt2mask.segment(Image.open('/path/to/img.png'),'a bagel')
|
||||||
|
|
||||||
|
# this will return a grayscale Image of the segmented data
|
||||||
|
grayscale = segmented.to_grayscale()
|
||||||
|
|
||||||
|
# this will return a semi-transparent image in which the
|
||||||
|
# selected object(s) are opaque and the rest is at various
|
||||||
|
# levels of transparency
|
||||||
|
transparent = segmented.to_transparent()
|
||||||
|
|
||||||
|
# this will return a masked image suitable for use in inpainting:
|
||||||
|
mask = segmented.to_mask(threshold=0.5)
|
||||||
|
|
||||||
|
The threshold used in the call to to_mask() selects pixels for use in
|
||||||
|
the mask that exceed the indicated confidence threshold. Values range
|
||||||
|
from 0.0 to 1.0. The higher the threshold, the more confident the
|
||||||
|
algorithm is. In limited testing, I have found that values around 0.5
|
||||||
|
work fine.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from models.clipseg import CLIPDensePredT
|
||||||
|
from einops import rearrange, repeat
|
||||||
|
from PIL import Image
|
||||||
|
from torchvision import transforms
|
||||||
|
|
||||||
|
CLIP_VERSION = 'ViT-B/16'
|
||||||
|
CLIPSEG_WEIGHTS = 'src/clipseg/weights/rd64-uni.pth'
|
||||||
|
CLIPSEG_SIZE = 352
|
||||||
|
|
||||||
|
class SegmentedGrayscale(object):
|
||||||
|
def __init__(self, image:Image, heatmap:torch.Tensor):
|
||||||
|
self.heatmap = heatmap
|
||||||
|
self.image = image
|
||||||
|
|
||||||
|
def to_grayscale(self)->Image:
|
||||||
|
return self._rescale(Image.fromarray(np.uint8(self.heatmap*255)))
|
||||||
|
|
||||||
|
def to_mask(self,threshold:float=0.5)->Image:
|
||||||
|
discrete_heatmap = self.heatmap.lt(threshold).int()
|
||||||
|
return self._rescale(Image.fromarray(np.uint8(discrete_heatmap*255),mode='L'))
|
||||||
|
|
||||||
|
def to_transparent(self)->Image:
|
||||||
|
transparent_image = self.image.copy()
|
||||||
|
transparent_image.putalpha(self.to_grayscale())
|
||||||
|
return transparent_image
|
||||||
|
|
||||||
|
# unscales and uncrops the 352x352 heatmap so that it matches the image again
|
||||||
|
def _rescale(self, heatmap:Image)->Image:
|
||||||
|
size = self.image.width if (self.image.width > self.image.height) else self.image.height
|
||||||
|
resized_image = heatmap.resize(
|
||||||
|
(size,size),
|
||||||
|
resample=Image.Resampling.LANCZOS
|
||||||
|
)
|
||||||
|
return resized_image.crop((0,0,self.image.width,self.image.height))
|
||||||
|
|
||||||
|
class Txt2Mask(object):
|
||||||
|
'''
|
||||||
|
Create new Txt2Mask object. The optional device argument can be one of
|
||||||
|
'cuda', 'mps' or 'cpu'.
|
||||||
|
'''
|
||||||
|
def __init__(self,device='cpu'):
|
||||||
|
print('>> Initializing clipseg model for text to mask inference')
|
||||||
|
self.device = device
|
||||||
|
self.model = CLIPDensePredT(version=CLIP_VERSION, reduce_dim=64, )
|
||||||
|
self.model.eval()
|
||||||
|
# initially we keep everything in cpu to conserve space
|
||||||
|
self.model.to('cpu')
|
||||||
|
self.model.load_state_dict(torch.load(CLIPSEG_WEIGHTS, map_location=torch.device('cpu')), strict=False)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def segment(self, image:Image, prompt:str) -> SegmentedGrayscale:
|
||||||
|
'''
|
||||||
|
Given a prompt string such as "a bagel", tries to identify the object in the
|
||||||
|
provided image and returns a SegmentedGrayscale object in which the brighter
|
||||||
|
pixels indicate where the object is inferred to be.
|
||||||
|
'''
|
||||||
|
self._to_device(self.device)
|
||||||
|
prompts = [prompt] # right now we operate on just a single prompt at a time
|
||||||
|
|
||||||
|
transform = transforms.Compose([
|
||||||
|
transforms.ToTensor(),
|
||||||
|
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
||||||
|
transforms.Resize((CLIPSEG_SIZE, CLIPSEG_SIZE)), # must be multiple of 64...
|
||||||
|
])
|
||||||
|
|
||||||
|
img = self._scale_and_crop(image)
|
||||||
|
img = transform(img).unsqueeze(0)
|
||||||
|
|
||||||
|
preds = self.model(img.repeat(len(prompts),1,1,1), prompts)[0]
|
||||||
|
heatmap = torch.sigmoid(preds[0][0]).cpu()
|
||||||
|
self._to_device('cpu')
|
||||||
|
return SegmentedGrayscale(image, heatmap)
|
||||||
|
|
||||||
|
def _to_device(self, device):
|
||||||
|
self.model.to(device)
|
||||||
|
|
||||||
|
def _scale_and_crop(self, image:Image)->Image:
|
||||||
|
scaled_image = Image.new('RGB',(CLIPSEG_SIZE,CLIPSEG_SIZE))
|
||||||
|
if image.width > image.height: # width is constraint
|
||||||
|
scale = CLIPSEG_SIZE / image.width
|
||||||
|
else:
|
||||||
|
scale = CLIPSEG_SIZE / image.height
|
||||||
|
scaled_image.paste(
|
||||||
|
image.resize(
|
||||||
|
(int(scale * image.width),
|
||||||
|
int(scale * image.height)
|
||||||
|
),
|
||||||
|
resample=Image.Resampling.LANCZOS
|
||||||
|
),box=(0,0)
|
||||||
|
)
|
||||||
|
return scaled_image
|
@ -1353,7 +1353,7 @@ class LatentDiffusion(DDPM):
|
|||||||
num_downs = self.first_stage_model.encoder.num_resolutions - 1
|
num_downs = self.first_stage_model.encoder.num_resolutions - 1
|
||||||
rescale_latent = 2 ** (num_downs)
|
rescale_latent = 2 ** (num_downs)
|
||||||
|
|
||||||
# get top left postions of patches as conforming for the bbbox tokenizer, therefore we
|
# get top left positions of patches as conforming for the bbbox tokenizer, therefore we
|
||||||
# need to rescale the tl patch coordinates to be in between (0,1)
|
# need to rescale the tl patch coordinates to be in between (0,1)
|
||||||
tl_patch_coordinates = [
|
tl_patch_coordinates = [
|
||||||
(
|
(
|
||||||
|
@ -64,7 +64,8 @@ def make_ddim_timesteps(
|
|||||||
):
|
):
|
||||||
if ddim_discr_method == 'uniform':
|
if ddim_discr_method == 'uniform':
|
||||||
c = num_ddpm_timesteps // num_ddim_timesteps
|
c = num_ddpm_timesteps // num_ddim_timesteps
|
||||||
ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
|
# ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
|
||||||
|
ddim_timesteps = (np.arange(0, num_ddim_timesteps) * c).astype(int)
|
||||||
elif ddim_discr_method == 'quad':
|
elif ddim_discr_method == 'quad':
|
||||||
ddim_timesteps = (
|
ddim_timesteps = (
|
||||||
(
|
(
|
||||||
@ -81,8 +82,8 @@ def make_ddim_timesteps(
|
|||||||
|
|
||||||
# assert ddim_timesteps.shape[0] == num_ddim_timesteps
|
# assert ddim_timesteps.shape[0] == num_ddim_timesteps
|
||||||
# add one to get the final alpha values right (the ones from first scale to data during sampling)
|
# add one to get the final alpha values right (the ones from first scale to data during sampling)
|
||||||
# steps_out = ddim_timesteps + 1
|
steps_out = ddim_timesteps + 1
|
||||||
steps_out = ddim_timesteps
|
# steps_out = ddim_timesteps
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f'Selected timesteps for ddim sampler: {steps_out}')
|
print(f'Selected timesteps for ddim sampler: {steps_out}')
|
||||||
|
@ -22,4 +22,5 @@ transformers==4.19.2
|
|||||||
-e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
|
-e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
|
||||||
-e git+https://github.com/lstein/k-diffusion.git@master#egg=k-diffusion
|
-e git+https://github.com/lstein/k-diffusion.git@master#egg=k-diffusion
|
||||||
-e git+https://github.com/TencentARC/GFPGAN.git#egg=gfpgan
|
-e git+https://github.com/TencentARC/GFPGAN.git#egg=gfpgan
|
||||||
|
-3 git+https://github.com/invoke-ai/clipseg.git#egg=clipseg
|
||||||
-e .
|
-e .
|
||||||
|
@ -35,3 +35,4 @@ realesrgan
|
|||||||
git+https://github.com/openai/CLIP.git@main#egg=clip
|
git+https://github.com/openai/CLIP.git@main#egg=clip
|
||||||
git+https://github.com/Birch-san/k-diffusion.git@mps#egg=k-diffusion
|
git+https://github.com/Birch-san/k-diffusion.git@mps#egg=k-diffusion
|
||||||
git+https://github.com/TencentARC/GFPGAN.git#egg=gfpgan
|
git+https://github.com/TencentARC/GFPGAN.git#egg=gfpgan
|
||||||
|
git+https://github.com/invoke-ai/clipseg.git#egg=clipseg
|
||||||
|
@ -95,7 +95,10 @@ def main():
|
|||||||
"\n* Initialization done! Awaiting your command (-h for help, 'q' to quit)"
|
"\n* Initialization done! Awaiting your command (-h for help, 'q' to quit)"
|
||||||
)
|
)
|
||||||
|
|
||||||
main_loop(gen, opt, infile)
|
try:
|
||||||
|
main_loop(gen, opt, infile)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\ngoodbye!")
|
||||||
|
|
||||||
# TODO: main_loop() has gotten busy. Needs to be refactored.
|
# TODO: main_loop() has gotten busy. Needs to be refactored.
|
||||||
def main_loop(gen, opt, infile):
|
def main_loop(gen, opt, infile):
|
||||||
@ -270,6 +273,7 @@ def main_loop(gen, opt, infile):
|
|||||||
model_hash = gen.model_hash,
|
model_hash = gen.model_hash,
|
||||||
),
|
),
|
||||||
name = filename,
|
name = filename,
|
||||||
|
compress_level = opt.png_compression,
|
||||||
)
|
)
|
||||||
|
|
||||||
# update rfc metadata
|
# update rfc metadata
|
||||||
|
@ -10,28 +10,31 @@ import sys
|
|||||||
import transformers
|
import transformers
|
||||||
import os
|
import os
|
||||||
import warnings
|
import warnings
|
||||||
|
import torch
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
import zipfile
|
||||||
|
import traceback
|
||||||
|
|
||||||
transformers.logging.set_verbosity_error()
|
transformers.logging.set_verbosity_error()
|
||||||
|
|
||||||
# this will preload the Bert tokenizer fles
|
# this will preload the Bert tokenizer fles
|
||||||
print('preloading bert tokenizer...', end='')
|
print('Loading bert tokenizer (ignore deprecation errors)...', end='')
|
||||||
|
with warnings.catch_warnings():
|
||||||
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
|
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
||||||
|
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
|
||||||
print('...success')
|
print('...success')
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
# this will download requirements for Kornia
|
# this will download requirements for Kornia
|
||||||
print('preloading Kornia requirements...', end='')
|
print('Loading Kornia requirements...', end='')
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
||||||
import kornia
|
import kornia
|
||||||
print('...success')
|
print('...success')
|
||||||
|
|
||||||
version = 'openai/clip-vit-large-patch14'
|
version = 'openai/clip-vit-large-patch14'
|
||||||
|
|
||||||
print('preloading CLIP model...',end='')
|
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
|
print('Loading CLIP model...',end='')
|
||||||
tokenizer = CLIPTokenizer.from_pretrained(version)
|
tokenizer = CLIPTokenizer.from_pretrained(version)
|
||||||
transformer = CLIPTextModel.from_pretrained(version)
|
transformer = CLIPTextModel.from_pretrained(version)
|
||||||
print('...success')
|
print('...success')
|
||||||
@ -61,7 +64,6 @@ if gfpgan:
|
|||||||
FaceRestoreHelper(1, det_model='retinaface_resnet50')
|
FaceRestoreHelper(1, det_model='retinaface_resnet50')
|
||||||
print('...success')
|
print('...success')
|
||||||
except Exception:
|
except Exception:
|
||||||
import traceback
|
|
||||||
print('Error loading ESRGAN:')
|
print('Error loading ESRGAN:')
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
|
|
||||||
@ -89,13 +91,11 @@ if gfpgan:
|
|||||||
urllib.request.urlretrieve(model_url,model_dest)
|
urllib.request.urlretrieve(model_url,model_dest)
|
||||||
print('...success')
|
print('...success')
|
||||||
except Exception:
|
except Exception:
|
||||||
import traceback
|
|
||||||
print('Error loading GFPGAN:')
|
print('Error loading GFPGAN:')
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
|
|
||||||
print('preloading CodeFormer model file...',end='')
|
print('preloading CodeFormer model file...',end='')
|
||||||
try:
|
try:
|
||||||
import urllib.request
|
|
||||||
model_url = 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth'
|
model_url = 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth'
|
||||||
model_dest = 'ldm/invoke/restoration/codeformer/weights/codeformer.pth'
|
model_dest = 'ldm/invoke/restoration/codeformer/weights/codeformer.pth'
|
||||||
if not os.path.exists(model_dest):
|
if not os.path.exists(model_dest):
|
||||||
@ -103,7 +103,27 @@ try:
|
|||||||
os.makedirs(os.path.dirname(model_dest), exist_ok=True)
|
os.makedirs(os.path.dirname(model_dest), exist_ok=True)
|
||||||
urllib.request.urlretrieve(model_url,model_dest)
|
urllib.request.urlretrieve(model_url,model_dest)
|
||||||
except Exception:
|
except Exception:
|
||||||
import traceback
|
|
||||||
print('Error loading CodeFormer:')
|
print('Error loading CodeFormer:')
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
print('...success')
|
print('...success')
|
||||||
|
|
||||||
|
print('Loading clipseq model for text-based masking...',end='')
|
||||||
|
try:
|
||||||
|
model_url = 'https://owncloud.gwdg.de/index.php/s/ioHbRzFx6th32hn/download'
|
||||||
|
model_dest = 'src/clipseg/clipseg_weights.zip'
|
||||||
|
if not os.path.exists(model_dest):
|
||||||
|
os.makedirs(os.path.dirname(model_dest), exist_ok=True)
|
||||||
|
urllib.request.urlretrieve(model_url,model_dest)
|
||||||
|
with zipfile.ZipFile(model_dest,'r') as zip:
|
||||||
|
zip.extractall('src/clipseg')
|
||||||
|
os.rename('src/clipseg/clipseg_weights','src/clipseg/weights')
|
||||||
|
from models.clipseg import CLIPDensePredT
|
||||||
|
model = CLIPDensePredT(version='ViT-B/16', reduce_dim=64, )
|
||||||
|
model.eval()
|
||||||
|
model.load_state_dict(torch.load('src/clipseg/weights/rd64-uni-refined.pth'), strict=False)
|
||||||
|
except Exception:
|
||||||
|
print('Error installing clipseg model:')
|
||||||
|
print(traceback.format_exc())
|
||||||
|
print('...success')
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user