mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
Remove dependency on original clipseg library for text masking (#2425)
- This replaces the original clipseg library with the transformers version from HuggingFace. - This should make it possible to register InvokeAI at PyPi and do a fully automated pip-based install. - Minor regression: it is no longer possible to specify which device the clipseg model will be loaded into, and it will reside in CPU. However, performance is more than acceptable.
This commit is contained in:
commit
046abb634e
@ -30,7 +30,7 @@ from huggingface_hub.utils._errors import RevisionNotFoundError
|
||||
from omegaconf import OmegaConf
|
||||
from omegaconf.dictconfig import DictConfig
|
||||
from tqdm import tqdm
|
||||
from transformers import CLIPTokenizer, CLIPTextModel
|
||||
from transformers import CLIPTokenizer, CLIPTextModel, AutoProcessor, CLIPSegForImageSegmentation
|
||||
|
||||
from ldm.invoke.globals import Globals, global_cache_dir
|
||||
from ldm.invoke.readline import generic_completer
|
||||
@ -601,31 +601,10 @@ def download_codeformer():
|
||||
#---------------------------------------------
|
||||
def download_clipseg():
|
||||
print('Installing clipseg model for text-based masking...',end='', file=sys.stderr)
|
||||
import zipfile
|
||||
CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined'
|
||||
try:
|
||||
model_url = 'https://owncloud.gwdg.de/index.php/s/ioHbRzFx6th32hn/download'
|
||||
model_dest = os.path.join(Globals.root,'models/clipseg/clipseg_weights')
|
||||
weights_zip = 'models/clipseg/weights.zip'
|
||||
|
||||
if not os.path.exists(model_dest):
|
||||
os.makedirs(os.path.dirname(model_dest), exist_ok=True)
|
||||
if not os.path.exists(f'{model_dest}/rd64-uni-refined.pth'):
|
||||
dest = os.path.join(Globals.root,weights_zip)
|
||||
request.urlretrieve(model_url,dest)
|
||||
with zipfile.ZipFile(dest,'r') as zip:
|
||||
zip.extractall(os.path.join(Globals.root,'models/clipseg'))
|
||||
os.remove(dest)
|
||||
|
||||
from clipseg.clipseg import CLIPDensePredT
|
||||
model = CLIPDensePredT(version='ViT-B/16', reduce_dim=64, )
|
||||
model.eval()
|
||||
model.load_state_dict(
|
||||
torch.load(
|
||||
os.path.join(Globals.root,'models/clipseg/clipseg_weights/rd64-uni-refined.pth'),
|
||||
map_location=torch.device('cpu')
|
||||
),
|
||||
strict=False,
|
||||
)
|
||||
download_from_hf(AutoProcessor,CLIPSEG_MODEL)
|
||||
download_from_hf(CLIPSegForImageSegmentation,CLIPSEG_MODEL)
|
||||
except Exception:
|
||||
print('Error installing clipseg model:')
|
||||
print(traceback.format_exc())
|
||||
|
@ -71,7 +71,14 @@ def global_cache_dir(subdir:Union[str,Path]='')->Path:
|
||||
is provided, it will be appended to the end of the path, allowing
|
||||
for huggingface-style conventions:
|
||||
global_cache_dir('diffusers')
|
||||
global_cache_dir('hub')
|
||||
Current HuggingFace documentation (mid-Jan 2023) indicates that
|
||||
transformers models will be cached into a "transformers" subdirectory,
|
||||
but in practice they seem to go into "hub". But if needed:
|
||||
global_cache_dir('transformers')
|
||||
One other caveat is that HuggingFace is moving some diffusers models
|
||||
into the "hub" subdirectory as well, so this will need to be revisited
|
||||
from time to time.
|
||||
'''
|
||||
home: str = os.getenv('HF_HOME')
|
||||
|
||||
|
@ -29,16 +29,12 @@ work fine.
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
import os
|
||||
from clipseg.clipseg import CLIPDensePredT
|
||||
from einops import rearrange, repeat
|
||||
from transformers import AutoProcessor, CLIPSegForImageSegmentation
|
||||
from PIL import Image, ImageOps
|
||||
from torchvision import transforms
|
||||
from ldm.invoke.globals import Globals
|
||||
from ldm.invoke.globals import global_cache_dir
|
||||
|
||||
CLIP_VERSION = 'ViT-B/16'
|
||||
CLIPSEG_WEIGHTS = 'models/clipseg/clipseg_weights/rd64-uni.pth'
|
||||
CLIPSEG_WEIGHTS_REFINED = 'models/clipseg/clipseg_weights/rd64-uni-refined.pth'
|
||||
CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined'
|
||||
CLIPSEG_SIZE = 352
|
||||
|
||||
class SegmentedGrayscale(object):
|
||||
@ -77,16 +73,15 @@ class Txt2Mask(object):
|
||||
'''
|
||||
def __init__(self,device='cpu',refined=False):
|
||||
print('>> Initializing clipseg model for text to mask inference')
|
||||
|
||||
# BUG: we are not doing anything with the device option at this time
|
||||
self.device = device
|
||||
self.model = CLIPDensePredT(version=CLIP_VERSION, reduce_dim=64, complex_trans_conv=refined)
|
||||
self.model.eval()
|
||||
# initially we keep everything in cpu to conserve space
|
||||
self.model.to('cpu')
|
||||
self.model.load_state_dict(torch.load(os.path.join(Globals.root,CLIPSEG_WEIGHTS_REFINED)
|
||||
if refined
|
||||
else os.path.join(Globals.root,CLIPSEG_WEIGHTS),
|
||||
map_location=torch.device('cpu')), strict=False
|
||||
)
|
||||
self.processor = AutoProcessor.from_pretrained(CLIPSEG_MODEL,
|
||||
cache_dir=global_cache_dir('hub')
|
||||
)
|
||||
self.model = CLIPSegForImageSegmentation.from_pretrained(CLIPSEG_MODEL,
|
||||
cache_dir=global_cache_dir('hub')
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def segment(self, image, prompt:str) -> SegmentedGrayscale:
|
||||
@ -95,9 +90,6 @@ class Txt2Mask(object):
|
||||
provided image and returns a SegmentedGrayscale object in which the brighter
|
||||
pixels indicate where the object is inferred to be.
|
||||
'''
|
||||
self._to_device(self.device)
|
||||
prompts = [prompt] # right now we operate on just a single prompt at a time
|
||||
|
||||
transform = transforms.Compose([
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
||||
@ -111,14 +103,14 @@ class Txt2Mask(object):
|
||||
img = self._scale_and_crop(image)
|
||||
img = transform(img).unsqueeze(0)
|
||||
|
||||
preds = self.model(img.repeat(len(prompts),1,1,1), prompts)[0]
|
||||
heatmap = torch.sigmoid(preds[0][0]).cpu()
|
||||
self._to_device('cpu')
|
||||
inputs = self.processor(text=[prompt],
|
||||
images=[image],
|
||||
padding=True,
|
||||
return_tensors='pt')
|
||||
outputs = self.model(**inputs)
|
||||
heatmap = torch.sigmoid(outputs.logits)
|
||||
return SegmentedGrayscale(image, heatmap)
|
||||
|
||||
def _to_device(self, device):
|
||||
self.model.to(device)
|
||||
|
||||
def _scale_and_crop(self, image:Image)->Image:
|
||||
scaled_image = Image.new('RGB',(CLIPSEG_SIZE,CLIPSEG_SIZE))
|
||||
if image.width > image.height: # width is constraint
|
||||
|
@ -36,8 +36,7 @@ classifiers = [
|
||||
dependencies = [
|
||||
"accelerate",
|
||||
"albumentations",
|
||||
"clip_anytorch", # replaceing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip",
|
||||
"clipseg @ https://github.com/invoke-ai/clipseg/archive/relaxed-python-requirement.zip", # is this still necesarry with diffusers?
|
||||
"clip_anytorch", # replacing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip",
|
||||
"datasets",
|
||||
"diffusers[torch]~=0.11",
|
||||
"dnspython==2.2.1",
|
||||
@ -53,7 +52,7 @@ dependencies = [
|
||||
"huggingface-hub>=0.11.1",
|
||||
"imageio",
|
||||
"imageio-ffmpeg",
|
||||
"k-diffusion", # replaceing "k-diffusion @ https://github.com/Birch-san/k-diffusion/archive/refs/heads/mps.zip",
|
||||
"k-diffusion", # replacing "k-diffusion @ https://github.com/Birch-san/k-diffusion/archive/refs/heads/mps.zip",
|
||||
"kornia",
|
||||
"npyscreen",
|
||||
"numpy~=1.23",
|
||||
|
Loading…
Reference in New Issue
Block a user