Remove dependency on original clipseg library for text masking (#2425)

- This replaces the original clipseg library with the transformers
version from HuggingFace.
- This should make it possible to register InvokeAI at PyPi and do a
fully automated pip-based install.
- Minor regression: it is no longer possible to specify which device the
clipseg model will be loaded into, and it will reside in CPU. However,
performance is more than acceptable.
This commit is contained in:
Lincoln Stein 2023-01-26 12:14:13 -05:00 committed by GitHub
commit 046abb634e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 30 additions and 53 deletions

View File

@ -30,7 +30,7 @@ from huggingface_hub.utils._errors import RevisionNotFoundError
from omegaconf import OmegaConf
from omegaconf.dictconfig import DictConfig
from tqdm import tqdm
from transformers import CLIPTokenizer, CLIPTextModel
from transformers import CLIPTokenizer, CLIPTextModel, AutoProcessor, CLIPSegForImageSegmentation
from ldm.invoke.globals import Globals, global_cache_dir
from ldm.invoke.readline import generic_completer
@ -601,31 +601,10 @@ def download_codeformer():
#---------------------------------------------
def download_clipseg():
print('Installing clipseg model for text-based masking...',end='', file=sys.stderr)
import zipfile
CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined'
try:
model_url = 'https://owncloud.gwdg.de/index.php/s/ioHbRzFx6th32hn/download'
model_dest = os.path.join(Globals.root,'models/clipseg/clipseg_weights')
weights_zip = 'models/clipseg/weights.zip'
if not os.path.exists(model_dest):
os.makedirs(os.path.dirname(model_dest), exist_ok=True)
if not os.path.exists(f'{model_dest}/rd64-uni-refined.pth'):
dest = os.path.join(Globals.root,weights_zip)
request.urlretrieve(model_url,dest)
with zipfile.ZipFile(dest,'r') as zip:
zip.extractall(os.path.join(Globals.root,'models/clipseg'))
os.remove(dest)
from clipseg.clipseg import CLIPDensePredT
model = CLIPDensePredT(version='ViT-B/16', reduce_dim=64, )
model.eval()
model.load_state_dict(
torch.load(
os.path.join(Globals.root,'models/clipseg/clipseg_weights/rd64-uni-refined.pth'),
map_location=torch.device('cpu')
),
strict=False,
)
download_from_hf(AutoProcessor,CLIPSEG_MODEL)
download_from_hf(CLIPSegForImageSegmentation,CLIPSEG_MODEL)
except Exception:
print('Error installing clipseg model:')
print(traceback.format_exc())

View File

@ -71,7 +71,14 @@ def global_cache_dir(subdir:Union[str,Path]='')->Path:
is provided, it will be appended to the end of the path, allowing
for huggingface-style conventions:
global_cache_dir('diffusers')
global_cache_dir('hub')
Current HuggingFace documentation (mid-Jan 2023) indicates that
transformers models will be cached into a "transformers" subdirectory,
but in practice they seem to go into "hub". But if needed:
global_cache_dir('transformers')
One other caveat is that HuggingFace is moving some diffusers models
into the "hub" subdirectory as well, so this will need to be revisited
from time to time.
'''
home: str = os.getenv('HF_HOME')

View File

@ -29,16 +29,12 @@ work fine.
import torch
import numpy as np
import os
from clipseg.clipseg import CLIPDensePredT
from einops import rearrange, repeat
from transformers import AutoProcessor, CLIPSegForImageSegmentation
from PIL import Image, ImageOps
from torchvision import transforms
from ldm.invoke.globals import Globals
from ldm.invoke.globals import global_cache_dir
CLIP_VERSION = 'ViT-B/16'
CLIPSEG_WEIGHTS = 'models/clipseg/clipseg_weights/rd64-uni.pth'
CLIPSEG_WEIGHTS_REFINED = 'models/clipseg/clipseg_weights/rd64-uni-refined.pth'
CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined'
CLIPSEG_SIZE = 352
class SegmentedGrayscale(object):
@ -77,16 +73,15 @@ class Txt2Mask(object):
'''
def __init__(self,device='cpu',refined=False):
print('>> Initializing clipseg model for text to mask inference')
# BUG: we are not doing anything with the device option at this time
self.device = device
self.model = CLIPDensePredT(version=CLIP_VERSION, reduce_dim=64, complex_trans_conv=refined)
self.model.eval()
# initially we keep everything in cpu to conserve space
self.model.to('cpu')
self.model.load_state_dict(torch.load(os.path.join(Globals.root,CLIPSEG_WEIGHTS_REFINED)
if refined
else os.path.join(Globals.root,CLIPSEG_WEIGHTS),
map_location=torch.device('cpu')), strict=False
)
self.processor = AutoProcessor.from_pretrained(CLIPSEG_MODEL,
cache_dir=global_cache_dir('hub')
)
self.model = CLIPSegForImageSegmentation.from_pretrained(CLIPSEG_MODEL,
cache_dir=global_cache_dir('hub')
)
@torch.no_grad()
def segment(self, image, prompt:str) -> SegmentedGrayscale:
@ -95,9 +90,6 @@ class Txt2Mask(object):
provided image and returns a SegmentedGrayscale object in which the brighter
pixels indicate where the object is inferred to be.
'''
self._to_device(self.device)
prompts = [prompt] # right now we operate on just a single prompt at a time
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
@ -111,14 +103,14 @@ class Txt2Mask(object):
img = self._scale_and_crop(image)
img = transform(img).unsqueeze(0)
preds = self.model(img.repeat(len(prompts),1,1,1), prompts)[0]
heatmap = torch.sigmoid(preds[0][0]).cpu()
self._to_device('cpu')
inputs = self.processor(text=[prompt],
images=[image],
padding=True,
return_tensors='pt')
outputs = self.model(**inputs)
heatmap = torch.sigmoid(outputs.logits)
return SegmentedGrayscale(image, heatmap)
def _to_device(self, device):
self.model.to(device)
def _scale_and_crop(self, image:Image)->Image:
scaled_image = Image.new('RGB',(CLIPSEG_SIZE,CLIPSEG_SIZE))
if image.width > image.height: # width is constraint

View File

@ -36,8 +36,7 @@ classifiers = [
dependencies = [
"accelerate",
"albumentations",
"clip_anytorch", # replaceing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip",
"clipseg @ https://github.com/invoke-ai/clipseg/archive/relaxed-python-requirement.zip", # is this still necesarry with diffusers?
"clip_anytorch", # replacing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip",
"datasets",
"diffusers[torch]~=0.11",
"dnspython==2.2.1",
@ -53,7 +52,7 @@ dependencies = [
"huggingface-hub>=0.11.1",
"imageio",
"imageio-ffmpeg",
"k-diffusion", # replaceing "k-diffusion @ https://github.com/Birch-san/k-diffusion/archive/refs/heads/mps.zip",
"k-diffusion", # replacing "k-diffusion @ https://github.com/Birch-san/k-diffusion/archive/refs/heads/mps.zip",
"kornia",
"npyscreen",
"numpy~=1.23",