Remove dependency on original clipseg library for text masking (#2425)

- This replaces the original clipseg library with the transformers
version from HuggingFace.
- This should make it possible to register InvokeAI at PyPi and do a
fully automated pip-based install.
- Minor regression: it is no longer possible to specify which device the
clipseg model will be loaded into, and it will reside in CPU. However,
performance is more than acceptable.
This commit is contained in:
Lincoln Stein 2023-01-26 12:14:13 -05:00 committed by GitHub
commit 046abb634e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 30 additions and 53 deletions

View File

@ -30,7 +30,7 @@ from huggingface_hub.utils._errors import RevisionNotFoundError
from omegaconf import OmegaConf from omegaconf import OmegaConf
from omegaconf.dictconfig import DictConfig from omegaconf.dictconfig import DictConfig
from tqdm import tqdm from tqdm import tqdm
from transformers import CLIPTokenizer, CLIPTextModel from transformers import CLIPTokenizer, CLIPTextModel, AutoProcessor, CLIPSegForImageSegmentation
from ldm.invoke.globals import Globals, global_cache_dir from ldm.invoke.globals import Globals, global_cache_dir
from ldm.invoke.readline import generic_completer from ldm.invoke.readline import generic_completer
@ -601,31 +601,10 @@ def download_codeformer():
#--------------------------------------------- #---------------------------------------------
def download_clipseg(): def download_clipseg():
print('Installing clipseg model for text-based masking...',end='', file=sys.stderr) print('Installing clipseg model for text-based masking...',end='', file=sys.stderr)
import zipfile CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined'
try: try:
model_url = 'https://owncloud.gwdg.de/index.php/s/ioHbRzFx6th32hn/download' download_from_hf(AutoProcessor,CLIPSEG_MODEL)
model_dest = os.path.join(Globals.root,'models/clipseg/clipseg_weights') download_from_hf(CLIPSegForImageSegmentation,CLIPSEG_MODEL)
weights_zip = 'models/clipseg/weights.zip'
if not os.path.exists(model_dest):
os.makedirs(os.path.dirname(model_dest), exist_ok=True)
if not os.path.exists(f'{model_dest}/rd64-uni-refined.pth'):
dest = os.path.join(Globals.root,weights_zip)
request.urlretrieve(model_url,dest)
with zipfile.ZipFile(dest,'r') as zip:
zip.extractall(os.path.join(Globals.root,'models/clipseg'))
os.remove(dest)
from clipseg.clipseg import CLIPDensePredT
model = CLIPDensePredT(version='ViT-B/16', reduce_dim=64, )
model.eval()
model.load_state_dict(
torch.load(
os.path.join(Globals.root,'models/clipseg/clipseg_weights/rd64-uni-refined.pth'),
map_location=torch.device('cpu')
),
strict=False,
)
except Exception: except Exception:
print('Error installing clipseg model:') print('Error installing clipseg model:')
print(traceback.format_exc()) print(traceback.format_exc())

View File

@ -71,7 +71,14 @@ def global_cache_dir(subdir:Union[str,Path]='')->Path:
is provided, it will be appended to the end of the path, allowing is provided, it will be appended to the end of the path, allowing
for huggingface-style conventions: for huggingface-style conventions:
global_cache_dir('diffusers') global_cache_dir('diffusers')
global_cache_dir('hub')
Current HuggingFace documentation (mid-Jan 2023) indicates that
transformers models will be cached into a "transformers" subdirectory,
but in practice they seem to go into "hub". But if needed:
global_cache_dir('transformers') global_cache_dir('transformers')
One other caveat is that HuggingFace is moving some diffusers models
into the "hub" subdirectory as well, so this will need to be revisited
from time to time.
''' '''
home: str = os.getenv('HF_HOME') home: str = os.getenv('HF_HOME')

View File

@ -29,16 +29,12 @@ work fine.
import torch import torch
import numpy as np import numpy as np
import os from transformers import AutoProcessor, CLIPSegForImageSegmentation
from clipseg.clipseg import CLIPDensePredT
from einops import rearrange, repeat
from PIL import Image, ImageOps from PIL import Image, ImageOps
from torchvision import transforms from torchvision import transforms
from ldm.invoke.globals import Globals from ldm.invoke.globals import global_cache_dir
CLIP_VERSION = 'ViT-B/16' CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined'
CLIPSEG_WEIGHTS = 'models/clipseg/clipseg_weights/rd64-uni.pth'
CLIPSEG_WEIGHTS_REFINED = 'models/clipseg/clipseg_weights/rd64-uni-refined.pth'
CLIPSEG_SIZE = 352 CLIPSEG_SIZE = 352
class SegmentedGrayscale(object): class SegmentedGrayscale(object):
@ -77,16 +73,15 @@ class Txt2Mask(object):
''' '''
def __init__(self,device='cpu',refined=False): def __init__(self,device='cpu',refined=False):
print('>> Initializing clipseg model for text to mask inference') print('>> Initializing clipseg model for text to mask inference')
# BUG: we are not doing anything with the device option at this time
self.device = device self.device = device
self.model = CLIPDensePredT(version=CLIP_VERSION, reduce_dim=64, complex_trans_conv=refined) self.processor = AutoProcessor.from_pretrained(CLIPSEG_MODEL,
self.model.eval() cache_dir=global_cache_dir('hub')
# initially we keep everything in cpu to conserve space )
self.model.to('cpu') self.model = CLIPSegForImageSegmentation.from_pretrained(CLIPSEG_MODEL,
self.model.load_state_dict(torch.load(os.path.join(Globals.root,CLIPSEG_WEIGHTS_REFINED) cache_dir=global_cache_dir('hub')
if refined )
else os.path.join(Globals.root,CLIPSEG_WEIGHTS),
map_location=torch.device('cpu')), strict=False
)
@torch.no_grad() @torch.no_grad()
def segment(self, image, prompt:str) -> SegmentedGrayscale: def segment(self, image, prompt:str) -> SegmentedGrayscale:
@ -95,9 +90,6 @@ class Txt2Mask(object):
provided image and returns a SegmentedGrayscale object in which the brighter provided image and returns a SegmentedGrayscale object in which the brighter
pixels indicate where the object is inferred to be. pixels indicate where the object is inferred to be.
''' '''
self._to_device(self.device)
prompts = [prompt] # right now we operate on just a single prompt at a time
transform = transforms.Compose([ transform = transforms.Compose([
transforms.ToTensor(), transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
@ -111,14 +103,14 @@ class Txt2Mask(object):
img = self._scale_and_crop(image) img = self._scale_and_crop(image)
img = transform(img).unsqueeze(0) img = transform(img).unsqueeze(0)
preds = self.model(img.repeat(len(prompts),1,1,1), prompts)[0] inputs = self.processor(text=[prompt],
heatmap = torch.sigmoid(preds[0][0]).cpu() images=[image],
self._to_device('cpu') padding=True,
return_tensors='pt')
outputs = self.model(**inputs)
heatmap = torch.sigmoid(outputs.logits)
return SegmentedGrayscale(image, heatmap) return SegmentedGrayscale(image, heatmap)
def _to_device(self, device):
self.model.to(device)
def _scale_and_crop(self, image:Image)->Image: def _scale_and_crop(self, image:Image)->Image:
scaled_image = Image.new('RGB',(CLIPSEG_SIZE,CLIPSEG_SIZE)) scaled_image = Image.new('RGB',(CLIPSEG_SIZE,CLIPSEG_SIZE))
if image.width > image.height: # width is constraint if image.width > image.height: # width is constraint

View File

@ -36,8 +36,7 @@ classifiers = [
dependencies = [ dependencies = [
"accelerate", "accelerate",
"albumentations", "albumentations",
"clip_anytorch", # replaceing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip", "clip_anytorch", # replacing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip",
"clipseg @ https://github.com/invoke-ai/clipseg/archive/relaxed-python-requirement.zip", # is this still necesarry with diffusers?
"datasets", "datasets",
"diffusers[torch]~=0.11", "diffusers[torch]~=0.11",
"dnspython==2.2.1", "dnspython==2.2.1",
@ -53,7 +52,7 @@ dependencies = [
"huggingface-hub>=0.11.1", "huggingface-hub>=0.11.1",
"imageio", "imageio",
"imageio-ffmpeg", "imageio-ffmpeg",
"k-diffusion", # replaceing "k-diffusion @ https://github.com/Birch-san/k-diffusion/archive/refs/heads/mps.zip", "k-diffusion", # replacing "k-diffusion @ https://github.com/Birch-san/k-diffusion/archive/refs/heads/mps.zip",
"kornia", "kornia",
"npyscreen", "npyscreen",
"numpy~=1.23", "numpy~=1.23",