remove dependency on original clipseg library

- This replaces the original clipseg library with the transformers
  version from HuggingFace.
- This should make it possible to register InvokeAI at PyPi and do
  a fully automated pip-based install.
- Minor regression: it is no longer possible to specify which device
  the clipseg model will be loaded into, and it will reside in CPU.
  However, performance is more than acceptable.
This commit is contained in:
Lincoln Stein 2023-01-26 09:35:16 -05:00
parent eb2ca4970b
commit 288e31fc60
4 changed files with 26 additions and 54 deletions

View File

@ -30,7 +30,7 @@ from huggingface_hub.utils._errors import RevisionNotFoundError
from omegaconf import OmegaConf
from omegaconf.dictconfig import DictConfig
from tqdm import tqdm
from transformers import CLIPTokenizer, CLIPTextModel
from transformers import CLIPTokenizer, CLIPTextModel, AutoProcessor, CLIPSegForImageSegmentation
from ldm.invoke.globals import Globals, global_cache_dir
from ldm.invoke.readline import generic_completer
@ -601,31 +601,10 @@ def download_codeformer():
#---------------------------------------------
def download_clipseg():
print('Installing clipseg model for text-based masking...',end='', file=sys.stderr)
import zipfile
CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined'
try:
model_url = 'https://owncloud.gwdg.de/index.php/s/ioHbRzFx6th32hn/download'
model_dest = os.path.join(Globals.root,'models/clipseg/clipseg_weights')
weights_zip = 'models/clipseg/weights.zip'
if not os.path.exists(model_dest):
os.makedirs(os.path.dirname(model_dest), exist_ok=True)
if not os.path.exists(f'{model_dest}/rd64-uni-refined.pth'):
dest = os.path.join(Globals.root,weights_zip)
request.urlretrieve(model_url,dest)
with zipfile.ZipFile(dest,'r') as zip:
zip.extractall(os.path.join(Globals.root,'models/clipseg'))
os.remove(dest)
from clipseg.clipseg import CLIPDensePredT
model = CLIPDensePredT(version='ViT-B/16', reduce_dim=64, )
model.eval()
model.load_state_dict(
torch.load(
os.path.join(Globals.root,'models/clipseg/clipseg_weights/rd64-uni-refined.pth'),
map_location=torch.device('cpu')
),
strict=False,
)
download_from_hf(AutoProcessor,CLIPSEG_MODEL)
download_from_hf(CLIPSegForImageSegmentation,CLIPSEG_MODEL)
except Exception:
print('Error installing clipseg model:')
print(traceback.format_exc())

View File

@ -71,7 +71,8 @@ def global_cache_dir(subdir:Union[str,Path]='')->Path:
is provided, it will be appended to the end of the path, allowing
for huggingface-style conventions:
global_cache_dir('diffusers')
global_cache_dir('transformers')
global_cache_dir('hub')
global_cache_dir('transformers') # not used?
'''
home: str = os.getenv('HF_HOME')

View File

@ -29,16 +29,12 @@ work fine.
import torch
import numpy as np
import os
from clipseg.clipseg import CLIPDensePredT
from einops import rearrange, repeat
from transformers import AutoProcessor, CLIPSegForImageSegmentation
from PIL import Image, ImageOps
from torchvision import transforms
from ldm.invoke.globals import Globals
from ldm.invoke.globals import global_cache_dir
CLIP_VERSION = 'ViT-B/16'
CLIPSEG_WEIGHTS = 'models/clipseg/clipseg_weights/rd64-uni.pth'
CLIPSEG_WEIGHTS_REFINED = 'models/clipseg/clipseg_weights/rd64-uni-refined.pth'
CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined'
CLIPSEG_SIZE = 352
class SegmentedGrayscale(object):
@ -77,16 +73,15 @@ class Txt2Mask(object):
'''
def __init__(self,device='cpu',refined=False):
print('>> Initializing clipseg model for text to mask inference')
# BUG: we are not doing anything with the device option at this time
self.device = device
self.model = CLIPDensePredT(version=CLIP_VERSION, reduce_dim=64, complex_trans_conv=refined)
self.model.eval()
# initially we keep everything in cpu to conserve space
self.model.to('cpu')
self.model.load_state_dict(torch.load(os.path.join(Globals.root,CLIPSEG_WEIGHTS_REFINED)
if refined
else os.path.join(Globals.root,CLIPSEG_WEIGHTS),
map_location=torch.device('cpu')), strict=False
)
self.processor = AutoProcessor.from_pretrained(CLIPSEG_MODEL,
cache_dir=global_cache_dir('hub')
)
self.model = CLIPSegForImageSegmentation.from_pretrained(CLIPSEG_MODEL,
cache_dir=global_cache_dir('hub')
)
@torch.no_grad()
def segment(self, image, prompt:str) -> SegmentedGrayscale:
@ -95,9 +90,6 @@ class Txt2Mask(object):
provided image and returns a SegmentedGrayscale object in which the brighter
pixels indicate where the object is inferred to be.
'''
self._to_device(self.device)
prompts = [prompt] # right now we operate on just a single prompt at a time
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
@ -111,14 +103,15 @@ class Txt2Mask(object):
img = self._scale_and_crop(image)
img = transform(img).unsqueeze(0)
preds = self.model(img.repeat(len(prompts),1,1,1), prompts)[0]
heatmap = torch.sigmoid(preds[0][0]).cpu()
self._to_device('cpu')
inputs = self.processor(text=[prompt],
images=[image],
padding=True,
return_tensors='pt')
outputs = self.model(**inputs)
preds = outputs.logits
heatmap = torch.sigmoid(preds)
return SegmentedGrayscale(image, heatmap)
def _to_device(self, device):
self.model.to(device)
def _scale_and_crop(self, image:Image)->Image:
scaled_image = Image.new('RGB',(CLIPSEG_SIZE,CLIPSEG_SIZE))
if image.width > image.height: # width is constraint

View File

@ -36,8 +36,7 @@ classifiers = [
dependencies = [
"accelerate",
"albumentations",
"clip_anytorch", # replaceing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip",
"clipseg @ https://github.com/invoke-ai/clipseg/archive/relaxed-python-requirement.zip", # is this still necesarry with diffusers?
"clip_anytorch", # replacing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip",
"datasets",
"diffusers[torch]~=0.11",
"dnspython==2.2.1",
@ -53,7 +52,7 @@ dependencies = [
"huggingface-hub>=0.11.1",
"imageio",
"imageio-ffmpeg",
"k-diffusion", # replaceing "k-diffusion @ https://github.com/Birch-san/k-diffusion/archive/refs/heads/mps.zip",
"k-diffusion", # replacing "k-diffusion @ https://github.com/Birch-san/k-diffusion/archive/refs/heads/mps.zip",
"kornia",
"npyscreen",
"numpy~=1.23",