diff --git a/ldm/invoke/configure_invokeai.py b/ldm/invoke/configure_invokeai.py index 70f80f7846..c2212f3178 100755 --- a/ldm/invoke/configure_invokeai.py +++ b/ldm/invoke/configure_invokeai.py @@ -30,7 +30,7 @@ from huggingface_hub.utils._errors import RevisionNotFoundError from omegaconf import OmegaConf from omegaconf.dictconfig import DictConfig from tqdm import tqdm -from transformers import CLIPTokenizer, CLIPTextModel +from transformers import CLIPTokenizer, CLIPTextModel, AutoProcessor, CLIPSegForImageSegmentation from ldm.invoke.globals import Globals, global_cache_dir from ldm.invoke.readline import generic_completer @@ -601,31 +601,10 @@ def download_codeformer(): #--------------------------------------------- def download_clipseg(): print('Installing clipseg model for text-based masking...',end='', file=sys.stderr) - import zipfile + CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined' try: - model_url = 'https://owncloud.gwdg.de/index.php/s/ioHbRzFx6th32hn/download' - model_dest = os.path.join(Globals.root,'models/clipseg/clipseg_weights') - weights_zip = 'models/clipseg/weights.zip' - - if not os.path.exists(model_dest): - os.makedirs(os.path.dirname(model_dest), exist_ok=True) - if not os.path.exists(f'{model_dest}/rd64-uni-refined.pth'): - dest = os.path.join(Globals.root,weights_zip) - request.urlretrieve(model_url,dest) - with zipfile.ZipFile(dest,'r') as zip: - zip.extractall(os.path.join(Globals.root,'models/clipseg')) - os.remove(dest) - - from clipseg.clipseg import CLIPDensePredT - model = CLIPDensePredT(version='ViT-B/16', reduce_dim=64, ) - model.eval() - model.load_state_dict( - torch.load( - os.path.join(Globals.root,'models/clipseg/clipseg_weights/rd64-uni-refined.pth'), - map_location=torch.device('cpu') - ), - strict=False, - ) + download_from_hf(AutoProcessor,CLIPSEG_MODEL) + download_from_hf(CLIPSegForImageSegmentation,CLIPSEG_MODEL) except Exception: print('Error installing clipseg model:') print(traceback.format_exc()) diff --git a/ldm/invoke/globals.py b/ldm/invoke/globals.py index 538ff17703..9db54fe62d 100644 --- a/ldm/invoke/globals.py +++ b/ldm/invoke/globals.py @@ -71,7 +71,14 @@ def global_cache_dir(subdir:Union[str,Path]='')->Path: is provided, it will be appended to the end of the path, allowing for huggingface-style conventions: global_cache_dir('diffusers') + global_cache_dir('hub') + Current HuggingFace documentation (mid-Jan 2023) indicates that + transformers models will be cached into a "transformers" subdirectory, + but in practice they seem to go into "hub". But if needed: global_cache_dir('transformers') + One other caveat is that HuggingFace is moving some diffusers models + into the "hub" subdirectory as well, so this will need to be revisited + from time to time. ''' home: str = os.getenv('HF_HOME') diff --git a/ldm/invoke/txt2mask.py b/ldm/invoke/txt2mask.py index aed299ab9a..ccb7b1c604 100644 --- a/ldm/invoke/txt2mask.py +++ b/ldm/invoke/txt2mask.py @@ -29,16 +29,12 @@ work fine. import torch import numpy as np -import os -from clipseg.clipseg import CLIPDensePredT -from einops import rearrange, repeat +from transformers import AutoProcessor, CLIPSegForImageSegmentation from PIL import Image, ImageOps from torchvision import transforms -from ldm.invoke.globals import Globals +from ldm.invoke.globals import global_cache_dir -CLIP_VERSION = 'ViT-B/16' -CLIPSEG_WEIGHTS = 'models/clipseg/clipseg_weights/rd64-uni.pth' -CLIPSEG_WEIGHTS_REFINED = 'models/clipseg/clipseg_weights/rd64-uni-refined.pth' +CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined' CLIPSEG_SIZE = 352 class SegmentedGrayscale(object): @@ -77,16 +73,15 @@ class Txt2Mask(object): ''' def __init__(self,device='cpu',refined=False): print('>> Initializing clipseg model for text to mask inference') + + # BUG: we are not doing anything with the device option at this time self.device = device - self.model = CLIPDensePredT(version=CLIP_VERSION, reduce_dim=64, complex_trans_conv=refined) - self.model.eval() - # initially we keep everything in cpu to conserve space - self.model.to('cpu') - self.model.load_state_dict(torch.load(os.path.join(Globals.root,CLIPSEG_WEIGHTS_REFINED) - if refined - else os.path.join(Globals.root,CLIPSEG_WEIGHTS), - map_location=torch.device('cpu')), strict=False - ) + self.processor = AutoProcessor.from_pretrained(CLIPSEG_MODEL, + cache_dir=global_cache_dir('hub') + ) + self.model = CLIPSegForImageSegmentation.from_pretrained(CLIPSEG_MODEL, + cache_dir=global_cache_dir('hub') + ) @torch.no_grad() def segment(self, image, prompt:str) -> SegmentedGrayscale: @@ -95,9 +90,6 @@ class Txt2Mask(object): provided image and returns a SegmentedGrayscale object in which the brighter pixels indicate where the object is inferred to be. ''' - self._to_device(self.device) - prompts = [prompt] # right now we operate on just a single prompt at a time - transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), @@ -111,14 +103,14 @@ class Txt2Mask(object): img = self._scale_and_crop(image) img = transform(img).unsqueeze(0) - preds = self.model(img.repeat(len(prompts),1,1,1), prompts)[0] - heatmap = torch.sigmoid(preds[0][0]).cpu() - self._to_device('cpu') + inputs = self.processor(text=[prompt], + images=[image], + padding=True, + return_tensors='pt') + outputs = self.model(**inputs) + heatmap = torch.sigmoid(outputs.logits) return SegmentedGrayscale(image, heatmap) - def _to_device(self, device): - self.model.to(device) - def _scale_and_crop(self, image:Image)->Image: scaled_image = Image.new('RGB',(CLIPSEG_SIZE,CLIPSEG_SIZE)) if image.width > image.height: # width is constraint diff --git a/pyproject.toml b/pyproject.toml index 8359f14e6d..19603e15f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,8 +36,7 @@ classifiers = [ dependencies = [ "accelerate", "albumentations", - "clip_anytorch", # replaceing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip", - "clipseg @ https://github.com/invoke-ai/clipseg/archive/relaxed-python-requirement.zip", # is this still necesarry with diffusers? + "clip_anytorch", # replacing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip", "datasets", "diffusers[torch]~=0.11", "dnspython==2.2.1", @@ -53,7 +52,7 @@ dependencies = [ "huggingface-hub>=0.11.1", "imageio", "imageio-ffmpeg", - "k-diffusion", # replaceing "k-diffusion @ https://github.com/Birch-san/k-diffusion/archive/refs/heads/mps.zip", + "k-diffusion", # replacing "k-diffusion @ https://github.com/Birch-san/k-diffusion/archive/refs/heads/mps.zip", "kornia", "npyscreen", "numpy~=1.23",