Remove dependency on original clipseg library for text masking (#2425)

- This replaces the original clipseg library with the transformers version from HuggingFace. - This should make it possible to register InvokeAI at PyPi and do a fully automated pip-based install. - Minor regression: it is no longer possible to specify which device the clipseg model will be loaded into, and it will reside in CPU. However, performance is more than acceptable.
2024-08-30 20:32:17 +00:00 · 2023-01-26 12:14:13 -05:00 · 2023-01-26 12:14:13 -05:00 · 046abb634e
commit 046abb634e
parent eb2ca4970b cbd967cbc4
4 changed files with 30 additions and 53 deletions
--- a/ldm/invoke/configure_invokeai.py
+++ b/ldm/invoke/configure_invokeai.py
@ -30,7 +30,7 @@ from huggingface_hub.utils._errors import RevisionNotFoundError
 from omegaconf import OmegaConf
 from omegaconf.dictconfig import DictConfig
 from tqdm import tqdm
-from transformers import CLIPTokenizer, CLIPTextModel
+from transformers import CLIPTokenizer, CLIPTextModel, AutoProcessor, CLIPSegForImageSegmentation
 from ldm.invoke.globals import Globals, global_cache_dir
 from ldm.invoke.readline import generic_completer
@ -601,31 +601,10 @@ def download_codeformer():
 #---------------------------------------------
 def download_clipseg():
    print('Installing clipseg model for text-based masking...',end='', file=sys.stderr)
-    import zipfile
+    CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined'
    try:
-        model_url = 'https://owncloud.gwdg.de/index.php/s/ioHbRzFx6th32hn/download'
+        download_from_hf(AutoProcessor,CLIPSEG_MODEL)
-        model_dest = os.path.join(Globals.root,'models/clipseg/clipseg_weights')
+        download_from_hf(CLIPSegForImageSegmentation,CLIPSEG_MODEL)
        weights_zip = 'models/clipseg/weights.zip'
        if not os.path.exists(model_dest):
            os.makedirs(os.path.dirname(model_dest), exist_ok=True)
        if not os.path.exists(f'{model_dest}/rd64-uni-refined.pth'):
            dest = os.path.join(Globals.root,weights_zip)
            request.urlretrieve(model_url,dest)
            with zipfile.ZipFile(dest,'r') as zip:
                zip.extractall(os.path.join(Globals.root,'models/clipseg'))
            os.remove(dest)
            from clipseg.clipseg import CLIPDensePredT
            model = CLIPDensePredT(version='ViT-B/16', reduce_dim=64, )
            model.eval()
            model.load_state_dict(
                torch.load(
                    os.path.join(Globals.root,'models/clipseg/clipseg_weights/rd64-uni-refined.pth'),
                    map_location=torch.device('cpu')
                    ),
                strict=False,
            )
    except Exception:
        print('Error installing clipseg model:')
        print(traceback.format_exc())
--- a/ldm/invoke/globals.py
+++ b/ldm/invoke/globals.py
@ -71,7 +71,14 @@ def global_cache_dir(subdir:Union[str,Path]='')->Path:
    is provided, it will be appended to the end of the path, allowing
    for huggingface-style conventions:
         global_cache_dir('diffusers')
         global_cache_dir('hub')
    Current HuggingFace documentation (mid-Jan 2023) indicates that
    transformers models will be cached into a "transformers" subdirectory,
    but in practice they seem to go into "hub". But if needed:
         global_cache_dir('transformers')
    One other caveat is that HuggingFace is moving some diffusers models
    into the "hub" subdirectory as well, so this will need to be revisited
    from time to time.
    '''
    home: str = os.getenv('HF_HOME')
--- a/ldm/invoke/txt2mask.py
+++ b/ldm/invoke/txt2mask.py
@ -29,16 +29,12 @@ work fine.
 import torch
 import numpy as  np
-import os
+from transformers import AutoProcessor, CLIPSegForImageSegmentation
 from clipseg.clipseg import CLIPDensePredT
 from einops import rearrange, repeat
 from PIL import Image, ImageOps
 from torchvision import transforms
-from ldm.invoke.globals import Globals
+from ldm.invoke.globals import global_cache_dir
-CLIP_VERSION = 'ViT-B/16'
+CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined'
 CLIPSEG_WEIGHTS = 'models/clipseg/clipseg_weights/rd64-uni.pth'
 CLIPSEG_WEIGHTS_REFINED = 'models/clipseg/clipseg_weights/rd64-uni-refined.pth'
 CLIPSEG_SIZE = 352
 class SegmentedGrayscale(object):
@ -77,16 +73,15 @@ class Txt2Mask(object):
    '''
    def __init__(self,device='cpu',refined=False):
        print('>> Initializing clipseg model for text to mask inference')
        # BUG: we are not doing anything with the device option at this time
        self.device = device
-        self.model = CLIPDensePredT(version=CLIP_VERSION, reduce_dim=64, complex_trans_conv=refined)
+        self.processor = AutoProcessor.from_pretrained(CLIPSEG_MODEL,
-        self.model.eval()
+                                                       cache_dir=global_cache_dir('hub')
-        # initially we keep everything in cpu to conserve space
+                                                       )
-        self.model.to('cpu')
+        self.model = CLIPSegForImageSegmentation.from_pretrained(CLIPSEG_MODEL,
-        self.model.load_state_dict(torch.load(os.path.join(Globals.root,CLIPSEG_WEIGHTS_REFINED)
+                                                                 cache_dir=global_cache_dir('hub')
-                                              if refined
+                                                                 )
                                              else os.path.join(Globals.root,CLIPSEG_WEIGHTS),
                                              map_location=torch.device('cpu')), strict=False
        )
    @torch.no_grad()
    def segment(self, image, prompt:str) -> SegmentedGrayscale:
@ -95,9 +90,6 @@ class Txt2Mask(object):
        provided image and returns a SegmentedGrayscale object in which the brighter
        pixels indicate where the object is inferred to be.
        '''
        self._to_device(self.device)
        prompts = [prompt]   # right now we operate on just a single prompt at a time
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
@ -111,14 +103,14 @@ class Txt2Mask(object):
        img = self._scale_and_crop(image)
        img = transform(img).unsqueeze(0)
-        preds = self.model(img.repeat(len(prompts),1,1,1), prompts)[0]
+        inputs = self.processor(text=[prompt],
-        heatmap = torch.sigmoid(preds[0][0]).cpu()
+                                images=[image],
-        self._to_device('cpu')
+                                padding=True,
                                return_tensors='pt')
        outputs = self.model(**inputs)
        heatmap = torch.sigmoid(outputs.logits)
        return SegmentedGrayscale(image, heatmap)
    def _to_device(self, device):
        self.model.to(device)
    def _scale_and_crop(self, image:Image)->Image:
        scaled_image = Image.new('RGB',(CLIPSEG_SIZE,CLIPSEG_SIZE))
        if image.width > image.height: # width is constraint
--- a/pyproject.toml
+++ b/pyproject.toml
@ -36,8 +36,7 @@ classifiers = [
 dependencies = [
  "accelerate",
  "albumentations",
-  "clip_anytorch",                                                                         # replaceing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip",
+  "clip_anytorch",          # replacing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip",
  "clipseg @ https://github.com/invoke-ai/clipseg/archive/relaxed-python-requirement.zip", # is this still necesarry with diffusers?
  "datasets",
  "diffusers[torch]~=0.11",
  "dnspython==2.2.1",
@ -53,7 +52,7 @@ dependencies = [
  "huggingface-hub>=0.11.1",
  "imageio",
  "imageio-ffmpeg",
-  "k-diffusion",                                                                           # replaceing "k-diffusion @ https://github.com/Birch-san/k-diffusion/archive/refs/heads/mps.zip",
+  "k-diffusion",              # replacing "k-diffusion @ https://github.com/Birch-san/k-diffusion/archive/refs/heads/mps.zip",
  "kornia",
  "npyscreen",
  "numpy~=1.23",