From 288e31fc6096fb55dc245a2103d7ef528b436635 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Thu, 26 Jan 2023 09:35:16 -0500
Subject: [PATCH 1/3] remove dependency on original clipseg library

- This replaces the original clipseg library with the transformers
  version from HuggingFace.
- This should make it possible to register InvokeAI at PyPi and do
  a fully automated pip-based install.
- Minor regression: it is no longer possible to specify which device
  the clipseg model will be loaded into, and it will reside in CPU.
  However, performance is more than acceptable.
---
 ldm/invoke/configure_invokeai.py | 29 +++------------------
 ldm/invoke/globals.py            |  3 ++-
 ldm/invoke/txt2mask.py           | 43 +++++++++++++-------------------
 pyproject.toml                   |  5 ++--
 4 files changed, 26 insertions(+), 54 deletions(-)

diff --git a/ldm/invoke/configure_invokeai.py b/ldm/invoke/configure_invokeai.py
index 70f80f7846..c2212f3178 100755
--- a/ldm/invoke/configure_invokeai.py
+++ b/ldm/invoke/configure_invokeai.py
@@ -30,7 +30,7 @@ from huggingface_hub.utils._errors import RevisionNotFoundError
 from omegaconf import OmegaConf
 from omegaconf.dictconfig import DictConfig
 from tqdm import tqdm
-from transformers import CLIPTokenizer, CLIPTextModel
+from transformers import CLIPTokenizer, CLIPTextModel, AutoProcessor, CLIPSegForImageSegmentation
 
 from ldm.invoke.globals import Globals, global_cache_dir
 from ldm.invoke.readline import generic_completer
@@ -601,31 +601,10 @@ def download_codeformer():
 #---------------------------------------------
 def download_clipseg():
     print('Installing clipseg model for text-based masking...',end='', file=sys.stderr)
-    import zipfile
+    CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined'
     try:
-        model_url = 'https://owncloud.gwdg.de/index.php/s/ioHbRzFx6th32hn/download'
-        model_dest = os.path.join(Globals.root,'models/clipseg/clipseg_weights')
-        weights_zip = 'models/clipseg/weights.zip'
-
-        if not os.path.exists(model_dest):
-            os.makedirs(os.path.dirname(model_dest), exist_ok=True)
-        if not os.path.exists(f'{model_dest}/rd64-uni-refined.pth'):
-            dest = os.path.join(Globals.root,weights_zip)
-            request.urlretrieve(model_url,dest)
-            with zipfile.ZipFile(dest,'r') as zip:
-                zip.extractall(os.path.join(Globals.root,'models/clipseg'))
-            os.remove(dest)
-
-            from clipseg.clipseg import CLIPDensePredT
-            model = CLIPDensePredT(version='ViT-B/16', reduce_dim=64, )
-            model.eval()
-            model.load_state_dict(
-                torch.load(
-                    os.path.join(Globals.root,'models/clipseg/clipseg_weights/rd64-uni-refined.pth'),
-                    map_location=torch.device('cpu')
-                    ),
-                strict=False,
-            )
+        download_from_hf(AutoProcessor,CLIPSEG_MODEL)
+        download_from_hf(CLIPSegForImageSegmentation,CLIPSEG_MODEL)
     except Exception:
         print('Error installing clipseg model:')
         print(traceback.format_exc())
diff --git a/ldm/invoke/globals.py b/ldm/invoke/globals.py
index 538ff17703..c995c42028 100644
--- a/ldm/invoke/globals.py
+++ b/ldm/invoke/globals.py
@@ -71,7 +71,8 @@ def global_cache_dir(subdir:Union[str,Path]='')->Path:
     is provided, it will be appended to the end of the path, allowing
     for huggingface-style conventions:
          global_cache_dir('diffusers')
-         global_cache_dir('transformers')
+         global_cache_dir('hub')
+         global_cache_dir('transformers')  # not used?
     '''
     home: str = os.getenv('HF_HOME')
 
diff --git a/ldm/invoke/txt2mask.py b/ldm/invoke/txt2mask.py
index aed299ab9a..ef9313ad2b 100644
--- a/ldm/invoke/txt2mask.py
+++ b/ldm/invoke/txt2mask.py
@@ -29,16 +29,12 @@ work fine.
 
 import torch
 import numpy as  np
-import os
-from clipseg.clipseg import CLIPDensePredT
-from einops import rearrange, repeat
+from transformers import AutoProcessor, CLIPSegForImageSegmentation
 from PIL import Image, ImageOps
 from torchvision import transforms
-from ldm.invoke.globals import Globals
+from ldm.invoke.globals import global_cache_dir
 
-CLIP_VERSION = 'ViT-B/16'
-CLIPSEG_WEIGHTS = 'models/clipseg/clipseg_weights/rd64-uni.pth'
-CLIPSEG_WEIGHTS_REFINED = 'models/clipseg/clipseg_weights/rd64-uni-refined.pth'
+CLIPSEG_MODEL = 'CIDAS/clipseg-rd64-refined'
 CLIPSEG_SIZE = 352
 
 class SegmentedGrayscale(object):
@@ -77,16 +73,15 @@ class Txt2Mask(object):
     '''
     def __init__(self,device='cpu',refined=False):
         print('>> Initializing clipseg model for text to mask inference')
+
+        # BUG: we are not doing anything with the device option at this time
         self.device = device
-        self.model = CLIPDensePredT(version=CLIP_VERSION, reduce_dim=64, complex_trans_conv=refined)
-        self.model.eval()
-        # initially we keep everything in cpu to conserve space
-        self.model.to('cpu')
-        self.model.load_state_dict(torch.load(os.path.join(Globals.root,CLIPSEG_WEIGHTS_REFINED)
-                                              if refined
-                                              else os.path.join(Globals.root,CLIPSEG_WEIGHTS),
-                                              map_location=torch.device('cpu')), strict=False
-        )
+        self.processor = AutoProcessor.from_pretrained(CLIPSEG_MODEL,
+                                                       cache_dir=global_cache_dir('hub')
+                                                       )
+        self.model = CLIPSegForImageSegmentation.from_pretrained(CLIPSEG_MODEL,
+                                                                 cache_dir=global_cache_dir('hub')
+                                                                 )
 
     @torch.no_grad()
     def segment(self, image, prompt:str) -> SegmentedGrayscale:
@@ -95,9 +90,6 @@ class Txt2Mask(object):
         provided image and returns a SegmentedGrayscale object in which the brighter
         pixels indicate where the object is inferred to be.
         '''
-        self._to_device(self.device)
-        prompts = [prompt]   # right now we operate on just a single prompt at a time
-
         transform = transforms.Compose([
             transforms.ToTensor(),
             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
@@ -111,14 +103,15 @@ class Txt2Mask(object):
         img = self._scale_and_crop(image)
         img = transform(img).unsqueeze(0)
 
-        preds = self.model(img.repeat(len(prompts),1,1,1), prompts)[0]
-        heatmap = torch.sigmoid(preds[0][0]).cpu()
-        self._to_device('cpu')
+        inputs = self.processor(text=[prompt],
+                                images=[image],
+                                padding=True,
+                                return_tensors='pt')
+        outputs = self.model(**inputs)
+        preds = outputs.logits
+        heatmap = torch.sigmoid(preds)
         return SegmentedGrayscale(image, heatmap)
 
-    def _to_device(self, device):
-        self.model.to(device)
-
     def _scale_and_crop(self, image:Image)->Image:
         scaled_image = Image.new('RGB',(CLIPSEG_SIZE,CLIPSEG_SIZE))
         if image.width > image.height: # width is constraint
diff --git a/pyproject.toml b/pyproject.toml
index 8359f14e6d..19603e15f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,8 +36,7 @@ classifiers = [
 dependencies = [
   "accelerate",
   "albumentations",
-  "clip_anytorch",                                                                         # replaceing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip",
-  "clipseg @ https://github.com/invoke-ai/clipseg/archive/relaxed-python-requirement.zip", # is this still necesarry with diffusers?
+  "clip_anytorch",          # replacing "clip @ https://github.com/openai/CLIP/archive/eaa22acb90a5876642d0507623e859909230a52d.zip",
   "datasets",
   "diffusers[torch]~=0.11",
   "dnspython==2.2.1",
@@ -53,7 +52,7 @@ dependencies = [
   "huggingface-hub>=0.11.1",
   "imageio",
   "imageio-ffmpeg",
-  "k-diffusion",                                                                           # replaceing "k-diffusion @ https://github.com/Birch-san/k-diffusion/archive/refs/heads/mps.zip",
+  "k-diffusion",              # replacing "k-diffusion @ https://github.com/Birch-san/k-diffusion/archive/refs/heads/mps.zip",
   "kornia",
   "npyscreen",
   "numpy~=1.23",

From a49d546125f29056062824a9de45eafd8defb1b7 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Thu, 26 Jan 2023 09:46:34 -0500
Subject: [PATCH 2/3] simplified code a bit

---
 ldm/invoke/txt2mask.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ldm/invoke/txt2mask.py b/ldm/invoke/txt2mask.py
index ef9313ad2b..ccb7b1c604 100644
--- a/ldm/invoke/txt2mask.py
+++ b/ldm/invoke/txt2mask.py
@@ -108,8 +108,7 @@ class Txt2Mask(object):
                                 padding=True,
                                 return_tensors='pt')
         outputs = self.model(**inputs)
-        preds = outputs.logits
-        heatmap = torch.sigmoid(preds)
+        heatmap = torch.sigmoid(outputs.logits)
         return SegmentedGrayscale(image, heatmap)
 
     def _scale_and_crop(self, image:Image)->Image:

From cbd967cbc416872b417fe7999902fb7e1979fcad Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lstein@gmail.com>
Date: Thu, 26 Jan 2023 11:48:03 -0500
Subject: [PATCH 3/3] add documentation caveat about location of HF cached
 models

---
 ldm/invoke/globals.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/ldm/invoke/globals.py b/ldm/invoke/globals.py
index c995c42028..9db54fe62d 100644
--- a/ldm/invoke/globals.py
+++ b/ldm/invoke/globals.py
@@ -72,7 +72,13 @@ def global_cache_dir(subdir:Union[str,Path]='')->Path:
     for huggingface-style conventions:
          global_cache_dir('diffusers')
          global_cache_dir('hub')
-         global_cache_dir('transformers')  # not used?
+    Current HuggingFace documentation (mid-Jan 2023) indicates that
+    transformers models will be cached into a "transformers" subdirectory,
+    but in practice they seem to go into "hub". But if needed:
+         global_cache_dir('transformers')
+    One other caveat is that HuggingFace is moving some diffusers models
+    into the "hub" subdirectory as well, so this will need to be revisited
+    from time to time.
     '''
     home: str = os.getenv('HF_HOME')