added customized patches and updated the README

2024-08-30 20:32:17 +00:00 · 2022-08-16 21:34:37 -04:00
parent d39f5b51a8
commit d6124c44a3
8 changed files with 505 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -1,4 +1,95 @@
 # Stable Diffusion
 This is a fork of CompVis/stable-diffusion, the wonderful open source
 text-to-image generator.
 The original has been modified in several minor ways:
 ## Simplified API for text to image generation
 There is now a simplified API for text to image generation, which
 lets you create images from a prompt in just three lines of code:
 ~~~~
 from ldm.simplet2i import T2I
 model = T2I()
 model.text2image("a unicorn in manhattan")
 ~~~~
 Please see ldm/simplet2i.py for more information.
 ## Interactive command-line interface similar to the Discord bot
 There is now a command-line script, located in scripts/dream.py, which
 provides an interactive interface to image generation similar to
 the "dream mothership" bot that Stable AI provided on its Discord
 server.  The advantage of this is that the lengthy model
 initialization only happens once. After that image generation is
 fast.
 Note that this has only been tested in the Linux environment!
   (ldm) ~/stable-diffusion$ ./scripts/dream.py
   * Initializing, be patient...
    Loading model from models/ldm/text2img-large/model.ckpt
    LatentDiffusion: Running in eps-prediction mode
    DiffusionWrapper has 872.30 M params.
    making attention of type 'vanilla' with 512 in_channels
    Working with z of shape (1, 4, 32, 32) = 4096 dimensions.
    making attention of type 'vanilla' with 512 in_channels
    Loading Bert tokenizer from "models/bert"
    setting sampler to plms
    * Initialization done! Awaiting your command...
    dream> ashley judd riding a camel -n2
    Outputs:
       outputs/txt2img-samples/00009.png: "ashley judd riding a camel" -n2 -S 416354203
       outputs/txt2img-samples/00010.png: "ashley judd riding a camel" -n2 -S 1362479620
 Command-line arguments ("./scripts/dream.py -h") allow you to change
 various defaults, and select between the mature stable-diffusion
 weights (512x512) and the older (256x256) latent diffusion weights
 (laion400m).
 ## No need for internet connectivity when loading the model
 My development machine is a GPU node in a high-performance compute
 cluster which has no connection to the internet. During model
 initialization, stable-diffusion tries to download the Bert tokenizer
 model from huggingface.co. This obviously didn't work for me.
 Rather than set up a hugging face local hub, I found the most
 expedient thing to do was to download the Bert tokenizer in advance,
 and patch stable-diffusion to read it from the local disk. The steps
 to do this are:
  (ldm) ~/stable-diffusion$ mkdir ./models/bert
  > python3
  >>> from transformers import BertTokenizerFast
  >>> model = BertTokenizerFast.from_pretrained("bert-base-uncased")
  >>> model.save_pretrained("./models/bert")
 (Make sure you are in the stable-diffusion directory when you do
 this!)
 If you don't like this change, just copy over the file
 ldm/modules/encoders/modules.py from the CompVis/stable-diffusion
 repository.
 ## Minor fixes
 I added the requirement for torchmetrics to environment.yaml.
 ## Installation and support
 Follow the directions from the original README, which starts below, to
 configure the environment and install requirements. For support,
 please use this repository's GitHub Issues tracking service.
 Author: Lincoln D. Stein <lincoln.stein@gmail.com>
 # Original README from CompViz/stable-diffusion
 *Stable Diffusion was made possible thanks to a collaboration with [Stability AI](https://stability.ai/) and [Runway](https://runwayml.com/) and builds upon our previous work:*
 [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)<br/>
--- a/ldm/models/diffusion/ddim.py
+++ b/ldm/models/diffusion/ddim.py
@ -135,7 +135,7 @@ class DDIMSampler(object):
        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
        print(f"Running DDIM Sampling with {total_steps} timesteps")
-        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
+        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps, dynamic_ncols=True)
        for i, step in enumerate(iterator):
            index = total_steps - i - 1
@ -238,4 +238,4 @@ class DDIMSampler(object):
            x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
                                          unconditional_guidance_scale=unconditional_guidance_scale,
                                          unconditional_conditioning=unconditional_conditioning)
-        return x_dec
+        return x_dec
--- a/ldm/models/diffusion/ddpm.py
+++ b/ldm/models/diffusion/ddpm.py
@ -255,7 +255,7 @@ class DDPM(pl.LightningModule):
        b = shape[0]
        img = torch.randn(shape, device=device)
        intermediates = [img]
-        for i in tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps):
+        for i in tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps, dynamic_ncols=True):
            img = self.p_sample(img, torch.full((b,), i, device=device, dtype=torch.long),
                                clip_denoised=self.clip_denoised)
            if i % self.log_every_t == 0 or i == self.num_timesteps - 1:
--- a/ldm/models/diffusion/plms.py
+++ b/ldm/models/diffusion/plms.py
@ -92,7 +92,7 @@ class PLMSSampler(object):
        # sampling
        C, H, W = shape
        size = (batch_size, C, H, W)
-        print(f'Data shape for PLMS sampling is {size}')
+#        print(f'Data shape for PLMS sampling is {size}')
        samples, intermediates = self.plms_sampling(conditioning, size,
                                                    callback=callback,
@ -134,9 +134,9 @@ class PLMSSampler(object):
        intermediates = {'x_inter': [img], 'pred_x0': [img]}
        time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps)
        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
-        print(f"Running PLMS Sampling with {total_steps} timesteps")
+#        print(f"Running PLMS Sampling with {total_steps} timesteps")
-        iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
+        iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps, dynamic_ncols=True)
        old_eps = []
        for i, step in enumerate(iterator):
--- a/ldm/modules/encoders/modules.py
+++ b/ldm/modules/encoders/modules.py
@ -55,7 +55,10 @@ class BERTTokenizer(AbstractEncoder):
    def __init__(self, device="cuda", vq_interface=True, max_length=77):
        super().__init__()
        from transformers import BertTokenizerFast  # TODO: add to reuquirements
-        self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+        fn       = 'models/bert'
        print(f'Loading Bert tokenizer from "{fn}"')
 #        self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
        self.tokenizer = BertTokenizerFast.from_pretrained(fn,local_files_only=True)
        self.device = device
        self.vq_interface = vq_interface
        self.max_length = max_length
@ -231,4 +234,5 @@ class FrozenClipImageEmbedder(nn.Module):
 if __name__ == "__main__":
    from ldm.util import count_params
    model = FrozenCLIPEmbedder()
-    count_params(model, verbose=True)
+    count_params(model, verbose=True)
--- a/ldm/simplet2i.py
+++ b/ldm/simplet2i.py
@ -0,0 +1,258 @@
 """Simplified text to image API for stable diffusion/latent diffusion
 Example Usage:
 from ldm.simplet2i import T2I
 # Create an object with default values
 t2i = T2I(outdir      = <path>        // outputs/txt2img-samples
          model       = <path>        // models/ldm/stable-diffusion-v1/model.ckpt
          config      = <path>        // default="configs/stable-diffusion/v1-inference.yaml
          batch       = <integer>     // 1
          steps       = <integer>     // 50
          seed        = <integer>     // current system time
          sampler     = ['ddim','plms']  // ddim
          grid        = <boolean>     // false
          width       = <integer>     // image width, multiple of 64 (512)
          height      = <integer>     // image height, multiple of 64 (512)
          cfg_scale   = <float>       // unconditional guidance scale (7.5)
          fixed_code  = <boolean>     // False
          )
 # do the slow model initialization
 t2i.load_model()
 # Do the fast inference & image generation. Any options passed here 
 # override the default values assigned during class initialization
 # Will call load_model() if the model was not previously loaded.
 t2i.txt2img(prompt = <string>           // required
            // the remaining option arguments override constructur value when present
            outdir = <path>             
            iterations  = <integer>
            batch       = <integer>
            steps       = <integer>
            seed        = <integer>     
            sampler     = ['ddim','plms']
            grid        = <boolean>
            width       = <integer> 
            height      = <integer>
            cfg_scale   = <float>
            ) -> boolean
 """
 import torch
 import numpy as np
 import random
 import sys
 import os
 from omegaconf import OmegaConf
 from PIL import Image
 from tqdm import tqdm, trange
 from itertools import islice
 from einops import rearrange
 from torchvision.utils import make_grid
 from pytorch_lightning import seed_everything
 from torch import autocast
 from contextlib import contextmanager, nullcontext
 from time import time
 from math import sqrt
 from ldm.util import instantiate_from_config
 from ldm.models.diffusion.ddim import DDIMSampler
 from ldm.models.diffusion.plms import PLMSSampler
 class T2I:
    """T2I class
    Attributes
    ----------
    outdir
    model
    config
    iterations
    batch
    steps
    seed
    sampler
    grid
    width
    height
    cfg_scale
    fixed_code
    latent_channels
    downsampling_factor
    precision
 """
    def __init__(self,
                 outdir="outputs/txt2img-samples",
                 batch=1,
                 iterations = 1,
                 width=256,   # change to 512 for stable diffusion
                 height=256,  # change to 512 for stable diffusion
                 grid=False,
                 steps=50,
                 seed=None,
                 cfg_scale=7.5,
                 weights="models/ldm/stable-diffusion-v1/model.ckpt",
                 config = "configs/latent-diffusion/txt2img-1p4B-eval.yaml",
                 sampler="plms",
                 latent_channels=4,
                 downsampling_factor=8,
                 ddim_eta=0.0,  # deterministic
                 fixed_code=False,
                 precision='autocast'
    ):
        self.outdir     = outdir
        self.batch      = batch
        self.iterations = iterations
        self.width      = width
        self.height     = height
        self.grid       = grid
        self.steps      = steps
        self.cfg_scale  = cfg_scale
        self.weights   = weights
        self.config     = config
        self.sampler_name  = sampler
        self.fixed_code    = fixed_code
        self.latent_channels     = latent_channels
        self.downsampling_factor = downsampling_factor
        self.ddim_eta            = ddim_eta
        self.precision           = precision
        self.model      = None     # empty for now
        self.sampler    = None
        if seed is None:
            self.seed = self._new_seed()
        else:
            self.seed = seed
    def txt2img(self,prompt,outdir=None,batch=None,iterations=None,
                steps=None,seed=None,grid=None,width=None,height=None,
                cfg_scale=None,ddim_eta=None):
        """ generate an image from the prompt, writing iteration images into the outdir """
        outdir     = outdir     or self.outdir
        steps      = steps      or self.steps
        seed       = seed       or self.seed
        width      = width      or self.width
        height     = height     or self.height
        cfg_scale  = cfg_scale  or self.cfg_scale
        ddim_eta   = ddim_eta   or self.ddim_eta
        batch      = batch or self.batch
        iterations = iterations or self.iterations
        if batch > 1:
            iterations = 1
        model = self.load_model()  # will instantiate the model or return it from cache
        if (grid is None):
            grid = self.grid
        data = [batch * [prompt]]
        # make directories and establish names for the output files
        os.makedirs(outdir, exist_ok=True)
        base_count = len(os.listdir(outdir))-1
        start_code = None
        if self.fixed_code:
            start_code = torch.randn([batch,
                                      self.latent_channels,
                                      height // self.downsampling_factor,
                                      width  // self.downsampling_factor],
                                     device=self.device)
        precision_scope = autocast if self.precision=="autocast" else nullcontext
        sampler         = self.sampler
        images = list()
        seeds  = list()
        with torch.no_grad():
            with precision_scope("cuda"):
                with model.ema_scope():
                    all_samples = list()
                    for n in trange(iterations, desc="Sampling"):
                        seed_everything(seed)
                        for prompts in tqdm(data, desc="data", dynamic_ncols=True):
                            uc = None
                            if cfg_scale != 1.0:
                                uc = model.get_learned_conditioning(batch * [""])
                            if isinstance(prompts, tuple):
                                    prompts = list(prompts)
                            c = model.get_learned_conditioning(prompts)
                            shape = [self.latent_channels, height // self.downsampling_factor, width // self.downsampling_factor]
                            samples_ddim, _ = sampler.sample(S=steps,
                                                             conditioning=c,
                                                             batch_size=batch,
                                                             shape=shape,
                                                             verbose=False,
                                                             unconditional_guidance_scale=cfg_scale,
                                                             unconditional_conditioning=uc,
                                                             eta=ddim_eta,
                                                             x_T=start_code)
                            x_samples_ddim = model.decode_first_stage(samples_ddim)
                            x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
                            for x_sample in x_samples_ddim:
                                if grid:
                                    all_samples.append(x_samples_ddim)
                                    seeds.append(seed)
                                else:
                                    x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
                                    filename = os.path.join(outdir, f"{base_count:05}.png")
                                    Image.fromarray(x_sample.astype(np.uint8)).save(filename)
                                    images.append([filename,seed])
                                    base_count += 1
                        seed = self._new_seed()
                    if grid:
                        n_rows = int(sqrt(batch * iterations))
                        # save as grid
                        grid = torch.stack(all_samples, 0)
                        grid = rearrange(grid, 'n b c h w -> (n b) c h w')
                        grid = make_grid(grid, nrow=n_rows)
                        # to image
                        grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()
                        filename = os.path.join(outdir, f"{base_count:05}.png")
                        Image.fromarray(grid.astype(np.uint8)).save(filename)
                        for s in seeds:
                            images.append([filename,s])
        return images
    def _new_seed(self):
        self.seed = random.randrange(0,np.iinfo(np.uint32).max)
        return self.seed
    def load_model(self):
        """ Load and initialize the model from configuration variables passed at object creation time """
        if self.model is None:
            seed_everything(self.seed)
            try:
                config = OmegaConf.load(self.config)
                self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
                model = self._load_model_from_config(config,self.weights)
                self.model = model.to(self.device)
            except AttributeError:
                raise SystemExit
            if self.sampler_name=='plms':
                print("setting sampler to plms")
                self.sampler = PLMSSampler(self.model)
            elif self.sampler_name == 'ddim':
                print("setting sampler to ddim")
                self.sampler = DDIMSampler(self.model)
            else:
                print(f"unsupported sampler {self.sampler_name}, defaulting to plms")
                self.sampler = PLMSSampler(self.model)
        return self.model
    def _load_model_from_config(self, config, ckpt):
        print(f"Loading model from {ckpt}")
        pl_sd = torch.load(ckpt, map_location="cpu")
        if "global_step" in pl_sd:
            print(f"Global Step: {pl_sd['global_step']}")
        sd = pl_sd["state_dict"]
        model = instantiate_from_config(config.model)
        m, u = model.load_state_dict(sd, strict=False)
        model.cuda()
        model.eval()
        return model
--- a/ldm/util.py
+++ b/ldm/util.py
@ -12,8 +12,6 @@ from queue import Queue
 from inspect import isfunction
 from PIL import Image, ImageDraw, ImageFont
 def log_txt_as_img(wh, xc, size=10):
    # wh a tuple of (width, height)
    # xc a list of captions to plot
--- a/scripts/dream.py
+++ b/scripts/dream.py
@ -0,0 +1,144 @@
 #!/usr/bin/env python
 import readline
 import argparse
 import shlex
 import atexit
 from os import path
 def main():
    arg_parser = create_argv_parser()
    opt        = arg_parser.parse_args()
    if opt.laion400m:
        # defaults suitable to the older latent diffusion weights
        width   = 256
        height  = 256
        config  = "configs/latent-diffusion/txt2img-1p4B-eval.yaml"
        weights = "models/ldm/text2img-large/model.ckpt"
    else:
        # some defaults suitable for stable diffusion weights
        width   = 512
        height  = 512
        config  = "configs/stable-diffusion/v1-inference.yaml"
        weights = "models/ldm/stable-diffusion-v1/model.ckpt"
    # command line history will be stored in a file called "~/.dream_history"
    load_history()
    print("* Initializing, be patient...\n")
    from pytorch_lightning import logging
    from ldm.simplet2i import T2I
    # creating a simple text2image object with a handful of
    # defaults passed on the command line.
    # additional parameters will be added (or overriden) during
    # the user input loop
    t2i = T2I(width=width,
              height=height,
              batch=opt.batch,
              outdir=opt.outdir,
              sampler=opt.sampler,
              weights=weights,
              config=config)
    # gets rid of annoying messages about random seed
    logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
    # preload the model
    t2i.load_model()
    print("\n* Initialization done! Awaiting your command...")
    log_path   = path.join(opt.outdir,"dream_log.txt")
    with open(log_path,'a') as log:
        cmd_parser = create_cmd_parser()
        main_loop(t2i,cmd_parser,log)
        log.close()
 def main_loop(t2i,parser,log):
    while True:
        try:
            command = input("dream> ")
        except EOFError:
            print("goodbye!")
            break
        elements = shlex.split(command)
        switches = ['']
        switches_started = False
        for el in elements:
            if el[0]=='-' and not switches_started:
                switches_started = True
            if switches_started:
                switches.append(el)
            else:
                switches[0] += el
                switches[0] += ' '
        switches[0] = switches[0][:len(switches[0])-1]
        try:
            opt      = parser.parse_args(switches)
        except SystemExit:
            parser.print_help()
            pass
        results = t2i.txt2img(**vars(opt))
        print("Outputs:")
        for r in results:
            log_message = " ".join(['   ',str(r[0])+':',
                                    f'"{switches[0]}"',
                                    *switches[1:],f'-S {r[1]}'])
            print(log_message)
            log.write(log_message+"\n")
            log.flush()
 def create_argv_parser():
    parser = argparse.ArgumentParser(description="Parse script's command line args")
    parser.add_argument("--laion400m",
                        "--latent_diffusion",
                        "-l",
                        dest='laion400m',
                        action='store_true',
                        help="fallback to the latent diffusion (LAION4400M) weights and config")
    parser.add_argument('-n','--iterations',
                        type=int,
                        default=1,
                        help="number of images to produce per sampling (overrides -n<iterations>, faster but doesn't produce individual seeds)")
    parser.add_argument('-b','--batch',
                        type=int,
                        default=1,
                        help="number of images to produce per sampling (currently broken")
    parser.add_argument('--sampler',
                        choices=['plms','ddim'],
                        default='plms',
                        help="which sampler to use")
    parser.add_argument('-o',
                        '--outdir',
                        type=str,
                        default="outputs/txt2img-samples",
                        help="directory in which to place generated images and a log of prompts and seeds")
    return parser
 def create_cmd_parser():
    parser = argparse.ArgumentParser(description="Parse terminal input in a discord 'dreambot' fashion")
    parser.add_argument('prompt')
    parser.add_argument('-s','--steps',type=int,help="number of steps")
    parser.add_argument('-S','--seed',type=int,help="image seed")
    parser.add_argument('-n','--iterations',type=int,default=1,help="number of samplings to perform")
    parser.add_argument('-b','--batch',type=int,default=1,help="number of images to produce per sampling (currently broken)")
    parser.add_argument('-W','--width',type=int,help="image width, multiple of 64")
    parser.add_argument('-H','--height',type=int,help="image height, multiple of 64")
    parser.add_argument('-C','--cfg_scale',type=float,help="prompt configuration scale (7.5)")
    parser.add_argument('-g','--grid',action='store_true',help="generate a grid")
    return parser
 def load_history():
    histfile = path.join(path.expanduser('~'),".dream_history")
    try:
        readline.read_history_file(histfile)
        readline.set_history_length(1000)
    except FileNotFoundError:
        pass
    atexit.register(readline.write_history_file,histfile)
 if __name__ == "__main__":
    main()