From 58ea3bf4c8e5dc5475c17fa17bb94988f133fd1d Mon Sep 17 00:00:00 2001 From: Kevin Turner <83819+keturn@users.noreply.github.com> Date: Wed, 9 Nov 2022 11:33:19 -0800 Subject: [PATCH] spike: proof of concept using diffusers for txt2img --- .../environment-lin-cuda.yml | 10 +++ .../requirements-base.txt | 2 +- ldm/invoke/generator/diffusers_pipeline.py | 36 +++++++--- ldm/invoke/generator/txt2img.py | 66 ++++++++++--------- 4 files changed, 75 insertions(+), 39 deletions(-) diff --git a/environments-and-requirements/environment-lin-cuda.yml b/environments-and-requirements/environment-lin-cuda.yml index 17b2cddc95..64bbabe902 100644 --- a/environments-and-requirements/environment-lin-cuda.yml +++ b/environments-and-requirements/environment-lin-cuda.yml @@ -12,6 +12,7 @@ dependencies: - pytorch=1.12.1 - cudatoolkit=11.6 - pip: + - accelerate~=0.13 - albumentations==0.4.3 - dependency_injector==4.40.0 - diffusers==0.6.0 @@ -39,6 +40,15 @@ dependencies: - torch-fidelity==0.3.0 - torchmetrics==0.7.0 - transformers==4.21.3 + - diffusers~=0.7 + - torchmetrics==0.7.0 + - flask==2.1.3 + - flask_socketio==5.3.0 + - flask_cors==3.0.10 + - dependency_injector==4.40.0 + - eventlet + - getpass_asterisk + - kornia==0.6.0 - git+https://github.com/openai/CLIP.git@main#egg=clip - git+https://github.com/Birch-san/k-diffusion.git@mps#egg=k-diffusion - git+https://github.com/invoke-ai/clipseg.git@relaxed-python-requirement#egg=clipseg diff --git a/environments-and-requirements/requirements-base.txt b/environments-and-requirements/requirements-base.txt index 9c306c42ca..8b7d83aa94 100644 --- a/environments-and-requirements/requirements-base.txt +++ b/environments-and-requirements/requirements-base.txt @@ -1,7 +1,7 @@ # pip will resolve the version which matches torch albumentations dependency_injector==4.40.0 -diffusers +diffusers[torch]~=0.7 einops eventlet facexlib diff --git a/ldm/invoke/generator/diffusers_pipeline.py b/ldm/invoke/generator/diffusers_pipeline.py index b13f85b645..d4d60b761a 100644 --- a/ldm/invoke/generator/diffusers_pipeline.py +++ b/ldm/invoke/generator/diffusers_pipeline.py @@ -1,6 +1,6 @@ import secrets from dataclasses import dataclass -from typing import List, Optional, Union +from typing import List, Optional, Union, Callable import torch from diffusers.models import AutoencoderKL, UNet2DConditionModel @@ -131,6 +131,7 @@ class StableDiffusionGeneratorPipeline(DiffusionPipeline): guidance_scale: Optional[float] = 7.5, generator: Optional[torch.Generator] = None, latents: Optional[torch.FloatTensor] = None, + callback: Optional[Callable[[PipelineIntermediateState], None]] = None, **extra_step_kwargs, ): r""" @@ -172,7 +173,22 @@ class StableDiffusionGeneratorPipeline(DiffusionPipeline): prompt, height=height, width=width, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, generator=generator, latents=latents, **extra_step_kwargs): - pass # discarding intermediates + if callback is not None: + callback(result) + if result is None: + raise AssertionError("why was that an empty generator?") + return result + + def image_from_embeddings(self, latents: torch.Tensor, num_inference_steps: int, + text_embeddings: torch.Tensor, guidance_scale: float, + *, callback: Callable[[PipelineIntermediateState], None]=None, run_id=None, + **extra_step_kwargs) -> StableDiffusionPipelineOutput: + self.scheduler.set_timesteps(num_inference_steps) + result = None + for result in self.generate_from_embeddings( + latents, text_embeddings, guidance_scale, run_id, **extra_step_kwargs): + if callback is not None: + callback(result) if result is None: raise AssertionError("why was that an empty generator?") return result @@ -199,9 +215,6 @@ class StableDiffusionGeneratorPipeline(DiffusionPipeline): if height % 8 != 0 or width % 8 != 0: raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - if run_id is None: - run_id = secrets.token_urlsafe(self.ID_LENGTH) - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` # corresponds to doing no classifier free guidance. @@ -209,16 +222,23 @@ class StableDiffusionGeneratorPipeline(DiffusionPipeline): text_embeddings = self.get_text_embeddings(prompt, opposing_prompt, do_classifier_free_guidance, batch_size)\ .to(self.unet.device) self.scheduler.set_timesteps(num_inference_steps) - latents = self.prepare_latents(latents, batch_size, height, width, - generator, self.unet.dtype) + latents = self.prepare_latents(latents, batch_size, height, width, generator, self.unet.dtype) + yield from self.generate_from_embeddings(latents, text_embeddings, guidance_scale, run_id, **extra_step_kwargs) + + def generate_from_embeddings(self, latents: torch.Tensor, text_embeddings: torch.Tensor, guidance_scale: float, + run_id: str = None, **extra_step_kwargs): + if run_id is None: + run_id = secrets.token_urlsafe(self.ID_LENGTH) yield PipelineIntermediateState(run_id=run_id, step=-1, timestep=self.scheduler.num_train_timesteps, latents=latents) + # NOTE: Depends on scheduler being already initialized! for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)): step_output = self.step(t, latents, guidance_scale, text_embeddings, **extra_step_kwargs) latents = step_output.prev_sample + predicted_original = getattr(step_output, 'pred_original_sample', None) yield PipelineIntermediateState(run_id=run_id, step=i, timestep=int(t), latents=latents, - predicted_original=step_output.pred_original_sample) + predicted_original=predicted_original) # https://discuss.huggingface.co/t/memory-usage-by-later-pipeline-stages/23699 torch.cuda.empty_cache() diff --git a/ldm/invoke/generator/txt2img.py b/ldm/invoke/generator/txt2img.py index a04207259b..a882b15671 100644 --- a/ldm/invoke/generator/txt2img.py +++ b/ldm/invoke/generator/txt2img.py @@ -1,10 +1,11 @@ ''' ldm.invoke.generator.txt2img inherits from ldm.invoke.generator ''' - +import PIL.Image import torch -from ldm.invoke.generator.base import Generator +from .base import Generator +from .diffusers_pipeline import StableDiffusionGeneratorPipeline class Txt2Img(Generator): @@ -13,7 +14,8 @@ class Txt2Img(Generator): @torch.no_grad() def get_make_image(self,prompt,sampler,steps,cfg_scale,ddim_eta, - conditioning,width,height,step_callback=None,threshold=0.0,perlin=0.0,**kwargs): + conditioning,width,height,step_callback=None,threshold=0.0,perlin=0.0, + **kwargs): """ Returns a function returning an image derived from the prompt and the initial image Return value depends on the seed at the time you call it @@ -22,38 +24,42 @@ class Txt2Img(Generator): self.perlin = perlin uc, c, extra_conditioning_info = conditioning - @torch.no_grad() - def make_image(x_T): - shape = [ - self.latent_channels, - height // self.downsampling_factor, - width // self.downsampling_factor, - ] + # FIXME: this should probably be either passed in to __init__ instead of model & precision, + # or be constructed in __init__ from those inputs. + pipeline = StableDiffusionGeneratorPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + revision="fp16", torch_dtype=torch.float16, + safety_checker=None, # TODO + # scheduler=sampler + ddim_eta, # TODO + # TODO: local_files_only=True + ) + pipeline.unet.to("cuda") + pipeline.vae.to("cuda") - if self.free_gpu_mem and self.model.model.device != self.model.device: - self.model.model.to(self.model.device) - - sampler.make_schedule(ddim_num_steps=steps, ddim_eta=ddim_eta, verbose=False) + def make_image(x_T) -> PIL.Image.Image: + # FIXME: restore free_gpu_mem functionality + # if self.free_gpu_mem and self.model.model.device != self.model.device: + # self.model.model.to(self.model.device) - samples, _ = sampler.sample( - batch_size = 1, - S = steps, - x_T = x_T, - conditioning = c, - shape = shape, - verbose = False, - unconditional_guidance_scale = cfg_scale, - unconditional_conditioning = uc, - extra_conditioning_info = extra_conditioning_info, - eta = ddim_eta, - img_callback = step_callback, - threshold = threshold, + # FIXME: how the embeddings are combined should be internal to the pipeline + combined_text_embeddings = torch.cat([uc, c]) + + pipeline_output = pipeline.image_from_embeddings( + latents=x_T, + num_inference_steps=steps, + text_embeddings=combined_text_embeddings, + guidance_scale=cfg_scale, + callback=step_callback, + # TODO: extra_conditioning_info = extra_conditioning_info, + # TODO: eta = ddim_eta, + # TODO: threshold = threshold, ) - if self.free_gpu_mem: - self.model.model.to("cpu") + # FIXME: restore free_gpu_mem functionality + # if self.free_gpu_mem: + # self.model.model.to("cpu") - return self.sample_to_image(samples) + return pipeline.numpy_to_pil(pipeline_output.images)[0] return make_image