Merge branch 'development' into development

2024-08-30 20:32:17 +00:00 · 2022-09-18 15:04:01 -04:00 · 2022-09-18 15:04:01 -04:00 · 4f926fc470
commit 4f926fc470
parent e1f0ee819d a0a9b12daf
23 changed files with 1064 additions and 67 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 # ignore default image save location and model symbolic link
 outputs/
 models/ldm/stable-diffusion-v1/model.ckpt
+ldm/restoration/codeformer/weights

 # ignore a directory which serves as a place for initial images
 inputs/
--- a/README.md
+++ b/README.md
@ -105,9 +105,10 @@ To run in full-precision mode, start `dream.py` with the `--full_precision` flag
 - [Seamless Tiling](docs/features/OTHER.md#seamless-tiling)
 - [Google Colab](docs/features/OTHER.md#google-colab)
 - [Web Server](docs/features/WEB.md)
- [Reading Prompts From File](docs/features/OTHER.md#reading-prompts-from-a-file)
+- [Reading Prompts From File](docs/features/PROMPTS.md#reading-prompts-from-a-file)
 - [Shortcut: Reusing Seeds](docs/features/OTHER.md#shortcuts-reusing-seeds)
- [Weighted Prompts](docs/features/OTHER.md#weighted-prompts)
+- [Weighted Prompts](docs/features/PROMPTS.md#weighted-prompts)
+- [Negative/Unconditioned Prompts](docs/features/PROMPTS.md#negative-and-unconditioned-prompts)
 - [Variations](docs/features/VARIATIONS.md)
 - [Personalizing Text-to-Image Generation](docs/features/TEXTUAL_INVERSION.md)
 - [Simplified API for text to image generation](docs/features/OTHER.md#simplified-api)
--- a/backend/modules/parameters.py
+++ b/backend/modules/parameters.py
@ -40,6 +40,8 @@ def parameters_to_command(params):
        switches.append(f'-I {params["init_img"]}')
    if 'init_mask' in params and len(params['init_mask']) > 0:
        switches.append(f'-M {params["init_mask"]}')
+    if 'init_color' in params and len(params['init_color']) > 0:
+        switches.append(f'--init_color {params["init_color"]}')
    if 'strength' in params and 'init_img' in params:
        switches.append(f'-f {params["strength"]}')
        if 'fit' in params and params["fit"] == True:
@ -129,6 +131,11 @@ def create_cmd_parser():
        type=str,
        help='Path to input mask for inpainting mode (supersedes width and height)',
    )
+    parser.add_argument(
+        '--init_color',
+        type=str,
+        help='Path to reference image for color correction (used for repeated img2img and inpainting)'
+    )
    parser.add_argument(
        '-T',
        '-fit',
--- a/docs/assets/negative_prompt_walkthru/step1.png
+++ b/docs/assets/negative_prompt_walkthru/step1.png
--- a/docs/assets/negative_prompt_walkthru/step2.png
+++ b/docs/assets/negative_prompt_walkthru/step2.png
--- a/docs/assets/negative_prompt_walkthru/step3.png
+++ b/docs/assets/negative_prompt_walkthru/step3.png
--- a/docs/assets/negative_prompt_walkthru/step4.png
+++ b/docs/assets/negative_prompt_walkthru/step4.png
--- a/docs/features/CLI.md
+++ b/docs/features/CLI.md
@ -154,11 +154,17 @@ vary greatly depending on what is in the image. We also ask to --fit the image i
 than 640x480. Otherwise the image size will be identical to the provided photo and you may run out
 of memory if it is large.

+Repeated chaining of img2img on an image can result in significant color shifts
+in the output, especially if run with lower strength. Color correction can be
+run against a reference image to fix this issue. Use the original input image to the
+chain as the the reference image for each step in the chain.
+
 In addition to the command-line options recognized by txt2img, img2img accepts additional options:

 | Argument           | Shortcut  | Default | Description                                                                                                                                |
 | ------------------ | --------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
 | --init_img <path>   | -I<path>  | None    | Path to the initialization image                                                                                                           |
+| --init_color <path> |           | None    | Path to reference image for color correction               |
 | --fit               | -F        | False   | Scale the image to fit into the specified -H and -W dimensions                                                                             |
 | --strength <float>  | -s<float> | 0.75    | How hard to try to match the prompt to the initial image. Ranges from 0.0-0.99, with higher values replacing the initial image completely. |

--- a/docs/features/OTHER.md
+++ b/docs/features/OTHER.md
@ -28,32 +28,6 @@ dream> "pond garden with lotus by claude monet" --seamless -s100 -n4

 ---

-## **Reading Prompts from a File**
-
-You can automate `dream.py` by providing a text file with the prompts you want to run, one line per
-prompt. The text file must be composed with a text editor (e.g. Notepad) and not a word processor.
-Each line should look like what you would type at the dream> prompt:
-
-```bash
-a beautiful sunny day in the park, children playing -n4 -C10
-stormy weather on a mountain top, goats grazing     -s100
-innovative packaging for a squid's dinner           -S137038382
-```
-
-Then pass this file's name to `dream.py` when you invoke it:
-
-```bash
-(ldm) ~/stable-diffusion$ python3 scripts/dream.py --from_file "path/to/prompts.txt"
-```
-
-You may read a series of prompts from standard input by providing a filename of `-`:
-
-```bash
-(ldm) ~/stable-diffusion$ echo "a beautiful day" | python3 scripts/dream.py --from_file -
-```
-
---
-
 ## **Shortcuts: Reusing Seeds**

 Since it is so common to reuse seeds while refining a prompt, there is now a shortcut as of version
@ -79,22 +53,6 @@ outputs/img-samples/000040.3498014304.png: "a cute child playing hopscotch" -G1.

 ---

-## **Weighted Prompts**
-
-You may weight different sections of the prompt to tell the sampler to attach different levels of
-priority to them, by adding `:(number)` to the end of the section you wish to up- or downweight. For
-example consider this prompt:
-
-```bash
-tabby cat:0.25 white duck:0.75 hybrid
-```
-
-This will tell the sampler to invest 25% of its effort on the tabby cat aspect of the image and 75%
-on the white duck aspect (surprisingly, this example actually works). The prompt weights can use any
-combination of integers and floating point numbers, and they do not need to add up to 1.
-
---
-
 ## **Simplified API**

 For programmers who wish to incorporate stable-diffusion into other products, this repository
--- a/docs/features/PROMPTS.md
+++ b/docs/features/PROMPTS.md
@ -0,0 +1,96 @@
+# Prompting Features
+
+## **Reading Prompts from a File**
+
+You can automate `dream.py` by providing a text file with the prompts you want to run, one line per
+prompt. The text file must be composed with a text editor (e.g. Notepad) and not a word processor.
+Each line should look like what you would type at the dream> prompt:
+
+```bash
+a beautiful sunny day in the park, children playing -n4 -C10
+stormy weather on a mountain top, goats grazing     -s100
+innovative packaging for a squid's dinner           -S137038382
+```
+
+Then pass this file's name to `dream.py` when you invoke it:
+
+```bash
+(ldm) ~/stable-diffusion$ python3 scripts/dream.py --from_file "path/to/prompts.txt"
+```
+
+You may read a series of prompts from standard input by providing a filename of `-`:
+
+```bash
+(ldm) ~/stable-diffusion$ echo "a beautiful day" | python3 scripts/dream.py --from_file -
+```
+
+---
+
+## **Weighted Prompts**
+
+You may weight different sections of the prompt to tell the sampler to attach different levels of
+priority to them, by adding `:(number)` to the end of the section you wish to up- or downweight. For
+example consider this prompt:
+
+```bash
+tabby cat:0.25 white duck:0.75 hybrid
+```
+
+This will tell the sampler to invest 25% of its effort on the tabby cat aspect of the image and 75%
+on the white duck aspect (surprisingly, this example actually works). The prompt weights can use any
+combination of integers and floating point numbers, and they do not need to add up to 1.
+
+---
+
+## **Negative and Unconditioned Prompts**
+
+Any words between a pair of square brackets will try and be ignored by Stable Diffusion's model during generation of images.
+
+```bash
+this is a test prompt [not really] to make you understand [cool] how this works.
+```
+
+In the above statement, the words 'not really cool` will be ignored by Stable Diffusion.
+
+Here's a prompt that depicts what it does.
+
+original prompt: 
+
+```bash
+"A fantastical translucent poney made of water and foam, ethereal, radiant, hyperalism, scottish folklore, digital painting, artstation, concept art, smooth, 8 k frostbite 3 engine, ultra detailed, art by artgerm and greg rutkowski and magali villeneuve" -s 20 -W 512 -H 768 -C 7.5 -A k_euler_a -S 1654590180
+```
+
+![step1](../assets/variation_walkthru/step1.png)
+
+That image has a woman, so if we want the horse without a rider, we can influence the image not to have a woman by putting [woman] in the prompt, like this:
+
+```bash
+"A fantastical translucent poney made of water and foam, ethereal, radiant, hyperalism, scottish folklore, digital painting, artstation, concept art, smooth, 8 k frostbite 3 engine, ultra detailed, art by artgerm and greg rutkowski and magali villeneuve [woman]" -s 20 -W 512 -H 768 -C 7.5 -A k_euler_a -S 1654590180
+```
+
+![step2](../assets/variation_walkthru/step2.png)
+
+That's nice - but say we also don't want the image to be quite so blue. We can add "blue" to the list of negative prompts, so it's now [woman blue]:
+
+```bash
+"A fantastical translucent poney made of water and foam, ethereal, radiant, hyperalism, scottish folklore, digital painting, artstation, concept art, smooth, 8 k frostbite 3 engine, ultra detailed, art by artgerm and greg rutkowski and magali villeneuve [woman blue]" -s 20 -W 512 -H 768 -C 7.5 -A k_euler_a -S 1654590180
+```
+
+![step3](../assets/variation_walkthru/step3.png)
+
+
+Getting close - but there's no sense in having a saddle when our horse doesn't have a rider, so we'll add one more negative prompt: [woman blue saddle].
+
+```bash
+"A fantastical translucent poney made of water and foam, ethereal, radiant, hyperalism, scottish folklore, digital painting, artstation, concept art, smooth, 8 k frostbite 3 engine, ultra detailed, art by artgerm and greg rutkowski and magali villeneuve [woman blue saddle]" -s 20 -W 512 -H 768 -C 7.5 -A k_euler_a -S 1654590180
+```
+
+![step4](../assets/variation_walkthru/step4.png)
+
+
+Notes about this feature:
+
+* The only requirement for words to be ignored is that they are in between a pair of square brackets.
+* You can provide multiple words within the same bracket.
+* You can provide multiple brackets with multiple words in different places of your prompt. That works just fine.
+* To improve typical anatomy problems, you can add negative prompts like [bad anatomy, extra legs, extra arms, extra fingers, poorly drawn hands, poorly drawn feet, disfigured, out of frame, tiling, bad art, deformed, mutated].
--- a/docs/features/UPSCALE.md
+++ b/docs/features/UPSCALE.md
@ -97,3 +97,39 @@ the base images.
 If you wish to stop during the image generation but want to upscale or face restore a particular
 generated image, pass it again with the same prompt and generated seed along with the `-U` and `-G`
 prompt arguments to perform those actions.
+
+## CodeFormer Support
+
+This repo also allows you to perform face restoration using
+[CodeFormer](https://github.com/sczhou/CodeFormer).
+
+In order to setup CodeFormer to work, you need to download the models like with GFPGAN. You can do
+this either by running `preload_models.py` or by manually downloading the
+[model file](https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth) and
+saving it to `ldm/restoration/codeformer/weights` folder.
+
+You can use `-ft` prompt argument to swap between CodeFormer and the default GFPGAN. The above
+mentioned `-G` prompt argument will allow you to control the strength of the restoration effect.
+
+### **Usage:**
+
+The following command will perform face restoration with CodeFormer instead of the default gfpgan.
+
+`<prompt> -G 0.8 -ft codeformer`
+
+**Other Options:**
+
+- `-cf` - cf or CodeFormer Fidelity takes values between `0` and `1`. 0 produces high quality
+  results but low accuracy and 1 produces lower quality results but higher accuacy to your original
+  face.
+
+The following command will perform face restoration with CodeFormer. CodeFormer will output a result
+that is closely matching to the input face.
+
+`<prompt> -G 1.0 -ft codeformer -cf 0.9`
+
+The following command will perform face restoration with CodeFormer. CodeFormer will output a result
+that is the best restoration possible. This may deviate slightly from the original face. This is an
+excellent option to use in situations when there is very little facial data to work with.
+
+`<prompt> -G 1.0 -ft codeformer -cf 0.1`
--- a/docs/features/VARIATIONS.md
+++ b/docs/features/VARIATIONS.md
@ -102,6 +102,7 @@ generate more variations around the almost-but-not-quite image. We do the
 latter, using both the `-V` (combining) and `-v` (variation strength) options.
 Note that we use `-n6` to generate 6 variations:

+```bash
 dream> "prompt" -S3357757885 -V3647897225,0.1,1614299449,0.1 -v0.05 -n6
 Outputs:
 ./outputs/Xena/000004.3279757577.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.1,1614299449:0.1,3279757577:0.05 -S3357757885
--- a/docs/other/CONTRIBUTORS.md
+++ b/docs/other/CONTRIBUTORS.md
@ -2,15 +2,16 @@
 title: Contributors
 ---

-The list of all the amazing people who have contributed to the various features that you get to experience in this fork.
+The list of all the amazing people who have contributed to the various features that you get to
+experience in this fork.

 We thank them for all of their time and hard work.

-## __Original Author:__
+## **Original Author:**

 - [Lincoln D. Stein](mailto:lincoln.stein@gmail.com)

-## __Contributions by:__
+## **Contributions by:**

 - [Sean McLellan](https://github.com/Oceanswave)
 - [Kevin Gibbons](https://github.com/bakkot)
@ -52,8 +53,9 @@ We thank them for all of their time and hard work.
 - [Doggettx](https://github.com/doggettx)
 - [Matthias Wild](https://github.com/mauwii)
 - [Kyle Schouviller](https://github.com/kyle0654)
+- [rabidcopy](https://github.com/rabidcopy)

-## __Original CompVis Authors:__
+## **Original CompVis Authors:**

 - [Robin Rombach](https://github.com/rromb)
 - [Patrick von Platen](https://github.com/patrickvonplaten)
@ -65,4 +67,5 @@ We thank them for all of their time and hard work.

 ---

-_If you have contributed and don't see your name on the list of contributors, please let one of the collaborators know about the omission, or feel free to make a pull request._
+_If you have contributed and don't see your name on the list of contributors, please let one of the
+collaborators know about the omission, or feel free to make a pull request._
--- a/ldm/dream/args.py
+++ b/ldm/dream/args.py
@ -181,6 +181,10 @@ class Args(object):
            switches.append('--seamless')
        if a['init_img'] and len(a['init_img'])>0:
            switches.append(f'-I {a["init_img"]}')
+        if a['init_mask'] and len(a['init_mask'])>0:
+            switches.append(f'-M {a["init_mask"]}')
+        if a['init_color'] and len(a['init_color'])>0:
+            switches.append(f'--init_color {a["init_color"]}')
        if a['fit']:
            switches.append(f'--fit')
        if a['init_img'] and a['strength'] and a['strength']>0:
@ -493,6 +497,11 @@ class Args(object):
            type=str,
            help='Path to input mask for inpainting mode (supersedes width and height)',
        )
+        img2img_group.add_argument(
+            '--init_color',
+            type=str,
+            help='Path to reference image for color correction (used for repeated img2img and inpainting)'
+        )
        img2img_group.add_argument(
            '-T',
            '-fit',
@ -507,6 +516,12 @@ class Args(object):
            help='Strength for noising/unnoising. 0.0 preserves image exactly, 1.0 replaces it completely',
            default=0.75,
        )
+        postprocessing_group.add_argument(
+            '-ft',
+            '--facetool',
+            type=str,
+            help='Select the face restoration AI to use: gfpgan, codeformer',
+        )
        postprocessing_group.add_argument(
            '-G',
            '--gfpgan_strength',
@ -514,6 +529,13 @@ class Args(object):
            help='The strength at which to apply the GFPGAN model to the result, in order to improve faces.',
            default=0,
        )
+        postprocessing_group.add_argument(
+            '-cf',
+            '--codeformer_fidelity',
+            type=float,
+            help='Takes values between 0 and 1. 0 produces high quality but low accuracy. 1 produces high accuracy but low quality.',
+            default=0.75
+        )
        postprocessing_group.add_argument(
            '-U',
            '--upscale',
@ -654,6 +676,8 @@ def metadata_loads(metadata):
            # repack the prompt and variations
            image['prompt']     = ','.join([':'.join([x['prompt'],   str(x['weight'])]) for x in image['prompt']])
            image['variations'] = ','.join([':'.join([str(x['seed']),str(x['weight'])]) for x in image['variations']])
+            # fix a bit of semantic drift here
+            image['sampler_name']=image.pop('sampler')
            opt = Args()
            opt._cmd_switches = Namespace(**image)
            results.append(opt)
--- a/ldm/dream/conditioning.py
+++ b/ldm/dream/conditioning.py
@ -13,7 +13,20 @@ import re
 import torch

 def get_uc_and_c(prompt, model, log_tokens=False, skip_normalize=False):
-    uc = model.get_learned_conditioning([''])
+    # Extract Unconditioned Words From Prompt
+    unconditioned_words = ''
+    unconditional_regex = r'\[(.*?)\]'
+    unconditionals = re.findall(unconditional_regex, prompt)
+
+    if len(unconditionals) > 0:
+        unconditioned_words = ' '.join(unconditionals)
+
+        # Remove Unconditioned Words From Prompt
+        unconditional_regex_compile = re.compile(unconditional_regex)
+        clean_prompt = unconditional_regex_compile.sub(' ', prompt)
+        prompt = re.sub(' +', ' ', clean_prompt)
+
+    uc = model.get_learned_conditioning([unconditioned_words])

    # get weighted sub-prompts
    weighted_subprompts = split_weighted_subprompts(
@ -34,6 +47,7 @@ def get_uc_and_c(prompt, model, log_tokens=False, skip_normalize=False):
    else:   # just standard 1 prompt
        log_tokenization(prompt, model, log_tokens)
        c = model.get_learned_conditioning([prompt])
+        uc = model.get_learned_conditioning([unconditioned_words])
    return (uc, c)

 def split_weighted_subprompts(text, skip_normalize=False)->list:
--- a/ldm/dream/readline.py
+++ b/ldm/dream/readline.py
@ -22,7 +22,8 @@ class Completer:
    def complete(self, text, state):
        buffer = readline.get_line_buffer()

-        if text.startswith(('-I', '--init_img','-M','--init_mask')):
+        if text.startswith(('-I', '--init_img','-M','--init_mask',
+                            '--init_color')):
            return self._path_completions(text, state, ('.png','.jpg','.jpeg'))

        if buffer.strip().endswith('cd') or text.startswith(('.', '/')):
@ -57,6 +58,8 @@ class Completer:
            path = text.replace('--init_mask=', '', 1).lstrip()
        elif text.startswith('-M'):
            path = text.replace('-M', '', 1).lstrip()
+        elif text.startswith('--init_color='):
+            path = text.replace('--init_color=', '', 1).lstrip()
        else:
            path = text

@ -100,6 +103,7 @@ if readline_available:
                '--individual','-i',
                '--init_img','-I',
                '--init_mask','-M',
+                '--init_color',
                '--strength','-f',
                '--variants','-v',
                '--outdir','-o',
--- a/ldm/generate.py
+++ b/ldm/generate.py
@ -15,6 +15,8 @@ import traceback
 import transformers
 import io
 import hashlib
+import cv2
+import skimage

 from omegaconf import OmegaConf
 from PIL import Image, ImageOps
@ -220,11 +222,14 @@ class Generate:
            init_mask        = None,
            fit              = False,
            strength         = None,
+            init_color       = None,
            # these are specific to embiggen (which also relies on img2img args)
            embiggen       =    None,
            embiggen_tiles =    None,
            # these are specific to GFPGAN/ESRGAN
+            facetool         = None,
            gfpgan_strength  = 0,
+            codeformer_fidelity = None,
            save_original    = False,
            upscale          = None,
            # Set this True to handle KeyboardInterrupt internally
@ -362,10 +367,17 @@ class Generate:
                embiggen_tiles = embiggen_tiles,
            )

+            if init_color:
+                self.correct_colors(image_list           = results,
+                                    reference_image_path = init_color,
+                                    image_callback       = image_callback)
+
            if upscale is not None or gfpgan_strength > 0:
                self.upscale_and_reconstruct(results,
                                             upscale        = upscale,
+                                             facetool       = facetool,
                                             strength       = gfpgan_strength,
+                                             codeformer_fidelity = codeformer_fidelity,
                                             save_original  = save_original,
                                             image_callback = image_callback)

@ -475,16 +487,43 @@ class Generate:

        return self.model

+    def correct_colors(self,
+                       image_list,
+                       reference_image_path,
+                       image_callback = None):
+        reference_image = Image.open(reference_image_path)
+        correction_target = cv2.cvtColor(np.asarray(reference_image),
+                                         cv2.COLOR_RGB2LAB)
+        for r in image_list:
+            image, seed = r
+            image = cv2.cvtColor(np.asarray(image),
+                                 cv2.COLOR_RGB2LAB)
+            image = skimage.exposure.match_histograms(image,
+                                                      correction_target,
+                                                      channel_axis=2)
+            image = Image.fromarray(
+                cv2.cvtColor(image, cv2.COLOR_LAB2RGB).astype("uint8")
+            )
+            if image_callback is not None:
+                image_callback(image, seed)
+            else:
+                r[0] = image
+
    def upscale_and_reconstruct(self,
                                image_list,
+                                facetool      = 'gfpgan',
                                upscale       = None,
                                strength      =  0.0,
+                                codeformer_fidelity = 0.75,
                                save_original = False,
                                image_callback = None):
        try:
            if upscale is not None:
                from ldm.gfpgan.gfpgan_tools import real_esrgan_upscale
            if strength > 0:
+                if facetool == 'codeformer':
+                    from ldm.restoration.codeformer.codeformer import CodeFormerRestoration
+                else:
                    from ldm.gfpgan.gfpgan_tools import run_gfpgan
        except (ModuleNotFoundError, ImportError):
            print(traceback.format_exc(), file=sys.stderr)
@ -504,6 +543,9 @@ class Generate:
                        seed,
                    )
                if strength > 0:
+                    if facetool == 'codeformer':
+                        image = CodeFormerRestoration().process(image=image, strength=strength, device=self.device, seed=seed, fidelity=codeformer_fidelity)
+                    else:
                        image = run_gfpgan(
                            image, strength, seed, 1
                        )
--- a/ldm/restoration/codeformer/codeformer.py
+++ b/ldm/restoration/codeformer/codeformer.py
@ -0,0 +1,76 @@
+import os
+import torch
+import numpy as np
+import warnings
+
+pretrained_model_url = 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth'
+
+class CodeFormerRestoration():
+    def __init__(self) -> None:
+        pass
+
+    def process(self, image, strength, device, seed=None, fidelity=0.75):
+        if seed is not None:
+            print(f'>> CodeFormer - Restoring Faces for image seed:{seed}')
+        with warnings.catch_warnings():
+            warnings.filterwarnings('ignore', category=DeprecationWarning)
+            warnings.filterwarnings('ignore', category=UserWarning)
+
+            from basicsr.utils.download_util import load_file_from_url
+            from basicsr.utils import img2tensor, tensor2img
+            from facexlib.utils.face_restoration_helper import FaceRestoreHelper
+            from ldm.restoration.codeformer.codeformer_arch import CodeFormer
+            from torchvision.transforms.functional import normalize
+            from PIL import Image
+            
+            cf_class = CodeFormer
+            
+            cf = cf_class(dim_embd=512, codebook_size=1024, n_head=8, n_layers=9, connect_list=['32', '64', '128', '256']).to(device)
+            
+            checkpoint_path = load_file_from_url(url=pretrained_model_url, model_dir=os.path.abspath('ldm/restoration/codeformer/weights'), progress=True)
+            checkpoint = torch.load(checkpoint_path)['params_ema']
+            cf.load_state_dict(checkpoint)
+            cf.eval()
+
+            image = image.convert('RGB')
+
+            face_helper = FaceRestoreHelper(upscale_factor=1, use_parse=True, device=device)
+            face_helper.clean_all()
+            face_helper.read_image(np.array(image, dtype=np.uint8))
+            face_helper.get_face_landmarks_5(resize=640, eye_dist_threshold=5)
+            face_helper.align_warp_face()
+
+            for idx, cropped_face in enumerate(face_helper.cropped_faces):
+                cropped_face_t = img2tensor(cropped_face / 255., bgr2rgb=True, float32=True)
+                normalize(cropped_face_t, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)
+                cropped_face_t = cropped_face_t.unsqueeze(0).to(device)
+
+                try:
+                    with torch.no_grad():
+                        output = cf(cropped_face_t, w=fidelity, adain=True)[0]
+                        restored_face = tensor2img(output.squeeze(0), rgb2bgr=True, min_max=(-1, 1))
+                    del output
+                    torch.cuda.empty_cache()
+                except RuntimeError as error:
+                    print(f'\tFailed inference for CodeFormer: {error}.')
+                    restored_face = cropped_face
+
+                restored_face = restored_face.astype('uint8')
+                face_helper.add_restored_face(restored_face)
+
+
+            face_helper.get_inverse_affine(None)
+
+            restored_img = face_helper.paste_faces_to_input_image()
+
+            res = Image.fromarray(restored_img)
+
+            if strength < 1.0:
+                # Resize the image to the new image if the sizes have changed
+                if restored_img.size != image.size:
+                    image = image.resize(res.size)
+                res = Image.blend(image, res, strength)
+
+            cf = None
+
+            return res
--- a/ldm/restoration/codeformer/codeformer_arch.py
+++ b/ldm/restoration/codeformer/codeformer_arch.py
@ -0,0 +1,276 @@
+import math
+import numpy as np
+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+from typing import Optional, List
+
+from ldm.restoration.codeformer.vqgan_arch import *
+from basicsr.utils import get_root_logger
+from basicsr.utils.registry import ARCH_REGISTRY
+
+def calc_mean_std(feat, eps=1e-5):
+    """Calculate mean and std for adaptive_instance_normalization.
+
+    Args:
+        feat (Tensor): 4D tensor.
+        eps (float): A small value added to the variance to avoid
+            divide-by-zero. Default: 1e-5.
+    """
+    size = feat.size()
+    assert len(size) == 4, 'The input feature should be 4D tensor.'
+    b, c = size[:2]
+    feat_var = feat.view(b, c, -1).var(dim=2) + eps
+    feat_std = feat_var.sqrt().view(b, c, 1, 1)
+    feat_mean = feat.view(b, c, -1).mean(dim=2).view(b, c, 1, 1)
+    return feat_mean, feat_std
+
+
+def adaptive_instance_normalization(content_feat, style_feat):
+    """Adaptive instance normalization.
+
+    Adjust the reference features to have the similar color and illuminations
+    as those in the degradate features.
+
+    Args:
+        content_feat (Tensor): The reference feature.
+        style_feat (Tensor): The degradate features.
+    """
+    size = content_feat.size()
+    style_mean, style_std = calc_mean_std(style_feat)
+    content_mean, content_std = calc_mean_std(content_feat)
+    normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size)
+    return normalized_feat * style_std.expand(size) + style_mean.expand(size)
+
+
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, x, mask=None):
+        if mask is None:
+            mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+
+
+class TransformerSALayer(nn.Module):
+    def __init__(self, embed_dim, nhead=8, dim_mlp=2048, dropout=0.0, activation="gelu"):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(embed_dim, nhead, dropout=dropout)
+        # Implementation of Feedforward model - MLP
+        self.linear1 = nn.Linear(embed_dim, dim_mlp)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_mlp, embed_dim)
+
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.norm2 = nn.LayerNorm(embed_dim)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward(self, tgt,
+                tgt_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        
+        # self attention
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+
+        # ffn
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout2(tgt2)
+        return tgt
+
+class Fuse_sft_block(nn.Module):
+    def __init__(self, in_ch, out_ch):
+        super().__init__()
+        self.encode_enc = ResBlock(2*in_ch, out_ch)
+
+        self.scale = nn.Sequential(
+                    nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
+                    nn.LeakyReLU(0.2, True),
+                    nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1))
+
+        self.shift = nn.Sequential(
+                    nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
+                    nn.LeakyReLU(0.2, True),
+                    nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1))
+
+    def forward(self, enc_feat, dec_feat, w=1):
+        enc_feat = self.encode_enc(torch.cat([enc_feat, dec_feat], dim=1))
+        scale = self.scale(enc_feat)
+        shift = self.shift(enc_feat)
+        residual = w * (dec_feat * scale + shift)
+        out = dec_feat + residual
+        return out
+
+
+@ARCH_REGISTRY.register()
+class CodeFormer(VQAutoEncoder):
+    def __init__(self, dim_embd=512, n_head=8, n_layers=9, 
+                codebook_size=1024, latent_size=256,
+                connect_list=['32', '64', '128', '256'],
+                fix_modules=['quantize','generator']):
+        super(CodeFormer, self).__init__(512, 64, [1, 2, 2, 4, 4, 8], 'nearest',2, [16], codebook_size)
+
+        if fix_modules is not None:
+            for module in fix_modules:
+                for param in getattr(self, module).parameters():
+                    param.requires_grad = False
+
+        self.connect_list = connect_list
+        self.n_layers = n_layers
+        self.dim_embd = dim_embd
+        self.dim_mlp = dim_embd*2
+
+        self.position_emb = nn.Parameter(torch.zeros(latent_size, self.dim_embd))
+        self.feat_emb = nn.Linear(256, self.dim_embd)
+
+        # transformer
+        self.ft_layers = nn.Sequential(*[TransformerSALayer(embed_dim=dim_embd, nhead=n_head, dim_mlp=self.dim_mlp, dropout=0.0) 
+                                    for _ in range(self.n_layers)])
+
+        # logits_predict head
+        self.idx_pred_layer = nn.Sequential(
+            nn.LayerNorm(dim_embd),
+            nn.Linear(dim_embd, codebook_size, bias=False))
+        
+        self.channels = {
+            '16': 512,
+            '32': 256,
+            '64': 256,
+            '128': 128,
+            '256': 128,
+            '512': 64,
+        }
+
+        # after second residual block for > 16, before attn layer for ==16
+        self.fuse_encoder_block = {'512':2, '256':5, '128':8, '64':11, '32':14, '16':18}
+        # after first residual block for > 16, before attn layer for ==16
+        self.fuse_generator_block = {'16':6, '32': 9, '64':12, '128':15, '256':18, '512':21}
+
+        # fuse_convs_dict
+        self.fuse_convs_dict = nn.ModuleDict()
+        for f_size in self.connect_list:
+            in_ch = self.channels[f_size]
+            self.fuse_convs_dict[f_size] = Fuse_sft_block(in_ch, in_ch)
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def forward(self, x, w=0, detach_16=True, code_only=False, adain=False):
+        # ################### Encoder #####################
+        enc_feat_dict = {}
+        out_list = [self.fuse_encoder_block[f_size] for f_size in self.connect_list]
+        for i, block in enumerate(self.encoder.blocks):
+            x = block(x) 
+            if i in out_list:
+                enc_feat_dict[str(x.shape[-1])] = x.clone()
+
+        lq_feat = x
+        # ################# Transformer ###################
+        # quant_feat, codebook_loss, quant_stats = self.quantize(lq_feat)
+        pos_emb = self.position_emb.unsqueeze(1).repeat(1,x.shape[0],1)
+        # BCHW -> BC(HW) -> (HW)BC
+        feat_emb = self.feat_emb(lq_feat.flatten(2).permute(2,0,1))
+        query_emb = feat_emb
+        # Transformer encoder
+        for layer in self.ft_layers:
+            query_emb = layer(query_emb, query_pos=pos_emb)
+
+        # output logits
+        logits = self.idx_pred_layer(query_emb) # (hw)bn
+        logits = logits.permute(1,0,2) # (hw)bn -> b(hw)n
+
+        if code_only: # for training stage II
+          # logits doesn't need softmax before cross_entropy loss
+            return logits, lq_feat
+
+        # ################# Quantization ###################
+        # if self.training:
+        #     quant_feat = torch.einsum('btn,nc->btc', [soft_one_hot, self.quantize.embedding.weight])
+        #     # b(hw)c -> bc(hw) -> bchw
+        #     quant_feat = quant_feat.permute(0,2,1).view(lq_feat.shape)
+        # ------------
+        soft_one_hot = F.softmax(logits, dim=2)
+        _, top_idx = torch.topk(soft_one_hot, 1, dim=2)
+        quant_feat = self.quantize.get_codebook_feat(top_idx, shape=[x.shape[0],16,16,256])
+        # preserve gradients
+        # quant_feat = lq_feat + (quant_feat - lq_feat).detach()
+
+        if detach_16:
+            quant_feat = quant_feat.detach() # for training stage III
+        if adain:
+            quant_feat = adaptive_instance_normalization(quant_feat, lq_feat)
+
+        # ################## Generator ####################
+        x = quant_feat
+        fuse_list = [self.fuse_generator_block[f_size] for f_size in self.connect_list]
+
+        for i, block in enumerate(self.generator.blocks):
+            x = block(x) 
+            if i in fuse_list: # fuse after i-th block
+                f_size = str(x.shape[-1])
+                if w>0:
+                    x = self.fuse_convs_dict[f_size](enc_feat_dict[f_size].detach(), x, w)
+        out = x
+        # logits doesn't need softmax before cross_entropy loss
+        return out, logits, lq_feat
--- a/ldm/restoration/codeformer/vqgan_arch.py
+++ b/ldm/restoration/codeformer/vqgan_arch.py
@ -0,0 +1,435 @@
+'''
+VQGAN code, adapted from the original created by the Unleashing Transformers authors:
+https://github.com/samb-t/unleashing-transformers/blob/master/models/vqgan.py
+
+'''
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import copy
+from basicsr.utils import get_root_logger
+from basicsr.utils.registry import ARCH_REGISTRY
+
+def normalize(in_channels):
+    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+    
+
+@torch.jit.script
+def swish(x):
+    return x*torch.sigmoid(x)
+
+
+#  Define VQVAE classes
+class VectorQuantizer(nn.Module):
+    def __init__(self, codebook_size, emb_dim, beta):
+        super(VectorQuantizer, self).__init__()
+        self.codebook_size = codebook_size  # number of embeddings
+        self.emb_dim = emb_dim  # dimension of embedding
+        self.beta = beta  # commitment cost used in loss term, beta * ||z_e(x)-sg[e]||^2
+        self.embedding = nn.Embedding(self.codebook_size, self.emb_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.codebook_size, 1.0 / self.codebook_size)
+
+    def forward(self, z):
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = z.permute(0, 2, 3, 1).contiguous()
+        z_flattened = z.view(-1, self.emb_dim)
+
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        d = (z_flattened ** 2).sum(dim=1, keepdim=True) + (self.embedding.weight**2).sum(1) - \
+            2 * torch.matmul(z_flattened, self.embedding.weight.t())
+
+        mean_distance = torch.mean(d)
+        # find closest encodings
+        # min_encoding_indices = torch.argmin(d, dim=1).unsqueeze(1)
+        min_encoding_scores, min_encoding_indices = torch.topk(d, 1, dim=1, largest=False)
+        # [0-1], higher score, higher confidence
+        min_encoding_scores = torch.exp(-min_encoding_scores/10)
+
+        min_encodings = torch.zeros(min_encoding_indices.shape[0], self.codebook_size).to(z)
+        min_encodings.scatter_(1, min_encoding_indices, 1)
+
+        # get quantized latent vectors
+        z_q = torch.matmul(min_encodings, self.embedding.weight).view(z.shape)
+        # compute loss for embedding
+        loss = torch.mean((z_q.detach()-z)**2) + self.beta * torch.mean((z_q - z.detach()) ** 2)
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+
+        # perplexity
+        e_mean = torch.mean(min_encodings, dim=0)
+        perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10)))
+        # reshape back to match original input shape
+        z_q = z_q.permute(0, 3, 1, 2).contiguous()
+
+        return z_q, loss, {
+            "perplexity": perplexity,
+            "min_encodings": min_encodings,
+            "min_encoding_indices": min_encoding_indices,
+            "min_encoding_scores": min_encoding_scores,
+            "mean_distance": mean_distance
+            }
+
+    def get_codebook_feat(self, indices, shape):
+        # input indices: batch*token_num -> (batch*token_num)*1
+        # shape: batch, height, width, channel
+        indices = indices.view(-1,1)
+        min_encodings = torch.zeros(indices.shape[0], self.codebook_size).to(indices)
+        min_encodings.scatter_(1, indices, 1)
+        # get quantized latent vectors
+        z_q = torch.matmul(min_encodings.float(), self.embedding.weight)
+
+        if shape is not None:  # reshape back to match original input shape
+            z_q = z_q.view(shape).permute(0, 3, 1, 2).contiguous()
+
+        return z_q
+
+
+class GumbelQuantizer(nn.Module):
+    def __init__(self, codebook_size, emb_dim, num_hiddens, straight_through=False, kl_weight=5e-4, temp_init=1.0):
+        super().__init__()
+        self.codebook_size = codebook_size  # number of embeddings
+        self.emb_dim = emb_dim  # dimension of embedding
+        self.straight_through = straight_through
+        self.temperature = temp_init
+        self.kl_weight = kl_weight
+        self.proj = nn.Conv2d(num_hiddens, codebook_size, 1)  # projects last encoder layer to quantized logits
+        self.embed = nn.Embedding(codebook_size, emb_dim)
+
+    def forward(self, z):
+        hard = self.straight_through if self.training else True
+
+        logits = self.proj(z)
+
+        soft_one_hot = F.gumbel_softmax(logits, tau=self.temperature, dim=1, hard=hard)
+
+        z_q = torch.einsum("b n h w, n d -> b d h w", soft_one_hot, self.embed.weight)
+
+        # + kl divergence to the prior loss
+        qy = F.softmax(logits, dim=1)
+        diff = self.kl_weight * torch.sum(qy * torch.log(qy * self.codebook_size + 1e-10), dim=1).mean()
+        min_encoding_indices = soft_one_hot.argmax(dim=1)
+
+        return z_q, diff, {
+            "min_encoding_indices": min_encoding_indices
+        }
+
+
+class Downsample(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+
+    def forward(self, x):
+        pad = (0, 1, 0, 1)
+        x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+
+        return x
+
+
+class ResBlock(nn.Module):
+    def __init__(self, in_channels, out_channels=None):
+        super(ResBlock, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.norm1 = normalize(in_channels)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = normalize(out_channels)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.conv_out = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x_in):
+        x = x_in
+        x = self.norm1(x)
+        x = swish(x)
+        x = self.conv1(x)
+        x = self.norm2(x)
+        x = swish(x)
+        x = self.conv2(x)
+        if self.in_channels != self.out_channels:
+            x_in = self.conv_out(x_in)
+
+        return x + x_in
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0
+        )
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h*w)
+        q = q.permute(0, 2, 1)   
+        k = k.reshape(b, c, h*w)
+        w_ = torch.bmm(q, k) 
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = F.softmax(w_, dim=2)
+
+        # attend to values
+        v = v.reshape(b, c, h*w)
+        w_ = w_.permute(0, 2, 1) 
+        h_ = torch.bmm(v, w_)
+        h_ = h_.reshape(b, c, h, w)
+
+        h_ = self.proj_out(h_)
+
+        return x+h_
+
+
+class Encoder(nn.Module):
+    def __init__(self, in_channels, nf, emb_dim, ch_mult, num_res_blocks, resolution, attn_resolutions):
+        super().__init__()
+        self.nf = nf
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.attn_resolutions = attn_resolutions
+
+        curr_res = self.resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+
+        blocks = []
+        # initial convultion
+        blocks.append(nn.Conv2d(in_channels, nf, kernel_size=3, stride=1, padding=1))
+
+        # residual and downsampling blocks, with attention on smaller res (16x16)
+        for i in range(self.num_resolutions):
+            block_in_ch = nf * in_ch_mult[i]
+            block_out_ch = nf * ch_mult[i]
+            for _ in range(self.num_res_blocks):
+                blocks.append(ResBlock(block_in_ch, block_out_ch))
+                block_in_ch = block_out_ch
+                if curr_res in attn_resolutions:
+                    blocks.append(AttnBlock(block_in_ch))
+
+            if i != self.num_resolutions - 1:
+                blocks.append(Downsample(block_in_ch))
+                curr_res = curr_res // 2
+
+        # non-local attention block
+        blocks.append(ResBlock(block_in_ch, block_in_ch))
+        blocks.append(AttnBlock(block_in_ch))
+        blocks.append(ResBlock(block_in_ch, block_in_ch))
+
+        # normalise and convert to latent size
+        blocks.append(normalize(block_in_ch))
+        blocks.append(nn.Conv2d(block_in_ch, emb_dim, kernel_size=3, stride=1, padding=1))
+        self.blocks = nn.ModuleList(blocks)
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+            
+        return x
+
+
+class Generator(nn.Module):
+    def __init__(self, nf, emb_dim, ch_mult, res_blocks, img_size, attn_resolutions):
+        super().__init__()
+        self.nf = nf 
+        self.ch_mult = ch_mult 
+        self.num_resolutions = len(self.ch_mult)
+        self.num_res_blocks = res_blocks
+        self.resolution = img_size 
+        self.attn_resolutions = attn_resolutions
+        self.in_channels = emb_dim
+        self.out_channels = 3
+        block_in_ch = self.nf * self.ch_mult[-1]
+        curr_res = self.resolution // 2 ** (self.num_resolutions-1)
+
+        blocks = []
+        # initial conv
+        blocks.append(nn.Conv2d(self.in_channels, block_in_ch, kernel_size=3, stride=1, padding=1))
+
+        # non-local attention block
+        blocks.append(ResBlock(block_in_ch, block_in_ch))
+        blocks.append(AttnBlock(block_in_ch))
+        blocks.append(ResBlock(block_in_ch, block_in_ch))
+
+        for i in reversed(range(self.num_resolutions)):
+            block_out_ch = self.nf * self.ch_mult[i]
+
+            for _ in range(self.num_res_blocks):
+                blocks.append(ResBlock(block_in_ch, block_out_ch))
+                block_in_ch = block_out_ch
+
+                if curr_res in self.attn_resolutions:
+                    blocks.append(AttnBlock(block_in_ch))
+
+            if i != 0:
+                blocks.append(Upsample(block_in_ch))
+                curr_res = curr_res * 2
+
+        blocks.append(normalize(block_in_ch))
+        blocks.append(nn.Conv2d(block_in_ch, self.out_channels, kernel_size=3, stride=1, padding=1))
+
+        self.blocks = nn.ModuleList(blocks)
+   
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+            
+        return x
+
+  
+@ARCH_REGISTRY.register()
+class VQAutoEncoder(nn.Module):
+    def __init__(self, img_size, nf, ch_mult, quantizer="nearest", res_blocks=2, attn_resolutions=[16], codebook_size=1024, emb_dim=256,
+                beta=0.25, gumbel_straight_through=False, gumbel_kl_weight=1e-8, model_path=None):
+        super().__init__()
+        logger = get_root_logger()
+        self.in_channels = 3 
+        self.nf = nf 
+        self.n_blocks = res_blocks 
+        self.codebook_size = codebook_size
+        self.embed_dim = emb_dim
+        self.ch_mult = ch_mult
+        self.resolution = img_size
+        self.attn_resolutions = attn_resolutions
+        self.quantizer_type = quantizer
+        self.encoder = Encoder(
+            self.in_channels,
+            self.nf,
+            self.embed_dim,
+            self.ch_mult,
+            self.n_blocks,
+            self.resolution,
+            self.attn_resolutions
+        )
+        if self.quantizer_type == "nearest":
+            self.beta = beta #0.25
+            self.quantize = VectorQuantizer(self.codebook_size, self.embed_dim, self.beta)
+        elif self.quantizer_type == "gumbel":
+            self.gumbel_num_hiddens = emb_dim
+            self.straight_through = gumbel_straight_through
+            self.kl_weight = gumbel_kl_weight
+            self.quantize = GumbelQuantizer(
+                self.codebook_size,
+                self.embed_dim,
+                self.gumbel_num_hiddens,
+                self.straight_through,
+                self.kl_weight
+            )
+        self.generator = Generator(
+            self.nf, 
+            self.embed_dim,
+            self.ch_mult, 
+            self.n_blocks, 
+            self.resolution, 
+            self.attn_resolutions
+        )
+
+        if model_path is not None:
+            chkpt = torch.load(model_path, map_location='cpu')
+            if 'params_ema' in chkpt:
+                self.load_state_dict(torch.load(model_path, map_location='cpu')['params_ema'])
+                logger.info(f'vqgan is loaded from: {model_path} [params_ema]')
+            elif 'params' in chkpt:
+                self.load_state_dict(torch.load(model_path, map_location='cpu')['params'])
+                logger.info(f'vqgan is loaded from: {model_path} [params]')
+            else:
+                raise ValueError(f'Wrong params!')
+
+
+    def forward(self, x):
+        x = self.encoder(x)
+        quant, codebook_loss, quant_stats = self.quantize(x)
+        x = self.generator(quant)
+        return x, codebook_loss, quant_stats
+
+
+
+# patch based discriminator
+@ARCH_REGISTRY.register()
+class VQGANDiscriminator(nn.Module):
+    def __init__(self, nc=3, ndf=64, n_layers=4, model_path=None):
+        super().__init__()
+
+        layers = [nn.Conv2d(nc, ndf, kernel_size=4, stride=2, padding=1), nn.LeakyReLU(0.2, True)]
+        ndf_mult = 1
+        ndf_mult_prev = 1
+        for n in range(1, n_layers):  # gradually increase the number of filters
+            ndf_mult_prev = ndf_mult
+            ndf_mult = min(2 ** n, 8)
+            layers += [
+                nn.Conv2d(ndf * ndf_mult_prev, ndf * ndf_mult, kernel_size=4, stride=2, padding=1, bias=False),
+                nn.BatchNorm2d(ndf * ndf_mult),
+                nn.LeakyReLU(0.2, True)
+            ]
+
+        ndf_mult_prev = ndf_mult
+        ndf_mult = min(2 ** n_layers, 8)
+
+        layers += [
+            nn.Conv2d(ndf * ndf_mult_prev, ndf * ndf_mult, kernel_size=4, stride=1, padding=1, bias=False),
+            nn.BatchNorm2d(ndf * ndf_mult),
+            nn.LeakyReLU(0.2, True)
+        ]
+
+        layers += [
+            nn.Conv2d(ndf * ndf_mult, 1, kernel_size=4, stride=1, padding=1)]  # output 1 channel prediction map
+        self.main = nn.Sequential(*layers)
+
+        if model_path is not None:
+            chkpt = torch.load(model_path, map_location='cpu')
+            if 'params_d' in chkpt:
+                self.load_state_dict(torch.load(model_path, map_location='cpu')['params_d'])
+            elif 'params' in chkpt:
+                self.load_state_dict(torch.load(model_path, map_location='cpu')['params'])
+            else:
+                raise ValueError(f'Wrong params!')
+
+    def forward(self, x):
+        return self.main(x)
--- a/requirements.txt
+++ b/requirements.txt
@ -14,6 +14,7 @@ pillow
 pip>=22
 pudb
 pytorch-lightning
+scikit-image>=0.19
 streamlit
 # "CompVis/taming-transformers" IS NOT INSTALLABLE
 # This is a drop-in replacement
--- a/scripts/dream.py
+++ b/scripts/dream.py
@ -191,11 +191,7 @@ def main_loop(gen, opt, infile):
            else:
                opt.with_variations = None

-        if opt.outdir:
-            if not os.path.exists(opt.outdir):
-                os.makedirs(opt.outdir)
-            current_outdir = opt.outdir
-        elif opt.prompt_as_dir:
+        if opt.prompt_as_dir:
            # sanitize the prompt to a valid folder name
            subdir = path_filter.sub('_', opt.prompt)[:name_max].rstrip(' .')

@ -210,6 +206,8 @@ def main_loop(gen, opt, infile):
            if not os.path.exists(current_outdir):
                os.makedirs(current_outdir)
        else:
+            if not os.path.exists(opt.outdir):
+                os.makedirs(opt.outdir)
            current_outdir = opt.outdir

        # Here is where the images are actually generated!
@ -271,7 +269,7 @@ def main_loop(gen, opt, infile):
                filename   = f'{prefix}.{first_seed}.png'
                formatted_dream_prompt  = opt.dream_prompt_str(seed=first_seed,grid=True,iterations=len(grid_images))
                formatted_dream_prompt += f' # {grid_seeds}'
-                metadata = metadata.dumps(
+                metadata = metadata_dumps(
                    opt,
                    seeds      = grid_seeds,
                    weights    = gen.weights,
--- a/scripts/preload_models.py
+++ b/scripts/preload_models.py
@ -87,11 +87,29 @@ if gfpgan:

    try:
        import urllib.request
-        model_path = 'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth'
+        model_url  = 'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth'
        model_dest = 'src/gfpgan/experiments/pretrained_models/GFPGANv1.3.pth'
+
+        if not os.path.exists(model_dest):
            print('downloading gfpgan model file...')
            urllib.request.urlretrieve(model_path,model_dest)
    except Exception:
        import traceback
        print('Error loading GFPGAN:')
        print(traceback.format_exc())
+print('...success')
+
+print('preloading CodeFormer model file...')
+try:
+        import urllib.request
+        model_url  = 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth'
+        model_dest = 'ldm/restoration/codeformer/weights/codeformer.pth'
+        if not os.path.exists(model_dest):
+            print('downloading codeformer model file...')
+            os.makedirs(os.path.dirname(model_dest), exist_ok=True)
+            urllib.request.urlretrieve(model_path,model_dest)
+except Exception:
+    import traceback
+    print('Error loading CodeFormer:')
+    print(traceback.format_exc())
+print('...success')