mirror of
https://github.com/invoke-ai/InvokeAI
synced 2024-08-30 20:32:17 +00:00
add support for controlnet & sdxl conversion - not fully working
This commit is contained in:
parent
907ff165be
commit
5607794dbb
@ -55,6 +55,7 @@ from invokeai.frontend.install.widgets import (
|
||||
from invokeai.backend.install.legacy_arg_parsing import legacy_parser
|
||||
from invokeai.backend.install.model_install_backend import (
|
||||
hf_download_from_pretrained,
|
||||
hf_download_with_resume,
|
||||
InstallSelections,
|
||||
ModelInstall,
|
||||
)
|
||||
@ -204,6 +205,13 @@ def download_conversion_models():
|
||||
pipeline = CLIPTextModel.from_pretrained(repo_id, subfolder="text_encoder", **kwargs)
|
||||
pipeline.save_pretrained(target_dir / 'stable-diffusion-2-clip' / 'text_encoder', safe_serialization=True)
|
||||
|
||||
repo_id = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
|
||||
_, model_name = repo_id.split('/')
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained(repo_id, **kwargs)
|
||||
tokenizer_2.save_pretrained(target_dir / model_name, safe_serialization=True)
|
||||
# for some reason config.json never downloads
|
||||
hf_download_with_resume(repo_id, target_dir / model_name, "config.json")
|
||||
|
||||
# VAE
|
||||
logger.info('Downloading stable diffusion VAE')
|
||||
vae = AutoencoderKL.from_pretrained('stabilityai/sd-vae-ft-mse', **kwargs)
|
||||
|
@ -58,7 +58,15 @@ LEGACY_CONFIGS = {
|
||||
SchedulerPredictionType.Epsilon: 'v2-inpainting-inference.yaml',
|
||||
SchedulerPredictionType.VPrediction: 'v2-inpainting-inference-v.yaml',
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
BaseModelType.StableDiffusionXL: {
|
||||
ModelVariantType.Normal: 'sd_xl_base.yaml',
|
||||
},
|
||||
|
||||
BaseModelType.StableDiffusionXLRefiner: {
|
||||
ModelVariantType.Normal: 'sd_xl_refiner.yaml',
|
||||
},
|
||||
}
|
||||
|
||||
@dataclass
|
||||
@ -329,6 +337,7 @@ class ModelInstall(object):
|
||||
description = str(description),
|
||||
model_format = info.format,
|
||||
)
|
||||
legacy_conf = None
|
||||
if info.model_type == ModelType.Main:
|
||||
attributes.update(dict(variant = info.variant_type,))
|
||||
if info.format=="checkpoint":
|
||||
@ -343,11 +352,17 @@ class ModelInstall(object):
|
||||
except KeyError:
|
||||
legacy_conf = Path(self.config.legacy_conf_dir, 'v1-inference.yaml') # best guess
|
||||
|
||||
attributes.update(
|
||||
dict(
|
||||
config = str(legacy_conf)
|
||||
)
|
||||
if info.model_type == ModelType.ControlNet and info.format=="checkpoint":
|
||||
possible_conf = path.with_suffix('.yaml')
|
||||
if possible_conf.exists():
|
||||
legacy_conf = str(self.relative_to_root(possible_conf))
|
||||
|
||||
if legacy_conf:
|
||||
attributes.update(
|
||||
dict(
|
||||
config = str(legacy_conf)
|
||||
)
|
||||
)
|
||||
return attributes
|
||||
|
||||
def relative_to_root(self, path: Path)->Path:
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -673,6 +673,7 @@ class ModelManager(object):
|
||||
|
||||
self.models[model_key] = model_config
|
||||
self.commit()
|
||||
|
||||
return AddModelResult(
|
||||
name = model_name,
|
||||
model_type = model_type,
|
||||
@ -840,7 +841,7 @@ class ModelManager(object):
|
||||
Returns the preamble for the config file.
|
||||
"""
|
||||
return textwrap.dedent(
|
||||
"""\
|
||||
"""
|
||||
# This file describes the alternative machine learning models
|
||||
# available to InvokeAI script.
|
||||
#
|
||||
|
@ -253,7 +253,8 @@ class PipelineCheckpointProbe(CheckpointProbeBase):
|
||||
return BaseModelType.StableDiffusion1
|
||||
if key_name in state_dict and state_dict[key_name].shape[-1] == 1024:
|
||||
return BaseModelType.StableDiffusion2
|
||||
# TODO: Verify that this is correct! Need an XL checkpoint file for this.
|
||||
# TODO: This is just a guess based on N=1
|
||||
key_name = 'model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_k.weight'
|
||||
if key_name in state_dict and state_dict[key_name].shape[-1] == 2048:
|
||||
return BaseModelType.StableDiffusionXL
|
||||
raise InvalidModelException("Cannot determine base type")
|
||||
|
@ -1,7 +1,8 @@
|
||||
import os
|
||||
import torch
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from typing import Optional, Literal
|
||||
from .base import (
|
||||
ModelBase,
|
||||
ModelConfigBase,
|
||||
@ -15,6 +16,7 @@ from .base import (
|
||||
InvalidModelException,
|
||||
ModelNotFoundException,
|
||||
)
|
||||
from invokeai.app.services.config import InvokeAIAppConfig
|
||||
|
||||
class ControlNetModelFormat(str, Enum):
|
||||
Checkpoint = "checkpoint"
|
||||
@ -24,8 +26,12 @@ class ControlNetModel(ModelBase):
|
||||
#model_class: Type
|
||||
#model_size: int
|
||||
|
||||
class Config(ModelConfigBase):
|
||||
model_format: ControlNetModelFormat
|
||||
class DiffusersConfig(ModelConfigBase):
|
||||
model_format: Literal[ControlNetModelFormat.Diffusers]
|
||||
|
||||
class CheckpointConfig(ModelConfigBase):
|
||||
model_format: Literal[ControlNetModelFormat.Checkpoint]
|
||||
config: str
|
||||
|
||||
def __init__(self, model_path: str, base_model: BaseModelType, model_type: ModelType):
|
||||
assert model_type == ModelType.ControlNet
|
||||
@ -99,13 +105,51 @@ class ControlNetModel(ModelBase):
|
||||
|
||||
@classmethod
|
||||
def convert_if_required(
|
||||
cls,
|
||||
model_path: str,
|
||||
output_path: str,
|
||||
config: ModelConfigBase,
|
||||
base_model: BaseModelType,
|
||||
) -> str:
|
||||
if cls.detect_format(model_path) == ControlNetModelFormat.Checkpoint:
|
||||
return _convert_controlnet_ckpt_and_cache(
|
||||
model_path = model_path,
|
||||
model_config = config.config,
|
||||
output_path = output_path,
|
||||
base_model = base_model,
|
||||
)
|
||||
else:
|
||||
return model_path
|
||||
|
||||
@classmethod
|
||||
def _convert_controlnet_ckpt_and_cache(
|
||||
cls,
|
||||
model_path: str,
|
||||
output_path: str,
|
||||
config: ModelConfigBase, # empty config or config of parent model
|
||||
base_model: BaseModelType,
|
||||
) -> str:
|
||||
if cls.detect_format(model_path) != ControlNetModelFormat.Diffusers:
|
||||
raise NotImplementedError("Checkpoint controlnet models currently unsupported")
|
||||
else:
|
||||
return model_path
|
||||
model_config: ControlNetModel.CheckpointConfig,
|
||||
) -> str:
|
||||
"""
|
||||
Convert the controlnet from checkpoint format to diffusers format,
|
||||
cache it to disk, and return Path to converted
|
||||
file. If already on disk then just returns Path.
|
||||
"""
|
||||
app_config = InvokeAIAppConfig.get_config()
|
||||
weights = app_config.root_path / model_path
|
||||
output_path = Path(output_path)
|
||||
|
||||
# return cached version if it exists
|
||||
if output_path.exists():
|
||||
return output_path
|
||||
|
||||
# to avoid circular import errors
|
||||
from ..convert_ckpt_to_diffusers import convert_controlnet_to_diffusers
|
||||
convert_controlnet_to_diffusers(
|
||||
weights,
|
||||
output_path,
|
||||
original_config_file = app_config.root_path / model_config,
|
||||
image_size = 512,
|
||||
scan_needed = True,
|
||||
from_safetensors = weights.suffix == ".safetensors"
|
||||
)
|
||||
return output_path
|
||||
|
@ -48,7 +48,7 @@ class StableDiffusionXLModel(DiffusersModel):
|
||||
if model_format == StableDiffusionXLModelFormat.Checkpoint:
|
||||
if ckpt_config_path:
|
||||
ckpt_config = OmegaConf.load(ckpt_config_path)
|
||||
ckpt_config["model"]["params"]["unet_config"]["params"]["in_channels"]
|
||||
in_channels = ckpt_config["model"]["params"]["unet_config"]["params"]["in_channels"]
|
||||
|
||||
else:
|
||||
checkpoint = read_checkpoint_meta(path)
|
||||
@ -109,6 +109,13 @@ class StableDiffusionXLModel(DiffusersModel):
|
||||
base_model: BaseModelType,
|
||||
) -> str:
|
||||
if isinstance(config, cls.CheckpointConfig):
|
||||
raise NotImplementedError('conversion of SDXL checkpoint models to diffusers format is not yet supported')
|
||||
from invokeai.backend.model_management.models.stable_diffusion import _convert_ckpt_and_cache
|
||||
return _convert_ckpt_and_cache(
|
||||
version=base_model,
|
||||
model_config=config,
|
||||
output_path=output_path,
|
||||
model_type='SDXL',
|
||||
no_safetensors=True, # giving errors for some reason
|
||||
)
|
||||
else:
|
||||
return model_path
|
||||
|
@ -15,6 +15,7 @@ from .base import (
|
||||
classproperty,
|
||||
InvalidModelException,
|
||||
)
|
||||
from .sdxl import StableDiffusionXLModel
|
||||
from invokeai.app.services.config import InvokeAIAppConfig
|
||||
from omegaconf import OmegaConf
|
||||
|
||||
@ -235,42 +236,16 @@ class StableDiffusion2Model(DiffusersModel):
|
||||
else:
|
||||
return model_path
|
||||
|
||||
def _select_ckpt_config(version: BaseModelType, variant: ModelVariantType):
|
||||
ckpt_configs = {
|
||||
BaseModelType.StableDiffusion1: {
|
||||
ModelVariantType.Normal: "v1-inference.yaml",
|
||||
ModelVariantType.Inpaint: "v1-inpainting-inference.yaml",
|
||||
},
|
||||
BaseModelType.StableDiffusion2: {
|
||||
ModelVariantType.Normal: "v2-inference-v.yaml", # best guess, as we can't differentiate with base(512)
|
||||
ModelVariantType.Inpaint: "v2-inpainting-inference.yaml",
|
||||
ModelVariantType.Depth: "v2-midas-inference.yaml",
|
||||
},
|
||||
# note that these .yaml files don't yet exist!
|
||||
BaseModelType.StableDiffusionXL: {
|
||||
ModelVariantType.Normal: "xl-inference-v.yaml",
|
||||
ModelVariantType.Inpaint: "xl-inpainting-inference.yaml",
|
||||
ModelVariantType.Depth: "xl-midas-inference.yaml",
|
||||
}
|
||||
}
|
||||
|
||||
app_config = InvokeAIAppConfig.get_config()
|
||||
try:
|
||||
config_path = app_config.legacy_conf_path / ckpt_configs[version][variant]
|
||||
if config_path.is_relative_to(app_config.root_path):
|
||||
config_path = config_path.relative_to(app_config.root_path)
|
||||
return str(config_path)
|
||||
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
# TODO: rework
|
||||
# Note that convert_ckpt_to_diffuses does not currently support conversion of SDXL models
|
||||
def _convert_ckpt_and_cache(
|
||||
version: BaseModelType,
|
||||
model_config: Union[StableDiffusion1Model.CheckpointConfig, StableDiffusion2Model.CheckpointConfig],
|
||||
output_path: str,
|
||||
version: BaseModelType,
|
||||
model_config: Union[StableDiffusion1Model.CheckpointConfig,
|
||||
StableDiffusion2Model.CheckpointConfig,
|
||||
StableDiffusionXLModel.CheckpointConfig,
|
||||
],
|
||||
output_path: str,
|
||||
use_save_model: bool=False,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""
|
||||
Convert the checkpoint model indicated in mconfig into a
|
||||
@ -298,5 +273,42 @@ def _convert_ckpt_and_cache(
|
||||
original_config_file=config_file,
|
||||
extract_ema=True,
|
||||
scan_needed=True,
|
||||
from_safetensors = weights.suffix == ".safetensors",
|
||||
**kwargs,
|
||||
)
|
||||
return output_path
|
||||
|
||||
def _select_ckpt_config(version: BaseModelType, variant: ModelVariantType):
|
||||
ckpt_configs = {
|
||||
BaseModelType.StableDiffusion1: {
|
||||
ModelVariantType.Normal: "v1-inference.yaml",
|
||||
ModelVariantType.Inpaint: "v1-inpainting-inference.yaml",
|
||||
},
|
||||
BaseModelType.StableDiffusion2: {
|
||||
ModelVariantType.Normal: "v2-inference-v.yaml", # best guess, as we can't differentiate with base(512)
|
||||
ModelVariantType.Inpaint: "v2-inpainting-inference.yaml",
|
||||
ModelVariantType.Depth: "v2-midas-inference.yaml",
|
||||
},
|
||||
BaseModelType.StableDiffusionXL: {
|
||||
ModelVariantType.Normal: "sd_xl_base.yaml",
|
||||
ModelVariantType.Inpaint: None,
|
||||
ModelVariantType.Depth: None,
|
||||
},
|
||||
BaseModelType.StableDiffusionXLRefiner: {
|
||||
ModelVariantType.Normal: "sd_xl_refiner.yaml",
|
||||
ModelVariantType.Inpaint: None,
|
||||
ModelVariantType.Depth: None,
|
||||
},
|
||||
}
|
||||
|
||||
app_config = InvokeAIAppConfig.get_config()
|
||||
try:
|
||||
config_path = app_config.legacy_conf_path / ckpt_configs[version][variant]
|
||||
if config_path.is_relative_to(app_config.root_path):
|
||||
config_path = config_path.relative_to(app_config.root_path)
|
||||
return str(config_path)
|
||||
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
|
98
invokeai/configs/stable-diffusion/sd_xl_base.yaml
Normal file
98
invokeai/configs/stable-diffusion/sd_xl_base.yaml
Normal file
@ -0,0 +1,98 @@
|
||||
model:
|
||||
target: sgm.models.diffusion.DiffusionEngine
|
||||
params:
|
||||
scale_factor: 0.13025
|
||||
disable_first_stage_autocast: True
|
||||
|
||||
denoiser_config:
|
||||
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
|
||||
params:
|
||||
num_idx: 1000
|
||||
|
||||
weighting_config:
|
||||
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
|
||||
scaling_config:
|
||||
target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
|
||||
discretization_config:
|
||||
target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
|
||||
|
||||
network_config:
|
||||
target: sgm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
adm_in_channels: 2816
|
||||
num_classes: sequential
|
||||
use_checkpoint: True
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [4, 2]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [1, 2, 4]
|
||||
num_head_channels: 64
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: [1, 2, 10] # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16
|
||||
context_dim: 2048
|
||||
spatial_transformer_attn_type: softmax-xformers
|
||||
legacy: False
|
||||
|
||||
conditioner_config:
|
||||
target: sgm.modules.GeneralConditioner
|
||||
params:
|
||||
emb_models:
|
||||
# crossattn cond
|
||||
- is_trainable: False
|
||||
input_key: txt
|
||||
target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
|
||||
params:
|
||||
layer: hidden
|
||||
layer_idx: 11
|
||||
# crossattn and vector cond
|
||||
- is_trainable: False
|
||||
input_key: txt
|
||||
target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
|
||||
params:
|
||||
arch: ViT-bigG-14
|
||||
version: laion2b_s39b_b160k
|
||||
freeze: True
|
||||
layer: penultimate
|
||||
always_return_pooled: True
|
||||
legacy: False
|
||||
# vector cond
|
||||
- is_trainable: False
|
||||
input_key: original_size_as_tuple
|
||||
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||
params:
|
||||
outdim: 256 # multiplied by two
|
||||
# vector cond
|
||||
- is_trainable: False
|
||||
input_key: crop_coords_top_left
|
||||
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||
params:
|
||||
outdim: 256 # multiplied by two
|
||||
# vector cond
|
||||
- is_trainable: False
|
||||
input_key: target_size_as_tuple
|
||||
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||
params:
|
||||
outdim: 256 # multiplied by two
|
||||
|
||||
first_stage_config:
|
||||
target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
attn_type: vanilla-xformers
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult: [1, 2, 4, 4]
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
91
invokeai/configs/stable-diffusion/sd_xl_refiner.yaml
Normal file
91
invokeai/configs/stable-diffusion/sd_xl_refiner.yaml
Normal file
@ -0,0 +1,91 @@
|
||||
model:
|
||||
target: sgm.models.diffusion.DiffusionEngine
|
||||
params:
|
||||
scale_factor: 0.13025
|
||||
disable_first_stage_autocast: True
|
||||
|
||||
denoiser_config:
|
||||
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
|
||||
params:
|
||||
num_idx: 1000
|
||||
|
||||
weighting_config:
|
||||
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
|
||||
scaling_config:
|
||||
target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
|
||||
discretization_config:
|
||||
target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
|
||||
|
||||
network_config:
|
||||
target: sgm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
adm_in_channels: 2560
|
||||
num_classes: sequential
|
||||
use_checkpoint: True
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 384
|
||||
attention_resolutions: [4, 2]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [1, 2, 4, 4]
|
||||
num_head_channels: 64
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 4
|
||||
context_dim: [1280, 1280, 1280, 1280] # 1280
|
||||
spatial_transformer_attn_type: softmax-xformers
|
||||
legacy: False
|
||||
|
||||
conditioner_config:
|
||||
target: sgm.modules.GeneralConditioner
|
||||
params:
|
||||
emb_models:
|
||||
# crossattn and vector cond
|
||||
- is_trainable: False
|
||||
input_key: txt
|
||||
target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
|
||||
params:
|
||||
arch: ViT-bigG-14
|
||||
version: laion2b_s39b_b160k
|
||||
legacy: False
|
||||
freeze: True
|
||||
layer: penultimate
|
||||
always_return_pooled: True
|
||||
# vector cond
|
||||
- is_trainable: False
|
||||
input_key: original_size_as_tuple
|
||||
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||
params:
|
||||
outdim: 256 # multiplied by two
|
||||
# vector cond
|
||||
- is_trainable: False
|
||||
input_key: crop_coords_top_left
|
||||
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||
params:
|
||||
outdim: 256 # multiplied by two
|
||||
# vector cond
|
||||
- is_trainable: False
|
||||
input_key: aesthetic_score
|
||||
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||
params:
|
||||
outdim: 256 # multiplied by one
|
||||
|
||||
first_stage_config:
|
||||
target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
attn_type: vanilla-xformers
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult: [1, 2, 4, 4]
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
Loading…
Reference in New Issue
Block a user