diff --git a/invokeai/app/invocations/controlnet_image_processors.py b/invokeai/app/invocations/controlnet_image_processors.py index ec064a00be..187784b29e 100644 --- a/invokeai/app/invocations/controlnet_image_processors.py +++ b/invokeai/app/invocations/controlnet_image_processors.py @@ -7,14 +7,13 @@ from typing import Literal, Optional, Union, List from PIL import Image, ImageFilter, ImageOps from pydantic import BaseModel, Field -from ..models.image import ImageField, ImageType +from ..models.image import ImageField, ImageType, ImageCategory from .baseinvocation import ( BaseInvocation, BaseInvocationOutput, InvocationContext, InvocationConfig, ) - from controlnet_aux import ( CannyDetector, HEDdetector, @@ -26,10 +25,11 @@ from controlnet_aux import ( OpenposeDetector, PidiNetDetector, ContentShuffleDetector, - # ZoeDetector, # FIXME: uncomment once ZoeDetector is availabel in official controlnet_aux release + ZoeDetector, + MediapipeFaceDetector, ) -from .image import ImageOutput, build_image_output, PILInvocationConfig +from .image import ImageOutput, PILInvocationConfig CONTROLNET_DEFAULT_MODELS = [ ########################################### @@ -161,33 +161,41 @@ class ImageProcessorInvocation(BaseInvocation, PILInvocationConfig): return image def invoke(self, context: InvocationContext) -> ImageOutput: - raw_image = context.services.images.get( + + raw_image = context.services.images.get_pil_image( self.image.image_type, self.image.image_name ) # image type should be PIL.PngImagePlugin.PngImageFile ? processed_image = self.run_processor(raw_image) + + # FIXME: what happened to image metadata? + # metadata = context.services.metadata.build_metadata( + # session_id=context.graph_execution_state_id, node=self + # ) + # currently can't see processed image in node UI without a showImage node, # so for now setting image_type to RESULT instead of INTERMEDIATE so will get saved in gallery - # image_type = ImageType.INTERMEDIATE - image_type = ImageType.RESULT - image_name = context.services.images.create_name( - context.graph_execution_state_id, self.id + image_dto = context.services.images.create( + image=processed_image, + image_type=ImageType.RESULT, + image_category=ImageCategory.GENERAL, + session_id=context.graph_execution_state_id, + node_id=self.id, + is_intermediate=self.is_intermediate ) - metadata = context.services.metadata.build_metadata( - session_id=context.graph_execution_state_id, node=self - ) - context.services.images.save(image_type, image_name, processed_image, metadata) """Builds an ImageOutput and its ImageField""" processed_image_field = ImageField( - image_name=image_name, - image_type=image_type, + image_name=image_dto.image_name, + image_type=image_dto.image_type, ) return ImageOutput( image=processed_image_field, - width=processed_image.width, - height=processed_image.height, - mode=processed_image.mode, + # width=processed_image.width, + width = image_dto.width, + # height=processed_image.height, + height = image_dto.height, + # mode=processed_image.mode, ) @@ -392,18 +400,17 @@ class ContentShuffleImageProcessorInvocation(ImageProcessorInvocation, PILInvoca return processed_image -# # FIXME: ZoeDetector was implemented _after_ most recent official release of controlnet_aux (v0.0.3) -# # so it is commented out until a new release is made -# class ZoeDepthImageProcessorInvocation(ImageProcessorInvocation, PILInvocationConfig): -# """Applies Zoe depth processing to image""" -# # fmt: off -# type: Literal["zoe_depth_image_processor"] = "zoe_depth_image_processor" -# # fmt: on -# -# def run_processor(self, image): -# zoe_depth_processor = ZoeDetector.from_pretrained("lllyasviel/Annotators") -# processed_image = zoe_depth_processor(image) -# return processed_image +# should work with controlnet_aux >= 0.0.4 and timm <= 0.6.13 +class ZoeDepthImageProcessorInvocation(ImageProcessorInvocation, PILInvocationConfig): + """Applies Zoe depth processing to image""" + # fmt: off + type: Literal["zoe_depth_image_processor"] = "zoe_depth_image_processor" + # fmt: on + + def run_processor(self, image): + zoe_depth_processor = ZoeDetector.from_pretrained("lllyasviel/Annotators") + processed_image = zoe_depth_processor(image) + return processed_image class MediapipeFaceProcessorInvocation(ImageProcessorInvocation, PILInvocationConfig): diff --git a/invokeai/app/invocations/latent.py b/invokeai/app/invocations/latent.py index 0bda8fb5d7..4975b7b578 100644 --- a/invokeai/app/invocations/latent.py +++ b/invokeai/app/invocations/latent.py @@ -6,10 +6,11 @@ from typing import Literal, Optional, Union, List from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, validator import torch from invokeai.app.invocations.util.choose_model import choose_model +from invokeai.app.models.image import ImageCategory from invokeai.app.util.misc import SEED_MAX, get_random_seed from invokeai.app.util.step_callback import stable_diffusion_step_callback @@ -27,9 +28,9 @@ from ...backend.stable_diffusion.diffusers_pipeline import ControlNetData from .baseinvocation import BaseInvocation, BaseInvocationOutput, InvocationContext, InvocationConfig import numpy as np -from ..services.image_storage import ImageType +from ..services.image_file_storage import ImageType from .baseinvocation import BaseInvocation, InvocationContext -from .image import ImageField, ImageOutput, build_image_output +from .image import ImageField, ImageOutput from .compel import ConditioningField from ...backend.stable_diffusion import PipelineIntermediateState from diffusers.schedulers import SchedulerMixin as Scheduler @@ -146,12 +147,17 @@ class NoiseInvocation(BaseInvocation): }, } + @validator("seed", pre=True) + def modulo_seed(cls, v): + """Returns the seed modulo SEED_MAX to ensure it is within the valid range.""" + return v % SEED_MAX + def invoke(self, context: InvocationContext) -> NoiseOutput: device = torch.device(choose_torch_device()) noise = get_noise(self.width, self.height, device, self.seed) name = f'{context.graph_execution_state_id}__{self.id}' - context.services.latents.set(name, noise) + context.services.latents.save(name, noise) return build_noise_output(latents_name=name, latents=noise) @@ -168,19 +174,18 @@ class TextToLatentsInvocation(BaseInvocation): noise: Optional[LatentsField] = Field(description="The noise to use") steps: int = Field(default=10, gt=0, description="The number of steps to use to generate the image") cfg_scale: float = Field(default=7.5, gt=0, description="The Classifier-Free Guidance, higher values may result in a result closer to the prompt", ) - scheduler: SAMPLER_NAME_VALUES = Field(default="lms", description="The scheduler to use" ) + scheduler: SAMPLER_NAME_VALUES = Field(default="euler", description="The scheduler to use" ) model: str = Field(default="", description="The model to use (currently ignored)") - seamless: bool = Field(default=False, description="Whether or not to generate an image that can tile without seams", ) - seamless_axes: str = Field(default="", description="The axes to tile the image on, 'x' and/or 'y'") - progress_images: bool = Field(default=False, description="Whether or not to produce progress images during generation", ) - control: Union[ControlField, List[ControlField]] = Field(default=None, description="The controlnet(s) to use") + control: Union[ControlField, list[ControlField]] = Field(default=None, description="The control to use") + # seamless: bool = Field(default=False, description="Whether or not to generate an image that can tile without seams", ) + # seamless_axes: str = Field(default="", description="The axes to tile the image on, 'x' and/or 'y'") # fmt: on # Schema customisation class Config(InvocationConfig): schema_extra = { "ui": { - "tags": ["latents"], + "tags": ["latents", "image"], "type_hints": { "model": "model", "control": "control", @@ -209,17 +214,17 @@ class TextToLatentsInvocation(BaseInvocation): scheduler_name=self.scheduler ) - if isinstance(model, DiffusionPipeline): - for component in [model.unet, model.vae]: - configure_model_padding(component, - self.seamless, - self.seamless_axes - ) - else: - configure_model_padding(model, - self.seamless, - self.seamless_axes - ) + # if isinstance(model, DiffusionPipeline): + # for component in [model.unet, model.vae]: + # configure_model_padding(component, + # self.seamless, + # self.seamless_axes + # ) + # else: + # configure_model_padding(model, + # self.seamless, + # self.seamless_axes + # ) return model @@ -292,7 +297,9 @@ class TextToLatentsInvocation(BaseInvocation): torch_dtype=model.unet.dtype).to(model.device) control_models.append(control_model) control_image_field = control_info.image - input_image = context.services.images.get(control_image_field.image_type, control_image_field.image_name) + input_image = context.services.images.get_pil_image(control_image_field.image_type, + control_image_field.image_name) + # self.image.image_type, self.image.image_name # FIXME: still need to test with different widths, heights, devices, dtypes # and add in batch_size, num_images_per_prompt? # and do real check for classifier_free_guidance? @@ -348,7 +355,7 @@ class TextToLatentsInvocation(BaseInvocation): torch.cuda.empty_cache() name = f'{context.graph_execution_state_id}__{self.id}' - context.services.latents.set(name, result_latents) + context.services.latents.save(name, result_latents) return build_latents_output(latents_name=name, latents=result_latents) @@ -361,6 +368,18 @@ class LatentsToLatentsInvocation(TextToLatentsInvocation): latents: Optional[LatentsField] = Field(description="The latents to use as a base image") strength: float = Field(default=0.5, description="The strength of the latents to use") + # Schema customisation + class Config(InvocationConfig): + schema_extra = { + "ui": { + "tags": ["latents"], + "type_hints": { + "model": "model", + "control": "control", + } + }, + } + def invoke(self, context: InvocationContext) -> LatentsOutput: noise = context.services.latents.get(self.noise.latents_name) latent = context.services.latents.get(self.latents.latents_name) @@ -402,7 +421,7 @@ class LatentsToLatentsInvocation(TextToLatentsInvocation): torch.cuda.empty_cache() name = f'{context.graph_execution_state_id}__{self.id}' - context.services.latents.set(name, result_latents) + context.services.latents.save(name, result_latents) return build_latents_output(latents_name=name, latents=result_latents) @@ -439,20 +458,30 @@ class LatentsToImageInvocation(BaseInvocation): np_image = model.decode_latents(latents) image = model.numpy_to_pil(np_image)[0] - image_type = ImageType.RESULT - image_name = context.services.images.create_name( - context.graph_execution_state_id, self.id - ) - - metadata = context.services.metadata.build_metadata( - session_id=context.graph_execution_state_id, node=self - ) + # what happened to metadata? + # metadata = context.services.metadata.build_metadata( + # session_id=context.graph_execution_state_id, node=self torch.cuda.empty_cache() - context.services.images.save(image_type, image_name, image, metadata) - return build_image_output( - image_type=image_type, image_name=image_name, image=image + # new (post Image service refactor) way of using services to save image + # and gnenerate unique image_name + image_dto = context.services.images.create( + image=image, + image_type=ImageType.RESULT, + image_category=ImageCategory.GENERAL, + session_id=context.graph_execution_state_id, + node_id=self.id, + is_intermediate=self.is_intermediate + ) + + return ImageOutput( + image=ImageField( + image_name=image_dto.image_name, + image_type=image_dto.image_type, + ), + width=image_dto.width, + height=image_dto.height, ) @@ -487,7 +516,8 @@ class ResizeLatentsInvocation(BaseInvocation): torch.cuda.empty_cache() name = f"{context.graph_execution_state_id}__{self.id}" - context.services.latents.set(name, resized_latents) + # context.services.latents.set(name, resized_latents) + context.services.latents.save(name, resized_latents) return build_latents_output(latents_name=name, latents=resized_latents) @@ -517,7 +547,8 @@ class ScaleLatentsInvocation(BaseInvocation): torch.cuda.empty_cache() name = f"{context.graph_execution_state_id}__{self.id}" - context.services.latents.set(name, resized_latents) + # context.services.latents.set(name, resized_latents) + context.services.latents.save(name, resized_latents) return build_latents_output(latents_name=name, latents=resized_latents) @@ -541,7 +572,10 @@ class ImageToLatentsInvocation(BaseInvocation): @torch.no_grad() def invoke(self, context: InvocationContext) -> LatentsOutput: - image = context.services.images.get( + # image = context.services.images.get( + # self.image.image_type, self.image.image_name + # ) + image = context.services.images.get_pil_image( self.image.image_type, self.image.image_name ) @@ -561,5 +595,6 @@ class ImageToLatentsInvocation(BaseInvocation): ) name = f"{context.graph_execution_state_id}__{self.id}" - context.services.latents.set(name, latents) + # context.services.latents.set(name, latents) + context.services.latents.save(name, latents) return build_latents_output(latents_name=name, latents=latents)