diff --git a/invokeai/app/invocations/ip_adapter.py b/invokeai/app/invocations/ip_adapter.py index 2874c92701..603a85148d 100644 --- a/invokeai/app/invocations/ip_adapter.py +++ b/invokeai/app/invocations/ip_adapter.py @@ -1,5 +1,5 @@ from builtins import float -from typing import List, Union +from typing import List, Literal, Union from pydantic import BaseModel, Field, field_validator, model_validator from typing_extensions import Self @@ -49,12 +49,15 @@ class IPAdapterOutput(BaseInvocationOutput): ip_adapter: IPAdapterField = OutputField(description=FieldDescriptions.ip_adapter, title="IP-Adapter") +CLIP_VISION_MODEL_MAP = {"ViT-H": "ip_adapter_sd_image_encoder", "ViT-G": "ip_adapter_sdxl_image_encoder"} + + @invocation("ip_adapter", title="IP-Adapter", tags=["ip_adapter", "control"], category="ip_adapter", version="1.2.2") class IPAdapterInvocation(BaseInvocation): """Collects IP-Adapter info to pass to other nodes.""" # Inputs - image: Union[ImageField, List[ImageField]] = InputField(description="The IP-Adapter image prompt(s).") + image: Union[ImageField, List[ImageField]] = InputField(description="The IP-Adapter image prompt(s).", ui_order=1) ip_adapter_model: ModelIdentifierField = InputField( description="The IP-Adapter model.", title="IP-Adapter Model", @@ -62,7 +65,9 @@ class IPAdapterInvocation(BaseInvocation): ui_order=-1, ui_type=UIType.IPAdapterModel, ) - + clip_vision_model: Literal["ViT-H", "ViT-G"] = InputField( + description="CLIP Vision model to use", default="ViT-H", ui_order=2 + ) weight: Union[float, List[float]] = InputField( default=1, description="The weight given to the IP-Adapter", title="Weight" ) @@ -89,12 +94,12 @@ class IPAdapterInvocation(BaseInvocation): ip_adapter_info = context.models.get_config(self.ip_adapter_model.key) assert isinstance(ip_adapter_info, (IPAdapterDiffusersConfig, IPAdapterCheckpointConfig)) - image_encoder_model_id = ( - ip_adapter_info.image_encoder_model_id - if isinstance(ip_adapter_info, IPAdapterDiffusersConfig) - else "ip_adapter_sd_image_encoder" - ) - image_encoder_model_name = image_encoder_model_id.split("/")[-1].strip() + if isinstance(ip_adapter_info, IPAdapterDiffusersConfig): + image_encoder_model_id = ip_adapter_info.image_encoder_model_id + image_encoder_model_name = image_encoder_model_id.split("/")[-1].strip() + else: + image_encoder_model_name = CLIP_VISION_MODEL_MAP[self.clip_vision_model] + image_encoder_model = self._get_image_encoder(context, image_encoder_model_name) return IPAdapterOutput( @@ -109,19 +114,25 @@ class IPAdapterInvocation(BaseInvocation): ) def _get_image_encoder(self, context: InvocationContext, image_encoder_model_name: str) -> AnyModelConfig: - found = False - while not found: + image_encoder_models = context.models.search_by_attrs( + name=image_encoder_model_name, base=BaseModelType.Any, type=ModelType.CLIPVision + ) + + if not len(image_encoder_models) > 0: + context.logger.warning( + f"The image encoder required by this IP Adapter ({image_encoder_model_name}) is not installed. \ + Downloading and installing now. This may take a while." + ) + + installer = context._services.model_manager.install + job = installer.heuristic_import(f"InvokeAI/{image_encoder_model_name}") + installer.wait_for_job(job, timeout=600) # Wait for up to 10 minutes image_encoder_models = context.models.search_by_attrs( name=image_encoder_model_name, base=BaseModelType.Any, type=ModelType.CLIPVision ) - found = len(image_encoder_models) > 0 - if not found: - context.logger.warning( - f"The image encoder required by this IP Adapter ({image_encoder_model_name}) is not installed." - ) - context.logger.warning("Downloading and installing now. This may take a while.") - installer = context._services.model_manager.install - job = installer.heuristic_import(f"InvokeAI/{image_encoder_model_name}") - installer.wait_for_job(job, timeout=600) # wait up to 10 minutes - then raise a TimeoutException - assert len(image_encoder_models) == 1 + + if len(image_encoder_models) == 0: + context.logger.error("Error while fetching CLIP Vision Image Encoder") + assert len(image_encoder_models) == 1 + return image_encoder_models[0] diff --git a/invokeai/app/invocations/metadata.py b/invokeai/app/invocations/metadata.py index 6fc72a1c3f..2da482c833 100644 --- a/invokeai/app/invocations/metadata.py +++ b/invokeai/app/invocations/metadata.py @@ -2,16 +2,8 @@ from typing import Any, Literal, Optional, Union from pydantic import BaseModel, ConfigDict, Field -from invokeai.app.invocations.baseinvocation import ( - BaseInvocation, - BaseInvocationOutput, - invocation, - invocation_output, -) -from invokeai.app.invocations.controlnet_image_processors import ( - CONTROLNET_MODE_VALUES, - CONTROLNET_RESIZE_VALUES, -) +from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output +from invokeai.app.invocations.controlnet_image_processors import CONTROLNET_MODE_VALUES, CONTROLNET_RESIZE_VALUES from invokeai.app.invocations.fields import ( FieldDescriptions, ImageField, @@ -43,6 +35,7 @@ class IPAdapterMetadataField(BaseModel): image: ImageField = Field(description="The IP-Adapter image prompt.") ip_adapter_model: ModelIdentifierField = Field(description="The IP-Adapter model.") + clip_vision_model: Literal["ViT-H", "ViT-G"] = Field(description="The CLIP Vision model") weight: Union[float, list[float]] = Field(description="The weight given to the IP-Adapter") begin_step_percent: float = Field(description="When the IP-Adapter is first applied (% of total steps)") end_step_percent: float = Field(description="When the IP-Adapter is last applied (% of total steps)") diff --git a/invokeai/frontend/web/public/locales/en.json b/invokeai/frontend/web/public/locales/en.json index d2402c61be..5872d22dfe 100644 --- a/invokeai/frontend/web/public/locales/en.json +++ b/invokeai/frontend/web/public/locales/en.json @@ -217,6 +217,7 @@ "saveControlImage": "Save Control Image", "scribble": "scribble", "selectModel": "Select a model", + "selectCLIPVisionModel": "Select a CLIP Vision model", "setControlImageDimensions": "Set Control Image Dimensions To W/H", "showAdvanced": "Show Advanced", "small": "Small", diff --git a/invokeai/frontend/web/src/features/controlAdapters/components/parameters/ParamControlAdapterModel.tsx b/invokeai/frontend/web/src/features/controlAdapters/components/parameters/ParamControlAdapterModel.tsx index 25d327e54e..380a33185d 100644 --- a/invokeai/frontend/web/src/features/controlAdapters/components/parameters/ParamControlAdapterModel.tsx +++ b/invokeai/frontend/web/src/features/controlAdapters/components/parameters/ParamControlAdapterModel.tsx @@ -1,12 +1,18 @@ -import { Combobox, FormControl, Tooltip } from '@invoke-ai/ui-library'; +import type { ComboboxOnChange, ComboboxOption } from '@invoke-ai/ui-library'; +import { Combobox, Flex, FormControl, Tooltip } from '@invoke-ai/ui-library'; import { createMemoizedSelector } from 'app/store/createMemoizedSelector'; import { useAppDispatch, useAppSelector } from 'app/store/storeHooks'; import { useGroupedModelCombobox } from 'common/hooks/useGroupedModelCombobox'; +import { useControlAdapterCLIPVisionModel } from 'features/controlAdapters/hooks/useControlAdapterCLIPVisionModel'; import { useControlAdapterIsEnabled } from 'features/controlAdapters/hooks/useControlAdapterIsEnabled'; import { useControlAdapterModel } from 'features/controlAdapters/hooks/useControlAdapterModel'; import { useControlAdapterModels } from 'features/controlAdapters/hooks/useControlAdapterModels'; import { useControlAdapterType } from 'features/controlAdapters/hooks/useControlAdapterType'; -import { controlAdapterModelChanged } from 'features/controlAdapters/store/controlAdaptersSlice'; +import { + controlAdapterCLIPVisionModelChanged, + controlAdapterModelChanged, +} from 'features/controlAdapters/store/controlAdaptersSlice'; +import type { CLIPVisionModel } from 'features/controlAdapters/store/types'; import { selectGenerationSlice } from 'features/parameters/store/generationSlice'; import { memo, useCallback, useMemo } from 'react'; import { useTranslation } from 'react-i18next'; @@ -29,6 +35,7 @@ const ParamControlAdapterModel = ({ id }: ParamControlAdapterModelProps) => { const { modelConfig } = useControlAdapterModel(id); const dispatch = useAppDispatch(); const currentBaseModel = useAppSelector((s) => s.generation.model?.base); + const currentCLIPVisionModel = useControlAdapterCLIPVisionModel(id); const mainModel = useAppSelector(selectMainModel); const { t } = useTranslation(); @@ -49,6 +56,16 @@ const ParamControlAdapterModel = ({ id }: ParamControlAdapterModelProps) => { [dispatch, id] ); + const onCLIPVisionModelChange = useCallback( + (v) => { + if (!v?.value) { + return; + } + dispatch(controlAdapterCLIPVisionModelChanged({ id, clipVisionModel: v.value as CLIPVisionModel })); + }, + [dispatch, id] + ); + const selectedModel = useMemo( () => (modelConfig && controlAdapterType ? { ...modelConfig, model_type: controlAdapterType } : null), [controlAdapterType, modelConfig] @@ -71,17 +88,42 @@ const ParamControlAdapterModel = ({ id }: ParamControlAdapterModelProps) => { isLoading, }); + const clipVisionOptions = useMemo( + () => [ + { label: 'ViT-H', value: 'ViT-H' }, + { label: 'ViT-G', value: 'ViT-G' }, + ], + [] + ); + + const clipVisionModel = useMemo( + () => clipVisionOptions.find((o) => o.value === currentCLIPVisionModel), + [clipVisionOptions, currentCLIPVisionModel] + ); + return ( - - - + + + + + {modelConfig?.type === 'ip_adapter' && modelConfig.format === 'checkpoint' && ( + + + + )} + ); }; diff --git a/invokeai/frontend/web/src/features/controlAdapters/hooks/useControlAdapterCLIPVisionModel.ts b/invokeai/frontend/web/src/features/controlAdapters/hooks/useControlAdapterCLIPVisionModel.ts new file mode 100644 index 0000000000..249d2022fe --- /dev/null +++ b/invokeai/frontend/web/src/features/controlAdapters/hooks/useControlAdapterCLIPVisionModel.ts @@ -0,0 +1,24 @@ +import { createMemoizedSelector } from 'app/store/createMemoizedSelector'; +import { useAppSelector } from 'app/store/storeHooks'; +import { + selectControlAdapterById, + selectControlAdaptersSlice, +} from 'features/controlAdapters/store/controlAdaptersSlice'; +import { useMemo } from 'react'; + +export const useControlAdapterCLIPVisionModel = (id: string) => { + const selector = useMemo( + () => + createMemoizedSelector(selectControlAdaptersSlice, (controlAdapters) => { + const cn = selectControlAdapterById(controlAdapters, id); + if (cn && cn?.type === 'ip_adapter') { + return cn.clipVisionModel; + } + }), + [id] + ); + + const clipVisionModel = useAppSelector(selector); + + return clipVisionModel; +}; diff --git a/invokeai/frontend/web/src/features/controlAdapters/store/controlAdaptersSlice.ts b/invokeai/frontend/web/src/features/controlAdapters/store/controlAdaptersSlice.ts index f4edca41bb..100bb3f6ad 100644 --- a/invokeai/frontend/web/src/features/controlAdapters/store/controlAdaptersSlice.ts +++ b/invokeai/frontend/web/src/features/controlAdapters/store/controlAdaptersSlice.ts @@ -14,6 +14,7 @@ import { v4 as uuidv4 } from 'uuid'; import { controlAdapterImageProcessed } from './actions'; import { CONTROLNET_PROCESSORS } from './constants'; import type { + CLIPVisionModel, ControlAdapterConfig, ControlAdapterProcessorType, ControlAdaptersState, @@ -244,6 +245,13 @@ export const controlAdaptersSlice = createSlice({ } caAdapter.updateOne(state, { id, changes: { controlMode } }); }, + controlAdapterCLIPVisionModelChanged: ( + state, + action: PayloadAction<{ id: string; clipVisionModel: CLIPVisionModel }> + ) => { + const { id, clipVisionModel } = action.payload; + caAdapter.updateOne(state, { id, changes: { clipVisionModel } }); + }, controlAdapterResizeModeChanged: ( state, action: PayloadAction<{ @@ -381,6 +389,7 @@ export const { controlAdapterProcessedImageChanged, controlAdapterIsEnabledChanged, controlAdapterModelChanged, + controlAdapterCLIPVisionModelChanged, controlAdapterWeightChanged, controlAdapterBeginStepPctChanged, controlAdapterEndStepPctChanged, diff --git a/invokeai/frontend/web/src/features/controlAdapters/store/types.ts b/invokeai/frontend/web/src/features/controlAdapters/store/types.ts index 93d4915cdf..329c318759 100644 --- a/invokeai/frontend/web/src/features/controlAdapters/store/types.ts +++ b/invokeai/frontend/web/src/features/controlAdapters/store/types.ts @@ -243,12 +243,15 @@ export type T2IAdapterConfig = { shouldAutoConfig: boolean; }; +export type CLIPVisionModel = 'ViT-H' | 'ViT-G'; + export type IPAdapterConfig = { type: 'ip_adapter'; id: string; isEnabled: boolean; controlImage: string | null; model: ParameterIPAdapterModel | null; + clipVisionModel: CLIPVisionModel; weight: number; beginStepPct: number; endStepPct: number; diff --git a/invokeai/frontend/web/src/features/controlAdapters/util/buildControlAdapter.ts b/invokeai/frontend/web/src/features/controlAdapters/util/buildControlAdapter.ts index d4796572d4..dc893ceb1c 100644 --- a/invokeai/frontend/web/src/features/controlAdapters/util/buildControlAdapter.ts +++ b/invokeai/frontend/web/src/features/controlAdapters/util/buildControlAdapter.ts @@ -46,6 +46,7 @@ export const initialIPAdapter: Omit = { isEnabled: true, controlImage: null, model: null, + clipVisionModel: 'ViT-H', weight: 1, beginStepPct: 0, endStepPct: 1, diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/addIPAdapterToLinearGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/addIPAdapterToLinearGraph.ts index 2298e84d43..ad563de468 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/addIPAdapterToLinearGraph.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/addIPAdapterToLinearGraph.ts @@ -48,7 +48,7 @@ export const addIPAdapterToLinearGraph = async ( if (!ipAdapter.model) { return; } - const { id, weight, model, beginStepPct, endStepPct, controlImage } = ipAdapter; + const { id, weight, model, clipVisionModel, beginStepPct, endStepPct, controlImage } = ipAdapter; assert(controlImage, 'IP Adapter image is required'); @@ -58,6 +58,7 @@ export const addIPAdapterToLinearGraph = async ( is_intermediate: true, weight: weight, ip_adapter_model: model, + clip_vision_model: clipVisionModel, begin_step_percent: beginStepPct, end_step_percent: endStepPct, image: { @@ -83,7 +84,7 @@ export const addIPAdapterToLinearGraph = async ( }; const buildIPAdapterMetadata = (ipAdapter: IPAdapterConfig): S['IPAdapterMetadataField'] => { - const { controlImage, beginStepPct, endStepPct, model, weight } = ipAdapter; + const { controlImage, beginStepPct, endStepPct, model, clipVisionModel, weight } = ipAdapter; assert(model, 'IP Adapter model is required'); @@ -99,6 +100,7 @@ const buildIPAdapterMetadata = (ipAdapter: IPAdapterConfig): S['IPAdapterMetadat return { ip_adapter_model: model, + clip_vision_model: clipVisionModel, weight, begin_step_percent: beginStepPct, end_step_percent: endStepPct,