Add CLIP Vision model to IP-Adapter info and use this to infer which model to use.

2024-08-30 20:32:17 +00:00 · 2023-09-14 11:57:53 -04:00
parent cadc0839a6
commit 388554448a
4 changed files with 84 additions and 62 deletions
--- a/invokeai/app/invocations/ip_adapter.py
+++ b/invokeai/app/invocations/ip_adapter.py
@ -15,11 +15,7 @@ from invokeai.app.invocations.baseinvocation import (
    invocation_output,
 )
 from invokeai.app.invocations.primitives import ImageField
-from invokeai.backend.model_management.models.base import BaseModelType
-
-IP_ADAPTER_IMAGE_ENCODER_MODELS = Literal[
-    "models/core/ip_adapters/sd-1/image_encoder/", "models/core/ip_adapters/sdxl/image_encoder"
-]
+from invokeai.backend.model_management.models.base import BaseModelType, ModelType


 class IPAdapterModelField(BaseModel):
@ -27,14 +23,15 @@ class IPAdapterModelField(BaseModel):
    base_model: BaseModelType = Field(description="Base model")


+class CLIPVisionModelField(BaseModel):
+    model_name: str = Field(description="Name of the CLIP Vision image encoder model")
+    base_model: BaseModelType = Field(description="Base model (usually 'Any')")
+
+
 class IPAdapterField(BaseModel):
    image: ImageField = Field(description="The IP-Adapter image prompt.")
-
    ip_adapter_model: IPAdapterModelField = Field(description="The IP-Adapter model to use.")
-
-    # TODO(ryand): Create and use a `CLIPImageEncoderField` instead that is analogous to the `ClipField` used elsewhere.
-    image_encoder_model: str = Field(description="The name of the CLIP image encoder model.")
-
+    image_encoder_model: CLIPVisionModelField = Field(description="The name of the CLIP image encoder model.")
    weight: float = Field(default=1.0, ge=0, description="The weight of the IP-Adapter.")


@ -55,17 +52,24 @@ class IPAdapterInvocation(BaseInvocation):
        title="IP-Adapter Model",
        input=Input.Direct,
    )
-    image_encoder_model: IP_ADAPTER_IMAGE_ENCODER_MODELS = InputField(
-        default="models/core/ip_adapters/sd-1/image_encoder/", description="The name of the CLIP image encoder model."
-    )
    weight: float = InputField(default=1.0, description="The weight of the IP-Adapter.", ui_type=UIType.Float)

    def invoke(self, context: InvocationContext) -> IPAdapterOutput:
+        # Lookup the CLIP Vision encoder that is intended to be used with the IP-Adapter model.
+        ip_adapter_info = context.services.model_manager.model_info(
+            self.ip_adapter_model.model_name, self.ip_adapter_model.base_model, ModelType.IPAdapter
+        )
+        image_encoder_model_name = ip_adapter_info["image_encoder_model"].split("/")[-1].strip()
+        image_encoder_model = CLIPVisionModelField(
+            model_name=image_encoder_model_name,
+            base_model=BaseModelType.Any,
+        )
+
        return IPAdapterOutput(
            ip_adapter=IPAdapterField(
                image=self.image,
                ip_adapter_model=self.ip_adapter_model,
-                image_encoder_model=self.image_encoder_model,
+                image_encoder_model=image_encoder_model,
                weight=self.weight,
            ),
        )