Add a GroundedSamInvocation for image segmentation from a text prompt (Grounding DINO + Segment Anything Model).

2024-08-30 20:32:17 +00:00 · 2024-07-29 13:53:14 -04:00
parent 2ad13ac7eb
commit ff6398f7d8
6 changed files with 322 additions and 1 deletions
--- a/invokeai/app/invocations/grounded_sam.py
+++ b/invokeai/app/invocations/grounded_sam.py
@ -0,0 +1,197 @@
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Optional
 import numpy as np
 import numpy.typing as npt
 import torch
 from PIL import Image
 from transformers import AutoModelForMaskGeneration, AutoProcessor, pipeline
 from transformers.models.sam import SamModel
 from transformers.models.sam.processing_sam import SamProcessor
 from transformers.pipelines import ZeroShotObjectDetectionPipeline
 from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
 from invokeai.app.invocations.fields import ImageField, InputField
 from invokeai.app.invocations.primitives import ImageOutput
 from invokeai.app.services.shared.invocation_context import InvocationContext
 from invokeai.backend.grounded_sam.grounding_dino_pipeline import GroundingDinoPipeline
 from invokeai.backend.grounded_sam.mask_refinement import mask_to_polygon, polygon_to_mask
 from invokeai.backend.grounded_sam.segment_anything_model import SegmentAnythingModel
 GROUNDING_DINO_MODEL_ID = "IDEA-Research/grounding-dino-tiny"
 SEGMENT_ANYTHING_MODEL_ID = "facebook/sam-vit-base"
@dataclass
 class BoundingBox:
    """Bounding box helper class used locally for the Grounding DINO outputs."""
    xmin: int
    ymin: int
    xmax: int
    ymax: int
    def to_box(self) -> list[int]:
        """Convert to the array notation expected by SAM."""
        return [self.xmin, self.ymin, self.xmax, self.ymax]
@dataclass
 class DetectionResult:
    """Detection result from Grounding DINO or Grounded SAM."""
    score: float
    label: str
    box: BoundingBox
    mask: Optional[npt.NDArray[Any]] = None
    @classmethod
    def from_dict(cls, detection_dict: dict[str, Any]):
        return cls(
            score=detection_dict["score"],
            label=detection_dict["label"],
            box=BoundingBox(
                xmin=detection_dict["box"]["xmin"],
                ymin=detection_dict["box"]["ymin"],
                xmax=detection_dict["box"]["xmax"],
                ymax=detection_dict["box"]["ymax"],
            ),
        )
@invocation(
    "grounded_segment_anything",
    title="Segment Anything (Text Prompt)",
    tags=["prompt", "segmentation"],
    category="segmentation",
    version="1.0.0",
 )
 class GroundedSAMInvocation(BaseInvocation):
    """Runs Grounded-SAM, as proposed in https://arxiv.org/pdf/2401.14159.
    More specifically, a Grounding DINO model is run to obtain bounding boxes for a text prompt, then the bounding box
    is passed as a prompt to a Segment Anything model to obtain a segmentation mask.
    Reference:
    - https://huggingface.co/docs/transformers/v4.43.3/en/model_doc/grounding-dino#grounded-sam
    - https://github.com/NielsRogge/Transformers-Tutorials/blob/a39f33ac1557b02ebfb191ea7753e332b5ca933f/Grounding%20DINO/GroundingDINO_with_Segment_Anything.ipynb
    """
    prompt: str = InputField(description="The prompt describing the object to segment.")
    image: ImageField = InputField(description="The image to segment.")
    apply_polygon_refinement: bool = InputField(
        description="Whether to apply polygon refinement to the mask. This will smooth the edges of the mask slightly "
        "and ensure that the mask consists of a single closed polygon.",
        default=False,
    )
    def invoke(self, context: InvocationContext) -> ImageOutput:
        image_pil = context.images.get_pil(self.image.image_name)
        detections = self._detect(context=context, image=image_pil, labels=[self.prompt])
        detections = self._segment(context=context, image=image_pil, detection_results=detections)
        # Extract ouput mask.
        mask_np = detections[0].mask
        assert mask_np is not None
        # Map [0, 1] to [0, 255].
        mask_np = mask_np * 255
        mask_pil = Image.fromarray(mask_np)
        image_dto = context.images.save(image=mask_pil)
        return ImageOutput.build(image_dto)
    def _to_box_array(self, detection_results: list[DetectionResult]) -> list[list[list[int]]]:
        """Convert a list of DetectionResults to the format expected by the Segment Anything model.
        Args:
            detection_results (list[DetectionResult]): The Grounding DINO detection results.
        """
        boxes = [result.box.to_box() for result in detection_results]
        return [boxes]
    def _detect(
        self,
        context: InvocationContext,
        image: Image.Image,
        labels: list[str],
        threshold: float = 0.3,
    ) -> list[DetectionResult]:
        """Use Grounding DINO to detect bounding boxes for a set of labels in an image."""
        def load_grounding_dino(model_path: Path):
            grounding_dino_pipeline = pipeline(
                model=str(model_path),
                task="zero-shot-object-detection",
                local_files_only=True,
                # TODO(ryand): Setting the torch_dtype here doesn't work. Investigate whether fp16 is supported by the
                # model, and figure out how to make it work in the pipeline.
                # torch_dtype=TorchDevice.choose_torch_dtype(),
            )
            assert isinstance(grounding_dino_pipeline, ZeroShotObjectDetectionPipeline)
            return GroundingDinoPipeline(grounding_dino_pipeline)
        with context.models.load_remote_model(source=GROUNDING_DINO_MODEL_ID, loader=load_grounding_dino) as detector:
            assert isinstance(detector, GroundingDinoPipeline)
            # TODO(ryand): I copied this "."-handling logic from the transformers example code. Test it and see if it
            # actually makes a difference.
            labels = [label if label.endswith(".") else label + "." for label in labels]
            results = detector(image, candidate_labels=labels, threshold=threshold)
            results = [DetectionResult.from_dict(result) for result in results]
            return results
    def _segment(
        self,
        context: InvocationContext,
        image: Image.Image,
        detection_results: list[DetectionResult],
    ) -> list[DetectionResult]:
        """Use Segment Anything (SAM) to generate masks given an image + a set of bounding boxes."""
        def load_sam_model(model_path: Path):
            sam_model = AutoModelForMaskGeneration.from_pretrained(
                model_path,
                local_files_only=True,
                # TODO(ryand): Setting the torch_dtype here doesn't work. Investigate whether fp16 is supported by the
                # model, and figure out how to make it work in the pipeline.
                # torch_dtype=TorchDevice.choose_torch_dtype(),
            )
            assert isinstance(sam_model, SamModel)
            sam_processor = AutoProcessor.from_pretrained(model_path, local_files_only=True)
            assert isinstance(sam_processor, SamProcessor)
            return SegmentAnythingModel(sam_model=sam_model, sam_processor=sam_processor)
        with (
            context.models.load_remote_model(source=SEGMENT_ANYTHING_MODEL_ID, loader=load_sam_model) as sam_pipeline,
        ):
            assert isinstance(sam_pipeline, SegmentAnythingModel)
            boxes = self._to_box_array(detection_results)
            masks = sam_pipeline.segment(image=image, boxes=boxes)
            masks = self._refine_masks(masks)
            for detection_result, mask in zip(detection_results, masks, strict=False):
                detection_result.mask = mask
            return detection_results
    def _refine_masks(self, masks: torch.Tensor) -> list[npt.NDArray[np.uint8]]:
        masks = masks.cpu().float()
        masks = masks.permute(0, 2, 3, 1)
        masks = masks.mean(axis=-1)
        masks = (masks > 0).int()
        masks = masks.numpy().astype(np.uint8)
        masks = list(masks)
        if self.apply_polygon_refinement:
            for idx, mask in enumerate(masks):
                shape = mask.shape
                polygon = mask_to_polygon(mask)
                mask = polygon_to_mask(polygon, shape)
                masks[idx] = mask
        return masks
--- a/invokeai/backend/grounded_sam/init.py
+++ b/invokeai/backend/grounded_sam/init.py
--- a/invokeai/backend/grounded_sam/grounding_dino_pipeline.py
+++ b/invokeai/backend/grounded_sam/grounding_dino_pipeline.py
@ -0,0 +1,27 @@
 from typing import Optional
 import torch
 from transformers.pipelines import ZeroShotObjectDetectionPipeline
 class GroundingDinoPipeline:
    """A wrapper class for a ZeroShotObjectDetectionPipeline that makes it compatible with the model manager's memory
    management system.
    """
    def __init__(self, pipeline: ZeroShotObjectDetectionPipeline):
        self._pipeline = pipeline
    def __call__(self, *args, **kwargs):
        return self._pipeline(*args, **kwargs)
    def to(self, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None) -> "GroundingDinoPipeline":
        self._pipeline.model.to(device=device, dtype=dtype)
        self._pipeline.device = self._pipeline.model.device
        return self
    def calc_size(self) -> int:
        # HACK(ryand): Fix the circular import issue.
        from invokeai.backend.model_manager.load.model_util import calc_module_size
        return calc_module_size(self._pipeline.model)
--- a/invokeai/backend/grounded_sam/mask_refinement.py
+++ b/invokeai/backend/grounded_sam/mask_refinement.py
@ -0,0 +1,50 @@
 # This file contains utilities for Grounded-SAM mask refinement based on:
 # https://github.com/NielsRogge/Transformers-Tutorials/blob/a39f33ac1557b02ebfb191ea7753e332b5ca933f/Grounding%20DINO/GroundingDINO_with_Segment_Anything.ipynb
 import cv2
 import numpy as np
 import numpy.typing as npt
 def mask_to_polygon(mask: npt.NDArray[np.uint8]) -> list[tuple[int, int]]:
    """Convert a binary mask to a polygon.
    Returns:
        list[list[int]]: List of (x, y) coordinates representing the vertices of the polygon.
    """
    # Find contours in the binary mask.
    contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    # Find the contour with the largest area.
    largest_contour = max(contours, key=cv2.contourArea)
    # Extract the vertices of the contour.
    polygon = largest_contour.reshape(-1, 2).tolist()
    return polygon
 def polygon_to_mask(
    polygon: list[tuple[int, int]], image_shape: tuple[int, int], fill_value: int = 1
 ) -> npt.NDArray[np.uint8]:
    """Convert a polygon to a segmentation mask.
    Args:
        polygon (list): List of (x, y) coordinates representing the vertices of the polygon.
        image_shape (tuple): Shape of the image (height, width) for the mask.
        fill_value (int): Value to fill the polygon with.
    Returns:
        np.ndarray: Segmentation mask with the polygon filled (with value 255).
    """
    # Create an empty mask.
    mask = np.zeros(image_shape, dtype=np.uint8)
    # Convert polygon to an array of points.
    pts = np.array(polygon, dtype=np.int32)
    # Fill the polygon with white color (255).
    cv2.fillPoly(mask, [pts], color=(fill_value,))
    return mask
--- a/invokeai/backend/grounded_sam/segment_anything_model.py
+++ b/invokeai/backend/grounded_sam/segment_anything_model.py
@ -0,0 +1,35 @@
 from typing import Optional
 import torch
 from PIL import Image
 from transformers.models.sam import SamModel
 from transformers.models.sam.processing_sam import SamProcessor
 class SegmentAnythingModel:
    """A wrapper class for the transformers SAM model and processor that makes it compatible with the model manager."""
    def __init__(self, sam_model: SamModel, sam_processor: SamProcessor):
        self._sam_model = sam_model
        self._sam_processor = sam_processor
    def to(self, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None) -> "SegmentAnythingModel":
        self._sam_model.to(device=device, dtype=dtype)
        return self
    def calc_size(self) -> int:
        # HACK(ryand): Fix the circular import issue.
        from invokeai.backend.model_manager.load.model_util import calc_module_size
        return calc_module_size(self._sam_model)
    def segment(self, image: Image.Image, boxes: list[list[list[int]]]) -> torch.Tensor:
        inputs = self._sam_processor(images=image, input_boxes=boxes, return_tensors="pt").to(self._sam_model.device)
        outputs = self._sam_model(**inputs)
        masks = self._sam_processor.post_process_masks(
            masks=outputs.pred_masks,
            original_sizes=inputs.original_sizes,
            reshaped_input_sizes=inputs.reshaped_input_sizes,
        )[0]
        return masks
--- a/invokeai/backend/model_manager/load/model_util.py
+++ b/invokeai/backend/model_manager/load/model_util.py
@ -11,6 +11,8 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers.scheduling_utils import SchedulerMixin
 from transformers import CLIPTokenizer
 from invokeai.backend.grounded_sam.grounding_dino_pipeline import GroundingDinoPipeline
 from invokeai.backend.grounded_sam.segment_anything_model import SegmentAnythingModel
 from invokeai.backend.ip_adapter.ip_adapter import IPAdapter
 from invokeai.backend.lora import LoRAModelRaw
 from invokeai.backend.model_manager.config import AnyModel
@ -34,7 +36,17 @@ def calc_model_size_by_data(logger: logging.Logger, model: AnyModel) -> int:
    elif isinstance(model, CLIPTokenizer):
        # TODO(ryand): Accurately calculate the tokenizer's size. It's small enough that it shouldn't matter for now.
        return 0
-    elif isinstance(model, (TextualInversionModelRaw, IPAdapter, LoRAModelRaw, SpandrelImageToImageModel)):
+    elif isinstance(
        model,
        (
            TextualInversionModelRaw,
            IPAdapter,
            LoRAModelRaw,
            SpandrelImageToImageModel,
            GroundingDinoPipeline,
            SegmentAnythingModel,
        ),
    ):
        return model.calc_size()
    else:
        # TODO(ryand): Promote this from a log to an exception once we are confident that we are handling all of the