Add a GroundedSamInvocation for image segmentation from a text prompt (Grounding DINO + Segment Anything Model).

This commit is contained in:
Ryan Dick
2024-07-29 13:53:14 -04:00
parent 2ad13ac7eb
commit ff6398f7d8
6 changed files with 322 additions and 1 deletions

View File

@ -0,0 +1,27 @@
from typing import Optional
import torch
from transformers.pipelines import ZeroShotObjectDetectionPipeline
class GroundingDinoPipeline:
"""A wrapper class for a ZeroShotObjectDetectionPipeline that makes it compatible with the model manager's memory
management system.
"""
def __init__(self, pipeline: ZeroShotObjectDetectionPipeline):
self._pipeline = pipeline
def __call__(self, *args, **kwargs):
return self._pipeline(*args, **kwargs)
def to(self, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None) -> "GroundingDinoPipeline":
self._pipeline.model.to(device=device, dtype=dtype)
self._pipeline.device = self._pipeline.model.device
return self
def calc_size(self) -> int:
# HACK(ryand): Fix the circular import issue.
from invokeai.backend.model_manager.load.model_util import calc_module_size
return calc_module_size(self._pipeline.model)

View File

@ -0,0 +1,50 @@
# This file contains utilities for Grounded-SAM mask refinement based on:
# https://github.com/NielsRogge/Transformers-Tutorials/blob/a39f33ac1557b02ebfb191ea7753e332b5ca933f/Grounding%20DINO/GroundingDINO_with_Segment_Anything.ipynb
import cv2
import numpy as np
import numpy.typing as npt
def mask_to_polygon(mask: npt.NDArray[np.uint8]) -> list[tuple[int, int]]:
"""Convert a binary mask to a polygon.
Returns:
list[list[int]]: List of (x, y) coordinates representing the vertices of the polygon.
"""
# Find contours in the binary mask.
contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# Find the contour with the largest area.
largest_contour = max(contours, key=cv2.contourArea)
# Extract the vertices of the contour.
polygon = largest_contour.reshape(-1, 2).tolist()
return polygon
def polygon_to_mask(
polygon: list[tuple[int, int]], image_shape: tuple[int, int], fill_value: int = 1
) -> npt.NDArray[np.uint8]:
"""Convert a polygon to a segmentation mask.
Args:
polygon (list): List of (x, y) coordinates representing the vertices of the polygon.
image_shape (tuple): Shape of the image (height, width) for the mask.
fill_value (int): Value to fill the polygon with.
Returns:
np.ndarray: Segmentation mask with the polygon filled (with value 255).
"""
# Create an empty mask.
mask = np.zeros(image_shape, dtype=np.uint8)
# Convert polygon to an array of points.
pts = np.array(polygon, dtype=np.int32)
# Fill the polygon with white color (255).
cv2.fillPoly(mask, [pts], color=(fill_value,))
return mask

View File

@ -0,0 +1,35 @@
from typing import Optional
import torch
from PIL import Image
from transformers.models.sam import SamModel
from transformers.models.sam.processing_sam import SamProcessor
class SegmentAnythingModel:
"""A wrapper class for the transformers SAM model and processor that makes it compatible with the model manager."""
def __init__(self, sam_model: SamModel, sam_processor: SamProcessor):
self._sam_model = sam_model
self._sam_processor = sam_processor
def to(self, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None) -> "SegmentAnythingModel":
self._sam_model.to(device=device, dtype=dtype)
return self
def calc_size(self) -> int:
# HACK(ryand): Fix the circular import issue.
from invokeai.backend.model_manager.load.model_util import calc_module_size
return calc_module_size(self._sam_model)
def segment(self, image: Image.Image, boxes: list[list[list[int]]]) -> torch.Tensor:
inputs = self._sam_processor(images=image, input_boxes=boxes, return_tensors="pt").to(self._sam_model.device)
outputs = self._sam_model(**inputs)
masks = self._sam_processor.post_process_masks(
masks=outputs.pred_masks,
original_sizes=inputs.original_sizes,
reshaped_input_sizes=inputs.reshaped_input_sizes,
)[0]
return masks

View File

@ -11,6 +11,8 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
from diffusers.schedulers.scheduling_utils import SchedulerMixin
from transformers import CLIPTokenizer
from invokeai.backend.grounded_sam.grounding_dino_pipeline import GroundingDinoPipeline
from invokeai.backend.grounded_sam.segment_anything_model import SegmentAnythingModel
from invokeai.backend.ip_adapter.ip_adapter import IPAdapter
from invokeai.backend.lora import LoRAModelRaw
from invokeai.backend.model_manager.config import AnyModel
@ -34,7 +36,17 @@ def calc_model_size_by_data(logger: logging.Logger, model: AnyModel) -> int:
elif isinstance(model, CLIPTokenizer):
# TODO(ryand): Accurately calculate the tokenizer's size. It's small enough that it shouldn't matter for now.
return 0
elif isinstance(model, (TextualInversionModelRaw, IPAdapter, LoRAModelRaw, SpandrelImageToImageModel)):
elif isinstance(
model,
(
TextualInversionModelRaw,
IPAdapter,
LoRAModelRaw,
SpandrelImageToImageModel,
GroundingDinoPipeline,
SegmentAnythingModel,
),
):
return model.calc_size()
else:
# TODO(ryand): Promote this from a log to an exception once we are confident that we are handling all of the