(minor) Remove commented code.

2024-08-30 20:32:17 +00:00 · 2024-03-05 09:12:17 -05:00
parent a665f20fb5
commit bcfb43e5f0
1 changed files with 0 additions and 102 deletions
--- a/invokeai/backend/stable_diffusion/diffusion/regional_prompt_data.py
+++ b/invokeai/backend/stable_diffusion/diffusion/regional_prompt_data.py
@ -5,25 +5,6 @@ from invokeai.backend.stable_diffusion.diffusion.conditioning_data import (
    TextConditioningRegions,
 )
 # Stages:
 # - Convert image masks to spatial masks at all downsampling factors.
 #   - Decision: Max pooling? Nearest? Other?
 #   - Should definitely be shared across all denoising steps - that should be easy.
 # - Convert spatial masks to cross-attention masks.
 #   - This should ideally be shared across all denoising steps, but preparing the masks requires knowing the max_key_seq_len.
 #   - Could create it just-in-time and them cache the result
 # - Convert spatial masks to self-attention masks.
 #   - This should be shared across all denoising steps.
 #   - Shape depends only on spatial resolution and downsampling factors.
 # - Convert cross-attention binary mask to score mask.
 # - Convert self-attention binary mask to score mask.
 #
 # If we wanted a time schedule for level of attenuation, we would apply that in the attention layer.
 # Pre-compute the spatial masks, because that's easy.
 # Compute the other stuff as it's requested. Add caching if we find that it's slow.
 class RegionalPromptData:
    def __init__(self, regions: list[TextConditioningRegions], max_downscale_factor: int = 8):
@ -78,75 +59,6 @@ class RegionalPromptData:
                    batch_sample_masks = F.max_pool2d(batch_sample_masks, kernel_size=2, stride=2)
        return batch_sample_masks_by_seq_len
        # Merge the batch_attn_masks_by_seq_len into a single attn_masks_by_seq_len.
        # for query_seq_len in batch_sample_masks_by_seq_len[0].keys():
        #     masks_by_seq_len[query_seq_len] = torch.cat(
        #         [batch_sample_masks_by_seq_len[i][query_seq_len] for i in range(len(batch_sample_masks_by_seq_len))]
        #     )
        # return masks_by_seq_len
    # @classmethod
    # def from_regions(
    #     cls,
    #     regions: list[TextConditioningRegions],
    #     key_seq_len: int,
    #     max_downscale_factor: int = 8,
    # ):
    #     """Construct a `RegionalPromptData` object.
    #     Args:
    #         regions (list[TextConditioningRegions]): regions[i] contains the prompt regions for the i'th sample in the
    #             batch.
    #     """
    #     attn_masks_by_seq_len = {}
    #     # batch_attn_mask_by_seq_len[b][s] contains the attention mask for the b'th batch sample with a query sequence
    #     # length of s.
    #     batch_attn_masks_by_seq_len: list[dict[int, torch.Tensor]] = []
    #     for batch_sample_regions in regions:
    #         batch_attn_masks_by_seq_len.append({})
    #         # Convert the bool masks to float masks so that max pooling can be applied.
    #         batch_masks = batch_sample_regions.masks.to(dtype=torch.float32)
    #         # Downsample the spatial dimensions by factors of 2 until max_downscale_factor is reached.
    #         downscale_factor = 1
    #         while downscale_factor <= max_downscale_factor:
    #             _, num_prompts, h, w = batch_masks.shape
    #             query_seq_len = h * w
    #             # Flatten the spatial dimensions of the mask by reshaping to (1, num_prompts, query_seq_len, 1).
    #             batch_query_masks = batch_masks.reshape((1, num_prompts, -1, 1))
    #             # Create a cross-attention mask for each prompt that selects the corresponding embeddings from
    #             # `encoder_hidden_states`.
    #             # attn_mask shape: (batch_size, query_seq_len, key_seq_len)
    #             # TODO(ryand): What device / dtype should this be?
    #             attn_mask = torch.zeros((1, query_seq_len, key_seq_len))
    #             for prompt_idx, embedding_range in enumerate(batch_sample_regions.ranges):
    #                 attn_mask[0, :, embedding_range.start : embedding_range.end] = batch_query_masks[
    #                     :, prompt_idx, :, :
    #                 ]
    #             batch_attn_masks_by_seq_len[-1][query_seq_len] = attn_mask
    #             downscale_factor *= 2
    #             if downscale_factor <= max_downscale_factor:
    #                 # We use max pooling because we downscale to a pretty low resolution, so we don't want small prompt
    #                 # regions to be lost entirely.
    #                 # TODO(ryand): In the future, we may want to experiment with other downsampling methods, and could
    #                 # potentially use a weighted mask rather than a binary mask.
    #                 batch_masks = F.max_pool2d(batch_masks, kernel_size=2, stride=2)
    #     # Merge the batch_attn_masks_by_seq_len into a single attn_masks_by_seq_len.
    #     for query_seq_len in batch_attn_masks_by_seq_len[0].keys():
    #         attn_masks_by_seq_len[query_seq_len] = torch.cat(
    #             [batch_attn_masks_by_seq_len[i][query_seq_len] for i in range(len(batch_attn_masks_by_seq_len))]
    #         )
    #     return cls(attn_masks_by_seq_len)
    def get_cross_attn_mask(self, query_seq_len: int, key_seq_len: int) -> torch.Tensor:
        """Get the cross-attention mask for the given query sequence length.
@ -232,20 +144,6 @@ class RegionalPromptData:
                    * batch_sample_regions.positive_self_attn_mask_scores[prompt_idx]
                )
            # attn_mask_min = attn_mask[batch_idx].min()
            # attn_mask_max = attn_mask[batch_idx].max()
            # attn_mask_range = attn_mask_max - attn_mask_min
            # if abs(attn_mask_range) < 0.0001:
            #     # All attn_mask value in this batch sample are the same, set the attn_mask to 0.0s (to avoid divide by
            #     # zero in the normalization).
            #     attn_mask[batch_idx] = attn_mask[batch_idx] * 0.0
            # else:
            #     # Normalize from range [attn_mask_min, attn_mask_max] to [0, self.self_attn_score_range].
            #     attn_mask[batch_idx] = (
            #         (attn_mask[batch_idx] - attn_mask_min) / attn_mask_range * self.self_attn_score_range
            #     )
            attn_mask_min = attn_mask[batch_idx].min()
            # Adjust so that the minimum value is 0.0 regardless of whether all pixels are covered or not.