o
    Wh6d                     @   sd   d dl mZmZmZmZ d dlZd dlmZ d dlmZm	Z	 G dd dej
ZG dd dej
ZdS )	    )ListOptionalTupleTypeN)nn)MLPLayerNorm2dc                       s   e Zd ZdZdejddfdedejdedeej ded	ed
df fddZ	de
jde
jde
jde
jded
ee
je
jf fddZde
jde
jde
jde
jd
ee
je
jf f
ddZ  ZS )MaskDecodera  
    Decoder module for generating masks and their associated quality scores using a transformer architecture.

    This class predicts masks given image and prompt embeddings, utilizing a transformer to process the inputs and
    generate mask predictions along with their quality scores.

    Attributes:
        transformer_dim (int): Channel dimension for the transformer module.
        transformer (nn.Module): Transformer module used for mask prediction.
        num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
        iou_token (nn.Embedding): Embedding for the IoU token.
        num_mask_tokens (int): Number of mask tokens.
        mask_tokens (nn.Embedding): Embedding for the mask tokens.
        output_upscaling (nn.Sequential): Neural network sequence for upscaling the output.
        output_hypernetworks_mlps (nn.ModuleList): Hypernetwork MLPs for generating masks.
        iou_prediction_head (nn.Module): MLP for predicting mask quality.

    Methods:
        forward: Predicts masks given image and prompt embeddings.
        predict_masks: Internal method for mask prediction.

    Examples:
        >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
        >>> masks, iou_pred = decoder(
        ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, multimask_output=True
        ... )
        >>> print(f"Predicted masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
          transformer_dimtransformernum_multimask_outputs
activationiou_head_depthiou_head_hidden_dimreturnNc                    s   t     | _|| _|| _td | _|d | _t| j | _	t
tj  d dddt d | tj d  d ddd| | _t fddt| jD | _t || j|| _dS )	a  
        Initialize the MaskDecoder module for generating masks and their associated quality scores.

        Args:
            transformer_dim (int): Channel dimension for the transformer module.
            transformer (nn.Module): Transformer module used for mask prediction.
            num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
            activation (Type[nn.Module]): Type of activation to use when upscaling masks.
            iou_head_depth (int): Depth of the MLP used to predict mask quality.
            iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.

        Examples:
            >>> transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=8), num_layers=6)
            >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer)
            >>> print(decoder)
                 kernel_sizestride   c                       g | ]}t    d  dqS r   r
   r   .0_r    [/var/www/vscode/kcb/lib/python3.10/site-packages/ultralytics/models/sam/modules/decoders.py
<listcomp>T       z(MaskDecoder.__init__.<locals>.<listcomp>N)super__init__r   r   r   r   	Embedding	iou_tokennum_mask_tokensmask_tokens
SequentialConvTranspose2dr   output_upscaling
ModuleListrangeoutput_hypernetworks_mlpsr   iou_prediction_head)selfr   r   r   r   r   r   	__class__r    r"   r&   )   s$   


zMaskDecoder.__init__image_embeddingsimage_pesparse_prompt_embeddingsdense_prompt_embeddingsmultimask_outputc           	      C   sb   | j ||||d\}}|rtddntdd}|dd|ddddf }|dd|f }||fS )a  
        Predict masks given image and prompt embeddings.

        Args:
            image_embeddings (torch.Tensor): Embeddings from the image encoder.
            image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings.
            sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes.
            dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs.
            multimask_output (bool): Whether to return multiple masks or a single mask.

        Returns:
            masks (torch.Tensor): Batched predicted masks.
            iou_pred (torch.Tensor): Batched predictions of mask quality.

        Examples:
            >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
            >>> image_emb = torch.rand(1, 256, 64, 64)
            >>> image_pe = torch.rand(1, 256, 64, 64)
            >>> sparse_emb = torch.rand(1, 2, 256)
            >>> dense_emb = torch.rand(1, 256, 64, 64)
            >>> masks, iou_pred = decoder(image_emb, image_pe, sparse_emb, dense_emb, multimask_output=True)
            >>> print(f"Masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
        )r5   r6   r7   r8   r   Nr   )predict_masksslice)	r2   r5   r6   r7   r8   r9   masksiou_pred
mask_slicer!   r!   r"   forwardY   s   
zMaskDecoder.forwardc                    s\  t jjjjjgdd}|d|jd dd}t j||fdd}t j||jd dd}|| }t j||jd dd}|j\}	}
}}	|||\}}|dddddf }|ddddj
 ddf  |dd|	|
||}|} fddtj
D }t j|dd}|j\}	}
}}|||	|
||  |	d||}|}||fS )	z`Predict masks and quality scores using image and prompt embeddings via transformer architecture.r   dimr   Nr   c                    ,   g | ]}j |  d d |d d f qS Nr0   r   imask_tokens_outr2   r!   r"   r#           z-MaskDecoder.predict_masks.<locals>.<listcomp>)torchcatr(   weightr*   	unsqueezeexpandshaperepeat_interleaver   r)   	transposeviewr-   r/   stackr1   )r2   r5   r6   r7   r8   output_tokenstokenssrcpos_srcbchwhsiou_token_outupscaled_embeddinghyper_in_listhyper_inr<   r=   r!   rH   r"   r:      s(   	 
"
zMaskDecoder.predict_masks)__name__
__module____qualname____doc__r   GELUintModuler   r&   rK   Tensorboolr   r?   r:   __classcell__r!   r!   r3   r"   r	      sZ    !0
.r	   c                       s&  e Zd ZdZdejddddddddddfdedejd	ed
eej dedede	de	de	de	ddf fddZ
	d$dejdejdejdejde	de	deeej  deejejejejf fddZ	d$dejdejdejdejde	deeej  deejejejejf fddZd d! Zd"d# Z  ZS )%SAM2MaskDecodera
  
    Transformer-based decoder for predicting instance segmentation masks from image and prompt embeddings.

    This class extends the functionality of the MaskDecoder, incorporating additional features such as
    high-resolution feature processing, dynamic multimask output, and object score prediction.

    Attributes:
        transformer_dim (int): Channel dimension of the transformer.
        transformer (nn.Module): Transformer used to predict masks.
        num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
        iou_token (nn.Embedding): Embedding for IOU token.
        num_mask_tokens (int): Total number of mask tokens.
        mask_tokens (nn.Embedding): Embedding for mask tokens.
        pred_obj_scores (bool): Whether to predict object scores.
        obj_score_token (nn.Embedding): Embedding for object score token.
        use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.
        output_upscaling (nn.Sequential): Upscaling layers for output.
        use_high_res_features (bool): Whether to use high-resolution features.
        conv_s0 (nn.Conv2d): Convolutional layer for high-resolution features (s0).
        conv_s1 (nn.Conv2d): Convolutional layer for high-resolution features (s1).
        output_hypernetworks_mlps (nn.ModuleList): List of MLPs for output hypernetworks.
        iou_prediction_head (MLP): MLP for IOU prediction.
        pred_obj_score_head (nn.Linear | MLP): Linear layer or MLP for object score prediction.
        dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
        dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
        dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.

    Methods:
        forward: Predicts masks given image and prompt embeddings.
        predict_masks: Predicts instance segmentation masks from image and prompt embeddings.
        _get_stability_scores: Computes mask stability scores based on IoU between thresholds.
        _dynamic_multimask_via_stability: Dynamically selects the most stable mask output.

    Examples:
        >>> image_embeddings = torch.rand(1, 256, 64, 64)
        >>> image_pe = torch.rand(1, 256, 64, 64)
        >>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
        >>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
        >>> decoder = SAM2MaskDecoder(256, transformer)
        >>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
        ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False
        ... )
    r
   r   Fg?g\(\?r   r   r   r   r   r   use_high_res_featurespred_obj_scorespred_obj_scores_mlpuse_multimask_token_for_obj_ptrr   Nc                    sZ  t     | _|| _|| _td | _|d | _t| j | _	|| _
| j
r/td | _|| _ttj  d dddt d | tj d  d ddd| | _|| _|rstj  d ddd| _tj  d ddd| _t fddt| jD | _t || j||d| _| j
rt d| _|rt  dd	| _|	| _|
| _|| _d
S )a  
        Initialize the SAM2MaskDecoder module for predicting instance segmentation masks.

        This decoder extends the functionality of MaskDecoder, incorporating additional features such as
        high-resolution feature processing, dynamic multimask output, and object score prediction.

        Args:
            transformer_dim (int): Channel dimension of the transformer.
            transformer (nn.Module): Transformer used to predict masks.
            num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
            activation (Type[nn.Module]): Type of activation to use when upscaling masks.
            iou_head_depth (int): Depth of the MLP used to predict mask quality.
            iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.
            use_high_res_features (bool): Whether to use high-resolution features.
            iou_prediction_use_sigmoid (bool): Whether to use sigmoid for IOU prediction.
            dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
            dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
            dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.
            pred_obj_scores (bool): Whether to predict object scores.
            pred_obj_scores_mlp (bool): Whether to use MLP for object score prediction.
            use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.

        Examples:
            >>> transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=8), num_layers=6)
            >>> decoder = SAM2MaskDecoder(transformer_dim=256, transformer=transformer)
            >>> print(decoder)
        r   r   r   r   r   c                    r   r   r   r   r    r!   r"   r#   $  r$   z,SAM2MaskDecoder.__init__.<locals>.<listcomp>)sigmoidr
   N)r%   r&   r   r   r   r   r'   r(   r)   r*   rn   obj_score_tokenrp   r+   r,   r   r-   rm   Conv2dconv_s0conv_s1r.   r/   r0   r   r1   Linearpred_obj_score_headdynamic_multimask_via_stability!dynamic_multimask_stability_delta"dynamic_multimask_stability_thresh)r2   r   r   r   r   r   r   rm   iou_prediction_use_sigmoidrx   ry   rz   rn   ro   rp   r3   r    r"   r&      sN   
,


zSAM2MaskDecoder.__init__r5   r6   r7   r8   r9   repeat_imagehigh_res_featuresc                 C   s   | j ||||||d\}}	}
}|r,|ddddddddf }|	ddddf }	n)| jr;| js;| ||	\}}	n|ddddddddf }|	ddddf }	|re| jre|
ddddf }n
|
ddddf }||	||fS )a  
        Predict masks given image and prompt embeddings.

        Args:
            image_embeddings (torch.Tensor): Embeddings from the image encoder with shape (B, C, H, W).
            image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings (B, C, H, W).
            sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes with shape (B, N, C).
            dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs with shape (B, C, H, W).
            multimask_output (bool): Whether to return multiple masks or a single mask.
            repeat_image (bool): Flag to repeat the image embeddings.
            high_res_features (List[torch.Tensor] | None): Optional high-resolution features.

        Returns:
            masks (torch.Tensor): Batched predicted masks with shape (B, N, H, W).
            iou_pred (torch.Tensor): Batched predictions of mask quality with shape (B, N).
            sam_tokens_out (torch.Tensor): Batched SAM token for mask output with shape (B, N, C).
            object_score_logits (torch.Tensor): Batched object score logits with shape (B, 1).

        Examples:
            >>> image_embeddings = torch.rand(1, 256, 64, 64)
            >>> image_pe = torch.rand(1, 256, 64, 64)
            >>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
            >>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
            >>> decoder = SAM2MaskDecoder(256, transformer)
            >>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
            ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False
            ... )
        )r5   r6   r7   r8   r|   r}   Nr   r   )r:   rx   training _dynamic_multimask_via_stabilityrp   )r2   r5   r6   r7   r8   r9   r|   r}   r<   r=   rI   object_score_logitssam_tokens_outr!   r!   r"   r?   9  s&   &
  
zSAM2MaskDecoder.forwardc                    s^  d}j rtjjjjjjjgdd}d}ntjjjjjgdd}|d|	ddd}tj||fdd}	|rItj
||	jd dd}
n|jd |	jd ksUJ |}
|
| }
|	ddksfJ dtj
||	jd dd}|
j\}}}}|
||	\}}
|dd|ddf }|dd|d |d j ddf  |
dd||||}
js|
}nj\}}}}}|\}}||||
| }|||| } fdd	tjD }tj|dd}|j\}}}}||||||  |d||}|}j r|dksJ |dddddf }nd
||jd d }|| |fS )zYPredict instance segmentation masks from image and prompt embeddings using a transformer.r   r@   r   rB   z@image_pe should have size 1 in batch dim (from `get_dense_pe()`)Nr   c                    rC   rD   rE   rF   rH   r!   r"   r#     rJ   z1SAM2MaskDecoder.predict_masks.<locals>.<listcomp>g      $@)rn   rK   rL   rr   rM   r(   r*   rN   rO   sizerQ   rP   r   r)   rR   rS   rm   r-   r/   rT   r1   rw   new_ones)r2   r5   r6   r7   r8   r|   r}   srU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   dc1ln1act1dc2act2feat_s0feat_s1r`   ra   r<   r=   r   r!   rH   r"   r:     sV   ("
zSAM2MaskDecoder.predict_masksc                 C   sT   | d}| j}tj||kdd }tj|| kdd }t|dk|| dS )zNCompute mask stability scores based on IoU between upper and lower thresholds.rB   r@   r   g      ?)flattenry   rK   sumfloatwhere)r2   mask_logitsstability_deltaarea_iarea_ur!   r!   r"   _get_stability_scores  s
   
z%SAM2MaskDecoder._get_stability_scoresc                 C   s   |ddddddddf }|ddddf }t j|dd}t j|d|jd}|||f }|d}|||f }|d}|ddddddddf }	|ddddf }
| |	}|| jk}t |d 	|	|	|}t |	|
|
|}||fS )a  
        Dynamically select the most stable mask output based on stability scores and IoU predictions.

        This method is used when outputting a single mask. If the stability score from the current single-mask
        output (based on output token 0) falls below a threshold, it instead selects from multi-mask outputs
        (based on output tokens 1-3) the mask with the highest predicted IoU score. This ensures a valid mask
        for both clicking and tracking scenarios.

        Args:
            all_mask_logits (torch.Tensor): Logits for all predicted masks, shape (B, N, H, W) where B is
                batch size, N is number of masks (typically 4), and H, W are mask dimensions.
            all_iou_scores (torch.Tensor): Predicted IoU scores for all masks, shape (B, N).

        Returns:
            mask_logits_out (torch.Tensor): Selected mask logits, shape (B, 1, H, W).
            iou_scores_out (torch.Tensor): Selected IoU scores, shape (B, 1).

        Examples:
            >>> decoder = SAM2MaskDecoder(...)
            >>> all_mask_logits = torch.rand(2, 4, 256, 256)  # 2 images, 4 masks each
            >>> all_iou_scores = torch.rand(2, 4)
            >>> mask_logits, iou_scores = decoder._dynamic_multimask_via_stability(all_mask_logits, all_iou_scores)
            >>> print(mask_logits.shape, iou_scores.shape)
            torch.Size([2, 1, 256, 256]) torch.Size([2, 1])
        Nr   rB   r@   r   )device).NN)
rK   argmaxaranger   r   rN   r   rz   r   	expand_as)r2   all_mask_logitsall_iou_scoresmultimask_logitsmultimask_iou_scoresbest_scores_inds
batch_indsbest_multimask_logitsbest_multimask_iou_scoressinglemask_logitssinglemask_iou_scoresstability_scores	is_stablemask_logits_outiou_scores_outr!   r!   r"   r     s.    

 

z0SAM2MaskDecoder._dynamic_multimask_via_stabilityrD   )rb   rc   rd   re   r   rf   rg   rh   r   rj   r&   rK   ri   r   r   r   r?   r:   r   r   rk   r!   r!   r3   r"   rl      s    0e	
M
Grl   )typingr   r   r   r   rK   r   ultralytics.nn.modulesr   r   rh   r	   rl   r!   r!   r!   r"   <module>   s    %