o
    Wh                     @   s   d dl mZ d dlZd dlm  mZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZ dZG dd dejZG dd dejjZdS )    )ListN)nn)trunc_normal_)MLP)LOGGER   )SAM2TwoWayTransformer)MaskDecoderSAM2MaskDecoder)ImageEncoderViTPromptEncoder)get_1d_sine_peselect_closest_cond_framesg      c                       s^   e Zd ZU dZdZeed< 		ddedede	d	e
e d
e
e ddf fddZdd Z  ZS )SAMModela  
    Segment Anything Model (SAM) for object segmentation tasks.

    This class combines image encoders, prompt encoders, and mask decoders to predict object masks from images
    and input prompts.

    Attributes:
        mask_threshold (float): Threshold value for mask prediction.
        image_encoder (ImageEncoderViT): Backbone for encoding images into embeddings.
        prompt_encoder (PromptEncoder): Encoder for various types of input prompts.
        mask_decoder (MaskDecoder): Predicts object masks from image and prompt embeddings.

    Methods:
        __init__: Initializes the SAMModel with encoders, decoder, and normalization parameters.

    Examples:
        >>> image_encoder = ImageEncoderViT(...)
        >>> prompt_encoder = PromptEncoder(...)
        >>> mask_decoder = MaskDecoder(...)
        >>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
        >>> # Further usage depends on SAMPredictor class

    Notes:
        All forward() operations are implemented in the SAMPredictor class.
            mask_thresholdg33333^@gR]@gRY@g(\2M@g(\L@g     L@image_encoderprompt_encodermask_decoder
pixel_mean	pixel_stdreturnNc                    s\   t    || _|| _|| _| dt|dddd | dt|dddd dS )a  
        Initialize the SAMModel class to predict object masks from an image and input prompts.

        Args:
            image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings.
            prompt_encoder (PromptEncoder): Encodes various types of input prompts.
            mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
            pixel_mean (List[float]): Mean values for normalizing pixels in the input image.
            pixel_std (List[float]): Std values for normalizing pixels in the input image.

        Examples:
            >>> image_encoder = ImageEncoderViT(...)
            >>> prompt_encoder = PromptEncoder(...)
            >>> mask_decoder = MaskDecoder(...)
            >>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
            >>> # Further usage depends on SAMPredictor class

        Notes:
            All forward() operations moved to SAMPredictor.
        r   r   Fr   N)	super__init__r   r   r   register_buffertorchTensorview)selfr   r   r   r   r   	__class__ V/var/www/vscode/kcb/lib/python3.10/site-packages/ultralytics/models/sam/modules/sam.pyr   9   s   
"zSAMModel.__init__c                 C   sB   t | jdr| j| || j_dd |D | j_|d | j_dS )z
        Set image size to make model compatible with different image sizes.

        Args:
            imgsz (Tuple[int, int]): The size of the input image.
        	set_imgszc                 S      g | ]}|d  qS    r$   .0xr$   r$   r%   
<listcomp>f       z&SAMModel.set_imgsz.<locals>.<listcomp>r   N)hasattrr   r&   r   input_image_sizeimage_embedding_sizeimg_sizer!   imgszr$   r$   r%   r&   \   s
   zSAMModel.set_imgsz)r   r   )__name__
__module____qualname____doc__r   float__annotations__r   r   r	   r   r   r&   __classcell__r$   r$   r"   r%   r      s&   
 #r   c                       s:  e Zd ZU dZdZeed< 													
	
				
															d8dedededededededef fddZe	dd Z
dd Zdd Z				d9ddZdd  Zd!ejfd"d#Zd$d% Z	d:d&d'Zd(d) Zd*d+ Zd,d- Z			d;d.d/Zd0d1 Zed2d3 Zd:d4d5Zd6d7 Z  ZS )<	SAM2Modela  
    SAM2Model class for Segment Anything Model 2 with memory-based video object segmentation capabilities.

    This class extends the functionality of SAM to handle video sequences, incorporating memory mechanisms
    for temporal consistency and efficient tracking of objects across frames.

    Attributes:
        mask_threshold (float): Threshold value for mask prediction.
        image_encoder (ImageEncoderViT): Visual encoder for extracting image features.
        memory_attention (nn.Module): Module for attending to memory features.
        memory_encoder (nn.Module): Encoder for generating memory representations.
        num_maskmem (int): Number of accessible memory frames.
        image_size (int): Size of input images.
        backbone_stride (int): Stride of the backbone network output.
        sam_prompt_embed_dim (int): Dimension of SAM prompt embeddings.
        sam_image_embedding_size (int): Size of SAM image embeddings.
        sam_prompt_encoder (PromptEncoder): Encoder for processing input prompts.
        sam_mask_decoder (SAM2MaskDecoder): Decoder for generating object masks.
        obj_ptr_proj (nn.Module): Projection layer for object pointers.
        obj_ptr_tpos_proj (nn.Module): Projection for temporal positional encoding in object pointers.

    Methods:
        forward_image: Processes image batch through encoder to extract multi-level features.
        track_step: Performs a single tracking step, updating object masks and memory features.

    Examples:
        >>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
        >>> image_batch = torch.rand(1, 3, 512, 512)
        >>> features = model.forward_image(image_batch)
        >>> track_results = model.track_step(0, True, features, None, None, None, {})
    r   r         r)         ?Fr   r   TNuse_multimask_token_for_obj_ptrpred_obj_scorespred_obj_scores_mlpfixed_no_obj_ptrsoft_no_obj_ptruse_mlp_for_obj_ptr_projno_obj_embed_spatialcompile_image_encoderc$           $         sr  t    || _|| _|rdnd| _|| _|| _|r%tjj	ddddd| _
|| _|r.|s.J || _|| _|| _|| _|j| _|| _| j| _t| jdr[t| jjdr[| jjjjd | _|| _tjt|dd| j| _t| jdd	 tjtdd| j| _tjtdd| j| _t| jdd	 t| jdd	 || _|| _ || _!|	| _"|| _#|| _$|
| _%|| _&|| _'|| _(|| _)|| _*|| _+|| _,|| _-|"| _.|| _/|| _0|| _1|| _2| j1r| j/sJ | jsJ | j/r| jrtjtd| j| _3t| j3dd	 | | _4d
| _5|!rtjtd| j| _5t| j5dd	 | 6  || _7|#r7t89d tj:| jj;dddd| j_;d
S d
S )a  
        Initialize the SAM2Model for video object segmentation with memory-based tracking.

        Args:
            image_encoder (nn.Module): Visual encoder for extracting image features.
            memory_attention (nn.Module): Module for attending to memory features.
            memory_encoder (nn.Module): Encoder for generating memory representations.
            num_maskmem (int): Number of accessible memory frames. Default is 7 (1 input frame + 6 previous frames).
            image_size (int): Size of input images.
            backbone_stride (int): Stride of the image backbone output.
            sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
            sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
            binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames
                with clicks during evaluation.
            use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM
                prompt encoder and mask decoder on frames with mask input.
            max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
                -1 means no limit.
            directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the
                first frame.
            use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
            multimask_output_in_sam (bool): Whether to output multiple (3) masks for the first click on initial
                conditioning frames.
            multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
            multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
            multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
            use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
            iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
            memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
            non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in
                memory encoder during evaluation.
            use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
            max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder
                cross-attention.
            add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers in
                the encoder.
            proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
                encoding in object pointers.
            use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance (instead of unsigned absolute distance)
                in the temporal positional encoding in the object pointers, only relevant when both
                `use_obj_ptrs_in_encoder=True` and `add_tpos_enc_to_obj_ptrs=True`.
            only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past
                during evaluation.
            pred_obj_scores (bool): Whether to predict if there is an object in the frame.
            pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
            fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
            soft_no_obj_ptr (bool): Whether to mix in no-object pointer softly for easier recovery and error mitigation.
            use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
            no_obj_embed_spatial (bool): Whether add no obj embedding to spatial frames.
            sam_mask_decoder_extra_args (Dict | None): Extra arguments for constructing the SAM mask decoder.
            compile_image_encoder (bool): Whether to compile the image encoder for faster inference.

        Examples:
            >>> image_encoder = ImageEncoderViT(...)
            >>> memory_attention = SAM2TwoWayTransformer(...)
            >>> memory_encoder = nn.Sequential(...)
            >>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
            >>> image_batch = torch.rand(1, 3, 512, 512)
            >>> features = model.forward_image(image_batch)
            >>> track_results = model.track_step(0, True, features, None, None, None, {})
           r      )kernel_sizestrideout_projweightr   g{Gz?)stdNzFImage encoder compilation is enabled. First forward pass will be slow.zmax-autotuneTF)mode	fullgraphdynamic)<r   r   r   use_high_res_features_in_samnum_feature_levelsuse_obj_ptrs_in_encodermax_obj_ptrs_in_encoderr   r   Conv2dmask_downsampleadd_tpos_enc_to_obj_ptrsproj_tpos_enc_in_obj_ptrsuse_signed_tpos_enc_to_obj_ptrs"only_obj_ptrs_in_the_past_for_evalmemory_attentiond_model
hidden_dimmemory_encodermem_dimr/   rL   rM   shapenum_maskmem	Parameterzerosmaskmem_tpos_encr   no_mem_embedno_mem_pos_encdirectly_add_no_mem_embedsigmoid_scale_for_mem_encsigmoid_bias_for_mem_enc"binarize_mask_from_pts_for_mem_encnon_overlap_masks_for_mem_encmemory_temporal_stride_for_eval$use_mask_input_as_output_without_sammultimask_output_in_sammultimask_min_pt_nummultimask_max_pt_nummultimask_output_for_trackingr@   iou_prediction_use_sigmoid
image_sizebackbone_stridesam_mask_decoder_extra_argsrA   rB   rC   rD   
no_obj_ptrrE   rF   _build_sam_headsmax_cond_frames_in_attnr   infocompileforward)$r!   r   r\   r_   rb   rt   ru   ri   rj   rk   rn   ry   rh   rR   ro   rp   rq   rr   r@   rs   rm   rl   rT   rU   rX   rY   rZ   r[   rA   rB   rC   rD   rE   rF   rv   rG   r"   r$   r%   r      s   
c


zSAM2Model.__init__c                 C   s   t |  jS )z=Return the device on which the model's parameters are stored.)next
parametersdevicer!   r$   r$   r%   r   N  s   zSAM2Model.devicec                 O   s   t d)zWProcess image and prompt inputs to generate object masks and scores in video sequences.zPlease use the corresponding methods in SAM2VideoPredictor for inference.See notebooks/video_predictor_example.ipynb for an example.)NotImplementedError)r!   argskwargsr$   r$   r%   r|   S  s   zSAM2Model.forwardc                 C   s   | j | _| j| j | _t| j| j| jf| j| jfdd| _tddtd| jddd| jdd| j	| j
| j| j| jd	
| jp<i | _| jr\tj| j | j | _| jr[t| j | j | j d| _ntj | _| jrqtj| j | j| _d
S tj | _d
S )zMBuild SAM-style prompt encoder and mask decoder for image segmentation tasks.r)   )	embed_dimr1   r0   mask_in_chansrH      i      )depthembedding_dimmlp_dim	num_heads   )
num_multimask_outputstransformertransformer_dimiou_head_depthiou_head_hidden_dimuse_high_res_featuresrs   rA   rB   r@   Nr$   )r^   sam_prompt_embed_dimrt   ru   sam_image_embedding_sizer   sam_prompt_encoderr
   r   rR   rs   rA   rB   r@   rv   sam_mask_decoderrT   r   r   Linearobj_ptr_projrE   r   IdentityrY   r`   obj_ptr_tpos_projr   r$   r$   r%   rx   Z  sL   
	zSAM2Model._build_sam_headsc              	   C   s  | d}|j}| d| jksJ | d| jksJ | d| jks&J |durC|d }|d }	| d|kr@|	 d|ksBJ ntj|dd|d}tj|dtj|d	 }	|durt|j	d
krm|j	dd |dfksoJ |j	dd | j
jkrtj| | j
jdddd}
n|}
nd}
| j
||	fd|
d\}}| j|| j
 |||d|d\}}}}| jr|dk}t|ddddf |t}| }tj|| j| jfddd}|dddf }|rtj|dd}tj||d}|||f d}|||f d}| ddkr|||f }n||}}| |}| jr9| jr$| }n| }| jr0|| }|d| | j  }|||||||fS )a
  
        Forward pass through SAM prompt encoders and mask heads.

        This method processes image features and optional point/mask inputs to generate object masks and scores.

        Args:
            backbone_features (torch.Tensor): Image features with shape (B, C, H, W).
            point_inputs (Dict[str, torch.Tensor] | None): Dictionary containing point prompts.
                'point_coords': Tensor of shape (B, P, 2) with float32 dtype, containing absolute
                    pixel-unit coordinates in (x, y) format for P input points.
                'point_labels': Tensor of shape (B, P) with int32 dtype, where 1 means positive clicks,
                    0 means negative clicks, and -1 means padding.
            mask_inputs (torch.Tensor | None): Mask of shape (B, 1, H*16, W*16), float or bool, with the
                same spatial size as the image.
            high_res_features (List[torch.Tensor] | None): List of two feature maps with shapes
                (B, C, 4*H, 4*W) and (B, C, 2*H, 2*W) respectively, used as high-resolution feature maps
                for SAM decoder.
            multimask_output (bool): If True, output 3 candidate masks and their IoU estimates; if False,
                output only 1 mask and its IoU estimate.

        Returns:
            (Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]):
                low_res_multimasks: Tensor of shape (B, M, H*4, W*4) with SAM output mask logits.
                high_res_multimasks: Tensor of shape (B, M, H*16, W*16) with upsampled mask logits.
                ious: Tensor of shape (B, M) with estimated IoU for each output mask.
                low_res_masks: Tensor of shape (B, 1, H*4, W*4) with the best low-resolution mask.
                high_res_masks: Tensor of shape (B, 1, H*16, W*16) with the best high-resolution mask.
                obj_ptr: Tensor of shape (B, C) with object pointer vector for the output mask.
                object_score_logits: Tensor of shape (B) with object score logits.
                Where M is 3 if multimask_output=True, and 1 if multimask_output=False.

        Examples:
            >>> backbone_features = torch.rand(1, 256, 32, 32)
            >>> point_inputs = {"point_coords": torch.rand(1, 2, 2), "point_labels": torch.tensor([[1, 0]])}
            >>> mask_inputs = torch.rand(1, 1, 512, 512)
            >>> results = model._forward_sam_heads(backbone_features, point_inputs, mask_inputs)
            >>> (
            ...     low_res_multimasks,
            ...     high_res_multimasks,
            ...     ious,
            ...     low_res_masks,
            ...     high_res_masks,
            ...     obj_ptr,
            ...     object_score_logits,
            ... ) = results
        r   r   r   rH   Npoint_coordspoint_labelsr   )dtyper   rI   FbilinearTsizealign_cornersrO   	antialias)pointsboxesmasks)image_embeddingsimage_pesparse_prompt_embeddingsdense_prompt_embeddingsmultimask_outputrepeat_imagehigh_res_features)r   rO   r   r   dim)r   r   r   r   r   rd   onesint32lenra   r   mask_input_sizeFinterpolater9   r   get_dense_perA   whereNO_OBJ_SCORErt   argmaxarange	unsqueezer   rD   sigmoidrC   rw   )r!   backbone_featurespoint_inputsmask_inputsr   r   Br   sam_point_coordssam_point_labelssam_mask_promptsparse_embeddingsdense_embeddingslow_res_multimasksioussam_output_tokensobject_score_logitsis_obj_appearinghigh_res_multimaskssam_output_tokenbest_iou_inds
batch_indslow_res_maskshigh_res_masksobj_ptrlambda_is_obj_appearingr$   r$   r%   _forward_sam_heads  s   
6"(
	



zSAM2Model._forward_sam_headsc                 C   s  d\}}|  }|| | }tj||dd |dd fdddd}||d	d
  }	| js@tj|d	| j|j	d}
n| j
|| ||d\}}}}}}
}tj|d
  dkd
d}|d }|  }|| | }| jr| jrx||
 }
|
d
| | j  }
|||	|||
|fS )zFProcess mask inputs directly as output, bypassing SAM encoder/decoder.)g      4@      $r   rI   r   Fr   Tr   r   r   r   )r   r   r   r   r   ).N)r9   r   r   r   new_onesrT   r   rd   r^   r   r   rW   anyflattenrA   rC   rw   )r!   r   r   r   	out_scaleout_biasmask_inputs_floatr   r   r   r   _r   r   r   r$   r$   r%   _use_mask_as_output$  sD   zSAM2Model._use_mask_as_output	img_batchc                 C   sL   |  |}| jr$| j|d d |d d< | j|d d |d d< |S )zRProcess image batch through encoder to extract multi-level features for SAM model.backbone_fpnr   r   )r   rR   r   conv_s0conv_s1)r!   r   backbone_outr$   r$   r%   forward_imageS  s
   
zSAM2Model.forward_imagec                 C   s   t |d t |d ksJ t |d | jksJ |d | j d }|d | j d }dd |D }dd |D }dd |D }||||fS )zZPrepare and flatten visual features from the image backbone output for further processing.r   vision_pos_encNc                 S   s    g | ]}|j d  |j d fqS )r   r   )ra   r*   r$   r$   r%   r-   e       z8SAM2Model._prepare_backbone_features.<locals>.<listcomp>c                 S       g | ]}| d d ddqS r   r   r   r   permuter*   r$   r$   r%   r-   g  r   c                 S   r   r   r   r*   r$   r$   r%   r-   h  r   )r   rS   )r!   r   feature_mapsvision_pos_embeds
feat_sizesvision_featsr$   r$   r%   _prepare_backbone_features]  s   z$SAM2Model._prepare_backbone_featuresc	           +         s  |d  d}	j}
|d \}}|d j}jdkr)|d ddd|	|
||S d}r/dnd|sg g }}t|d dksCJ |d }t |j\}}dd |	 D }j
r^dnj}tdjD ]T}j| }|dkr}rx | n | }n%s d | | }||d |  }n d  |  | }||d |  }|d |d	}|d	u r||d	}|||f qg|D ]D\}}|d	u rq|d
 j|dd}||dddd |d d j|d}|dddd}|jj| d   }|| qjrt|j}j
s"jr" fdd| D }n|} fdd| D }td|D ]9} r? |  n |  }!|!dk sR|d	urT|!|krT n|d |!||!d	}|d	urn|| |d f q6|rt| \}"}#tj|#dd}$jr|d }%jr|
nj}&tj|"|d}'t|'|% |&d}'|'}'|' d!d|	j}'n
|$"t|"|	j}'j|
k r|$#d|	|
j j}$|$dddddd}$|'j$|
j dd}'||$ ||' |$j%d }n1d}n.j&r|d j' }(|(ddd|	|
||}(|(S j'!d|	jg}j(!d|	jg}tj)|dd})tj)|dd}*j*|||)|*|d}(|(ddd|	|
||}(|(S )zePrepare memory-conditioned features by fusing current frame's visual features with previous memories.r   r   r   r   cond_frame_outputsc                 S   s   g | ]}d |fqS )r   r$   )r+   outr$   r$   r%   r-     r.   zBSAM2Model._prepare_memory_conditioned_features.<locals>.<listcomp>non_cond_frame_outputsNmaskmem_featuresT)r   non_blockingmaskmem_pos_encr   c                    s,   i | ]\}}r| krn| kr||qS r$   r$   r+   tr   )	frame_idxtrack_in_reverser$   r%   
<dictcomp>  s    zBSAM2Model._prepare_memory_conditioned_features.<locals>.<dictcomp>c                    s6   g | ]\}}j r |  nt | |d  fqS )r   )rZ   absr   )r   r!   tpos_sign_mulr$   r%   r-     s    

r   r   rH   )currcurr_posmemory
memory_posnum_obj_ptr_tokens)+r   r^   r   rb   r   r    r   r   ry   valuestrainingrm   rangegetappendtor   re   rT   minrU   r[   itemszipr   stackrX   rY   r`   tensorr   r   r   expand	new_zerosreshaperepeat_interleavera   rh   rf   rg   catr\   )+r!   r   is_init_cond_framecurrent_vision_featscurrent_vision_pos_embedsr   output_dict
num_framesr   r   CHWr   r   to_cat_memoryto_cat_memory_pos_embedcond_outputsselected_cond_outputsunselected_cond_outputst_pos_and_prevsrt_post_relprev_frame_idxr   prevfeatsmaskmem_encrU   ptr_cond_outputspos_and_ptrst_diffr   pos_list	ptrs_listobj_ptrs
t_diff_maxtpos_dimobj_pospix_feat_with_memr   memory_pos_embedr$   )r   r!   r   r   r%   $_prepare_memory_conditioned_featuresl  s   








z.SAM2Model._prepare_memory_conditioned_featuresc                 C   s  |d  d}| j}|d \}}	|d ddd||||	}
| jr*| js*| |}| jo.|}|r;| js;|dk }nt	
|}| jdkrJ|| j }| jdkrT|| j }| j|
|dd}|d	 }|d
 }| jdur|dk }|d|d  | jd j|j  7 }||fS )zXEncode frame features and masks into a new memory representation for video segmentation.r   r   r   r   r?   r   T)skip_mask_sigmoidvision_featuresr   N).NN)r   r^   r   r    rl   r  "_apply_non_overlapping_constraintsrk   r9   r   r   ri   rj   r_   rF   r  ra   )r!   r  r   pred_masks_high_resr   is_mask_from_ptsr   r  r  r  pix_featbinarizemask_for_memmaskmem_outr   r   r   r$   r$   r%   _encode_new_memory  s2   	








zSAM2Model._encode_new_memoryc              
   C   s  ||d}t |dkrdd t|dd |dd D }nd}|durE| jrE|d ddd}|jd| jg|d R  }| |||}n9| j|||dd |dd |dd ||	|
d	}|durn|durj|du slJ |}| ||}| j	|||||d
}||||fS )hPerform a single tracking step, updating object masks and memory features based on current frame inputs.)r   r   r   c                 S   s:   g | ]\}}| d ddj|d |dg|R  qS )r   r   r   )r   r    r   )r+   r,   sr$   r$   r%   r-   N  s    (z)SAM2Model._track_step.<locals>.<listcomp>Nr   r   r   )r   r  r  r  r   r  r  r   )r   r   r   r   r   )
r   r	  rn   r   r    r^   r   r1  _use_multimaskr   )r!   r   r  r  r  r   r   r   r  r  r   prev_sam_mask_logitscurrent_outr   r7  sam_outputsr   r$   r$   r%   _track_step<  s@   



zSAM2Model._track_stepc                 C   sV   |r!| j dkr!|}| j|||||dud\}	}
|	|d< |
|d< dS d|d< d|d< dS )z^Run memory encoder on predicted mask to encode it into a new memory feature for future frames.r   N)r  r   r5  r   r6  r   r   )rb   r;  )r!   r  r   r   run_mem_encoderr   r   r@  high_res_masks_for_mem_encr   r   r$   r$   r%   _encode_memory_in_outputw  s   
z"SAM2Model._encode_memory_in_outputc                 C   sx   |  |||||||||	|
|\}}}}|\}}}}}}}||d< ||d< ||d< | js/||d< | ||||||| |S )r<  
pred_masksr5  r   r   )rB  r  rE  )r!   r   r  r  r  r   r   r   r  r  r   rC  r?  r@  rA  r   r   r   r   r   r$   r$   r%   
track_step  s:   
zSAM2Model.track_stepc                 C   sF   |du rdn|d  d}| jo"|p| jo"| j|  ko | jkS   S )zaDetermine whether to use multiple mask outputs in the SAM head based on configuration and inputs.Nr   r   r   )r   ro   rr   rp   rq   )r!   r  r   num_ptsr$   r$   r%   r>    s   zSAM2Model._use_multimaskc                 C   sn   |  d}|dkr| S | j}tj| ddd}tj||ddddddf }||k}t|| tj| dd} | S )	z\Apply non-overlapping constraints to masks, keeping the highest scoring object per location.r   r   T)r   keepdimr   Nr   )max)r   r   r   r   r   r   clamp)rF  
batch_sizer   max_obj_indsbatch_obj_indskeepr$   r$   r%   r4    s   
z,SAM2Model._apply_non_overlapping_constraintsc                 C   s
   || _ dS )z Set binarize for VideoPredictor.N)rk   )r!   r8  r$   r$   r%   set_binarize  s   
zSAM2Model.set_binarizec                 C   s(   |d | _ || j_dd |D | j_dS )zCSet image size to make model compatible with different image sizes.r   c                 S   r'   r(   r$   r*   r$   r$   r%   r-     r.   z'SAM2Model.set_imgsz.<locals>.<listcomp>N)rt   r   r0   r1   r3   r$   r$   r%   r&     s   
zSAM2Model.set_imgsz) r=   r>   r)   r?   r   FFr   FFFr   r   FFFr   FFr)   TFFFFFFFFFNF)NNNF)F)FTN)r5   r6   r7   r8   r   r9   r:   boolr   propertyr   r|   rx   r   r   r   r   r   r   r1  r;  rB  rE  rG  r>  staticmethodr4  rP  r&   r;   r$   r$   r"   r%   r<   j   s   
   !"$ B
2
 /

 %,;%
:	

r<   )typingr   r   torch.nn.functionalr   
functionalr   torch.nn.initr   ultralytics.nn.modulesr   ultralytics.utilsr   blocksr   decodersr	   r
   encodersr   r   utilsr   r   r   Moduler   r<   r$   r$   r$   r%   <module>   s   N