o
    WñhCn  ã                   @   s  d Z ddlZddlZddlmZ ddlm  mZ ddlm	Z	m
Z
 ddlmZ ddlmZmZmZ dZG dd	„ d	ejƒZG d
d„ deƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZdS )zTransformer modules.é    N)Ú	constant_Úxavier_uniform_é   )ÚConv)Ú_get_clonesÚinverse_sigmoidÚ#multi_scale_deformable_attn_pytorch)
ÚTransformerEncoderLayerÚTransformerLayerÚTransformerBlockÚMLPBlockÚLayerNorm2dÚAIFIÚDeformableTransformerDecoderÚ!DeformableTransformerDecoderLayerÚMSDeformAttnÚMLPc                       s\   e Zd ZdZddde ¡ df‡ fdd„	Zedd	d
„ƒZddd„Z	ddd„Z
ddd„Z‡  ZS )r	   a  
    Defines a single layer of the transformer encoder.

    Attributes:
        ma (nn.MultiheadAttention): Multi-head attention module.
        fc1 (nn.Linear): First linear layer in the feedforward network.
        fc2 (nn.Linear): Second linear layer in the feedforward network.
        norm1 (nn.LayerNorm): Layer normalization after attention.
        norm2 (nn.LayerNorm): Layer normalization after feedforward network.
        dropout (nn.Dropout): Dropout layer for the feedforward network.
        dropout1 (nn.Dropout): Dropout layer after attention.
        dropout2 (nn.Dropout): Dropout layer after feedforward network.
        act (nn.Module): Activation function.
        normalize_before (bool): Whether to apply normalization before attention and feedforward.
    é   é   ç        Fc                    sž   t ƒ  ¡  ddlm} |stdƒ‚tj|||dd| _t ||¡| _	t ||¡| _
t |¡| _t |¡| _t |¡| _t |¡| _t |¡| _|| _|| _dS )aÎ  
        Initialize the TransformerEncoderLayer with specified parameters.

        Args:
            c1 (int): Input dimension.
            cm (int): Hidden dimension in the feedforward network.
            num_heads (int): Number of attention heads.
            dropout (float): Dropout probability.
            act (nn.Module): Activation function.
            normalize_before (bool): Whether to apply normalization before attention and feedforward.
        é   )Ú	TORCH_1_9z]TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True).T)ÚdropoutÚbatch_firstN)ÚsuperÚ__init__Úutils.torch_utilsr   ÚModuleNotFoundErrorÚnnÚMultiheadAttentionÚmaÚLinearÚfc1Úfc2Ú	LayerNormÚnorm1Únorm2ÚDropoutr   Údropout1Údropout2ÚactÚnormalize_before)ÚselfÚc1ÚcmÚ	num_headsr   r*   r+   r   ©Ú	__class__© úV/var/www/vscode/kcb/lib/python3.10/site-packages/ultralytics/nn/modules/transformer.pyr   -   s    
ÿ
z TransformerEncoderLayer.__init__Nc                 C   ó   |du r| S | | S )z2Add position embeddings to the tensor if provided.Nr2   ©ÚtensorÚposr2   r2   r3   Úwith_pos_embedN   ó   z&TransformerEncoderLayer.with_pos_embedc              	   C   st   |   ||¡ }}| j|||||dd }||  |¡ }|  |¡}|  |  |  |  |¡¡¡¡}||  |¡ }|  	|¡S )a¾  
        Perform forward pass with post-normalization.

        Args:
            src (torch.Tensor): Input tensor.
            src_mask (torch.Tensor, optional): Mask for the src sequence.
            src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
            pos (torch.Tensor, optional): Positional encoding.

        Returns:
            (torch.Tensor): Output tensor after attention and feedforward.
        ©ÚvalueÚ	attn_maskÚkey_padding_maskr   )
r8   r    r(   r%   r#   r   r*   r"   r)   r&   )r,   ÚsrcÚsrc_maskÚsrc_key_padding_maskr7   ÚqÚkÚsrc2r2   r2   r3   Úforward_postS   s   

z$TransformerEncoderLayer.forward_postc              	   C   st   |   |¡}|  ||¡ }}| j|||||dd }||  |¡ }|  |¡}|  |  |  |  |¡¡¡¡}||  	|¡ S )a½  
        Perform forward pass with pre-normalization.

        Args:
            src (torch.Tensor): Input tensor.
            src_mask (torch.Tensor, optional): Mask for the src sequence.
            src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
            pos (torch.Tensor, optional): Positional encoding.

        Returns:
            (torch.Tensor): Output tensor after attention and feedforward.
        r:   r   )
r%   r8   r    r(   r&   r#   r   r*   r"   r)   )r,   r>   r?   r@   r7   rC   rA   rB   r2   r2   r3   Úforward_preh   s   

z#TransformerEncoderLayer.forward_prec                 C   s&   | j r|  ||||¡S |  ||||¡S )aÉ  
        Forward propagates the input through the encoder module.

        Args:
            src (torch.Tensor): Input tensor.
            src_mask (torch.Tensor, optional): Mask for the src sequence.
            src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
            pos (torch.Tensor, optional): Positional encoding.

        Returns:
            (torch.Tensor): Output tensor after transformer encoder layer.
        )r+   rE   rD   )r,   r>   r?   r@   r7   r2   r2   r3   Úforward}   s   zTransformerEncoderLayer.forward©N©NNN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   ÚGELUr   Ústaticmethodr8   rD   rE   rF   Ú__classcell__r2   r2   r0   r3   r	      s    !

r	   c                       sJ   e Zd ZdZddde ¡ df‡ fdd„	Z‡ fdd	„Zeddd„ƒZ	‡  Z
S )r   z“
    Defines the AIFI transformer layer.

    This class extends TransformerEncoderLayer to work with 2D data by adding positional embeddings.
    r   r   r   Fc                    s   t ƒ  ||||||¡ dS )aÄ  
        Initialize the AIFI instance with specified parameters.

        Args:
            c1 (int): Input dimension.
            cm (int): Hidden dimension in the feedforward network.
            num_heads (int): Number of attention heads.
            dropout (float): Dropout probability.
            act (nn.Module): Activation function.
            normalize_before (bool): Whether to apply normalization before attention and feedforward.
        N)r   r   )r,   r-   r.   r/   r   r*   r+   r0   r2   r3   r   –   s   zAIFI.__init__c                    sp   |j dd… \}}}|  |||¡}tƒ j| d¡ ddd¡|j|j|jdd}| ddd¡ 	d|||g¡ 
¡ S )zæ
        Forward pass for the AIFI transformer layer.

        Args:
            x (torch.Tensor): Input tensor with shape [B, C, H, W].

        Returns:
            (torch.Tensor): Output tensor with shape [B, C, H, W].
        r   Né   r   )ÚdeviceÚdtype)r7   éÿÿÿÿ)ÚshapeÚ"build_2d_sincos_position_embeddingr   rF   ÚflattenÚpermuteÚtorQ   rR   ÚviewÚ
contiguous)r,   ÚxÚcÚhÚwÚ	pos_embedr0   r2   r3   rF   ¤   s   
. zAIFI.forwardé   ç     ˆÃ@c           
      C   sÈ   |d dks
J dƒ‚t j| t jd}t j|t jd}t j||dd\}}|d }t j|t jd| }d||  }| ¡ d |d	  }| ¡ d |d	  }	t  t  |¡t  |¡t  |	¡t  |	¡gd
¡d	 S )a‡  
        Build 2D sine-cosine position embedding.

        Args:
            w (int): Width of the feature map.
            h (int): Height of the feature map.
            embed_dim (int): Embedding dimension.
            temperature (float): Temperature for the sine/cosine functions.

        Returns:
            (torch.Tensor): Position embedding with shape [1, embed_dim, h*w].
        é   r   zHEmbed dimension must be divisible by 4 for 2D sin-cos position embedding©rR   Úij)Úindexingg      ð?©.NNr   )ÚtorchÚarangeÚfloat32ÚmeshgridrV   ÚcatÚsinÚcos)
r^   r]   Ú	embed_dimÚtemperatureÚgrid_wÚgrid_hÚpos_dimÚomegaÚout_wÚout_hr2   r2   r3   rU   ´   s   0z'AIFI.build_2d_sincos_position_embedding)r`   ra   )rI   rJ   rK   rL   r   rM   r   rF   rN   rU   rO   r2   r2   r0   r3   r      s    r   c                       ó(   e Zd ZdZ‡ fdd„Zdd„ Z‡  ZS )r
   zeTransformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance).c                    sx   t ƒ  ¡  tj||dd| _tj||dd| _tj||dd| _tj||d| _tj||dd| _	tj||dd| _
dS )zî
        Initialize a self-attention mechanism using linear transformations and multi-head attention.

        Args:
            c (int): Input and output channel dimension.
            num_heads (int): Number of attention heads.
        F)Úbias)rn   r/   N)r   r   r   r!   rA   rB   Úvr   r    r"   r#   )r,   r\   r/   r0   r2   r3   r   Ó   s   
zTransformerLayer.__init__c                 C   s<   |   |  |¡|  |¡|  |¡¡d | }|  |  |¡¡| S )zá
        Apply a transformer block to the input x and return the output.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after transformer layer.
        r   )r    rA   rB   rx   r#   r"   ©r,   r[   r2   r2   r3   rF   ã   s   (
zTransformerLayer.forward©rI   rJ   rK   rL   r   rF   rO   r2   r2   r0   r3   r
   Ð   s    r
   c                       rv   )r   aS  
    Vision Transformer https://arxiv.org/abs/2010.11929.

    Attributes:
        conv (Conv, optional): Convolution layer if input and output channels differ.
        linear (nn.Linear): Learnable position embedding.
        tr (nn.Sequential): Sequential container of transformer layers.
        c2 (int): Output channel dimension.
    c                    s\   t ƒ  ¡  d| _|ˆ krt|ˆ ƒ| _t ˆ ˆ ¡| _tj‡ ‡fdd„t|ƒD ƒŽ | _	ˆ | _
dS )aU  
        Initialize a Transformer module with position embedding and specified number of heads and layers.

        Args:
            c1 (int): Input channel dimension.
            c2 (int): Output channel dimension.
            num_heads (int): Number of attention heads.
            num_layers (int): Number of transformer layers.
        Nc                 3   s    | ]}t ˆ ˆƒV  qd S rG   )r
   )Ú.0Ú_©Úc2r/   r2   r3   Ú	<genexpr>  s   € z,TransformerBlock.__init__.<locals>.<genexpr>)r   r   Úconvr   r   r!   ÚlinearÚ
SequentialÚrangeÚtrr~   )r,   r-   r~   r/   Ú
num_layersr0   r}   r3   r   ü   s   

 
zTransformerBlock.__init__c                 C   sb   | j dur
|   |¡}|j\}}}}| d¡ ddd¡}|  ||  |¡ ¡ ddd¡ || j||¡S )z÷
        Forward propagates the input through the bottleneck module.

        Args:
            x (torch.Tensor): Input tensor with shape [b, c1, w, h].

        Returns:
            (torch.Tensor): Output tensor with shape [b, c2, w, h].
        NrP   r   r   )r€   rT   rV   rW   r„   r   Úreshaper~   )r,   r[   Úbr|   r^   r]   Úpr2   r2   r3   rF     s
   


,zTransformerBlock.forwardrz   r2   r2   r0   r3   r   ñ   s    
r   c                       s<   e Zd ZdZejf‡ fdd„	Zdejdejfdd„Z	‡  Z
S )r   z6Implements a single block of a multi-layer perceptron.c                    s2   t ƒ  ¡  t ||¡| _t ||¡| _|ƒ | _dS )a   
        Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.

        Args:
            embedding_dim (int): Input and output dimension.
            mlp_dim (int): Hidden dimension.
            act (nn.Module): Activation function.
        N)r   r   r   r!   Úlin1Úlin2r*   )r,   Úembedding_dimÚmlp_dimr*   r0   r2   r3   r   "  s   
	zMLPBlock.__init__r[   Úreturnc                 C   s   |   |  |  |¡¡¡S )z¸
        Forward pass for the MLPBlock.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after MLP block.
        )rŠ   r*   r‰   ry   r2   r2   r3   rF   0  s   
zMLPBlock.forward)rI   rJ   rK   rL   r   rM   r   rg   ÚTensorrF   rO   r2   r2   r0   r3   r     s    r   c                       s0   e Zd ZdZejdf‡ fdd„	Zdd„ Z‡  ZS )r   a4  
    Implements a simple multi-layer perceptron (also called FFN).

    Attributes:
        num_layers (int): Number of layers in the MLP.
        layers (nn.ModuleList): List of linear layers.
        sigmoid (bool): Whether to apply sigmoid to the output.
        act (nn.Module): Activation function.
    Fc                    sX   t ƒ  ¡  || _|g|d  }t dd„ t|g| ||g ƒD ƒ¡| _|| _|ƒ | _dS )a­  
        Initialize the MLP with specified input, hidden, output dimensions and number of layers.

        Args:
            input_dim (int): Input dimension.
            hidden_dim (int): Hidden dimension.
            output_dim (int): Output dimension.
            num_layers (int): Number of layers.
            act (nn.Module): Activation function.
            sigmoid (bool): Whether to apply sigmoid to the output.
        r   c                 s   s     | ]\}}t  ||¡V  qd S rG   )r   r!   )r{   ÚnrB   r2   r2   r3   r   W  ó   € zMLP.__init__.<locals>.<genexpr>N)	r   r   r…   r   Ú
ModuleListÚzipÚlayersÚsigmoidr*   )r,   Ú	input_dimÚ
hidden_dimÚ
output_dimr…   r*   r”   r]   r0   r2   r3   r   H  s   
(zMLP.__init__c                 C   sZ   t | jƒD ]\}}|| jd k rt| dt ¡ ƒ||ƒƒn||ƒ}qt| ddƒr+| ¡ S |S )z´
        Forward pass for the entire MLP.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after MLP.
        r   r*   r”   F)Ú	enumerater“   r…   Úgetattrr   ÚReLUr”   )r,   r[   ÚiÚlayerr2   r2   r3   rF   [  s   
0zMLP.forward)	rI   rJ   rK   rL   r   rš   r   rF   rO   r2   r2   r0   r3   r   =  s    
r   c                       s*   e Zd ZdZd‡ fdd„	Zdd„ Z‡  ZS )r   að  
    2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.

    Original implementations in
    https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
    and
    https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py.

    Attributes:
        weight (nn.Parameter): Learnable scale parameter.
        bias (nn.Parameter): Learnable bias parameter.
        eps (float): Small constant for numerical stability.
    çíµ ÷Æ°>c                    s8   t ƒ  ¡  t t |¡¡| _t t |¡¡| _|| _	dS )zÔ
        Initialize LayerNorm2d with the given parameters.

        Args:
            num_channels (int): Number of channels in the input.
            eps (float): Small constant for numerical stability.
        N)
r   r   r   Ú	Parameterrg   ÚonesÚweightÚzerosrw   Úeps)r,   Únum_channelsr¢   r0   r2   r3   r   y  s   

zLayerNorm2d.__init__c                 C   sj   |j ddd}||  d¡j ddd}|| t || j ¡ }| jdd…ddf | | jdd…ddf  S )zÅ
        Perform forward pass for 2D layer normalization.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Normalized output tensor.
        r   T©ÚkeepdimrP   N)ÚmeanÚpowrg   Úsqrtr¢   r    rw   )r,   r[   ÚuÚsr2   r2   r3   rF   †  s   
,zLayerNorm2d.forward)r   rz   r2   r2   r0   r3   r   j  s    r   c                       s4   e Zd ZdZd‡ fdd„	Zdd„ Zdd
d„Z‡  ZS )r   a;  
    Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.

    https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py

    Attributes:
        im2col_step (int): Step size for im2col operations.
        d_model (int): Model dimension.
        n_levels (int): Number of feature levels.
        n_heads (int): Number of attention heads.
        n_points (int): Number of sampling points per attention head per feature level.
        sampling_offsets (nn.Linear): Linear layer for generating sampling offsets.
        attention_weights (nn.Linear): Linear layer for generating attention weights.
        value_proj (nn.Linear): Linear layer for projecting values.
        output_proj (nn.Linear): Linear layer for projecting output.
    r`   rb   r   c                    s¼   t ƒ  ¡  || dkrtd|› d|› ƒ‚|| }|| |ks#J dƒ‚d| _|| _|| _|| _|| _t 	||| | d ¡| _
t 	||| | ¡| _t 	||¡| _t 	||¡| _|  ¡  dS )aG  
        Initialize MSDeformAttn with the given parameters.

        Args:
            d_model (int): Model dimension.
            n_levels (int): Number of feature levels.
            n_heads (int): Number of attention heads.
            n_points (int): Number of sampling points per attention head per feature level.
        r   z.d_model must be divisible by n_heads, but got z and z(`d_model` must be divisible by `n_heads`é@   rP   N)r   r   Ú
ValueErrorÚim2col_stepÚd_modelÚn_levelsÚn_headsÚn_pointsr   r!   Úsampling_offsetsÚattention_weightsÚ
value_projÚoutput_projÚ_reset_parameters)r,   r®   r¯   r°   r±   Ú_d_per_headr0   r2   r3   r   ¨  s   

zMSDeformAttn.__init__c                 C   sV  t | jjjdƒ tj| jtjddtj	 | j  }t 
| ¡ | ¡ gd¡}|| ¡ jdddd   | jddd	¡ d| j| jd¡}t| jƒD ]}|d
d
…d
d
…|d
d
…f  |d 9  < qFt ¡  t | d¡¡| j_W d
  ƒ n1 svw   Y  t | jjjdƒ t | jjjdƒ t| jjjƒ t | jjjdƒ t| jjjƒ t | jjjdƒ d
S )zReset module parameters.r   rc   g       @rS   Tr¤   r   r   rP   N)r   r²   r    Údatarg   rh   r°   ri   ÚmathÚpiÚstackrm   rl   ÚabsÚmaxrY   Úrepeatr¯   r±   rƒ   Úno_gradr   rž   rw   r³   r   r´   rµ   )r,   ÚthetasÚ	grid_initr›   r2   r2   r3   r¶   Ç  s$   "ý*
ÿzMSDeformAttn._reset_parametersNc              	   C   sØ  |j dd… \}}|j d }tdd„ |D ƒƒ|ksJ ‚|  |¡}|dur.| |d tdƒ¡}| ||| j| j| j ¡}|  |¡ ||| j| j	| j
d¡}	|  |¡ ||| j| j	| j
 ¡}
t |
d¡ ||| j| j	| j
¡}
|j d }|dkr¦tj||j|jd	 d¡}|	|ddddd…ddd…f  }|dd…dd…ddd…ddd…f | }n:|d
krØ|	| j
 |dd…dd…ddd…ddd…f  d }|dd…dd…ddd…ddd…f | }ntd|› dƒ‚t||||
ƒ}|  |¡S )a™  
        Perform forward pass for multiscale deformable attention.

        https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py

        Args:
            query (torch.Tensor): Tensor with shape [bs, query_length, C].
            refer_bbox (torch.Tensor): Tensor with shape [bs, query_length, n_levels, 2], range in [0, 1],
                top-left (0,0), bottom-right (1, 1), including padding area.
            value (torch.Tensor): Tensor with shape [bs, value_length, C].
            value_shapes (list): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
            value_mask (torch.Tensor, optional): Tensor with shape [bs, value_length], True for non-padding elements,
                False for padding elements.

        Returns:
            (torch.Tensor): Output tensor with shape [bs, Length_{query}, C].
        NrP   r   c                 s   s     | ]}|d  |d  V  qdS )r   r   Nr2   )r{   rª   r2   r2   r3   r   ð  r   z'MSDeformAttn.forward.<locals>.<genexpr>rf   r   rS   )rR   rQ   rb   g      à?z5Last dim of reference_points must be 2 or 4, but got Ú.)rT   Úsumr´   Úmasked_fillÚfloatrY   r°   r®   r²   r¯   r±   r³   ÚFÚsoftmaxrg   Ú	as_tensorrR   rQ   Úflipr¬   r   rµ   )r,   ÚqueryÚ
refer_bboxr;   Úvalue_shapesÚ
value_maskÚbsÚlen_qÚlen_vr²   r³   Ú
num_pointsÚoffset_normalizerÚaddÚsampling_locationsÚoutputr2   r2   r3   rF   Ü  s*   

   
 *2*
zMSDeformAttn.forward)r`   rb   r   rb   rG   )rI   rJ   rK   rL   r   r¶   rF   rO   r2   r2   r0   r3   r   –  s
    r   c                       sR   e Zd ZdZdddde ¡ ddf‡ fdd„	Zed	d
„ ƒZdd„ Z	ddd„Z
‡  ZS )r   a†  
    Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
    https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py

    Attributes:
        self_attn (nn.MultiheadAttention): Self-attention module.
        dropout1 (nn.Dropout): Dropout after self-attention.
        norm1 (nn.LayerNorm): Layer normalization after self-attention.
        cross_attn (MSDeformAttn): Cross-attention module.
        dropout2 (nn.Dropout): Dropout after cross-attention.
        norm2 (nn.LayerNorm): Layer normalization after cross-attention.
        linear1 (nn.Linear): First linear layer in the feedforward network.
        act (nn.Module): Activation function.
        dropout3 (nn.Dropout): Dropout in the feedforward network.
        linear2 (nn.Linear): Second linear layer in the feedforward network.
        dropout4 (nn.Dropout): Dropout after the feedforward network.
        norm3 (nn.LayerNorm): Layer normalization after the feedforward network.
    r`   r   i   r   rb   c                    s¦   t ƒ  ¡  tj|||d| _t |¡| _t |¡| _t	||||ƒ| _
t |¡| _t |¡| _t ||¡| _|| _t |¡| _t ||¡| _t |¡| _t |¡| _dS )aÞ  
        Initialize the DeformableTransformerDecoderLayer with the given parameters.

        Args:
            d_model (int): Model dimension.
            n_heads (int): Number of attention heads.
            d_ffn (int): Dimension of the feedforward network.
            dropout (float): Dropout probability.
            act (nn.Module): Activation function.
            n_levels (int): Number of feature levels.
            n_points (int): Number of sampling points.
        )r   N)r   r   r   r   Ú	self_attnr'   r(   r$   r%   r   Ú
cross_attnr)   r&   r!   Úlinear1r*   Údropout3Úlinear2Údropout4Únorm3)r,   r®   r°   Úd_ffnr   r*   r¯   r±   r0   r2   r3   r     s   
z*DeformableTransformerDecoderLayer.__init__c                 C   r4   )z;Add positional embeddings to the input tensor, if provided.Nr2   r5   r2   r2   r3   r8   ?  r9   z0DeformableTransformerDecoderLayer.with_pos_embedc              	   C   s4   |   |  |  |  |¡¡¡¡}||  |¡ }|  |¡S )zÞ
        Perform forward pass through the Feed-Forward Network part of the layer.

        Args:
            tgt (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after FFN.
        )rÚ   rÙ   r*   rØ   rÛ   rÜ   )r,   ÚtgtÚtgt2r2   r2   r3   Úforward_ffnD  s   

z-DeformableTransformerDecoderLayer.forward_ffnNc                 C   s    |   ||¡ }}	| j| dd¡|	 dd¡| dd¡|dd  dd¡}
||  |
¡ }|  |¡}|  |   ||¡| d¡|||¡}
||  |
¡ }|  |¡}|  	|¡S )aH  
        Perform the forward pass through the entire decoder layer.

        Args:
            embed (torch.Tensor): Input embeddings.
            refer_bbox (torch.Tensor): Reference bounding boxes.
            feats (torch.Tensor): Feature maps.
            shapes (list): Feature shapes.
            padding_mask (torch.Tensor, optional): Padding mask.
            attn_mask (torch.Tensor, optional): Attention mask.
            query_pos (torch.Tensor, optional): Query position embeddings.

        Returns:
            (torch.Tensor): Output tensor after decoder layer.
        r   r   )r<   rP   )
r8   rÖ   Ú	transposer(   r%   r×   Ú	unsqueezer)   r&   rà   )r,   ÚembedrË   ÚfeatsÚshapesÚpadding_maskr<   Ú	query_posrA   rB   rÞ   r2   r2   r3   rF   R  s   (ÿþ
ÿ

z)DeformableTransformerDecoderLayer.forwardrH   )rI   rJ   rK   rL   r   rš   r   rN   r8   rà   rF   rO   r2   r2   r0   r3   r     s     !
r   c                       s0   e Zd ZdZd‡ fdd„	Z		d	dd„Z‡  ZS )
r   aº  
    Implementation of Deformable Transformer Decoder based on PaddleDetection.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py

    Attributes:
        layers (nn.ModuleList): List of decoder layers.
        num_layers (int): Number of decoder layers.
        hidden_dim (int): Hidden dimension.
        eval_idx (int): Index of the layer to use during evaluation.
    rS   c                    sB   t ƒ  ¡  t||ƒ| _|| _|| _|dkr|| _dS || | _dS )aU  
        Initialize the DeformableTransformerDecoder with the given parameters.

        Args:
            hidden_dim (int): Hidden dimension.
            decoder_layer (nn.Module): Decoder layer module.
            num_layers (int): Number of decoder layers.
            eval_idx (int): Index of the layer to use during evaluation.
        r   N)r   r   r   r“   r…   r–   Úeval_idx)r,   r–   Údecoder_layerr…   rè   r0   r2   r3   r   ‚  s
   

 z%DeformableTransformerDecoder.__init__Nc
              
   C   sü   |}
g }g }d}|  ¡ }t| jƒD ]b\}}||
||||	|||ƒƒ}
|| |
ƒ}t  |t|ƒ ¡}| jrS| || |
ƒ¡ |dkrF| |¡ n"| t  |t|ƒ ¡¡ n|| jkrh| || |
ƒ¡ | |¡  n|}| jrq| ¡ n|}qt 	|¡t 	|¡fS )aé  
        Perform the forward pass through the entire decoder.

        Args:
            embed (torch.Tensor): Decoder embeddings.
            refer_bbox (torch.Tensor): Reference bounding boxes.
            feats (torch.Tensor): Image features.
            shapes (list): Feature shapes.
            bbox_head (nn.Module): Bounding box prediction head.
            score_head (nn.Module): Score prediction head.
            pos_mlp (nn.Module): Position MLP.
            attn_mask (torch.Tensor, optional): Attention mask.
            padding_mask (torch.Tensor, optional): Padding mask.

        Returns:
            dec_bboxes (torch.Tensor): Decoded bounding boxes.
            dec_cls (torch.Tensor): Decoded classification scores.
        Nr   )
r”   r˜   r“   rg   r   ÚtrainingÚappendrè   Údetachr»   )r,   rã   rË   rä   rå   Ú	bbox_headÚ
score_headÚpos_mlpr<   ræ   rÕ   Ú
dec_bboxesÚdec_clsÚlast_refined_bboxr›   rœ   ÚbboxÚrefined_bboxr2   r2   r3   rF   ’  s*   

z$DeformableTransformerDecoder.forward)rS   )NNrz   r2   r2   r0   r3   r   u  s    ör   )rL   r¹   rg   Útorch.nnr   Útorch.nn.functionalÚ
functionalrÆ   Útorch.nn.initr   r   r€   r   Úutilsr   r   r   Ú__all__ÚModuler	   r   r
   r   r   r   r   r   r   r   r2   r2   r2   r3   Ú<module>   s&   sA!.-,rm