o
    Wh2                     @   s`   d dl Z d dlmZ d dlZd dlmZmZ ddlmZ G dd dejZ	G dd	 d	ejZ
dS )
    N)Optional)Tensornn   )RoPEAttentionc                       s   e Zd ZdZ						ddeded	ed
ededef fddZdede	e defddZ
	ddedede	e de	e dedefddZ			ddedede	e de	e dedejfddZ  ZS ) MemoryAttentionLayera  
    Implements a memory attention layer with self-attention and cross-attention mechanisms for neural networks.

    This class combines self-attention, cross-attention, and feedforward components to process input tensors and
    generate memory-based attention outputs.

    Attributes:
        d_model (int): Dimensionality of the model.
        dim_feedforward (int): Dimensionality of the feedforward network.
        dropout_value (float): Dropout rate for regularization.
        self_attn (RoPEAttention): Self-attention mechanism using RoPE (Rotary Position Embedding).
        cross_attn_image (RoPEAttention): Cross-attention mechanism for image processing.
        linear1 (nn.Linear): First linear layer of the feedforward network.
        linear2 (nn.Linear): Second linear layer of the feedforward network.
        norm1 (nn.LayerNorm): Layer normalization for self-attention output.
        norm2 (nn.LayerNorm): Layer normalization for cross-attention output.
        norm3 (nn.LayerNorm): Layer normalization for feedforward network output.
        dropout1 (nn.Dropout): Dropout layer after self-attention.
        dropout2 (nn.Dropout): Dropout layer after cross-attention.
        dropout3 (nn.Dropout): Dropout layer after feedforward network.
        activation (nn.ReLU): Activation function for the feedforward network.
        pos_enc_at_attn (bool): Flag to add positional encoding at attention.
        pos_enc_at_cross_attn_queries (bool): Flag to add positional encoding to cross-attention queries.
        pos_enc_at_cross_attn_keys (bool): Flag to add positional encoding to cross-attention keys.

    Methods:
        forward: Performs the full memory attention operation on input tensors.
        _forward_sa: Performs self-attention on input tensor.
        _forward_ca: Performs cross-attention between target and memory tensors.

    Examples:
        >>> layer = MemoryAttentionLayer(d_model=256, dim_feedforward=2048, dropout=0.1)
        >>> tgt = torch.randn(1, 100, 256)
        >>> memory = torch.randn(1, 100, 64)
        >>> pos = torch.randn(1, 100, 256)
        >>> query_pos = torch.randn(1, 100, 256)
        >>> output = layer(tgt, memory, pos, query_pos)
        >>> print(output.shape)
        torch.Size([1, 100, 256])
          皙?FTd_modeldim_feedforwarddropoutpos_enc_at_attnpos_enc_at_cross_attn_keyspos_enc_at_cross_attn_queriesc                    s   t    || _|| _|| _tdddd| _tdddddd| _t	||| _
t|| _t	||| _t|| _t|| _t|| _t|| _t|| _t|| _t | _|| _|| _|| _dS )a{  
        Initialize a memory attention layer with self-attention, cross-attention, and feedforward components.

        Args:
            d_model (int): Dimensionality of the model.
            dim_feedforward (int): Dimensionality of the feedforward network.
            dropout (float): Dropout rate for regularization.
            pos_enc_at_attn (bool): Whether to add positional encoding at attention.
            pos_enc_at_cross_attn_keys (bool): Whether to add positional encoding to cross-attention keys.
            pos_enc_at_cross_attn_queries (bool): Whether to add positional encoding to cross-attention queries.
        r   r   )embedding_dim	num_headsdownsample_rateT@   )rope_k_repeatr   r   r   	kv_in_dimN)super__init__r   r   dropout_valuer   	self_attncross_attn_imager   Linearlinear1Dropoutr   linear2	LayerNormnorm1norm2norm3dropout1dropout2dropout3ReLU
activationr   r   r   )selfr   r   r   r   r   r   	__class__ c/var/www/vscode/kcb/lib/python3.10/site-packages/ultralytics/models/sam/modules/memory_attention.pyr   6   s2   
	

zMemoryAttentionLayer.__init__tgt	query_posreturnc                 C   sB   |  |}| jr|| n| }}| j|||d}|| | }|S )z^Perform self-attention on input tensor using positional encoding and RoPE attention mechanism.)v)r!   r   r   r$   )r)   r.   r/   tgt2qkr,   r,   r-   _forward_saj   s
   
z MemoryAttentionLayer._forward_sar   memoryposnum_k_exclude_ropec                 C   st   i }|dkrt | jtsJ d|i}| |}| jd| jr!|| n|| jr)|| n||d|}|| | }|S )zXPerform cross-attention between target and memory tensors using RoPEAttention mechanism.r   r8   )r3   r4   r1   Nr,   )
isinstancer   r   r"   r   r   r%   )r)   r.   r6   r/   r7   r8   kwdsr2   r,   r,   r-   _forward_car   s   	
z MemoryAttentionLayer._forward_caNc              	   C   sV   |  ||}| |||||}| |}| | | | |}|| | }|S )z^Process input tensors through self-attention, cross-attention, and feedforward network layers.)r5   r;   r#   r   r   r(   r   r&   )r)   r.   r6   r7   r/   r8   r2   r,   r,   r-   forward   s   	
zMemoryAttentionLayer.forward)r   r	   r
   FTF)r   NNr   )__name__
__module____qualname____doc__intfloatboolr   r   r   r5   r;   torchr<   __classcell__r,   r,   r*   r-   r      sh    +4
r   c                       st   e Zd ZdZ	ddededejdedef
 fdd	Z	
	
	dde	j
de	j
dee
 dee
 dede	j
fddZ  ZS )MemoryAttentiona  
    Memory attention module for processing sequential data with self and cross-attention mechanisms.

    This class implements a multi-layer attention mechanism that combines self-attention and cross-attention
    for processing sequential data, particularly useful in transformer-like architectures.

    Attributes:
        d_model (int): The dimension of the model's hidden state.
        layers (nn.ModuleList): A list of MemoryAttentionLayer modules.
        num_layers (int): The number of attention layers.
        norm (nn.LayerNorm): Layer normalization applied to the output.
        pos_enc_at_input (bool): Whether to apply positional encoding at the input.
        batch_first (bool): Whether the input tensors are in batch-first format.

    Methods:
        forward: Processes input tensors through the attention layers.

    Examples:
        >>> d_model = 256
        >>> layer = MemoryAttentionLayer(d_model)
        >>> attention = MemoryAttention(d_model, pos_enc_at_input=True, layer=layer, num_layers=3)
        >>> curr = torch.randn(10, 32, d_model)  # (seq_len, batch_size, d_model)
        >>> memory = torch.randn(20, 32, d_model)  # (mem_len, batch_size, d_model)
        >>> curr_pos = torch.randn(10, 32, d_model)
        >>> memory_pos = torch.randn(20, 32, d_model)
        >>> output = attention(curr, memory, curr_pos, memory_pos)
        >>> print(output.shape)
        torch.Size([10, 32, 256])
    Tr   pos_enc_at_inputlayer
num_layersbatch_firstc                    sP   t    || _t fddt|D | _|| _t|| _	|| _
|| _dS )aQ  
        Initialize MemoryAttention with specified layers and normalization for sequential data processing.

        This class implements a multi-layer attention mechanism that combines self-attention and cross-attention
        for processing sequential data, particularly useful in transformer-like architectures.

        Args:
            d_model (int): The dimension of the model's hidden state.
            pos_enc_at_input (bool): Whether to apply positional encoding at the input.
            layer (nn.Module): The attention layer to be used in the module.
            num_layers (int): The number of attention layers.
            batch_first (bool): Whether the input tensors are in batch-first format.

        Examples:
            >>> d_model = 256
            >>> layer = MemoryAttentionLayer(d_model)
            >>> attention = MemoryAttention(d_model, pos_enc_at_input=True, layer=layer, num_layers=3)
            >>> curr = torch.randn(10, 32, d_model)  # (seq_len, batch_size, d_model)
            >>> memory = torch.randn(20, 32, d_model)  # (mem_len, batch_size, d_model)
            >>> curr_pos = torch.randn(10, 32, d_model)
            >>> memory_pos = torch.randn(20, 32, d_model)
            >>> output = attention(curr, memory, curr_pos, memory_pos)
            >>> print(output.shape)
            torch.Size([10, 32, 256])
        c                    s   g | ]}t  qS r,   )copydeepcopy).0_rI   r,   r-   
<listcomp>   s    z,MemoryAttention.__init__.<locals>.<listcomp>N)r   r   r   r   
ModuleListrangelayersrJ   r    normrH   rK   )r)   r   rH   rI   rJ   rK   r*   rP   r-   r      s   
!
zMemoryAttention.__init__Nr   currr6   curr_pos
memory_posnum_obj_ptr_tokensr0   c           
      C   s&  t |tr't |tsJ t|t|  krdksJ  J |d |d }}|jd |jd ks5J d|}| jrD|durD|d|  }| jr_|dd}|dd}|dd}|dd}| jD ]}i }t |jt	rpd|i}|d||||d|}qb| 
|}	| jr|	dd}	|dd}|	S )	aa  
        Process inputs through attention layers, applying self and cross-attention with positional encoding.

        Args:
            curr (torch.Tensor): Self-attention input tensor, representing the current state.
            memory (torch.Tensor): Cross-attention input tensor, representing memory information.
            curr_pos (Optional[Tensor]): Positional encoding for self-attention inputs.
            memory_pos (Optional[Tensor]): Positional encoding for cross-attention inputs.
            num_obj_ptr_tokens (int): Number of object pointer tokens to exclude from rotary position embedding.

        Returns:
            (torch.Tensor): Processed output tensor after applying attention layers and normalization.

        Examples:
            >>> d_model = 256
            >>> layer = MemoryAttentionLayer(d_model)
            >>> attention = MemoryAttention(d_model, pos_enc_at_input=True, layer=layer, num_layers=3)
            >>> curr = torch.randn(10, 32, d_model)  # (seq_len, batch_size, d_model)
            >>> memory = torch.randn(20, 32, d_model)  # (mem_len, batch_size, d_model)
            >>> curr_pos = torch.randn(10, 32, d_model)
            >>> memory_pos = torch.randn(20, 32, d_model)
            >>> output = attention(curr, memory, curr_pos, memory_pos)
            >>> print(output.shape)
            torch.Size([10, 32, 256])
        r   r   z/Batch size must be the same for curr and memoryNr
   r8   )r.   r6   r7   r/   r,   )r9   listlenshaperH   rK   	transposerT   r   r   rU   )
r)   rV   r6   rW   rX   rY   outputrI   r:   normed_outputr,   r,   r-   r<      s<   
!$

zMemoryAttention.forward)Tr=   )r>   r?   r@   rA   rB   rD   r   Moduler   rE   r   r   r<   rF   r,   r,   r*   r-   rG      s<    $-rG   )rL   typingr   rE   r   r   blocksr   r`   r   rG   r,   r,   r,   r-   <module>   s    