o
    WhR9                     @   sv   d dl Z d dlmZmZ d dlZd dlmZmZ d dlmZ G dd dej	Z
G dd dej	ZG d	d
 d
ej	ZdS )    N)TupleType)Tensornn)MLPBlockc                       sn   e Zd ZdZejdfdededededeej ded	d
f fddZ	de
de
de
d	ee
e
f fddZ  ZS )TwoWayTransformera  
    A Two-Way Transformer module for simultaneous attention to image and query points.

    This class implements a specialized transformer decoder that attends to an input image using queries with
    supplied positional embeddings. It's useful for tasks like object detection, image segmentation, and point
    cloud processing.

    Attributes:
        depth (int): Number of layers in the transformer.
        embedding_dim (int): Channel dimension for input embeddings.
        num_heads (int): Number of heads for multihead attention.
        mlp_dim (int): Internal channel dimension for the MLP block.
        layers (nn.ModuleList): List of TwoWayAttentionBlock layers composing the transformer.
        final_attn_token_to_image (Attention): Final attention layer from queries to image.
        norm_final_attn (nn.LayerNorm): Layer normalization applied to final queries.

    Methods:
        forward: Processes image and point embeddings through the transformer.

    Examples:
        >>> transformer = TwoWayTransformer(depth=6, embedding_dim=256, num_heads=8, mlp_dim=2048)
        >>> image_embedding = torch.randn(1, 256, 32, 32)
        >>> image_pe = torch.randn(1, 256, 32, 32)
        >>> point_embedding = torch.randn(1, 100, 256)
        >>> output_queries, output_image = transformer(image_embedding, image_pe, point_embedding)
        >>> print(output_queries.shape, output_image.shape)
       depthembedding_dim	num_headsmlp_dim
activationattention_downsample_ratereturnNc                    sz   t    || _|| _|| _|| _t | _t	|D ]}| j
t||||||dkd qt|||d| _t|| _dS )aW  
        Initialize a Two-Way Transformer for simultaneous attention to image and query points.

        Args:
            depth (int): Number of layers in the transformer.
            embedding_dim (int): Channel dimension for input embeddings.
            num_heads (int): Number of heads for multihead attention. Must divide embedding_dim.
            mlp_dim (int): Internal channel dimension for the MLP block.
            activation (Type[nn.Module]): Activation function to use in the MLP block.
            attention_downsample_rate (int): Downsampling rate for attention mechanism.
        r   )r
   r   r   r   r   skip_first_layer_pedownsample_rateN)super__init__r	   r
   r   r   r   
ModuleListlayersrangeappendTwoWayAttentionBlock	Attentionfinal_attn_token_to_image	LayerNormnorm_final_attn)selfr	   r
   r   r   r   r   i	__class__ ^/var/www/vscode/kcb/lib/python3.10/site-packages/ultralytics/models/sam/modules/transformer.pyr   )   s&   

zTwoWayTransformer.__init__image_embeddingimage_pepoint_embeddingc           
      C   s   | dddd}| dddd}|}|}| jD ]}|||||d\}}q|| }|| }| j|||d}	||	 }| |}||fS )an  
        Process image and point embeddings through the Two-Way Transformer.

        Args:
            image_embedding (Tensor): Image to attend to, with shape (B, embedding_dim, H, W).
            image_pe (Tensor): Positional encoding to add to the image, with same shape as image_embedding.
            point_embedding (Tensor): Embedding to add to query points, with shape (B, N_points, embedding_dim).

        Returns:
            queries (Tensor): Processed point embeddings with shape (B, N_points, embedding_dim).
            keys (Tensor): Processed image embeddings with shape (B, H*W, embedding_dim).
        r   r      )querieskeysquery_pekey_peqkv)flattenpermuter   r   r   )
r   r$   r%   r&   r(   r)   layerr-   r.   attn_outr"   r"   r#   forwardS   s"   

zTwoWayTransformer.forward)__name__
__module____qualname____doc__r   ReLUintr   Moduler   r   r   r4   __classcell__r"   r"   r    r#   r      s8    "*
r   c                       sv   e Zd ZdZdejddfdedededeej d	ed
e	ddf fddZ
dededededeeef f
ddZ  ZS )r   aI  
    A two-way attention block for simultaneous attention to image and query points.

    This class implements a specialized transformer block with four main layers: self-attention on sparse inputs,
    cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention of dense
    inputs to sparse inputs.

    Attributes:
        self_attn (Attention): Self-attention layer for queries.
        norm1 (nn.LayerNorm): Layer normalization after self-attention.
        cross_attn_token_to_image (Attention): Cross-attention layer from queries to keys.
        norm2 (nn.LayerNorm): Layer normalization after token-to-image attention.
        mlp (MLPBlock): MLP block for transforming query embeddings.
        norm3 (nn.LayerNorm): Layer normalization after MLP block.
        norm4 (nn.LayerNorm): Layer normalization after image-to-token attention.
        cross_attn_image_to_token (Attention): Cross-attention layer from keys to queries.
        skip_first_layer_pe (bool): Whether to skip positional encoding in the first layer.

    Methods:
        forward: Applies self-attention and cross-attention to queries and keys.

    Examples:
        >>> embedding_dim, num_heads = 256, 8
        >>> block = TwoWayAttentionBlock(embedding_dim, num_heads)
        >>> queries = torch.randn(1, 100, embedding_dim)
        >>> keys = torch.randn(1, 1000, embedding_dim)
        >>> query_pe = torch.randn(1, 100, embedding_dim)
        >>> key_pe = torch.randn(1, 1000, embedding_dim)
        >>> processed_queries, processed_keys = block(queries, keys, query_pe, key_pe)
    i   r   Fr
   r   r   r   r   r   r   Nc                    s~   t    t||| _t|| _t|||d| _t|| _t	|||| _
t|| _t|| _t|||d| _|| _dS )ah  
        Initialize a TwoWayAttentionBlock for simultaneous attention to image and query points.

        This block implements a specialized transformer layer with four main components: self-attention on sparse
        inputs, cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention
        of dense inputs to sparse inputs.

        Args:
            embedding_dim (int): Channel dimension of the embeddings.
            num_heads (int): Number of attention heads in the attention layers.
            mlp_dim (int): Hidden dimension of the MLP block.
            activation (Type[nn.Module]): Activation function for the MLP block.
            attention_downsample_rate (int): Downsampling rate for the attention mechanism.
            skip_first_layer_pe (bool): Whether to skip positional encoding in the first layer.
        r   N)r   r   r   	self_attnr   r   norm1cross_attn_token_to_imagenorm2r   mlpnorm3norm4cross_attn_image_to_tokenr   )r   r
   r   r   r   r   r   r    r"   r#   r      s   

zTwoWayAttentionBlock.__init__r(   r)   r*   r+   c           	      C   s   | j r| j|||d}n|| }| j|||d}|| }| |}|| }|| }| j|||d}|| }| |}| |}|| }| |}|| }|| }| j|||d}|| }| |}||fS )a  
        Apply two-way attention to process query and key embeddings in a transformer block.

        Args:
            queries (Tensor): Query embeddings with shape (B, N_queries, embedding_dim).
            keys (Tensor): Key embeddings with shape (B, N_keys, embedding_dim).
            query_pe (Tensor): Positional encodings for queries with same shape as queries.
            key_pe (Tensor): Positional encodings for keys with same shape as keys.

        Returns:
            queries (Tensor): Processed query embeddings with shape (B, N_queries, embedding_dim).
            keys (Tensor): Processed key embeddings with shape (B, N_keys, embedding_dim).
        r,   )	r   r=   r>   r?   r@   rA   rB   rD   rC   )	r   r(   r)   r*   r+   r-   r3   r.   mlp_outr"   r"   r#   r4      s(   




zTwoWayAttentionBlock.forward)r5   r6   r7   r8   r   r9   r:   r   r;   boolr   r   r   r4   r<   r"   r"   r    r#   r      s,    #.'r   c                       s   e Zd ZdZ		ddededededdf
 fd	d
ZedededefddZededefddZ	dedededefddZ
  ZS )r   a  
    An attention layer with downscaling capability for embedding size after projection.

    This class implements a multi-head attention mechanism with the option to downsample the internal
    dimension of queries, keys, and values.

    Attributes:
        embedding_dim (int): Dimensionality of input embeddings.
        kv_in_dim (int): Dimensionality of key and value inputs.
        internal_dim (int): Internal dimension after downsampling.
        num_heads (int): Number of attention heads.
        q_proj (nn.Linear): Linear projection for queries.
        k_proj (nn.Linear): Linear projection for keys.
        v_proj (nn.Linear): Linear projection for values.
        out_proj (nn.Linear): Linear projection for output.

    Methods:
        _separate_heads: Separates input tensor into attention heads.
        _recombine_heads: Recombines separated attention heads.
        forward: Computes attention output for given query, key, and value tensors.

    Examples:
        >>> attn = Attention(embedding_dim=256, num_heads=8, downsample_rate=2)
        >>> q = torch.randn(1, 100, 256)
        >>> k = v = torch.randn(1, 50, 256)
        >>> output = attn(q, k, v)
        >>> print(output.shape)
        torch.Size([1, 100, 256])
    r'   Nr
   r   r   	kv_in_dimr   c                    s   t    || _|dur|n|| _|| | _|| _| j| dks$J dt|| j| _t| j| j| _	t| j| j| _
t| j|| _dS )a+  
        Initialize the Attention module with specified dimensions and settings.

        Args:
            embedding_dim (int): Dimensionality of input embeddings.
            num_heads (int): Number of attention heads.
            downsample_rate (int): Factor by which internal dimensions are downsampled.
            kv_in_dim (int | None): Dimensionality of key and value inputs. If None, uses embedding_dim.

        Raises:
            AssertionError: If num_heads does not evenly divide the internal dim (embedding_dim / downsample_rate).
        Nr   z$num_heads must divide embedding_dim.)r   r   r
   rG   internal_dimr   r   Linearq_projk_projv_projout_proj)r   r
   r   r   rG   r    r"   r#   r     s   

zAttention.__init__xc                 C   s,   | j \}}}| ||||| } | ddS )zGSeparate the input tensor into the specified number of attention heads.r'   r   )shapereshape	transpose)rN   r   bncr"   r"   r#   _separate_heads2  s   zAttention._separate_headsc                 C   s,   | j \}}}}| dd} | |||| S )z9Recombine separated attention heads into a single tensor.r'   r   )rO   rQ   rP   )rN   rR   n_headsn_tokens
c_per_headr"   r"   r#   _recombine_heads9  s   zAttention._recombine_headsr-   r.   r/   c                 C   s   |  |}| |}| |}| || j}| || j}| || j}|j\}}}}||dddd }|t| }t	j
|dd}|| }| |}| |S )a  
        Apply multi-head attention to query, key, and value tensors with optional downsampling.

        Args:
            q (Tensor): Query tensor with shape (B, N_q, embedding_dim).
            k (Tensor): Key tensor with shape (B, N_k, embedding_dim).
            v (Tensor): Value tensor with shape (B, N_k, embedding_dim).

        Returns:
            (Tensor): Output tensor after attention with shape (B, N_q, embedding_dim).
        r   r'      r   )dim)rJ   rK   rL   rU   r   rO   r1   mathsqrttorchsoftmaxrY   rM   )r   r-   r.   r/   _rX   attnoutr"   r"   r#   r4   @  s   




zAttention.forward)r'   N)r5   r6   r7   r8   r:   r   staticmethodr   rU   rY   r4   r<   r"   r"   r    r#   r      s(    ""r   )r]   typingr   r   r_   r   r   ultralytics.nn.modulesr   r;   r   r   r   r"   r"   r"   r#   <module>   s   tt