o
    Whk                    @   s   d Z ddlZddlmZ ddlm  mZ ddlmZ ddl	m
Z
mZmZmZmZmZ ddlmZ dZG dd	 d	ejZG d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd deZG dd dejZG d d! d!eZ G d"d# d#eZ!G d$d% d%ejZ"G d&d' d'ejZ#G d(d) d)ejZ$G d*d+ d+ejZ%G d,d- d-ejZ&G d.d/ d/ejZ'G d0d1 d1ejZ(G d2d3 d3ejZ)G d4d5 d5ejZ*G d6d7 d7ejZ+G d8d9 d9e#Z,G d:d; d;eZ-G d<d= d=ejZ.G d>d? d?e.Z/G d@dA dAejZ0G dBdC dCejZ1G dDdE dEejZ2G dFdG dGejZ3G dHdI dIejZ4G dJdK dKejZ5G dLdM dMeZ6G dNdO dOeZ7G dPdQ dQejjZ8G dRdS dSejZ9G dTdU dUeZ:G dVdW dWejZ;G dXdY dYejZ<G dZd[ d[ejZ=G d\d] d]ejZ>G d^d_ d_eZ?G d`da daejZ@G dbdc dcejZAG ddde deejZBG dfdg dgejZCG dhdi diejZDG djdk dkejZEG dldm dmejZFG dndo doejZGdS )pzBlock modules.    N)fuse_conv_and_bn   )ConvDWConv	GhostConv	LightConvRepConvautopad)TransformerBlock)'DFLHGBlockHGStemSPPSPPFC1C2C3C2fC2fAttnImagePoolingAttnContrastiveHeadBNContrastiveHeadC3xC3TRC3GhostGhostBottleneck
BottleneckBottleneckCSPProtoRepC3ResNetLayerRepNCSPELAN4ELAN1ADownAConvSPPELANCBFuseCBLinearC3k2C2fPSAC2PSARepVGGDWCIBC2fCIB	AttentionPSASCDownTorchVisionc                       *   e Zd ZdZd fdd	Zdd Z  ZS )r   z
    Integral module of Distribution Focal Loss (DFL).

    Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
       c                    sb   t    tj|ddddd| _tj|tjd}t	|
d|dd| jjjdd< || _dS )zGInitialize a convolutional layer with a given number of input channels.r   Fbias)dtypeN)super__init__nnConv2drequires_grad_convtorcharangefloat	Parameterviewweightdatac1)selfrD   x	__class__ P/var/www/vscode/kcb/lib/python3.10/site-packages/ultralytics/nn/modules/block.pyr8   ?   s
   
$
zDFL.__init__c                 C   s<   |j \}}}| ||d| j|ddd|d|S )zCApply the DFL module to input tensor and return transformed output.      r   )shaper<   rA   rD   	transposesoftmax)rE   rF   b_arI   rI   rJ   forwardG   s   0zDFL.forward)r3   __name__
__module____qualname____doc__r8   rS   __classcell__rI   rI   rG   rJ   r   8   s    r   c                       *   e Zd ZdZd fdd	Zdd Z  ZS )	r   z1YOLOv8 mask Proto module for segmentation models.       c                    sR   t    t||dd| _tj||ddddd| _t||dd| _t||| _dS )a   
        Initialize the YOLOv8 mask Proto module with specified number of protos and masks.

        Args:
            c1 (int): Input channels.
            c_ (int): Intermediate channels.
            c2 (int): Output channels (number of protos).
           krL   r   Tr4   N)	r7   r8   r   cv1r9   ConvTranspose2dupsamplecv2cv3)rE   rD   c_c2rG   rI   rJ   r8   Q   s
   
	zProto.__init__c              	   C   s   |  | | | |S )zEPerform a forward pass through layers using an upsampled input image.)rd   rc   rb   r`   rE   rF   rI   rI   rJ   rS   `   s   zProto.forward)r[   r\   rT   rI   rI   rG   rJ   r   N       r   c                       (   e Zd ZdZ fddZdd Z  ZS )r   z
    StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
    c                    s   t    t||ddt d| _t||d dddt d| _t|d |dddt d| _t|d |ddt d| _t||ddt d| _	tj
ddddd| _dS )	z
        Initialize the StemBlock of PPHGNetV2.

        Args:
            c1 (int): Input channels.
            cm (int): Middle channels.
            c2 (int): Output channels.
        r]   rL   actr   r   T)kernel_sizestridepadding	ceil_modeN)r7   r8   r   r9   ReLUstem1stem2astem2bstem3stem4	MaxPool2dpool)rE   rD   cmrf   rG   rI   rJ   r8   l   s   
	zHGStem.__init__c                 C   sr   |  |}t|g d}| |}t|g d}| |}| |}tj||gdd}| |}| 	|}|S )+Forward pass of a PPHGNetV2 backbone layer.)r   r   r   r   r   dim)
rq   Fpadrr   rs   rw   r=   catrt   ru   )rE   rF   x2x1rI   rI   rJ   rS   }   s   





zHGStem.forwardrT   rI   rI   rG   rJ   r   e   s    r   c                       s8   e Zd ZdZdddde f fdd	Zdd Z  ZS )	r   z
    HG_Block of PPHGNetV2 with 2 convolutions and LightConv.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
    r]      Fc	           	         s   t    |r	tntt fddt|D | _t|  |d dd d| _t|d |dd d| _	|o?|k| _
dS )a  
        Initialize HGBlock with specified parameters.

        Args:
            c1 (int): Input channels.
            cm (int): Middle channels.
            c2 (int): Output channels.
            k (int): Kernel size.
            n (int): Number of LightConv or Conv blocks.
            lightconv (bool): Whether to use LightConv.
            shortcut (bool): Whether to use shortcut connection.
            act (nn.Module): Activation function.
        c                 3   s,    | ]}|d krn dV  qdS )r   r_   rk   NrI   ).0irk   blockrD   rx   r_   rI   rJ   	<genexpr>   s   * z#HGBlock.__init__.<locals>.<genexpr>rL   r   rj   N)r7   r8   r   r   r9   
ModuleListrangemscecadd)	rE   rD   rx   rf   r_   n	lightconvshortcutrk   rG   r   rJ   r8      s   
& zHGBlock.__init__c                    sJ   |g    fdd| jD  | | t d | jr# | S  S )ry   c                 3       | ]	}| d  V  qdS NrI   r   r   yrI   rJ   r          z"HGBlock.forward.<locals>.<genexpr>r   )extendr   r   r   r=   r~   r   rg   rI   r   rJ   rS      s   zHGBlock.forward)	rU   rV   rW   rX   r9   rp   r8   rS   rY   rI   rI   rG   rJ   r      s    r   c                       r2   )r   zDSpatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729.   	      c                    sX   t    |d }t||dd| _t|t|d  |dd| _tdd |D | _dS )z
        Initialize the SPP layer with input/output channels and pooling kernel sizes.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (Tuple[int, int, int]): Kernel sizes for max pooling.
        rL   r   c                 S   s    g | ]}t j|d |d dqS )r   rL   rl   rm   rn   )r9   rv   )r   rF   rI   rI   rJ   
<listcomp>        z SPP.__init__.<locals>.<listcomp>N)	r7   r8   r   r`   lenrc   r9   r   r   rE   rD   rf   r_   re   rG   rI   rJ   r8      s
   
	zSPP.__init__c                    s2   |    | t g fdd| jD  dS )zBForward pass of the SPP layer, performing spatial pyramid pooling.c                    s   g | ]}| qS rI   rI   r   rF   rI   rJ   r      s    zSPP.forward.<locals>.<listcomp>r   )r`   rc   r=   r~   r   rg   rI   r   rJ   rS      s   
(zSPP.forward)r   rT   rI   rI   rG   rJ   r      rh   r   c                       r2   )r   zGSpatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher.r   c                    sP   t    |d }t||dd| _t|d |dd| _tj|d|d d| _dS )a'  
        Initialize the SPPF layer with given input/output channels and kernel size.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (int): Kernel size.

        Notes:
            This module is equivalent to SPP(k=(5, 9, 13)).
        rL   r   rK   r   N)r7   r8   r   r`   rc   r9   rv   r   r   rG   rI   rJ   r8      s
   
zSPPF.__init__c                    s<     |g fddtdD   tdS )zRApply sequential pooling operations to input and return concatenated feature maps.c                 3   s    | ]
}  d  V  qdS r   r   r   rQ   rE   r   rI   rJ   r      s    zSPPF.forward.<locals>.<genexpr>r]   r   )r`   r   r   rc   r=   r~   rg   rI   r   rJ   rS      s   zSPPF.forwardr   rT   rI   rI   rG   rJ   r          r   c                       r2   )r   z"CSP Bottleneck with 1 convolution.r   c                    s<   t    t| dd| _tj fddt|D  | _dS )z
        Initialize the CSP Bottleneck with 1 convolution.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of convolutions.
        r   c                 3   s    | ]	}t   d V  qdS )r]   N)r   r   rf   rI   rJ   r      r   zC1.__init__.<locals>.<genexpr>N)r7   r8   r   r`   r9   
Sequentialr   r   )rE   rD   rf   r   rG   r   rJ   r8      s   
	"zC1.__init__c                 C   s   |  |}| || S )z:Apply convolution and residual connection to input tensor.)r`   r   )rE   rF   r   rI   rI   rJ   rS      s   
z
C1.forwardr   rT   rI   rI   rG   rJ   r      s    r   c                       *   e Zd ZdZd	 fdd	Zdd Z  ZS )
r   z#CSP Bottleneck with 2 convolutions.r   T      ?c                    sh   t    t|| _t|dj dd_tdj |d_tj fddt	|D  _
dS )ah  
        Initialize a CSP Bottleneck with 2 convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rL   r   c              	   3   (    | ]}t jj d ddV  qdS )r]   r]   r         ?r_   eNr   cr   grE   r   rI   rJ   r        & zC2.__init__.<locals>.<genexpr>Nr7   r8   intr   r   r`   rc   r9   r   r   r   rE   rD   rf   r   r   r   r   rG   r   rJ   r8      s
   
&zC2.__init__c                 C   s2   |  |dd\}}| t| ||fdS )z<Forward pass through the CSP bottleneck with 2 convolutions.rL   r   )r`   chunkrc   r=   r~   r   rE   rF   rR   rP   rI   rI   rJ   rS     s   z
C2.forwardr   Tr   r   rT   rI   rI   rG   rJ   r          r   c                       s2   e Zd ZdZd fdd	Zdd Zd	d
 Z  ZS )r   <Faster Implementation of CSP Bottleneck with 2 convolutions.r   Fr   c                    sl   t    t|| _t|dj dd_td| j |d_t fddt	|D _
dS )ah  
        Initialize a CSP bottleneck with 2 convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rL   r   c              	   3   r   r   r   r   r   rI   rJ   r   (  r   zC2f.__init__.<locals>.<genexpr>N)r7   r8   r   r   r   r`   rc   r9   r   r   r   r   rG   r   rJ   r8     s
   
&zC2f.__init__c                    sB   t | |dd   fdd| jD  | t dS )zForward pass through C2f layer.rL   r   c                 3   r   r   rI   r   r   rI   rJ   r   -  r   zC2f.forward.<locals>.<genexpr>)listr`   r   r   r   rc   r=   r~   rg   rI   r   rJ   rS   *  s   zC2f.forwardc                    sV   |  || j| jfd  d  d g   fdd| jD  | t dS ).Forward pass using split() instead of chunk().r   r   c                 3   r   r   rI   r   r   rI   rJ   r   4  r   z$C2f.forward_split.<locals>.<genexpr>)r`   splitr   r   r   rc   r=   r~   rg   rI   r   rJ   forward_split0  s   zC2f.forward_splitr   Fr   r   rU   rV   rW   rX   r8   rS   r   rY   rI   rI   rG   rJ   r     
    r   c                       r   )
r   z#CSP Bottleneck with 3 convolutions.r   Tr   c                    sn   t    t||  t| dd| _t| dd| _td  |d| _tj fddt	|D  | _
dS )aj  
        Initialize the CSP Bottleneck with 3 convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        r   rL   c              	   3   $    | ]}t   d ddV  qdS )))r   r   r   r   r   Nr   r   re   r   r   rI   rJ   r   L     " zC3.__init__.<locals>.<genexpr>N)r7   r8   r   r   r`   rc   rd   r9   r   r   r   r   rG   r   rJ   r8   ;  s   
&zC3.__init__c              	   C   s(   |  t| | || |fdS )z<Forward pass through the CSP bottleneck with 3 convolutions.r   )rd   r=   r~   r   r`   rc   rg   rI   rI   rJ   rS   N  s   (z
C3.forwardr   rT   rI   rI   rG   rJ   r   8  r   r   c                       "   e Zd ZdZd fdd	Z  ZS )r   z"C3 module with cross-convolutions.r   Tr   c                    sJ   t  ||| | t|| _tj fddt|D  _dS )ae  
        Initialize C3 module with cross-convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c              	   3   r   )))r   r]   r]   r   r   r   N)r   re   r   r   rI   rJ   r   d  r   zC3x.__init__.<locals>.<genexpr>N)r7   r8   r   re   r9   r   r   r   r   rG   r   rJ   r8   V  s   &zC3x.__init__r   rU   rV   rW   rX   r8   rY   rI   rI   rG   rJ   r   S      r   c                       rZ   )	r   zRep C3.r]   r   c                    s~   t    t||  t| dd| _t| dd| _tj fddt|D  | _	 |kr8t |dd| _dS t
 | _dS )z
        Initialize CSP Bottleneck with a single convolution.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of RepConv blocks.
            e (float): Expansion ratio.
        r   c                    s   g | ]}t   qS rI   )r   r   re   rI   rJ   r   x  s    z"RepC3.__init__.<locals>.<listcomp>N)r7   r8   r   r   r`   rc   r9   r   r   r   Identityrd   rE   rD   rf   r   r   rG   r   rJ   r8   j  s   

*zRepC3.__init__c                 C   s    |  | | || | S )zForward pass of RepC3 module.)rd   r   r`   rc   rg   rI   rI   rJ   rS   {  s    zRepC3.forward)r]   r   rT   rI   rI   rG   rJ   r   g      r   c                       r   )r   z"C3 module with TransformerBlock().r   Tr   c                    s6   t  |||||| t|| }t||d|| _dS )ad  
        Initialize C3 module with TransformerBlock.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Transformer blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rK   N)r7   r8   r   r
   r   )rE   rD   rf   r   r   r   r   re   rG   rI   rJ   r8     s   zC3TR.__init__r   r   rI   rI   rG   rJ   r     r   r   c                       r   )r   z!C3 module with GhostBottleneck().r   Tr   c                    sD   t  |||||| t||  tj fddt|D  | _dS )ah  
        Initialize C3 module with GhostBottleneck.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Ghost bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c                 3   s    | ]}t   V  qd S )N)r   r   r   rI   rJ   r     s    z#C3Ghost.__init__.<locals>.<genexpr>Nr7   r8   r   r9   r   r   r   r   rG   r   rJ   r8     s   "zC3Ghost.__init__r   r   rI   rI   rG   rJ   r     r   r   c                       rZ   )	r   zGGhost Bottleneck https://github.com/huawei-noah/Efficient-AI-Backbones.r]   r   c                    s   t    |d }tt||dd|dkrt||||ddnt t||dddd| _|dkrGtt||||ddt||dddd| _	dS t | _	dS )z
        Initialize Ghost Bottleneck module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (int): Kernel size.
            s (int): Stride.
        rL   r   Frj   N)
r7   r8   r9   r   r   r   r   r<   r   r   )rE   rD   rf   r_   sre   rG   rI   rJ   r8     s   

 .zGhostBottleneck.__init__c                 C   s   |  || | S )z8Apply skip connection and concatenation to input tensor.)r<   r   rg   rI   rI   rJ   rS     s   zGhostBottleneck.forwardr   rT   rI   rI   rG   rJ   r         r   c                       *   e Zd ZdZd
 fdd	Zdd	 Z  ZS )r   zStandard bottleneck.Tr   r   r   c                    sT   t    t|| }t|||d d| _t|||d d|d| _|o&||k| _dS )am  
        Initialize a standard bottleneck module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            shortcut (bool): Whether to use shortcut connection.
            g (int): Groups for convolutions.
            k (Tuple[int, int]): Kernel sizes for convolutions.
            e (float): Expansion ratio.
        r   r   r   N)r7   r8   r   r   r`   rc   r   rE   rD   rf   r   r   r_   r   re   rG   rI   rJ   r8     s
   
zBottleneck.__init__c                 C   s*   | j r|| | | S | | |S )z3Apply bottleneck with optional shortcut connection.)r   rc   r`   rg   rI   rI   rJ   rS     s   *zBottleneck.forwardTr   r   r   rT   rI   rI   rG   rJ   r     r   r   c                       r   )
r   zGCSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks.r   Tr   c                    s   t    t||  t| dd| _tj| dddd| _tj  dddd| _td  |dd| _	t
d  | _t | _tj fddt|D  | _dS )aR  
        Initialize CSP Bottleneck.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        r   Fr4   rL   c                 3   "    | ]}t   d dV  qdS r   r   Nr   r   r   rI   rJ   r          z)BottleneckCSP.__init__.<locals>.<genexpr>N)r7   r8   r   r   r`   r9   r:   rc   rd   cv4BatchNorm2dbnSiLUrk   r   r   r   r   rG   r   rJ   r8     s   

&zBottleneckCSP.__init__c              
   C   sB   |  | | |}| |}| | | t||fdS )z)Apply CSP bottleneck with 3 convolutions.r   )	rd   r   r`   rc   r   rk   r   r=   r~   )rE   rF   y1y2rI   rI   rJ   rS     s   
"zBottleneckCSP.forwardr   rT   rI   rI   rG   rJ   r     s    r   c                       rZ   )	ResNetBlockz.ResNet block with standard convolution layers.r   rK   c              	      s   t    || }t||dddd| _t||d|ddd| _t||ddd| _|dks/||kr>tt||d|dd| _	dS t | _	dS )	z
        Initialize ResNet block.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            s (int): Stride.
            e (int): Expansion ratio.
        r   Tr_   r   rk   r]   r_   r   prk   Fr   N)
r7   r8   r   r`   rc   rd   r9   r   r   r   )rE   rD   rf   r   r   c3rG   rI   rJ   r8     s   

<zResNetBlock.__init__c              	   C   s&   t | | | || | S )z&Forward pass through the ResNet block.)r|   relurd   rc   r`   r   rg   rI   rI   rJ   rS     s   &zResNetBlock.forward)r   rK   rT   rI   rI   rG   rJ   r     r   r   c                       r   )
r    z)ResNet layer with multiple ResNet blocks.r   FrK   c              
      s   t    || _| jr"tt| dddddtjdddd| _dS t| |dg}|	 fd	d
t
|d D  tj| | _dS )a5  
        Initialize ResNet layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            s (int): Stride.
            is_first (bool): Whether this is the first layer.
            n (int): Number of ResNet blocks.
            e (int): Expansion ratio.
           rL   r]   Tr   r   r   r   c                    s    g | ]}t    d dqS )r   r   )r   r   rf   r   rI   rJ   r   0  r   z(ResNetLayer.__init__.<locals>.<listcomp>N)r7   r8   is_firstr9   r   r   rv   layerr   r   r   )rE   rD   rf   r   r   r   r   blocksrG   r   rJ   r8     s   
 
"zResNetLayer.__init__c                 C   s
   |  |S )z&Forward pass through the ResNet layer.)r   rg   rI   rI   rJ   rS   3  s   
zResNetLayer.forward)r   Fr   rK   rT   rI   rI   rG   rJ   r      s    r    c                       r   )MaxSigmoidAttnBlockzMax Sigmoid attention block.r         Fc                    s   t    || _|| | _||krt||dddnd| _t||| _t	t
|| _t||dddd| _|rFt	t
d|dd| _dS d| _dS )aH  
        Initialize MaxSigmoidAttnBlock.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            nh (int): Number of heads.
            ec (int): Embedding channels.
            gc (int): Guide channels.
            scale (bool): Whether to use learnable scale parameter.
        r   Fr   Nr]   r   r   )r7   r8   nhhcr   r   r9   Linearglr@   r=   zerosr5   	proj_convonesscale)rE   rD   rf   r   r   gcr   rG   rI   rJ   r8   ;  s   

*zMaxSigmoidAttnBlock.__init__c           	      C   s   |j \}}}}| |}|||j d | j| j}| jdur#| |n|}||| j| j||}td||}|jddd }|| jd  }|| j	dddddf  }|
 | j }| |}||| jd||}||d }||d||S )	z
        Forward pass of MaxSigmoidAttnBlock.

        Args:
            x (torch.Tensor): Input tensor.
            guide (torch.Tensor): Guide tensor.

        Returns:
            (torch.Tensor): Output tensor after attention.
        r   Nzbmchw,bnmc->bmhwnr   rz   r   r   rL   )rM   r   rA   r   r   r   r=   einsummaxr5   sigmoidr   r   	unsqueeze)	rE   rF   guidebsrQ   hwembedawrI   rI   rJ   rS   P  s   

zMaxSigmoidAttnBlock.forward)r   r   r   FrT   rI   rI   rG   rJ   r   8  r   r   c                       s2   e Zd ZdZd fdd	Zd	d
 Zdd Z  ZS )r   z*C2f module with an additional attn module.r   r   r   Fr   c
           
         s   t    t||	 _t|dj dd_td| j |d_t fddt	|D _
tjj|||d_dS )a  
        Initialize C2f module with attention mechanism.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            ec (int): Embedding channels for attention.
            nh (int): Number of heads for attention.
            gc (int): Guide channels for attention.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rL   r   r]   c              	   3   r   r   r   r   r   rI   rJ   r     r   z#C2fAttn.__init__.<locals>.<genexpr>)r   r   r   N)r7   r8   r   r   r   r`   rc   r9   r   r   r   r   attn)
rE   rD   rf   r   r   r   r   r   r   r   rG   r   rJ   r8   q  s   
"zC2fAttn.__init__c                    sX   t | |dd   fdd| jD   |  d | | t	 dS )a  
        Forward pass through C2f layer with attention.

        Args:
            x (torch.Tensor): Input tensor.
            guide (torch.Tensor): Guide tensor for attention.

        Returns:
            (torch.Tensor): Output tensor after processing.
        rL   r   c                 3   r   r   rI   r   r   rI   rJ   r     r   z"C2fAttn.forward.<locals>.<genexpr>r   )
r   r`   r   r   r   appendr  rc   r=   r~   rE   rF   r   rI   r   rJ   rS     s   zC2fAttn.forwardc                    s`   t | || j| jfd   fdd| jD   |  d | | t	
 dS )a  
        Forward pass using split() instead of chunk().

        Args:
            x (torch.Tensor): Input tensor.
            guide (torch.Tensor): Guide tensor for attention.

        Returns:
            (torch.Tensor): Output tensor after processing.
        r   c                 3   r   r   rI   r   r   rI   rJ   r     r   z(C2fAttn.forward_split.<locals>.<genexpr>r   )r   r`   r   r   r   r   r  r  rc   r=   r~   r  rI   r   rJ   r     s   zC2fAttn.forward_split)r   r   r   r   Fr   r   r   rI   rI   rG   rJ   r   n  s
    r   c                       s*   e Zd ZdZd fdd		Zd
d Z  ZS )r   zKImagePoolingAttn: Enhance the text embeddings with image-aware information.r[   rI   r      r]   Fc                    s   t    t|}tt|t| | _tt t  | _tt t  | _	t || _
|rGtjtdgddnd| _t fdd|D | _tfddt|D | _ | _|| _|| _ | | _| _dS )	a  
        Initialize ImagePoolingAttn module.

        Args:
            ec (int): Embedding channels.
            ch (tuple): Channel dimensions for feature maps.
            ct (int): Channel dimension for text embeddings.
            nh (int): Number of attention heads.
            k (int): Kernel size for pooling.
            scale (bool): Whether to use learnable scale parameter.
        g        Trequires_gradr   c                    s   g | ]
}t j| d dqS )r   )rl   )r9   r:   )r   in_channels)r   rI   rJ   r     s    z-ImagePoolingAttn.__init__.<locals>.<listcomp>c                    s   g | ]	}t   fqS rI   )r9   AdaptiveMaxPool2dr   r^   rI   rJ   r     s    N)r7   r8   r   r9   r   	LayerNormr   querykeyvalueprojr@   r=   tensorr   r   projectionsr   im_poolsr   r   nfr   r_   )rE   r   chctr   r_   r   r  rG   )r   r_   rJ   r8     s   
 

zImagePoolingAttn.__init__c                    s  |d j d  t|| jksJ | jd  fddt|| j| jD }tj|dd	dd}| 
|}| |}| |}| d| j| j}| d| j| j}| d| j| j}td||}|| jd	  }tj|dd}td
||}| | d| j}|| j | S )z
        Forward pass of ImagePoolingAttn.

        Args:
            x (List[torch.Tensor]): List of input feature maps.
            text (torch.Tensor): Text embeddings.

        Returns:
            (torch.Tensor): Enhanced text embeddings.
        r   rL   c                    s(   g | ]\}}}|||  d qS )r   )rA   )r   rF   r  rw   r   num_patchesrI   rJ   r     s   ( z,ImagePoolingAttn.forward.<locals>.<listcomp>r   rz   r   zbnmc,bkmc->bmnkr   zbmnk,bkmc->bnmc)rM   r   r  r_   zipr  r  r=   r~   rN   r  r  r  reshaper   r   r   r|   rO   r  r   r   )rE   rF   textqr_   vr  rI   r  rJ   rS     s"   
 


zImagePoolingAttn.forward)r[   rI   r   r  r]   FrT   rI   rI   rG   rJ   r     s    r   c                       ri   )r   zZImplements contrastive learning head for region-text similarity in vision-language models.c                    sB   t    ttdg| _ttg td  | _	dS )zBInitialize ContrastiveHead with region-text similarity parameters.      $g$I$I,@N)
r7   r8   r9   r@   r=   r  r5   r   loglogit_scalerE   rG   rI   rJ   r8     s   
$zContrastiveHead.__init__c                 C   sB   t j|ddd}t j|ddd}td||}|| j  | j S )z
        Forward function of contrastive learning.

        Args:
            x (torch.Tensor): Image features.
            w (torch.Tensor): Text features.

        Returns:
            (torch.Tensor): Similarity scores.
        r   rL   r{   r   r   bchw,bkc->bkhw)r|   	normalizer=   r   r!  expr5   rE   rF   r  rI   rI   rJ   rS     s   zContrastiveHead.forwardrT   rI   rI   rG   rJ   r     s    r   c                       s>   e Zd ZdZdef fddZdd Zdd Zd	d
 Z  Z	S )r   z
    Batch Norm Contrastive Head using batch norm instead of l2-normalization.

    Args:
        embed_dims (int): Embed dimensions of text and image features.
    
embed_dimsc                    sD   t    t|| _ttdg| _tdt	g  | _
dS )z
        Initialize BNContrastiveHead.

        Args:
            embed_dims (int): Embedding dimensions for features.
        r  g      N)r7   r8   r9   r   normr@   r=   r  r5   r   r!  )rE   r(  rG   rI   rJ   r8     s   
zBNContrastiveHead.__init__c                 C   s   | ` | `| `| j| _dS )zCFuse the batch normalization layer in the BNContrastiveHead module.N)r)  r5   r!  forward_fuserS   r"  rI   rI   rJ   fuse  s   zBNContrastiveHead.fusec                 C   s   |S )zN
        Passes input out unchanged.

        TODO: Update or remove?
        rI   r'  rI   rI   rJ   r*  !  s   zBNContrastiveHead.forward_fusec                 C   s<   |  |}tj|ddd}td||}|| j  | j S )z
        Forward function of contrastive learning with batch normalization.

        Args:
            x (torch.Tensor): Image features.
            w (torch.Tensor): Text features.

        Returns:
            (torch.Tensor): Similarity scores.
        r   rL   r#  r$  )r)  r|   r%  r=   r   r!  r&  r5   r'  rI   rI   rJ   rS   )  s   
zBNContrastiveHead.forward)
rU   rV   rW   rX   r   r8   r+  r*  rS   rY   rI   rI   rG   rJ   r     s    r   c                       "   e Zd ZdZd fdd	Z  ZS )	RepBottleneckzRep bottleneck.Tr   r   r   c                    s:   t  |||||| t|| }t|||d d| _dS )a^  
        Initialize RepBottleneck.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            shortcut (bool): Whether to use shortcut connection.
            g (int): Groups for convolutions.
            k (Tuple[int, int]): Kernel sizes for convolutions.
            e (float): Expansion ratio.
        r   r   N)r7   r8   r   r   r`   r   rG   rI   rJ   r8   >  s   zRepBottleneck.__init__r   r   rI   rI   rG   rJ   r-  ;  r   r-  c                       r   )RepCSPzXRepeatable Cross Stage Partial Network (RepCSP) module for efficient feature extraction.r   Tr   c                    sH   t  |||| t||  tj fddt|D  | _dS )aS  
        Initialize RepCSP layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of RepBottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c                 3   r   r   )r-  r   r   rI   rJ   r   `  r   z"RepCSP.__init__.<locals>.<genexpr>Nr   r   rG   r   rJ   r8   R  s   &zRepCSP.__init__r   r   rI   rI   rG   rJ   r.  O  r   r.  c                       s2   e Zd ZdZd	 fdd	Zdd Zdd Z  ZS )
r!   z	CSP-ELAN.r   c                    s   t    |d | _t||dd| _tt|d ||t||dd| _tt|||t||dd| _	t|d|  |dd| _
dS )a  
        Initialize CSP-ELAN layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            c3 (int): Intermediate channels.
            c4 (int): Intermediate channels for RepCSP.
            n (int): Number of RepCSP blocks.
        rL   r   r]   N)r7   r8   r   r   r`   r9   r   r.  rc   rd   r   )rE   rD   rf   r   c4r   rG   rI   rJ   r8   f  s   

$ zRepNCSPELAN4.__init__c                    sH   t | |dd   fdd| j| jfD  | t dS )z(Forward pass through RepNCSPELAN4 layer.rL   r   c                 3   r   r   rI   r   r   rI   rJ   r   {  r   z'RepNCSPELAN4.forward.<locals>.<genexpr>)	r   r`   r   r   rc   rd   r   r=   r~   rg   rI   r   rJ   rS   x  s    zRepNCSPELAN4.forwardc                    sP   t | || j| jfd   fdd| j| jfD  | t	 dS )r   r   c                 3   r   r   rI   r   r   rI   rJ   r     r   z-RepNCSPELAN4.forward_split.<locals>.<genexpr>)
r   r`   r   r   r   rc   rd   r   r=   r~   rg   rI   r   rJ   r   ~  s    zRepNCSPELAN4.forward_splitr   r   rI   rI   rG   rJ   r!   c  r   r!   c                       s    e Zd ZdZ fddZ  ZS )r"   z!ELAN1 module with 4 convolutions.c                    sl   t  |||| |d | _t||dd| _t|d |dd| _t||dd| _t|d|  |dd| _dS )z
        Initialize ELAN1 layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            c3 (int): Intermediate channels.
            c4 (int): Intermediate channels for convolutions.
        rL   r   r]   N)r7   r8   r   r   r`   rc   rd   r   )rE   rD   rf   r   r/  rG   rI   rJ   r8     s   

zELAN1.__init__r   rI   rI   rG   rJ   r"     s    r"   c                       ri   )r$   zAConv.c                    s    t    t||ddd| _dS )z
        Initialize AConv module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
        r]   rL   r   N)r7   r8   r   r`   rE   rD   rf   rG   rI   rJ   r8     s   
zAConv.__init__c                 C   s"   t jj|ddddd}| |S )z!Forward pass through AConv layer.rL   r   r   FT)r=   r9   
functional
avg_pool2dr`   rg   rI   rI   rJ   rS     s   
zAConv.forwardrT   rI   rI   rG   rJ   r$     s    r$   c                       ri   )r#   zADown.c                    sH   t    |d | _t|d | jddd| _t|d | jddd| _dS )z
        Initialize ADown module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
        rL   r]   r   r   N)r7   r8   r   r   r`   rc   r0  rG   rI   rJ   r8     s   

zADown.__init__c                 C   s`   t jj|ddddd}|dd\}}| |}t jj|ddd}| |}t ||fdS )z!Forward pass through ADown layer.rL   r   r   FTr]   )	r=   r9   r1  r2  r   r`   
max_pool2drc   r~   )rE   rF   r   r   rI   rI   rJ   rS     s   

zADown.forwardrT   rI   rI   rG   rJ   r#     s    r#   c                       r2   )r%   z	SPP-ELAN.r   c                    sz   t    || _t||dd| _tj|d|d d| _tj|d|d d| _tj|d|d d| _	td| |dd| _
dS )z
        Initialize SPP-ELAN block.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            c3 (int): Intermediate channels.
            k (int): Kernel size for max pooling.
        r   rL   r   rK   N)r7   r8   r   r   r`   r9   rv   rc   rd   r   cv5)rE   rD   rf   r   r_   rG   rI   rJ   r8     s   

zSPPELAN.__init__c                    sB   |  |g   fdd| j| j| jfD  | t dS )z#Forward pass through SPPELAN layer.c                 3   r   r   rI   r   r   rI   rJ   r     r   z"SPPELAN.forward.<locals>.<genexpr>r   )r`   r   rc   rd   r   r4  r=   r~   rg   rI   r   rJ   rS     s   $zSPPELAN.forwardr   rT   rI   rI   rG   rJ   r%     r   r%   c                       rZ   )	r'   z	CBLinear.r   Nc              	      s8   t    || _tj|t|||t|||dd| _dS )a  
        Initialize CBLinear module.

        Args:
            c1 (int): Input channels.
            c2s (List[int]): List of output channel sizes.
            k (int): Kernel size.
            s (int): Stride.
            p (int | None): Padding.
            g (int): Groups.
        T)groupsr5   N)r7   r8   c2sr9   r:   sumr	   r<   )rE   rD   r6  r_   r   r   r   rG   rI   rJ   r8     s   
(zCBLinear.__init__c                 C   s   |  |j| jddS )z$Forward pass through CBLinear layer.r   rz   )r<   r   r6  rg   rI   rI   rJ   rS     s   zCBLinear.forward)r   r   Nr   rT   rI   rI   rG   rJ   r'     s    r'   c                       ri   )r&   zCBFuse.c                    s   t    || _dS )zv
        Initialize CBFuse module.

        Args:
            idx (List[int]): Indices for feature selection.
        N)r7   r8   idx)rE   r8  rG   rI   rJ   r8     s   

zCBFuse.__init__c                    sR   |d j dd  fddt|dd D }tjt||dd  ddS )z
        Forward pass through CBFuse layer.

        Args:
            xs (List[torch.Tensor]): List of input tensors.

        Returns:
            (torch.Tensor): Fused output tensor.
        r   rL   Nc                    s*   g | ]\}}t j| j|  d dqS )nearest)sizemode)r|   interpolater8  r   r   rF   rE   target_sizerI   rJ   r     s   * z"CBFuse.forward.<locals>.<listcomp>r   rz   )rM   	enumerater=   r7  stack)rE   xsresrI   r>  rJ   rS   	  s   
  zCBFuse.forwardrT   rI   rI   rG   rJ   r&     s    
r&   c                       r   )
C3fr   r   Fr   c                    sr   t    t||  t| dd| _t| dd| _td|   |d| _t fddt	|D | _
dS )an  
        Initialize CSP bottleneck layer with two convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        r   rL   c              	   3   r   r   r   r   r   rI   rJ   r   ,  r   zC3f.__init__.<locals>.<genexpr>N)r7   r8   r   r   r`   rc   rd   r9   r   r   r   r   rG   r   rJ   r8     s   
&zC3f.__init__c                    s@   |  || |g   fdd| jD  | t dS )zForward pass through C3f layer.c                 3   r   r   rI   r   r   rI   rJ   r   1  r   zC3f.forward.<locals>.<genexpr>r   )rc   r`   r   r   rd   r=   r~   rg   rI   r   rJ   rS   .  s   zC3f.forwardr   rT   rI   rI   rG   rJ   rD    r   rD  c                       r,  )	r(   r   r   Fr   Tc                    s>   t  |||| t fddt|D _dS )aw  
        Initialize C3k2 module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of blocks.
            c3k (bool): Whether to use C3k blocks.
            e (float): Expansion ratio.
            g (int): Groups for convolutions.
            shortcut (bool): Whether to use shortcut connections.
        c                 3   s:    | ]} rt jjd ntjjV  qdS )rL   N)C3kr   r   r   c3kr   rE   r   rI   rJ   r   F  s    *
z C3k2.__init__.<locals>.<genexpr>Nr7   r8   r9   r   r   r   )rE   rD   rf   r   rG  r   r   r   rG   rF  rJ   r8   8  s   zC3k2.__init__)r   Fr   r   Tr   rI   rI   rG   rJ   r(   5  r   r(   c                       r,  )	rE  zhC3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks.r   Tr   r]   c                    sJ   t  |||| t||  tj fddt|D  | _dS )ap  
        Initialize C3k module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
            k (int): Kernel size.
        c              	   3   s(    | ]}t   fd dV  qdS )r   r   Nr   r   re   r   r_   r   rI   rJ   r   ^  r   zC3k.__init__.<locals>.<genexpr>Nr   )rE   rD   rf   r   r   r   r   r_   rG   rI  rJ   r8   N  s   (zC3k.__init__)r   Tr   r   r]   r   rI   rI   rG   rJ   rE  K  r   rE  c                       sB   e Zd ZdZd fddZdd Zdd	 Ze d
d Z	  Z
S )r+   zfRepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture.returnNc              	      sN   t    t||ddd|dd| _t||ddd|dd| _|| _t | _dS )zm
        Initialize RepVGGDW module.

        Args:
            ed (int): Input and output channels.
        r   r   r]   Fr   rk   N)	r7   r8   r   r<   conv1r{   r9   r   rk   )rE   edrG   rI   rJ   r8   d  s
   
zRepVGGDW.__init__c                 C   s   |  | || | S )z
        Perform a forward pass of the RepVGGDW block.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after applying the depth wise separable convolution.
        )rk   r<   rL  rg   rI   rI   rJ   rS   q  s   
zRepVGGDW.forwardc                 C      |  | |S )a  
        Perform a forward pass of the RepVGGDW block without fusing the convolutions.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after applying the depth wise separable convolution.
        )rk   r<   rg   rI   rI   rJ   r*  }     
zRepVGGDW.forward_fusec           	      C   s   t | jj| jj}t | jj| jj}|j}|j}|j}|j}tjj	|g d}|| }|| }|jj
| |jj
| || _| `dS )z
        Fuse the convolutional layers in the RepVGGDW block.

        This method fuses the convolutional layers and updates the weights and biases accordingly.
        )rL   rL   rL   rL   N)r   r<   r   rL  rB   r5   r=   r9   r1  r}   rC   copy_)	rE   r<   rL  conv_wconv_bconv1_wconv1_bfinal_conv_wfinal_conv_brI   rI   rJ   r+    s   zRepVGGDW.fuserJ  N)rU   rV   rW   rX   r8   rS   r*  r=   no_gradr+  rY   rI   rI   rG   rJ   r+   a  s    r+   c                       r   )
r,   a  
    Conditional Identity Block (CIB) module.

    Args:
        c1 (int): Number of input channels.
        c2 (int): Number of output channels.
        shortcut (bool, optional): Whether to add a shortcut connection. Defaults to True.
        e (float, optional): Scaling factor for the hidden channels. Defaults to 0.5.
        lk (bool, optional): Whether to use RepVGGDW for the third convolutional layer. Defaults to False.
    Tr   Fc                    s   t    t|| }tt||d|dt|d| d|r#td| ntd| d| dd| dtd| |dt||d|d| _|oF||k| _dS )a!  
        Initialize the CIB module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            shortcut (bool): Whether to use shortcut connection.
            e (float): Expansion ratio.
            lk (bool): Whether to use RepVGGDW.
        r]   r   rL   r   N)	r7   r8   r   r9   r   r   r+   r`   r   )rE   rD   rf   r   r   lkre   rG   rI   rJ   r8     s   
*zCIB.__init__c                 C   s   | j r
|| | S | |S )z
        Forward pass of the CIB module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor.
        )r   r`   rg   rI   rI   rJ   rS     s   
zCIB.forward)Tr   FrT   rI   rI   rG   rJ   r,     s    r,   c                       r   )r-   aQ  
    C2fCIB class represents a convolutional block with C2f and CIB modules.

    Args:
        c1 (int): Number of input channels.
        c2 (int): Number of output channels.
        n (int, optional): Number of CIB modules to stack. Defaults to 1.
        shortcut (bool, optional): Whether to use shortcut connection. Defaults to False.
        lk (bool, optional): Whether to use local key connection. Defaults to False.
        g (int, optional): Number of groups for grouped convolution. Defaults to 1.
        e (float, optional): Expansion ratio for CIB modules. Defaults to 0.5.
    r   Fr   c                    s<   t  ||||| t fddt|D _dS )a  
        Initialize C2fCIB module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of CIB modules.
            shortcut (bool): Whether to use shortcut connection.
            lk (bool): Whether to use local key connection.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c                 3   s&    | ]}t jjd  dV  qdS )r   )r   rY  N)r,   r   r   rY  rE   r   rI   rJ   r        $ z"C2fCIB.__init__.<locals>.<genexpr>NrH  )rE   rD   rf   r   r   rY  r   r   rG   rZ  rJ   r8     s   &zC2fCIB.__init__)r   FFr   r   r   rI   rI   rG   rJ   r-     s    r-   c                       rZ   )	r.   a  
    Attention module that performs self-attention on the input tensor.

    Args:
        dim (int): The input tensor dimension.
        num_heads (int): The number of attention heads.
        attn_ratio (float): The ratio of the attention key dimension to the head dimension.

    Attributes:
        num_heads (int): The number of attention heads.
        head_dim (int): The dimension of each attention head.
        key_dim (int): The dimension of the attention key.
        scale (float): The scaling factor for the attention scores.
        qkv (Conv): Convolutional layer for computing the query, key, and value.
        proj (Conv): Convolutional layer for projecting the attended values.
        pe (Conv): Convolutional layer for positional encoding.
    r  r   c                    s   t    || _|| | _t| j| | _| jd | _| j| }||d  }t||ddd| _t||ddd| _	t||dd|dd| _
dS )	z
        Initialize multi-head attention module.

        Args:
            dim (int): Input dimension.
            num_heads (int): Number of attention heads.
            attn_ratio (float): Attention ratio for key dimension.
              rL   r   Frj   r]   rK  N)r7   r8   	num_headshead_dimr   key_dimr   r   qkvr  pe)rE   r{   r]  
attn_rationh_kdr  rG   rI   rJ   r8     s   
	

zAttention.__init__c              	   C   s   |j \}}}}|| }| |}||| j| jd | j |j| j| j| jgdd\}}	}
|dd|	 | j }|j	dd}|
|dd ||||| 
|
|||| }| |}|S )z
        Forward pass of the Attention module.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            (torch.Tensor): The output tensor after self-attention.
        rL   rz   r   )rM   r`  rA   r]  r_  r^  r   rN   r   rO   ra  r  r  )rE   rF   BCHWNr`  r  r_   r  r  rI   rI   rJ   rS     s   

2
zAttention.forward)r  r   rT   rI   rI   rG   rJ   r.     s    r.   c                       s,   e Zd ZdZdd fddZd	d
 Z  ZS )PSABlockaK  
    PSABlock class implementing a Position-Sensitive Attention block for neural networks.

    This class encapsulates the functionality for applying multi-head attention and feed-forward neural network layers
    with optional shortcut connections.

    Attributes:
        attn (Attention): Multi-head attention module.
        ffn (nn.Sequential): Feed-forward neural network module.
        add (bool): Flag indicating whether to add shortcut connections.

    Methods:
        forward: Performs a forward pass through the PSABlock, applying attention and feed-forward layers.

    Examples:
        Create a PSABlock and perform a forward pass
        >>> psablock = PSABlock(c=128, attn_ratio=0.5, num_heads=4, shortcut=True)
        >>> input_tensor = torch.randn(1, 128, 32, 32)
        >>> output_tensor = psablock(input_tensor)
    r   rK   TrJ  Nc              	      sN   t    t|||d| _tt||d dt|d |ddd| _|| _dS )a&  
        Initialize the PSABlock.

        Args:
            c (int): Input and output channels.
            attn_ratio (float): Attention ratio for key dimension.
            num_heads (int): Number of attention heads.
            shortcut (bool): Whether to use shortcut connections.
        rb  r]  rL   r   Frj   N)	r7   r8   r.   r  r9   r   r   ffnr   )rE   r   rb  r]  r   rG   rI   rJ   r8   H  s   

*
zPSABlock.__init__c                 C   sD   | j r
|| | n| |}| j r|| | }|S | |}|S )z
        Execute a forward pass through PSABlock.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after attention and feed-forward processing.
        )r   r  rl  rg   rI   rI   rJ   rS   X  s
   

zPSABlock.forward)r   rK   TrW  rT   rI   rI   rG   rJ   rj  2  s    rj  c                       r2   )r/   a  
    PSA class for implementing Position-Sensitive Attention in neural networks.

    This class encapsulates the functionality for applying position-sensitive attention and feed-forward networks to
    input tensors, enhancing feature extraction and processing capabilities.

    Attributes:
        c (int): Number of hidden channels after applying the initial convolution.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
        attn (Attention): Attention module for position-sensitive attention.
        ffn (nn.Sequential): Feed-forward network for further processing.

    Methods:
        forward: Applies position-sensitive attention and feed-forward network to the input tensor.

    Examples:
        Create a PSA module and apply it to an input tensor
        >>> psa = PSA(c1=128, c2=128, e=0.5)
        >>> input_tensor = torch.randn(1, 128, 64, 64)
        >>> output_tensor = psa.forward(input_tensor)
    r   c              	      s   t    ||ksJ t|| | _t|d| j dd| _td| j |d| _t| jd| jd d| _t	
t| j| jd dt| jd | jddd| _dS )	z
        Initialize PSA module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            e (float): Expansion ratio.
        rL   r   r   @   rk  Frj   N)r7   r8   r   r   r   r`   rc   r.   r  r9   r   rl  )rE   rD   rf   r   rG   rI   rJ   r8     s   
	6zPSA.__init__c                 C   sR   |  |j| j| jfdd\}}|| | }|| | }| t||fdS )z
        Execute forward pass in PSA module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after attention and feed-forward processing.
        r   rz   )r`   r   r   r  rl  rc   r=   r~   r   rI   rI   rJ   rS     s    
zPSA.forward)r   rT   rI   rI   rG   rJ   r/   g  s    r/   c                       rZ   )	r*   aL  
    C2PSA module with attention mechanism for enhanced feature extraction and processing.

    This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing
    capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations.

    Attributes:
        c (int): Number of hidden channels.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
        m (nn.Sequential): Sequential container of PSABlock modules for attention and feed-forward operations.

    Methods:
        forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations.

    Notes:
        This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.

    Examples:
        >>> c2psa = C2PSA(c1=256, c2=256, n=3, e=0.5)
        >>> input_tensor = torch.randn(1, 256, 64, 64)
        >>> output_tensor = c2psa(input_tensor)
    r   r   c                    sp   t    ||ksJ t||  _t|d j dd _td j |d _tj fddt	|D   _
dS )z
        Initialize C2PSA module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of PSABlock modules.
            e (float): Expansion ratio.
        rL   r   c                 3   &    | ]}t  jd  jd dV  qdS r   rm  rk  Nrj  r   r   r"  rI   rJ   r     r[  z!C2PSA.__init__.<locals>.<genexpr>Nr   r   rG   r"  rJ   r8     s   

"zC2PSA.__init__c                 C   s@   |  |j| j| jfdd\}}| |}| t||fdS )z
        Process the input tensor through a series of PSA blocks.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after processing.
        r   rz   )r`   r   r   r   rc   r=   r~   r   rI   rI   rJ   rS     s    

zC2PSA.forwardr   r   rT   rI   rI   rG   rJ   r*     s    r*   c                       s"   e Zd ZdZd fdd	Z  ZS )r)   a  
    C2fPSA module with enhanced feature extraction using PSA blocks.

    This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature extraction.

    Attributes:
        c (int): Number of hidden channels.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
        m (nn.ModuleList): List of PSA blocks for feature extraction.

    Methods:
        forward: Performs a forward pass through the C2fPSA module.
        forward_split: Performs a forward pass using split() instead of chunk().

    Examples:
        >>> import torch
        >>> from ultralytics.models.common import C2fPSA
        >>> model = C2fPSA(c1=64, c2=64, n=3, e=0.5)
        >>> x = torch.randn(1, 64, 128, 128)
        >>> output = model(x)
        >>> print(output.shape)
    r   r   c                    sB   ||ksJ t  j||||d t fddt|D  _dS )z
        Initialize C2fPSA module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of PSABlock modules.
            e (float): Expansion ratio.
        )r   r   c                 3   rn  ro  rp  r   r"  rI   rJ   r      r[  z"C2fPSA.__init__.<locals>.<genexpr>NrH  r   rG   r"  rJ   r8     s   
"zC2fPSA.__init__rq  r   rI   rI   rG   rJ   r)     s    r)   c                       ri   )r0   a<  
    SCDown module for downsampling with separable convolutions.

    This module performs downsampling using a combination of pointwise and depthwise convolutions, which helps in
    efficiently reducing the spatial dimensions of the input tensor while maintaining the channel information.

    Attributes:
        cv1 (Conv): Pointwise convolution layer that reduces the number of channels.
        cv2 (Conv): Depthwise convolution layer that performs spatial downsampling.

    Methods:
        forward: Applies the SCDown module to the input tensor.

    Examples:
        >>> import torch
        >>> from ultralytics import SCDown
        >>> model = SCDown(c1=64, c2=128, k=3, s=2)
        >>> x = torch.randn(1, 64, 128, 128)
        >>> y = model(x)
        >>> print(y.shape)
        torch.Size([1, 128, 64, 64])
    c                    s4   t    t||dd| _t|||||dd| _dS )z
        Initialize SCDown module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (int): Kernel size.
            s (int): Stride.
        r   F)r_   r   r   rk   N)r7   r8   r   r`   rc   )rE   rD   rf   r_   r   rG   rI   rJ   r8     s   

zSCDown.__init__c                 C   rN  )z
        Apply convolution and downsampling to the input tensor.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Downsampled output tensor.
        )rc   r`   rg   rI   rI   rJ   rS   )  rO  zSCDown.forwardrT   rI   rI   rG   rJ   r0     s    r0   c                       r   )r1   aZ  
    TorchVision module to allow loading any torchvision model.

    This class provides a way to load a model from the torchvision library, optionally load pre-trained weights, and customize the model by truncating or unwrapping layers.

    Attributes:
        m (nn.Module): The loaded torchvision model, possibly truncated and unwrapped.

    Args:
        model (str): Name of the torchvision model to load.
        weights (str, optional): Pre-trained weights to load. Default is "DEFAULT".
        unwrap (bool, optional): If True, unwraps the model to a sequential containing all but the last `truncate` layers. Default is True.
        truncate (int, optional): Number of layers to truncate from the end if `unwrap` is True. Default is 2.
        split (bool, optional): Returns output from intermediate child modules as list. Default is False.
    DEFAULTTrL   Fc                    s   ddl }t   t|jdr|jj||d| _n|jj| t|d| _|rZt	| j
 }t|d tjrFg t	|d 
 |dd }tj|rQ|d|  n| | _|| _dS d| _t  | j_| j_dS )an  
        Load the model and weights from torchvision.

        Args:
            model (str): Name of the torchvision model to load.
            weights (str): Pre-trained weights to load.
            unwrap (bool): Whether to unwrap the model.
            truncate (int): Number of layers to truncate.
            split (bool): Whether to split the output.
        r   N	get_model)weights)
pretrainedr   F)torchvisionr7   r8   hasattrmodelsrs  r   __dict__boolr   children
isinstancer9   r   r   r   headheads)rE   modelrt  unwraptruncater   rv  layersrG   rI   rJ   r8   G  s   
 
zTorchVision.__init__c                    s8   | j r|g   fdd| jD   S | |  S )z
        Forward pass through the model.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor | List[torch.Tensor]): Output tensor or list of tensors.
        c                 3   r   r   rI   r   r   rI   rJ   r   o  r   z&TorchVision.forward.<locals>.<genexpr>)r   r   r   rg   rI   r   rJ   rS   c  s   

zTorchVision.forward)rr  TrL   FrT   rI   rI   rG   rJ   r1   6  s    r1   c                       r2   )AAttna  
    Area-attention module for YOLO models, providing efficient attention mechanisms.

    This module implements an area-based attention mechanism that processes input features in a spatially-aware manner,
    making it particularly effective for object detection tasks.

    Attributes:
        area (int): Number of areas the feature map is divided.
        num_heads (int): Number of heads into which the attention mechanism is divided.
        head_dim (int): Dimension of each attention head.
        qkv (Conv): Convolution layer for computing query, key and value tensors.
        proj (Conv): Projection convolution layer.
        pe (Conv): Position encoding convolution layer.

    Methods:
        forward: Applies area-attention to input tensor.

    Examples:
        >>> attn = AAttn(dim=256, num_heads=8, area=4)
        >>> x = torch.randn(1, 256, 32, 32)
        >>> output = attn(x)
        >>> print(output.shape)
        torch.Size([1, 256, 32, 32])
    r   c              	      sr   t    || _|| _||  | _}|| j }t||d ddd| _t||ddd| _t||ddd|dd| _dS )a5  
        Initialize an Area-attention module for YOLO models.

        Args:
            dim (int): Number of hidden channels.
            num_heads (int): Number of heads into which the attention mechanism is divided.
            area (int): Number of areas the feature map is divided, default is 1.
        r]   r   Frj   r   rK  N)	r7   r8   arear]  r^  r   r`  r  ra  )rE   r{   r]  r  r^  all_head_dimrG   rI   rJ   r8     s   
	
zAAttn.__init__c                 C   s  |j \}}}}|| }| |ddd}| jdkr1||| j || j |d }|j \}}}|||| j| jd 	ddddj
| j| j| jgdd\}	}
}|	dd|
 | jd  }|jdd}||dd }|	dddd}|	dddd}| jdkr||| j || j |}||| j || j |}|j \}}}|||||	dddd }|||||	dddd }|| | }| |S )	z
        Process the input tensor through the area-attention.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after area-attention.
        rL   r   r]   r   rz   rd  r   r\  )rM   r`  flattenrN   r  r  rA   r]  r^  permuter   rO   
contiguousra  r  )rE   rF   re  rf  rg  rh  ri  r`  rQ   r  r_   r  r  rI   rI   rJ   rS     s0   


  
zAAttn.forwardr   rT   rI   rI   rG   rJ   r  u  s    r  c                       s2   e Zd ZdZd
 fdd	Zdd Zdd	 Z  ZS )ABlocka  
    Area-attention block module for efficient feature extraction in YOLO models.

    This module implements an area-attention mechanism combined with a feed-forward network for processing feature maps.
    It uses a novel area-based attention approach that is more efficient than traditional self-attention while
    maintaining effectiveness.

    Attributes:
        attn (AAttn): Area-attention module for processing spatial features.
        mlp (nn.Sequential): Multi-layer perceptron for feature transformation.

    Methods:
        _init_weights: Initializes module weights using truncated normal distribution.
        forward: Applies area-attention and feed-forward processing to input tensor.

    Examples:
        >>> block = ABlock(dim=256, num_heads=8, mlp_ratio=1.2, area=1)
        >>> x = torch.randn(1, 256, 32, 32)
        >>> output = block(x)
        >>> print(output.shape)
        torch.Size([1, 256, 32, 32])
    333333?r   c              	      sX   t    t|||d| _t|| }tt||dt||ddd| _| 	| j
 dS )ae  
        Initialize an Area-attention block module.

        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of heads into which the attention mechanism is divided.
            mlp_ratio (float): Expansion ratio for MLP hidden dimension.
            area (int): Number of areas the feature map is divided.
        )r]  r  r   Frj   N)r7   r8   r  r  r   r9   r   r   mlpapply_init_weights)rE   r{   r]  	mlp_ratior  mlp_hidden_dimrG   rI   rJ   r8     s
   

"zABlock.__init__c                 C   sD   t |tjrtjj|jdd |jdur tj|jd dS dS dS )z
        Initialize weights using a truncated normal distribution.

        Args:
            m (nn.Module): Module to initialize.
        g{Gz?)stdNr   )r|  r9   r:   inittrunc_normal_rB   r5   	constant_rE   r   rI   rI   rJ   r    s   
zABlock._init_weightsc                 C   s   ||  | }|| | S )z
        Forward pass through ABlock.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after area-attention and feed-forward processing.
        )r  r  rg   rI   rI   rJ   rS     s   
zABlock.forward)r  r   )rU   rV   rW   rX   r8   r  rS   rY   rI   rI   rG   rJ   r    s
    r  c                       s*   e Zd ZdZd fdd	Zd	d
 Z  ZS )A2C2fa  
    Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms.

    This module extends the C2f architecture by incorporating area-attention and ABlock layers for improved feature
    processing. It supports both area-attention and standard convolution modes.

    Attributes:
        cv1 (Conv): Initial 1x1 convolution layer that reduces input channels to hidden channels.
        cv2 (Conv): Final 1x1 convolution layer that processes concatenated features.
        gamma (nn.Parameter | None): Learnable parameter for residual scaling when using area attention.
        m (nn.ModuleList): List of either ABlock or C3k modules for feature processing.

    Methods:
        forward: Processes input through area-attention or standard convolution pathway.

    Examples:
        >>> m = A2C2f(512, 512, n=1, a2=True, area=1)
        >>> x = torch.randn(1, 512, 32, 32)
        >>> output = m(x)
        >>> print(output.shape)
        torch.Size([1, 512, 32, 32])
    r   TF       @r   c                    s   t    t|| d dksJ dt|dd| _td|  |d| _ r8|r8tjdt	| ddnd| _
t fd	d
t|D | _dS )a  
        Initialize Area-Attention C2f module.

        Args:
            c1 (int): Number of input channels.
            c2 (int): Number of output channels.
            n (int): Number of ABlock or C3k modules to stack.
            a2 (bool): Whether to use area attention blocks. If False, uses C3k blocks instead.
            area (int): Number of areas the feature map is divided.
            residual (bool): Whether to use residual connections with learnable gamma parameter.
            mlp_ratio (float): Expansion ratio for MLP hidden dimension.
            e (float): Channel expansion ratio for hidden channels.
            g (int): Number of groups for grouped convolutions.
            shortcut (bool): Whether to use shortcut connections in C3k blocks.
        r\   r   z(Dimension of ABlock be a multiple of 32.r   g{Gz?Tr	  Nc                 3   sD    | ]} rt jfd dtdD  ntdV  qdS )c                 3   s"    | ]}t d   V  qdS )r\   N)r  r   )r  re   r  rI   rJ   r   @  r   z+A2C2f.__init__.<locals>.<genexpr>.<genexpr>rL   N)r9   r   r   rE  r   a2r  re   r   r  r   rI   rJ   r   ?  s    "
z!A2C2f.__init__.<locals>.<genexpr>)r7   r8   r   r   r`   rc   r9   r@   r=   r   gammar   r   r   )rE   rD   rf   r   r  r  residualr  r   r   r   rG   r  rJ   r8   '  s   
&zA2C2f.__init__c                    sf   |  |g   fdd| jD  | t d | jdur1|| jdt| jdd   S  S )z
        Forward pass through A2C2f layer.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after processing.
        c                 3   r   r   rI   r   r   rI   rJ   r   Q  r   z A2C2f.forward.<locals>.<genexpr>r   Nr   )	r`   r   r   rc   r=   r~   r  rA   r   rg   rI   r   rJ   rS   F  s   

 zA2C2f.forward)r   Tr   Fr  r   r   TrT   rI   rI   rG   rJ   r    s    r  c                       s,   e Zd ZdZd	d
 fddZdd Z  ZS )	SwiGLUFFNz@SwiGLU Feed-Forward Network for transformer-based architectures.rK   rJ  Nc                    s6   t    t||| | _t|| d || _dS )zSInitialize SwiGLU FFN with input dimension, output dimension, and expansion factor.rL   N)r7   r8   r9   r   w12w3)rE   r   r   r   rG   rI   rJ   r8   [  s   
zSwiGLUFFN.__init__c                 C   s4   |  |}|jddd\}}t|| }| |S )z.Apply SwiGLU transformation to input features.rL   r   rz   )r  r   r|   silur  )rE   rF   x12r   r   hiddenrI   rI   rJ   rS   a  s   

zSwiGLUFFN.forward)rK   rW  rT   rI   rI   rG   rJ   r  X  s    r  c                       s*   e Zd ZdZd fddZdd Z  ZS )	Residualz7Residual connection wrapper for neural network modules.rJ  Nc                    s8   t    || _tj| jjj tj| jjj dS )z3Initialize residual module with the wrapped module.N)	r7   r8   r   r9   r  zeros_r  r5   rB   r  rG   rI   rJ   r8   l  s   
zResidual.__init__c                 C   s   ||  | S )z,Apply residual connection to input features.r   rg   rI   rI   rJ   rS   u  s   zResidual.forwardrW  rT   rI   rI   rG   rJ   r  i  s    	r  c                       ri   )SAVPEzESpatial-Aware Visual Prompt Embedding module for feature enhancement.c              	      s   t    t fddt|D | _t fddt|D | _d| _td  |d| _	tjd  | jddd| _
tjd| jddd| _ttd| j | jdtj| j| jddd| _d	S )
zVInitialize SAVPE module with channels, intermediate channels, and embedding dimension.c                 3   sN    | ]"\}}t t| d t  d |dv rt j|d dnt  V  qdS )r]      r   rL   rL   scale_factorNr9   r   r   Upsampler   r=  r   rI   rJ   r     s    2
z!SAVPE.__init__.<locals>.<genexpr>c                 3   sD    | ]\}}t t| d |dv rt j|d dnt  V  qdS )r   r  rL   r  Nr  r=  r  rI   rJ   r     s
    .
r3   r]   r   )rn   rL   N)r7   r8   r9   r   r@  r`   rc   r   r:   rd   r   r4  r   r   cv6)rE   r  r   r  rG   r  rJ   r8   }  s   


4zSAVPE.__init__c                    s   fddt |D } tj|dd} fddt |D } tj|dd}|j\}}}}|jd }|||d}||d j||	d|ddd||  j||}|||d|||| d||} 
tj| |fdd}||| jd}|||dd}|| t|t|jj  }	tj|	dtjd|	j}	|	dd	|| j| j ddd }
tj|
dd	||ddd
dS )zJProcess input features and visual prompts to generate enhanced embeddings.c                       g | ]\}} j | |qS rI   )rc   r   r   xir"  rI   rJ   r         z!SAVPE.forward.<locals>.<listcomp>r   rz   c                    r  rI   )r`   r  r"  rI   rJ   r     r  r   )r{   r6   rd  rL   r#  )r@  r   r=   r~   rd   rM   rA   r  r   expandr  r4  logical_notfinfor6   minr|   rO   r?   torN   r%  )rE   rF   vpr   re  rf  rg  rh  Qscore
aggregatedrI   r"  rJ   rS     s    
4" ,"zSAVPE.forwardrT   rI   rI   rG   rJ   r  z  s    r  )HrX   r=   torch.nnr9   torch.nn.functionalr1  r|   ultralytics.utils.torch_utilsr   r<   r   r   r   r   r   r	   transformerr
   __all__Moduler   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r   r   r   r   r   r-  r.  r!   r"   r$   r#   r%   r'   r&   rD  r(   rE  r+   r,   r-   r.   rj  r/   r*   r)   r0   r1   r  r  r  r  r  r  rI   rI   rI   rJ   <module>   sx    +&$#  6:A7"C0?5::(3?VDI