o
    Vh(l                     @   s  d dl mZ d dlmZmZmZmZmZ d dlZd dl	m
  mZ d dlm
Z
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZ ddlmZmZ g dZdee dee dee deee ee f fddZ ej!"d dejdejdee defddZ#ej!"d deeeef deeeef deeeef fddZ$ej!"d dedeeeef deeeef deeeef def
ddZ%ej!"d 					dLd ed!ed"ed#edee d$edee d%e&d&e&d'ee d(ee d)e'defd*d+Z(ej!"d+ G d,d- d-e
j)Z*G d.d/ d/e
j)Z+G d0d1 d1e
j)Z,dee d2ed3ee d$ee dee d4e&d5ee d6e'd7ede,fd8d9Z-ed:d;d<Z.G d=d> d>eZ/G d?d@ d@eZ0G dAdB dBeZ1e edCe/j2fdDdddEd5ee/ d6e'd7ede,fdFdGZ3e edCe0j2fdDdddEd5ee0 d6e'd7ede,fdHdIZ4e edCe1j2fdDdddEd5ee1 d6e'd7ede,fdJdKZ5dS )M    )partial)AnyCallableListOptionalTupleN)nnTensor   )VideoClassification)_log_api_usage_once   )register_modelWeightsWeightsEnum)_KINETICS400_CATEGORIES)_ovewrite_named_paramhandle_legacy_interface)PatchMergingSwinTransformerBlock)SwinTransformer3dSwin3D_T_WeightsSwin3D_S_WeightsSwin3D_B_Weightsswin3d_tswin3d_sswin3d_b
shift_sizesize_dhwwindow_sizereturnc                 C   s:   t dD ]}|| || kr|| ||< d| |< q|| fS )Nr
   r   range)r   r   r   i r$   ]/var/www/vscode/kcb/lib/python3.10/site-packages/torchvision/models/video/swin_transformer.py_get_window_and_shift_size    s   r&   relative_position_bias_tablerelative_position_indexc                 C   s^   |d |d  |d  }| |d |d |f    }|||d}|ddd d}|S )Nr      r   )flattenviewpermute
contiguous	unsqueeze)r'   r(   r   
window_volrelative_position_biasr$   r$   r%   _get_relative_position_bias/   s   r2   
patch_sizec                    s.    fddt dD }|d |d |d fS )Nc                    s,   g | ]} | |  |    |  qS r$   r$   .0r#   r3   r   r$   r%   
<listcomp>@   s   , z(_compute_pad_size_3d.<locals>.<listcomp>r
   r   r)   r   r!   )r   r3   pad_sizer$   r6   r%   _compute_pad_size_3d?   s   r9   xc              
      sj  | j | }|d d  |d d   |d d   } fddtdD }d}|d D ]-}|d D ]&}	|d D ]}
|||d |d |	d |	d |
d |
d f< |d7 }q;q5q/||d d  d |d d  d |d d  d }|dddddd|d d  d  }|d|d }||dktd	|dktd
}|S )Nr   r)   r   c                    s:   g | ]}d |  f|   |  f |  dffqS )r   Nr$   r4   r   r   r$   r%   r7   P   s    z._compute_attention_mask_3d.<locals>.<listcomp>r
         g      Y        )	new_zerosr"   r,   r-   reshaper/   masked_fillfloat)r:   r   r   r   	attn_masknum_windowsslicescountdhwr$   r;   r%   _compute_attention_mask_3dG   s6   
02
$rJ   r>   Tinput
qkv_weightproj_weightr1   	num_headsattention_dropoutdropoutqkv_bias	proj_biastrainingc                 C   s  | j \}}}}}t|||f|d |d |d f}t| ddd|d d|d d|d f}|j \}}}}}|||f}t|dkrTtj||d  |d  |d  fdd}|d |d  |d |d   |d |d   }|||d |d  |d |d |d  |d |d |d  |d |}|ddddddd	d
	|| |d |d  |d  |}t
|||	}|	|d|dd||| ddddd}|d |d |d }}}||| d  }||dd}|| }t|dkrDt||d |d |d f|d |d |d f|d |d |d f}||d| |||d|d}||dd }|d||d|d}tj|dd}tj|||d}||dd	|d|d|}t
|||
}tj|||d}|||d |d  |d |d  |d |d  |d |d |d |}|ddddddd	d
	|||||}t|dkrtj||d |d |d fdd}|ddd|d|d|ddf  }|S )a  
    Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.
    Args:
        input (Tensor[B, T, H, W, C]): The input tensor, 5-dimensions.
        qkv_weight (Tensor[in_dim, out_dim]): The weight tensor of query, key, value.
        proj_weight (Tensor[out_dim, out_dim]): The weight tensor of projection.
        relative_position_bias (Tensor): The learned relative position bias added to attention.
        window_size (List[int]): 3-dimensions window size, T, H, W .
        num_heads (int): Number of attention heads.
        shift_size (List[int]): Shift size for shifted window attention (T, H, W).
        attention_dropout (float): Dropout ratio of attention weight. Default: 0.0.
        dropout (float): Dropout ratio of output. Default: 0.0.
        qkv_bias (Tensor[out_dim], optional): The bias tensor of query, key, value. Default: None.
        proj_bias (Tensor[out_dim], optional): The bias tensor of projection. Default: None.
        training (bool, optional): Training flag used by the dropout parameters. Default: True.
    Returns:
        Tensor[B, T, H, W, C]: The output tensor after shifted window attention.
    r   r)   r   )r)   r   r
   )shiftsdimsr
   r=   r<         g      r*   )dim)prS   N)shaper9   Fpadsumtorchrollr,   r-   r@   linearsizematmul	transposerJ   r/   softmaxrP   r.   )rK   rL   rM   r1   r   rN   r   rO   rP   rQ   rR   rS   btrH   rI   cr8   r:   _tphpwppadded_sizerD   qkvqkvattnrC   r$   r$   r%   shifted_window_attention_3ds   st   !"(
(.
0((&
"*rs   c                       s   e Zd ZdZ				ddedee dee deded	ed
ededdf fddZdddZ	dddZ
dee dejfddZdedefddZ  ZS )ShiftedWindowAttention3dz2
    See :func:`shifted_window_attention_3d`.
    Tr>   rY   r   r   rN   rQ   rR   rO   rP   r    Nc	           	         s   t    t|dkst|dkrtd|| _|| _|| _|| _|| _t	j
||d |d| _t	j
|||d| _|   |   d S )Nr
   z.window_size and shift_size must be of length 2)bias)super__init__len
ValueErrorr   r   rN   rO   rP   r   Linearrn   proj#define_relative_position_bias_tabledefine_relative_position_index)	selfrY   r   r   rN   rQ   rR   rO   rP   	__class__r$   r%   rw      s   
z!ShiftedWindowAttention3d.__init__c                 C   s^   t td| jd  d d| jd  d  d| jd  d  | j| _t jj| jdd d S )Nr   r   r)   {Gz?std)	r   	Parameterr_   zerosr   rN   r'   inittrunc_normal_r~   r$   r$   r%   r|     s   4z<ShiftedWindowAttention3d.define_relative_position_bias_tablec                    s   fddt dD }ttj|d |d |d dd}t|d}|d d d d d f |d d d d d f  }|ddd }|d d d d df   jd d 7  < |d d d d df   jd d 7  < |d d d d df   jd d 7  < |d d d d df  d jd  d d jd  d  9  < |d d d d df  d jd  d 9  < |d	} 	d
| d S )Nc                    s   g | ]
}t  j| qS r$   )r_   aranger   r4   r   r$   r%   r7     s    zKShiftedWindowAttention3d.define_relative_position_index.<locals>.<listcomp>r
   r   r)   r   ij)indexingr*   r(   )
r"   r_   stackmeshgridr+   r-   r.   r   r^   register_buffer)r~   
coords_dhwcoordscoords_flattenrelative_coordsr(   r$   r   r%   r}     s   ,(((>,
z7ShiftedWindowAttention3d.define_relative_position_indexc                 C   s   t | j| j|S )N)r2   r'   r(   )r~   r   r$   r$   r%   get_relative_position_bias#  s   z3ShiftedWindowAttention3d.get_relative_position_biasr:   c           
      C   s   |j \}}}}}|||g}| j | j }}t|||\}}| |}	t|| jj| j	j|	|| j
|| j| j| jj| j	j| jdS )N)r   rO   rP   rQ   rR   rS   )r[   r   copyr   r&   r   rs   rn   weightr{   rN   rO   rP   ru   rS   )
r~   r:   ri   rg   rH   rI   r   r   r   r1   r$   r$   r%   forward&  s&   

z ShiftedWindowAttention3d.forward)TTr>   r>   )r    N)__name__
__module____qualname____doc__intr   boolrB   rw   r|   r}   r_   r	   r   r   __classcell__r$   r$   r   r%   rt      s:    
	



rt   c                       sb   e Zd ZdZ			ddee dededeed	ej	f  d
df
 fddZ
ded
efddZ  ZS )PatchEmbed3da;  Video to Patch Embedding.

    Args:
        patch_size (List[int]): Patch token size.
        in_channels (int): Number of input channels. Default: 3
        embed_dim (int): Number of linear projection output channels. Default: 96.
        norm_layer (nn.Module, optional): Normalization layer. Default: None
    r
   `   Nr3   in_channels	embed_dim
norm_layer.r    c                    sf   t    t|  |d |d |d f| _tj||| j| jd| _|d ur,||| _d S t | _d S )Nr   r)   r   )kernel_sizestride)	rv   rw   r   tuple_patch_sizer   Conv3dr{   normIdentity)r~   r3   r   r   r   r   r$   r%   rw   K  s   
zPatchEmbed3d.__init__r:   c              
   C   s|   |  \}}}}}t|||f| j}t|d|d d|d d|d f}| |}|ddddd}| jdur<| |}|S )zForward function.r   r   r)   r
   r<   N)rb   r9   r   r\   r]   r{   r-   r   )r~   r:   ri   rg   rH   rI   r8   r$   r$   r%   r   a  s   $


zPatchEmbed3d.forward)r
   r   N)r   r   r   r   r   r   r   r   r   Modulerw   r	   r   r   r$   r$   r   r%   r   A  s"    r   c                        s   e Zd ZdZdddddddedf	dee ded	ee d
ee dee dedededededee	de
jf  dee	de
jf  de	de
jf dee	de
jf  ddf fddZdedefddZ  ZS )r   aY  
    Implements 3D Swin Transformer from the `"Video Swin Transformer" <https://arxiv.org/abs/2106.13230>`_ paper.
    Args:
        patch_size (List[int]): Patch size.
        embed_dim (int): Patch embedding dimension.
        depths (List(int)): Depth of each Swin Transformer layer.
        num_heads (List(int)): Number of attention heads in different layers.
        window_size (List[int]): Window size.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
        dropout (float): Dropout rate. Default: 0.0.
        attention_dropout (float): Attention dropout rate. Default: 0.0.
        stochastic_depth_prob (float): Stochastic depth rate. Default: 0.1.
        num_classes (int): Number of classes for classification head. Default: 400.
        norm_layer (nn.Module, optional): Normalization layer. Default: None.
        block (nn.Module, optional): SwinTransformer Block. Default: None.
        downsample_layer (nn.Module): Downsample layer (patch merging). Default: PatchMerging.
        patch_embed (nn.Module, optional): Patch Embedding layer. Default: None.
    g      @r>   皙?i  Nr3   r   depthsrN   r   	mlp_ratiorP   rO   stochastic_depth_probnum_classesr   .blockdownsample_layerpatch_embedr    c                    s  t    t|  |
| _|d u rtttd}|d u r!ttjdd}|d u r't	}||||d| _
tj|d| _g }t|}d}tt|D ]S}g }|d|  }t|| D ]* |	t| |d  }||||| | fd	d
|D |||||td
 |d7 }qT|tj|  |t|d k r|||| qDtj| | _|dt|d   | _|| j| _td| _t| j|
| _|  D ]}t|tjrtjj|jdd |jd urtj |j qd S )N)
attn_layergh㈵>)eps)r3   r   r   )rZ   r   r   r)   c                    s$   g | ]} d  dkrdn|d  qS )r   r   r$   )r5   rI   i_layerr$   r%   r7     s   $ z.SwinTransformer3d.__init__.<locals>.<listcomp>)r   r   r   rP   rO   r   r   r   r   r   )!rv   rw   r   r   r   r   rt   r   	LayerNormr   r   Dropoutpos_dropr^   r"   rx   rB   append
Sequentialfeaturesnum_featuresr   AdaptiveAvgPool3davgpoolrz   headmodules
isinstancer   r   r   ru   zeros_)r~   r3   r   r   rN   r   r   rP   rO   r   r   r   r   r   r   layerstotal_stage_blocksstage_block_idi_stagestagerY   sd_probmr   r   r%   rw     sd   


zSwinTransformer3d.__init__r:   c                 C   s^   |  |}| |}| |}| |}|ddddd}| |}t|d}| |}|S )Nr   r<   r)   r   r
   )	r   r   r   r   r-   r   r_   r+   r   )r~   r:   r$   r$   r%   r     s   





zSwinTransformer3d.forward)r   r   r   r   r   r   r   rB   r   r   r   r   rw   r	   r   r   r$   r$   r   r%   r   n  sV    	
Lr   r   r   r   weightsprogresskwargsc           
   	   K   sZ   |d urt |dt|jd  td| |||||d|}	|d ur+|	|j|dd |	S )Nr   
categories)r3   r   r   rN   r   r   T)r   
check_hashr$   )r   rx   metar   load_state_dictget_state_dict)
r3   r   r   rN   r   r   r   r   r   modelr$   r$   r%   _swin_transformer3d  s   
r   )r)   r)   r)   )r   min_sizemin_temporal_sizec                   @   L   e Zd Zedeedddddi eddd	d
dddiddddZeZdS )r   z9https://download.pytorch.org/models/swin3d_t-7615ae03.pth   r      g
ףp=
?gv/?gCl?gZd;O?gy&1?g?	crop_sizeresize_sizemeanr   Fhttps://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400The weights were ported from the paper. The accuracies are estimated on video-level with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`ivKinetics-400g(\mS@gK7aW@zacc@1zacc@5g7A`E@gnb^@recipe_docs
num_params_metrics_ops
_file_sizeurl
transformsr   N	r   r   r   r   r   r   _COMMON_METAKINETICS400_V1DEFAULTr$   r$   r$   r%   r      4    r   c                   @   r   )r   z9https://download.pytorch.org/models/swin3d_s-da41c237.pthr   r   r   r   r   r   r   if$r   gMbXS@g'1W@r   gҵT@gK7Ik@r   r   Nr   r$   r$   r$   r%   r     r   r   c                   @   s   e Zd Zedeedddddi eddd	d
dddiddddZedeedddddi eddd	d
dddiddddZeZ	dS )r   z<https://download.pytorch.org/models/swin3d_b_1k-24f7c7c6.pthr   r   r   r   r   r   r   iX?r   gSS@gbX9W@r   gMbXa@g/$v@r   r   z=https://download.pytorch.org/models/swin3d_b_22k-7c6ae6fa.pthgx&iT@g~jW@N)
r   r   r   r   r   r   r   r   KINETICS400_IMAGENET22K_V1r   r$   r$   r$   r%   r   >  sd    r   
pretrained)r   )r   r   c                 K   :   t | } td	g ddg dg dg dd| |d|S )
a  
    Constructs a swin_tiny architecture from
    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.

    Args:
        weights (:class:`~torchvision.models.video.Swin3D_T_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.Swin3D_T_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.Swin3D_T_Weights
        :members:
    r   r<   r<   r   )r   r   rV   r   r
   rV            rW   rW   r   r3   r   r   rN   r   r   r   r   Nr$   )r   verifyr   r   r   r   r$   r$   r%   r   x     
	r   c                 K   r   )
a  
    Constructs a swin_small architecture from
    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.

    Args:
        weights (:class:`~torchvision.models.video.Swin3D_S_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.Swin3D_S_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.Swin3D_S_Weights
        :members:
    r   r   r   r      r   r   r   r   r   Nr$   )r   r   r   r   r$   r$   r%   r     r   r   c                 K   r   )
a  
    Constructs a swin_base architecture from
    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.

    Args:
        weights (:class:`~torchvision.models.video.Swin3D_B_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.Swin3D_B_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.Swin3D_B_Weights
        :members:
    r      r  )r<   r          r   r   r   Nr$   )r   r   r   r   r$   r$   r%   r     r   r   )r>   r>   NNT)6	functoolsr   typingr   r   r   r   r   r_   torch.nn.functionalr   
functionalr\   r	   transforms._presetsr   utilsr   _apir   r   r   _metar   _utilsr   r   swin_transformerr   r   __all__r   r&   fxwrapr2   r9   rJ   rB   r   rs   r   rt   r   r   r   r   r   r   r   r   r   r   r   r$   r$   r$   r%   <module>   s   

4
)	

p[-m	

:*$*$.