o
    Vh                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlZd dlZd dlmZ ddlmZmZ ddlmZ ddlmZ d	d
lmZmZmZ d	dlmZ d	dlmZm Z  g dZ!eG dd dZ"dee# de#fddZ$dej%de#de#deej%e#f fddZ&dej%de#de#de#dej%f
ddZ'ej()d ej()d G dd dej*Z+dej%de#dej%fd d!Z,d"ej%d#ej%d$ee#e#e#f d%ee#e#e#f d&ej%d'ej%d(ej%dej%fd)d*Z-dej%d+ej%d,e.fd-d.Z/ej()d* ej()d. G d/d0 d0ej*Z0G d1d2 d2ej*Z1G d3d4 d4ej*Z2G d5d6 d6ej*Z3d7e	e" d8e4d9e
e d:e.d;ede3fd<d=Z5G d>d? d?eZ6G d@dA dAeZ7e e dBe6j8fdCddDdEd9e
e6 d:e.d;ede3fdFdGZ9e e dBe7j8fdCddDdEd9e
e7 d:e.d;ede3fdHdIZ:dS )J    N)	dataclass)partial)AnyCallableDictListOptionalSequenceTuple   )MLPStochasticDepth)VideoClassification)_log_api_usage_once   )register_modelWeightsWeightsEnum)_KINETICS400_CATEGORIES)_ovewrite_named_paramhandle_legacy_interface)MViTMViT_V1_B_Weights	mvit_v1_bMViT_V2_S_Weights	mvit_v2_sc                   @   sV   e Zd ZU eed< eed< eed< ee ed< ee ed< ee ed< ee ed< dS )	MSBlockConfig	num_headsinput_channelsoutput_channelskernel_q	kernel_kvstride_q	stride_kvN)__name__
__module____qualname__int__annotations__r    r)   r)   Q/var/www/vscode/kcb/lib/python3.10/site-packages/torchvision/models/video/mvit.pyr      s   
 r   sreturnc                 C   s   d}| D ]}||9 }q|S N   r)   )r+   productvr)   r)   r*   _prod&   s   
r1   x
target_dim
expand_dimc                 C   sF   |   }||d kr| |} | |fS ||krtd| j | |fS )Nr.   zUnsupported input dimension )dim	unsqueeze
ValueErrorshaper2   r3   r4   
tensor_dimr)   r)   r*   
_unsqueeze-   s   
r;   r:   c                 C   s   ||d kr|  |} | S r-   )squeezer9   r)   r)   r*   _squeeze6   s   
r=   c                       s|   e Zd Z		ddejdeej deej deddf
 fdd	Zd
ej	de
eeef de
ej	e
eeef f fddZ  ZS )PoolNFpoolnorm
activationnorm_before_poolr,   c                    sV   t    || _g }|d ur|| |d ur|| |r#tj| nd | _|| _d S )N)super__init__r?   appendnn
Sequentialnorm_actrB   )selfr?   r@   rA   rB   layers	__class__r)   r*   rD   A   s   



zPool.__init__r2   thwc                 C   s   t |dd\}}tj|ddd\}}|dd}|jd d \}}}||| |f|  }| jr<| jd ur<| |}| 	|}|jdd  \}}	}
||||ddd}tj
||fdd}| jsm| jd urm| |}t|dd|}|||	|
ffS )	N   r.   r.   r   )indicesr5   r   r5   )r;   torchtensor_split	transposer8   reshape
contiguousrB   rH   r?   catr=   )rI   r2   rM   r:   class_tokenBNCTHWr)   r)   r*   forwardR   s   


zPool.forward)NF)r$   r%   r&   rF   Moduler   boolrD   rS   Tensorr
   r'   r`   __classcell__r)   r)   rK   r*   r>   @   s    >r>   	embeddingdc                 C   s@   | j d |kr	| S tjj| ddd|dddddS )Nr   r.   linear)sizemode)r8   rF   
functionalinterpolatepermuter6   r<   )re   rf   r)   r)   r*   _interpolatel   s   rm   attnqq_thwk_thw	rel_pos_h	rel_pos_w	rel_pos_tc           %      C   s6  |\}}}	|\}
}}t dt|| d }t dt|	| d }t dt||
 d }t|| d}t|| d}t|d d d f | t|d d d f d|  |  }t||	 d}t|	| d}t|	d d d f | t|d d d f d|  |  }t|
| d}t||
 d}t|d d d f | t|
d d d f d|
  |  }t||}t||}t||}||  }||  }||  }|j\}}}}|d d d d dd f |||||	|} td| |}!td| |}"| 	dddddd	||| | |	 |} t
| |dddd}#|#||||	||
	dddddd	}#|!d d d d d d d d d d d d d d f |"d d d d d d d d d d d d d d f  |#d d d d d d d d d d d d d d f  |||| |	 |
| | }$| d d d d dd dd f  |$7  < | S )
Nr   r.         ?zbythwc,hkc->bythwkzbythwc,wkc->bythwkr   r   rN      )r'   maxrS   arangerm   longr8   rV   einsumrl   matmulrU   view)%rn   ro   rp   rq   rr   rs   rt   q_tq_hq_wk_tk_hk_wdhdwdt	q_h_ratio	k_h_ratiodist_h	q_w_ratio	k_w_ratiodist_w	q_t_ratio	k_t_ratiodist_tRhRwRtrZ   n_head_r5   r_qrel_h_qrel_w_qrel_q_trel_posr)   r)   r*   _add_rel_pos{   sH   


<<<


**$...(r   shortcutresidual_with_cls_embedc              	   C   sZ   |r	|  | | S | d d d d dd d d f  |d d d d dd d d f 7  < | S r-   )add_)r2   r   r   r)   r)   r*   _add_shortcut   s
   
Dr   c                       s   e Zd Zdejfdee dedededee dee dee d	ee d
edededede	dej
f ddf fddZdejdeeeef deejeeeef f fddZ  ZS )MultiscaleAttention        
input_size	embed_dim
output_dimr   r    r!   r"   r#   residual_poolr   rel_pos_embeddropout
norm_layer.r,   Nc              
      sp  t    || _|| _|| _|| | _dt| j | _|	| _	|
| _
t|d| | _t||g}|dkr@|tj|dd tj| | _d | _t|dksUt|dkrrdd |D }ttj| j| j|||| jd	d
|| j| _d | _d | _t|dkst|dkrdd |D }ttj| j| j|||| jd	d
|| j| _ttj| j| j|||| jd	d
|| j| _d | _d | _d | _|r6t|dd  }t|dkr||d  n|}t|dkr||d  n|}dt|| d }d|d  d }tt|| j| _tt|| j| _tt|| j| _tj j!| jdd tj j!| jdd tj j!| jdd d S d S )Nru   r   r   Tinplacer.   c                 S      g | ]}t |d  qS r   r'   ).0ro   r)   r)   r*   
<listcomp>       z0MultiscaleAttention.__init__.<locals>.<listcomp>F)stridepaddinggroupsbiasc                 S   r   r   r   )r   kvr)   r)   r*   r      r   r   r   {Gz?std)"rC   rD   r   r   r   head_dimmathsqrtscalerr   r   rF   LinearqkvrE   DropoutrG   projectpool_qr1   r>   Conv3dpool_kpool_vrr   rs   rt   rw   len	ParameterrS   zerosinittrunc_normal_)rI   r   r   r   r   r    r!   r"   r#   r   r   r   r   r   rJ   	padding_q
padding_kvrh   q_sizekv_sizespatial_dimtemporal_dimrK   r)   r*   rD      s   

			zMultiscaleAttention.__init__r2   rM   c                 C   s:  |j \}}}| |||d| j| jddjdd\}}}| jd ur-| ||\}}	n|}	| jd ur<| ||d }| j	d urI| 	||\}}t
| j| |dd}
| jd urr| jd urr| jd urrt|
|||	| j| j| j}
|
jdd}
t
|
|}| jrt||| j |dd|d| j}| |}||fS )Nr   r.   r   rR   r   rQ   )r8   r   rV   r   r   rU   unbindr   r   r   rS   r{   r   rr   rs   rt   r   softmaxr   r   r   r   r   )rI   r2   rM   rZ   r[   r\   ro   kr0   rq   rn   r)   r)   r*   r`      s6   2


	
zMultiscaleAttention.forward)r$   r%   r&   rF   	LayerNormr   r'   rb   floatr   ra   rD   rS   rc   r
   r`   rd   r)   r)   rK   r*   r      sB    	
>\r   c                       s   e Zd Zddejfdee dededededede	d	e	d
e
dejf ddf fddZdejdeeeef deejeeeef f fddZ  ZS )MultiscaleBlockr   r   cnfr   r   r   proj_after_attnr   stochastic_depth_probr   .r,   Nc
                    s  t    || _d | _t|jdkr.dd |jD }
dd |
D }ttj|
|j|dd | _|r3|j	n|j
}|	|j
| _|	|| _t| jtj| _t||j
||j|j|j|j|j|||||	d| _t|d| |j	gtj|d d| _t|d	| _d | _|j
|j	krt|j
|j	| _d S d S )
Nr.   c                 S   s    g | ]}|d kr|d  n|qS rO   r)   )r   r+   r)   r)   r*   r   U  s     z,MultiscaleBlock.__init__.<locals>.<listcomp>c                 S   r   r   r   )r   r   r)   r)   r*   r   V  r   )r   r   )	r    r!   r"   r#   r   r   r   r   r   rN   )activation_layerr   r   row)rC   rD   r   	pool_skipr1   r"   r>   rF   	MaxPool3dr   r   norm1norm2
isinstanceBatchNorm1dneeds_transposalr   r   r    r!   r#   rn   r   GELUmlpr   stochastic_depthr   r   )rI   r   r   r   r   r   r   r   r   r   kernel_skippadding_skipattn_dimrK   r)   r*   rD   D  sP   

zMultiscaleBlock.__init__r2   rM   c           	      C   s   | j r| |ddddn| |}| ||\}}| jd u s%| js'|n| |}| jd u r3|n| ||d }|| | }| j rR| |ddddn| |}| jd u s_| jra|n| |}|| | 	| |fS )Nr.   r   r   )
r   r   rU   rn   r   r   r   r   r   r   )	rI   r2   rM   x_norm1x_attnthw_newx_skipx_norm2x_projr)   r)   r*   r`   ~  s   **zMultiscaleBlock.forward)r$   r%   r&   rF   r   r   r'   r   rb   r   r   ra   rD   rS   rc   r
   r`   rd   r)   r)   rK   r*   r   C  s4    		
>:r   c                
       sP   e Zd Zdedeeef dededdf
 fddZd	ejdejfd
dZ	  Z
S )PositionalEncoding
embed_sizespatial_sizetemporal_sizer   r,   Nc                    s   t    || _|| _tt|| _d | _	d | _
d | _|sGtt| jd | jd  || _	tt| j|| _
tt|| _d S d S )Nr   r.   )rC   rD   r   r   rF   r   rS   r   rY   spatial_postemporal_pos	class_pos)rI   r   r   r   r   rK   r)   r*   rD     s   
$zPositionalEncoding.__init__r2   c                 C   s   | j |ddd}tj||fdd}| jd ur\| jd ur\| jd ur\| jj	\}}tj
| j|dd}|| jd| jddd| tj| jd|fddd}|| |S )Nr   rQ   r.   rR   )rY   expandrh   r6   rS   rX   r   r   r   r8   repeat_interleaver   r   rV   )rI   r2   rY   hw_sizer   pos_embeddingr)   r)   r*   r`     s   & 
zPositionalEncoding.forward)r$   r%   r&   r'   r
   rb   rD   rS   rc   r`   rd   r)   r)   rK   r*   r     s    *r   c                $       s   e Zd Z									ddeeef d	ed
ee dedededededededede	e
dejf  de	e
dejf  deeeef deeeef deeeef ddf" fddZdejdejfddZ  ZS ) r         ?r     Nr      r   r   rN   rN   r.   r   r   r   r   block_settingr   r   r   r   r   attention_dropoutr   num_classesblock.r   patch_embed_kernelpatch_embed_stridepatch_embed_paddingr,   c                    s  t    t|  t|}|dkrtd|du rt}|du r&ttjdd}tj	d|d j
|||d| _dd	 t|f| | jjD }t|d j
|d
 |d f|d |d| _t | _t|D ]/\}}|
| |d  }| j||||||||	||d	 t|jdkrdd	 t||jD }q`||d j| _ttj|ddt|d j|| _|  D ][}t|tjrtjj|jdd t|tjr|j durtj!|j d qt|tjr|jdurtj!|jd |j durtj!|j d qt|tr
|" D ]
}tjj|dd qqdS )a  
        MViT main class.

        Args:
            spatial_size (tuple of ints): The spacial size of the input as ``(H, W)``.
            temporal_size (int): The temporal size ``T`` of the input.
            block_setting (sequence of MSBlockConfig): The Network structure.
            residual_pool (bool): If True, use MViTv2 pooling residual connection.
            residual_with_cls_embed (bool): If True, the addition on the residual connection will include
                the class embedding.
            rel_pos_embed (bool): If True, use MViTv2's relative positional embeddings.
            proj_after_attn (bool): If True, apply the projection after the attention.
            dropout (float): Dropout rate. Default: 0.0.
            attention_dropout (float): Attention dropout rate. Default: 0.0.
            stochastic_depth_prob: (float): Stochastic depth rate. Default: 0.0.
            num_classes (int): The number of classes.
            block (callable, optional): Module specifying the layer which consists of the attention and mlp.
            norm_layer (callable, optional): Module specifying the normalization layer to use.
            patch_embed_kernel (tuple of ints): The kernel of the convolution that patchifies the input.
            patch_embed_stride (tuple of ints): The stride of the convolution that patchifies the input.
            patch_embed_padding (tuple of ints): The padding of the convolution that patchifies the input.
        r   z+The configuration parameter can't be empty.Ngư>)epsr   )in_channelsout_channelskernel_sizer   r   c                 S      g | ]\}}|| qS r)   r)   r   rh   r   r)   r)   r*   r     r   z!MViT.__init__.<locals>.<listcomp>r.   r   )r   r   r   r   ru   )	r   r   r   r   r   r   r   r   r   c                 S   r  r)   r)   r	  r)   r)   r*   r   
  r   rQ   Tr   r   r   r   )#rC   rD   r   r   r7   r   r   rF   r   r   r   	conv_projzipr   r   pos_encoding
ModuleListblocks	enumeraterE   r"   r   r@   rG   r   r   headmodulesr   r   r   weightr   	constant_
parameters)rI   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  total_stage_blocksr   stage_block_idr   sd_probmweightsrK   r)   r*   rD     s   
)


zMViT.__init__r2   c                 C   s   t |ddd }| |}|ddd}| |}| jjf| jj }| jD ]	}|||\}}q'| |}|d d df }| 	|}|S )Nrv   r   r   r.   )
r;   r
  flattenrU   r  r   r   r  r@   r  )rI   r2   rM   r   r)   r)   r*   r`   !  s   




zMViT.forward)	r   r   r   r   NNr   r   r   )r$   r%   r&   r
   r'   r	   r   rb   r   r   r   rF   ra   rD   rS   rc   r`   rd   r)   r)   rK   r*   r     s\    

	
xr   r   r   r  progresskwargsc                 K   s   |d ur1t |dt|jd  |jd d |jd d ksJ t |d|jd  t |d|jd  |dd	}|dd
}td||| |dd|dd|dd|dd|d|}|d urk||j|dd |S )Nr   
categoriesmin_sizer   r.   r   r   min_temporal_size   r!     r   Fr   Tr   r   )r   r   r   r   r   r   r   r   )r  
check_hashr)   )r   r   metapopr   load_state_dictget_state_dict)r   r   r  r  r  r   r   modelr)   r)   r*   _mvit8  s,    



	r)  c                   @   J   e Zd Zedeedddddddedd	d
ddddiddd	dZeZdS )r   z:https://download.pytorch.org/models/mvit_v1_b-dbeb1030.pthr      ?r.  r.  ?r0  r0  	crop_sizeresize_sizemeanr   r"  zShttps://github.com/facebookresearch/pytorchvideo/blob/main/docs/source/model_zoo.mdThe weights were ported from the paper. The accuracies are estimated on video-level with parameters `frame_rate=7.5`, `clips_per_video=5`, and `clip_len=16`ip.Kinetics-400gJ+S@gh|?eW@zacc@1zacc@5guVQ@g rxa@	r  r  r  recipe_docs
num_params_metrics_ops
_file_sizeurl
transformsr$  N	r$   r%   r&   r   r   r   r   KINETICS400_V1DEFAULTr)   r)   r)   r*   r   Y  4    r   c                   @   r*  )r   z:https://download.pytorch.org/models/mvit_v2_s-ae3be167.pthr   r+  r-  r/  r1  r"  zChttps://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.mdr5  ir6  g r0T@g(\W@r7  guVP@g?5^I|`@r8  r?  NrB  r)   r)   r)   r*   r   z  rE  r   
pretrained)r  T)r  r  c                 K   s  t | } g dg dg dg g dg g dg g g g g g g g g g g dg gg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgg g dg g dg g g g g g g g g g g dg gg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgd	}g }tt|d
 D ],}|t|d
 | |d | |d | |d | |d | |d | |d | d	 qtddd|dd|dd| |d|S )a  
    Constructs a base MViTV1 architecture from
    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__.

    .. betastatus:: video module

    Args:
        weights (:class:`~torchvision.models.video.MViT_V1_B_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.MViT_V1_B_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.MViT``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvit.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.MViT_V1_B_Weights
        :members:
    r.   r   r   rN   rN   rN   rN   rN   rN   rN   rN   rN   rN   rN      rH  `      rK    rL  rL  rL  rL  rL  rL  rL  rL  rL  rL     rM  )rK  rK  rL  rL  rL  rL  rL  rL  rL  rL  rL  rL  rL  rM  rM  rM  r   r   r   r.   r   r   r.   rH  rH  r.   rN   rN   r.   r.   r.   r   r   r   r    r!   r"   r#   r   r   r   r    r!   r"   r#   r   r"  Fr   皙?)r   r   r   r   r   r   r  r  Nr)   )r   verifyranger   rE   r   r)  r%  r  r  r  configr   ir)   r)   r*   r     s   
..,







	r   c                 K   sD  t | } g dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dgd	}g }tt|d
 D ],}|t|d
 | |d | |d | |d | |d | |d | |d | d	 qtddd|dddd|dd| |d
|S )aC  Constructs a small MViTV2 architecture from
    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__ and
    `MViTv2: Improved Multiscale Vision Transformers for Classification
    and Detection <https://arxiv.org/abs/2112.01526>`__.

    .. betastatus:: video module

    Args:
        weights (:class:`~torchvision.models.video.MViT_V2_S_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.MViT_V2_S_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.MViT``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvit.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.MViT_V2_S_Weights
            :members:
    rG  )rJ  rJ  rK  rK  rL  rL  rL  rL  rL  rL  rL  rL  rL  rL  rL  rM  rI  rN  rR  rO  rP  rQ  rS  r   r   r   r    r!   r"   r#   r   r"  TFr   rT  )
r   r   r   r   r   r   r   r   r  r  Nr)   )r   rU  rV  r   rE   r   r)  r%  rW  r)   r)   r*   r     s   
N







r   );r   dataclassesr   	functoolsr   typingr   r   r   r   r   r	   r
   rS   torch.fxtorch.nnrF   opsr   r   transforms._presetsr   utilsr   _apir   r   r   _metar   _utilsr   r   __all__r   r'   r1   rc   r;   r=   fxwrapra   r>   rm   r   rb   r   r   r   r   r   r   r)  r   r   rC  r   r   r)   r)   r)   r*   <module>   s    $	
&"	,
< H 
!!!*`.