o
    0hK5                     @   s*  U d Z ddlZddlZddlZddlmZmZmZmZm	Z	 ddl
Z
ddlZddlZddlZej Zeo:ej dkZerBedndZerMedd Znedd Zed	d Zed
d Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Z edd Z!dd Z"dd Z#edd Z$edd Z%dd Z&dd Z'dd  Z(ed!d Z)e*e+d"< ed#d Z,e*e+d$< ed%d Z-e*e+d&< ed'd Z.e*e+d(< eoe Z/e*e+d)< ed*d Z0e*e+d+< d,d- Z1ed.d Z2e*e+d/< ed0d Z3e*e+d1< er+zddl4Z5e5j6 Z7W n e8y*   d2Z7d2ZY nw d2Z7d2a9d3d4 Z:ej;d5d6 Z<ej;dQd8d9Z=ej;d:d; Z>dQd<d=Z?d>d? Z@d@dA ZAdBdC ZBdDdE ZCdFdG ZDdHdI ZEeD ZFeE ZGdJejHjIdfdKdLZJdJejKejHjIdfdMdNZLdOdP ZMesej rJ dS dS )Rz>This file is allowed to initialize CUDA context when imported.    N)LazyVal
TEST_NUMBATEST_WITH_ROCM	TEST_CUDA
IS_WINDOWS   zcuda:0c                   C   s   t S N)r    r	   r	   W/var/www/vscode/kcb/lib/python3.10/site-packages/torch/testing/_internal/common_cuda.py<lambda>   s    r   c                   C   s   t otjjtjdtdS )N      ?device)r   torchbackendscudnnis_acceptabletensorCUDA_DEVICEr	   r	   r	   r
   r          c                   C   s   t rtjj S dS )Nr   )
TEST_CUDNNr   r   r   versionr	   r	   r	   r
   r      s    c                   C      t j ot j dkS )N)      r   cudais_availableget_device_capabilityr	   r	   r	   r
   r          c                   C   r   )N)   r   r   r	   r	   r	   r
   r      r   c                   C   r   )N)   r   r   r	   r	   r	   r
   r      r   c                   C   r   )N)r!   r   r   r	   r	   r	   r
   r      r   c                   C   r   )N)   r   r   r	   r	   r	   r
   r       r   c                   C   r   Nr"   	   r   r	   r	   r	   r
   r   !   r   c                   C   r   )N)r%   r   r   r	   r	   r	   r
   r   "   r   c                   C   r   )N)
   r   r   r	   r	   r	   r
   r   #   r   c                   C   s.   t j ot j d dkot j d dkS )Nr   r&      r   r	   r	   r	   r
   r   %   s    c                   C   s   t j ot j dv ptS )N))r!   r   )r"   r!   )r   r   r   r   IS_THORr	   r	   r	   r
   r   '   r   c                   C   s   t j ot j dkS r#   r   r	   r	   r	   r
   r   (   r   c                      s,   t rtjdj t fdddD S dS )Nr   c                 3   s    | ]}| v V  qd S r   r	   ).0archgcn_arch_namer	   r
   	<genexpr>-       zCDNA2OrLater.<locals>.<genexpr>>   gfx90agfx942F)r   r   r   get_device_propertiesgcnArchNameanyr	   r	   r+   r
   CDNA2OrLater*   s   r4   c                 C   s2   t j sdS t jdj}tjd|}|| kS )NFr   /PYTORCH_DEBUG_FLASH_ATTENTION_GCN_ARCH_OVERRIDE)r   r   r   r1   r2   osenvironget)matching_archr,   r*   r	   r	   r
   evaluate_gfx_arch_exact0   s
   
r:   c                   C      t dS )Ngfx90a:sramecc+:xnack-r:   r	   r	   r	   r
   r   7       c                   C   r;   )Ngfx942:sramecc+:xnack-r=   r	   r	   r	   r
   r   8   r>   c                   C   s&   t r
tdp	tdS trt otS dS )Nr<   r?   F)r   r:   r   r   SM80OrLaterr	   r	   r	   r
   *evaluate_platform_supports_flash_attention:   s
   
rA   c                   C   s    t r
tdp	tdS trdS dS )Nr<   r?   TF)r   r:   r   r	   r	   r	   r
   .evaluate_platform_supports_efficient_attentionA   s
   rB   c                   C   s   t  ototdkS )Ni_ )r   r@   TEST_CUDNN_VERSIONr	   r	   r	   r
   *evaluate_platform_supports_cudnn_attentionH      rD   c                   C      t  S r   )rA   r	   r	   r	   r
   r   K       !PLATFORM_SUPPORTS_FLASH_ATTENTIONc                   C   rF   r   )rB   r	   r	   r	   r
   r   L   rG   #PLATFORM_SUPPORTS_MEM_EFF_ATTENTIONc                   C   rF   r   )rD   r	   r	   r	   r
   r   M   rG   !PLATFORM_SUPPORTS_CUDNN_ATTENTIONc                   C   s   t ptptS r   )rH   rJ   rI   r	   r	   r	   r
   r   O   s    !PLATFORM_SUPPORTS_FUSED_ATTENTIONPLATFORM_SUPPORTS_FUSED_SDPAc                   C      t otS r   )r   r@   r	   r	   r	   r
   r   U   r>   PLATFORM_SUPPORTS_BF16c                  C   s   t j rMt jjrDtdd t jjdd d D } dg}| dkr(|dg | dkr1|d	 |D ]}|t j	d
j
v rA dS q3dS tpLt j dkS dS )Nc                 s       | ]}t |V  qd S r   int)r)   vr	   r	   r
   r-   Z   r.   z1evaluate_platform_supports_fp8.<locals>.<genexpr>.r   gfx94)r    r   gfx120)r    r   gfx95r   Tr$   F)r   r   r   r   hiptuplesplitextendappendr1   r2   SM90OrLaterr   )ROCM_VERSIONarchsr*   r	   r	   r
   evaluate_platform_supports_fp8W   s   
$
r_   c                   C   rF   r   )r_   r	   r	   r	   r
   r   g   rG   PLATFORM_SUPPORTS_FP8c                   C   rM   r   )r   SM100OrLaterr	   r	   r	   r
   r   i   r>   PLATFORM_SUPPORTS_MX_GEMMFc                  C   sD   t sJ dts ttj D ]} tjdd|  d qdad S d S )Nz?CUDA must be available when calling initialize_cuda_context_rngr'   zcuda:r   T)r   __cuda_ctx_rng_initializedranger   r   device_countrandn)ir	   r	   r
   initialize_cuda_context_rng{   s   rh   c                  c   s    t jjjj} z6dt jjj_t jjjd d d dd d V  W d    n1 s'w   Y  W | t jjj_d S W | t jjj_d S | t jjj_w )NFenabled	benchmarkdeterministic
allow_tf32r   r   r   matmulrm   r   flagsold_allow_tf32_matmulr	   r	   r
   tf32_off   s   rs   h㈵>c                 c   s    t jjjj}| j}z?dt jjj_|| _t jjjd d d dd d V  W d    n1 s-w   Y  W |t jjj_|| _d S W |t jjj_|| _d S |t jjj_|| _w )NTri   )r   r   r   ro   rm   	precisionr   rp   )selftf32_precisionrr   old_precisionr	   r	   r
   tf32_on   s    

ry   c                  c   s    t jjjj} z6dt jjj_t jjjddddd dV  W d   n1 s'w   Y  W | t jjj_dS W | t jjj_dS | t jjj_w )z
    Context manager to temporarily enable TF32 for CUDA operations.
    Restores the previous TF32 state after exiting the context.
    TNri   rn   rq   r	   r	   r
   tf32_enabled   s   rz   c                    s&   dd  fddfdd}|S )Nc                 S   s2   t   |  W d    d S 1 sw   Y  d S r   rs   rv   function_callr	   r	   r
   with_tf32_disabled   s   "z+tf32_on_and_off.<locals>.with_tf32_disabledc                    s6   t |   |  W d    d S 1 sw   Y  d S r   )ry   r|   )rw   r	   r
   with_tf32_enabled   s   "z*tf32_on_and_off.<locals>.with_tf32_enabledc                    s8   t j}t|  t fdd}|S )Nc                     s   t | D ]\}}| |< qtj }d v r#|o"t d jdk}d v r3|o2 d tjtjhv }|rO d  fdd  d  fdd d S di   d S )	Nr   r   dtyperv   c                          di S Nr	   r	   r	   fkwargsr	   r
   r          zCtf32_on_and_off.<locals>.wrapper.<locals>.wrapped.<locals>.<lambda>c                      r   r   r	   r	   r   r	   r
   r      r   r	   )zipr   r   is_tf32_supportedr   typefloat32	complex64)argsr   krR   cond)	arg_namesr   r~   r   )r   r
   wrapped   s   

z1tf32_on_and_off.<locals>.wrapper.<locals>.wrapped)inspect	signature
parametersrX   keys	functoolswraps)r   paramsr   )r~   r   )r   r   r
   wrapper   s
   z tf32_on_and_off.<locals>.wrapperr	   )rw   r   r	   )rw   r~   r   r
   tf32_on_and_off   s   r   c                    s   t   fdd}|S )Nc                     s8   t    | i |W  d    S 1 sw   Y  d S r   r{   )r   r   r   r	   r
   r      s   $zwith_tf32_off.<locals>.wrapped)r   r   )r   r   r	   r   r
   with_tf32_off   s   r   c                  C   s^   dt j vr	dS t j d} t j | td d  dd }tdd |dD S )	NMagmar   r   zMagma 
r   c                 s   rO   r   rP   r)   xr	   r	   r
   r-      r.   z%_get_magma_version.<locals>.<genexpr>rS   )r   
__config__showfindlenrY   rX   )positionversion_strr	   r	   r
   _get_magma_version   s
   $r   c                  C   s4   t jjd u rdS tt jj} tdd | dD S )Nr   c                 s   rO   r   rP   r   r	   r	   r
   r-     r.   z*_get_torch_cuda_version.<locals>.<genexpr>rS   )r   r   r   strrX   rY   )cuda_versionr	   r	   r
   _get_torch_cuda_version   s   r   c                  C   s:   t sdS ttjj} | dd } tdd | dD S )Nr   -r   c                 s   rO   r   rP   r   r	   r	   r
   r-   
  r.   z*_get_torch_rocm_version.<locals>.<genexpr>rS   )r   r   r   r   rW   rY   rX   )rocm_versionr	   r	   r
   _get_torch_rocm_version  s
   r   c                   C   s   t  S r   )r   r	   r	   r	   r
   !_check_cusparse_generic_available  s   r   c                  C   sX   t sdS tjjs
dS ttjj} | dd } tdd | dD }|d u p*|dk  S )NFr   r   c                 s   rO   r   rP   r   r	   r	   r
   r-     r.   z5_check_hipsparse_generic_available.<locals>.<genexpr>rS   )r   r'   )r   r   r   rW   r   rY   rX   )r   rocm_version_tupler	   r	   r
   "_check_hipsparse_generic_available  s   r   r   c           
      C   s   t jt jddt jddj| d}t jt jddt jddj| d}t   t| | D ]	\}}|| q8W d    n1 sLw   Y  ddi}|d ur^|	| || fi |}|| fi |}	||||	fS )Nr"   r   lrr   )
r   nn
SequentialLineartono_gradr   r   copy_update)
r   optimizer_ctoroptimizer_kwargsmod_controlmod_scalingcsr   opt_controlopt_scalingr	   r	   r
   !_create_scaling_models_optimizers  s   **

r   c              	   C   s   t jd|| dt jd|| dft jd|| dt jd|| dft jd|| dt jd|| dft jd|| dt jd|| dfg}t j | }d}t| ||d|||f S )N)r"   r"   )r   r   r   )r   r   r   )r   rf   r   MSELossr   r   )r   r   r   r   dataloss_fn	skip_iterr	   r	   r
   _create_scaling_case1  s   r   c                 C   s   t s| S t| S r   )IS_SM89unittestexpectedFailure)funcr	   r	   r
   xfailIfSM89@  rE   r   )rt   )N__doc__r   r   
torch.cuda$torch.testing._internal.common_utilsr   r   r   r   r   r   
contextlibr6   r   r   is_initialized"CUDA_ALREADY_INITIALIZED_ON_IMPORTre   TEST_MULTIGPUr   r   r   rC   SM53OrLaterSM60OrLaterSM70OrLaterSM75OrLaterr@   SM89OrLaterr\   ra   r(   	IS_JETSONr   r4   r:   GFX90A_ExactGFX942_ExactrA   rB   rD   rH   bool__annotations__rI   rJ   rK   rL   rN   r_   r`   rb   
numba.cudanumbar   TEST_NUMBA_CUDA	Exceptionrc   rh   contextmanagerrs   ry   rz   r   r   r   r   r   r   r   TEST_CUSPARSE_GENERICTEST_HIPSPARSE_GENERICoptimSGDr   floatr   r   r	   r	   r	   r
   <module>   s   





,%