o
    0h>C                  	   @   s  d dl Z d dlmZmZ d dlmZ e edZe edZe rd dl	Z	d dl	m
Z e	j		dTd	d
Ze	j		dTddZe	j				dUddZe	j				dUddZe	je	jddiddde	jddiddde	jddiddde	jddidddgg de	j		dTddZe	je	jddidddgg de	j		dTddZe	je	jdddddde	jdddddde	jdddddde	jddddddgg de	j	 		!	dVd"d#Zd$d% Ze	je	jddiddde	jddidddgg d&d'd(eid)e	j		dTd*d+Ze	j		dTd,d-Ze	j		dTd.d/Ze	j	 		!	dVd0d1Ze	j		dTd2d3Ze	j		dTd4d5Ze	jd6d7 Ze	j			8	dWd9d:Ze	j	;		<	dXd=d>Ze	j	?		@	dYdAdBZ e	j	?		@	dYdCdDZ!e	jdej"fdEdFZ#e	jdej"fdGdHZ$d dIl%m&Z&m'Z' e	j		dTdJdKZ(e	j		dTdLdMZ)e	j		dTdNdOZ*e	j		dTdPdQZ+e	j		dTdRdSZ,dS dS )Z    N)HAS_CUDAHAS_GPU)
has_tritonzrequires cudazrequires gpu)language
BLOCK_SIZEtl.constexprc                 C   p   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
|	|
 }t j|| ||d d S Nr   axismasktl
program_idarangeloadstorein_ptr0in_ptr1out_ptr
n_elementsr   pidblock_startoffsetsr   xyoutput r   X/var/www/vscode/kcb/lib/python3.10/site-packages/torch/testing/_internal/triton_utils.py
add_kernel      r!   c                 C   sp   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
|	|
 }t j|| ||d d S r	   r   r   r   r   r    
sub_kernel"   r"   r#   ARGS_PASSEDc                 C   s~   t jdd}|| }|t d| }||k }	t j| | |	d}
|dkr1t j|| |	d}|
| }n|
}t j|| ||	d d S Nr   r
   r   twor   )r   r   r   r   r$   r   r   r   r   r   r   r   r   r   r   r    add_kernel_with_optional_param3   s   	
r'   c                 C   s   t jdd}|| }|t d| }	|	|k }
t j| |	|  |
d}|dkr3t j||	 |
d}|| }n|}t j||	|  ||
d d S r%   r   )r   r   r   r   strider$   r   r   r   r   r   r   r   r   r   r   r    -add_kernel_with_none_param_and_equal_to_1_argH   s   

r)            )
num_stages	num_warps   @   )configskeyc                 C   r   r	   r   r   r   r   r    add_kernel_autotuned^   s   r3         c                 C   sp   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
|	|
 }t j|| ||d d S r	   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    &add_kernel_autotuned_weird_param_orderx   s   r6   )BLOCK_SIZE_XBLOCK_SIZE_Yr7   r8   c                 C   s   t d| }|t d|d d d f  }||k }	t d| }
|
t d|d d d f  }||k }|}|}t | |||   |	|@ }t | |||   |	|@ }|| }t ||||   ||	|@  d S )Nr      r   )r   r   r   
x_elements
y_elementsr7   r8   xoffsetxindexxmaskyoffsetyindexymaskx1y0tmp0tmp1tmp2r   r   r    add_kernel_2d_autotuned   s   "rG   c                 O   s   | S )Nr   )r1   ___r   r   r    _dummy_early_config_prune   s   rJ   
      early_config_prune)r1   r2   warmuprepprune_configs_byc                 C   r   r	   r   r   r   r   r    *add_kernel_autotuned_with_unsupported_args   s   rQ   c                 C   st   t jdd}|| }|t d| }||k }	t j| | |	d}
t j|| |	d}|
| | }t j|| ||	d d S r	   r   )r   r   r   r   scaling_factorr   r   r   r   r   r   r   r   r   r   r    add_kernel_with_scaling   s   	rS   c           	      C   s\   t jdd}|| }t | |g|gt j}t ||g|gt j}|| }t |||g d S )Nr   r
   r   r   _experimental_descriptor_loadfloat32_experimental_descriptor_store)	in_desc_ptr0in_desc_ptr1out_desc_ptrr   r   offsetabr   r   r   r    add_kernel_with_tma_1d   s(   r^   c                 C   sz   t jdd}t jdd}|| }|| }t | ||g||gt j}	t |||g||gt j}
|	|
 }t ||||g d S )Nr   r
   r9   rT   )rX   rY   rZ   r7   r8   pid_xpid_yoffset_xoffset_yr   r   r   r   r   r    add_kernel_with_tma_2d  s,   rc   c           
      C   s^   t jdd}|| }|t d| }||k }t j| | |d}d| }	t j|| |	|d d S Nr   r
   r   r5   r   )
r   r   r   r   r   r   r   r   r   r   r   r   r    mul2_kernel*  s   re   c           	      C   s^   t jdd}|| }|t d| }||k }t j| | |d}d| }t j| | ||d d S rd   r   )	ptrr   r   r   r   r   r   r   r   r   r   r    mul2_inplace_kernel9  s   rg   c                 C   s   t | dk| dS )Nr   )r   where)r   r   r   r    	zero_negsG  s   ri   
ACTIVATIONc           
      C   s   t jdd}|| }|t d| }||k }|dkr"t| ||d n|dkr/t| | |||d t j| | |d}	t j|| |	|d d S )Nr   r
   rg   )r   r!   r   )r   r   r   rg   r!   r   r   )
r   r   r   r   rj   r   r   r   r   r   r   r   r    indirection_kernelK  s   rk   X_BLOCK_SIZEY_BLOCK_SIZEc                 C   s   t jdd}t jdd}|| }|| }	|t d| }
|	t d| }|d d d f | |
d d d f  }|d d d f | |
d d d f  }t | | }t || |d  d S )Nr   r
   r9   g       @r   )in_ptrr   in_y_strideout_y_striderl   rm   xidyidx_starty_start	x_offsets	y_offsetssrc_offsetsdst_offsetssrcr   r   r    double_strided_kernel^  s   	$$rz   nBLOCKc           	      C   x   t | t d| }t |t d| }t |g|t j}t jdd|||gt jddd}t |t d| | d S )Nr   shf.l.wrap.b32 $0, $1, $2, $3;
=r,r, r, rTr9   dtypeis_purepackr   r   r   fullint32inline_asm_elementwiser   	XYZr{   r|   r   r   szr   r   r    inline_asm_kernel_is_pure_truer     r   c           	      C   r}   )Nr   r~   r   Fr9   r   r   r   r   r   r    inline_asm_kernel_is_pure_false  r   r   c           
   	   C   s   t jdd}|| }t jt j| |gdg|g|gdgddgd}t jt j||gdg|g|gdgddgd}|| }	t jt j||gdg|g|gdgd|	dgd d S Nr   r
   r9   )baseshapestridesr   block_shapeorder)boundary_checkr   r   r   make_block_ptrr   )
x_ptry_ptr
output_ptrr   r   r   r   r   r   r   r   r   r    add_kernel_with_block_ptr  sJ   
r   c              	   C   s   t jdd}|| }t jt j| |dgddg|dg|dgddgddgd}|}t jt j||dgddg|dg|dgddgd|dgd d S r   r   )r   r   r   r   r   r   r   r   r   r   r    kernel_with_block_ptr_2d  s4   
r   )r   r   c                 C   sj   t jdd}|| }|t d| }||k }t| | |d}	t|| |d}
|	|
 }t|| ||d d S r	   r   r   r   r   r    add_kernel_with_import  s   r   c                 C   s   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
t ddkr4|	|
 }n|	|
 }t j|| ||d d S r	   r   r   r   r   r    cond_op_kernel  s   
r   c                 C   r   r	   )r   r   r   r   
atomic_addr   r   r   r    atomic_add_kernel  r"   r   c                 C   s   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
tdD ]}|	|
 }t j|| ||d q,d}|dkr[|d8 }|	|
 }t j|| ||d |dksCd S d S )Nr   r
   r   r5   r9   )r   r   r   r   ranger   )r   r   r   r   r   r   r   r   r   r   r   ir   r   r   r    add_4_times_kernel  s   r   c                 C   sp   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
|	|
 }t j|| ||d d S r	   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    add_kernel_out_of_order_fn23  r"   r   )r   r   )r$   r   r   r   )r7   r   r8   r   )r   r   rj   r   )rl   r   rm   r   )r{   r   r|   r   )-unittest&torch.testing._internal.inductor_utilsr   r   torch.utils._tritonr   
skipUnlessrequires_cudarequires_gputritonr   r   jitr!   r#   r'   r)   autotuneConfigr3   r6   rG   rJ   rQ   rS   r^   rc   re   rg   ri   rk   rz   r   r   	constexprr   r   triton.languager   r   r   r   r   r   r   r   r   r   r    <module>   sN  	
!
-!    