o
    Ih                 
   @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZmZ d dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlm  mZ d dlm Z  d dl!m"Z"m#Z# d d	l$m%Z% d d
l&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8 ddl9m:Z: ddl;m<Z<m=Z=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZFmGZGmHZHmIZI ddlJmKZKmLZL ddlMmNZNmOZOmPZPmQZQ ddlmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_ ddl`maZbmcZcmdZdmeZe ddlfmgZg ddlhmiZi ddljmkZkmlZlmmZmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZy dd lzm{Z{m|Z|m}Z}m~Z~mZmZ dd!lmZmZmZmZmZ dd"lmZ erd d#lmZ d d$lmZ d d%lmZ dd&l7mZ dd'lmZ ed(ZeeZejed)Zejed*Zejed+Ze: Z9G d,d- d-Zeddxd0d1Zeddxd2d3ZG d4d5 d5ZejG d6d7 d7ZejG d8d9 d9Zdyd>d?ZG d@dA dAetZe jZdzdDdEZd{dGdHZdzdIdJZd|dKdLZdzdMdNZd}dPdQZd~dTdUZG dVdW dWeoZddYdZZddd^d_ZG d`da daesZedb G dcdd ddeZG dedf dfZejG dgdh dhZG didj djZejG dkdl dlZG dmdn dneneeeeeef f f ZG dodp dpee ZG dqdr dreZddvdwZdS )    )annotationsN)IterableSequence)	lru_cache)AnyCallablecastOptionalTYPE_CHECKINGUnion)
PRECEDENCE)get_interface_for_device)identitypreserve_rng_state)is_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)has_triton_package   )free_symbol_is_type
prefix_strsymbol_is_typeSymT)ValueRanges   )configirmetrics)AsyncCompile)	code_hashget_pathPyCodeCache)DefaultHandler)triton_heuristics)benchmarker)AutotuneHintDevicePropertiesTRITON_MAX_BLOCKTRITON_MAX_RSPLIT)get_max_y_gridnext_power_of_2)BaseSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfDelayReplaceLineget_bounds_index_exprget_fused_kernel_nameget_kernel_metadatais_welford_reductionPlaceholderprefix_is_reduction	sympy_dotsympy_product
sympy_substriton_typetriton_version_uses_attrs_dictupcast_compute_type)_opsReductionType	StoreModeV)"get_kernel_category_by_source_code   )BlockPatternMatcher)ArgNameBackendFeatureConstexprArgCSECSEVariableDeferredLineIndentedBufferInplacedBufferOpOverridesPythonPrinter
RemovedArgSizeArg	TensorArgWorkspaceArgWorkspaceZeroMode)constant_reprIterationRangesIterationRangesEntryIterationRangesRoot
SIMDKernelSIMDScheduling)	config_ofequal_1_arg_indicesnon_constexpr_signatureshould_unwrap_unspec_argsignature_to_meta)SymbolicCallArg)
ModuleType)TypeVarDtypePropagationOpsHandler)IRNode)SIMDKernelFeatures_T
perf_hintsschedulefusionc                   @  s8   e Zd ZU dZi Zded< i Zded< edddZdS )OpDtypeSupportz
    Some Triton ops such as libdevice and tl.math only support float32 and float64.
    This class records which dtypes are supported by specific IR ops.
    z"dict[str, OrderedSet[torch.dtype]]supported_dtypeszdict[str, bool]convert_outputsfuncCallable[..., str]convert_outputboolreturnNonec                 C  s*   |j }ttjtjg| j|< || j|< d S N)__name__r   torchfloat32float64rl   rm   )clsrn   rp   op_name r{   R/var/www/vscode/kcb/lib/python3.10/site-packages/torch/_inductor/codegen/triton.pyregister_upcast   s   zOpDtypeSupport.register_upcastN)rn   ro   rp   rq   rr   rs   )	ru   
__module____qualname____doc__rl   __annotations__rm   classmethodr}   r{   r{   r{   r|   rk   v   s   
 rk   rr   strc                  C  s(   t  sdS ddl} t| jjdrdS dS )zd
    import AttrsDescriptor if the triton version is new enough to have this
    class defined.
     r   NAttrsDescriptorz4from triton.compiler.compiler import AttrsDescriptor)r   triton.compiler.compilerhasattrcompiler)tritonr{   r{   r|   gen_attr_descriptor_import   s   r   c                  C  s6   t  } | d t  }r| | | d |  S )NzD
        import triton
        import triton.language as tl
        a  
        from torch._inductor.runtime import triton_helpers, triton_heuristics
        from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
        from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
        )rL   splicer   	writelinegetvalue)imports	attr_descr{   r{   r|   gen_common_triton_imports   s   

r   c                   @  sp   e Zd ZdZeejejgZeej	ej
ejgeZdd eD Zdd eD Zedd	d
ZedddZdS )TritonSymbolszU
    Stores sympy.Symbol instances and constants associated with triton codegen.
    c                 C  s(   i | ]}|t jt|  d dddqS )offsetTintegernonnegative)sympySymbolr   .0symtr{   r{   r|   
<dictcomp>   s    zTritonSymbols.<dictcomp>c                 C  s,   i | ]}|t jt|   d dddqS )BLOCKTr   positive)r   r   r   upperr   r{   r{   r|   r      s    treerV   rr   sympy.Symbolc                 C     | j |j S rt   )block_sizesr   ry   r   r{   r{   r|   get_block_size      zTritonSymbols.get_block_sizec                 C  r   rt   )block_offsetsr   r   r{   r{   r|   get_block_offset   r   zTritonSymbols.get_block_offsetN)r   rV   rr   r   )ru   r~   r   r   r   r   R0_INDEXR1_INDEXreduction_typesXBLOCKYBLOCKZBLOCKblock_typesr   r   r   r   r   r{   r{   r{   r|   r      s    r   c                   @  sv   e Zd ZU ded< ded< ded< ded< d	ed
< dddZdddZdddZdddZdddZe	dddZ
dS )IndexingOptionsr   	index_strOrderedSet[str]	mask_varsOptional[str]
expand_strrq   _has_rindex
sympy.Exprindexrr   c                 C  
   t | jS rt   )rq   r   selfr{   r{   r|   has_mask      
zIndexingOptions.has_maskc                 C  s   t | jtjS rt   )r   r   r   TMPr   r{   r{   r|   has_indirect      zIndexingOptions.has_indirectc                 C     | j S rt   )r   r   r{   r{   r|   
has_rindex   s   zIndexingOptions.has_rindexc                 C     t dd | jD S )Nc                 s      | ]
}t |d V  qdS )tmpNr   
startswithr   maskr{   r{   r|   	<genexpr>       z.IndexingOptions.has_tmpmask.<locals>.<genexpr>anyr   r   r{   r{   r|   has_tmpmask      zIndexingOptions.has_tmpmaskc                 C  r   )Nc                 s  r   )rNr   r   r{   r{   r|   r      r   z,IndexingOptions.has_rmask.<locals>.<genexpr>r   r   r{   r{   r|   	has_rmask   r   zIndexingOptions.has_rmaskc                 C  s   | j rdtt| j S dS )N & rs   )r   joinmapr   r   r{   r{   r|   mask_str   s   zIndexingOptions.mask_strNrr   rq   rr   r   )ru   r~   r   r   r   r   r   r   r   propertyr   r{   r{   r{   r|   r      s   
 




r   c                   @  s  e Zd ZU ded< ded< ded< ded< d	ed
< ded< d	ed< dZded< edCddZedCddZedCddZedCddZ	dDd d!Z
edEd&d'ZdFd,d-ZdGdHd0d1ZdId3d4ZdJd5d6ZdKd7d8ZdLd9d:ZdLd;d<ZdLd=d>ZdLd?d@ZdLdAdBZdS )MBlockPtrOptionsBlockParametersparamsr   constant_offset	list[int]orderr   r   Sequence[sympy.Expr]broadcast_shapez
list[bool]broadcasting_dimsfinal_shapeNzOptional[list[int]]_boundary_checkrr   list[sympy.Expr]c                 C     | j jS rt   )r   shaper   r{   r{   r|   r         zBlockPtrOptions.shapec                 C  r   rt   )r   block_shaper   r{   r{   r|   r      r   zBlockPtrOptions.block_shapec                 C  r   rt   )r   stridesr   r{   r{   r|   r      r   zBlockPtrOptions.stridesc                 C  r   rt   )r   offsetsr   r{   r{   r|   r      r   zBlockPtrOptions.offsetsvaluer   initial_shapeallow_implicitrq   c                   s   dd t | j| jD }t|||}tjj t| jo0t|t|kp0t fddt ||D }|r5|rCd| dtj	
| j d}t|| j|}|S )z
        Generate a broadcast and a reshape for the block pointer.
        This restores stride-0 dimensions which were removed from the block pointer.
        c                 S  s    g | ]\}}|rt jjn|qS r{   )r   SOne)r   dimis_broadcastingr{   r{   r|   
<listcomp>  s    zABlockPtrOptions.codegen_broadcast_and_reshape.<locals>.<listcomp>c                 3  s.    | ]\}}  |d p  || V  qdS rD   Nstatically_known_equals)r   pre_dimpost_dimsizevarsr{   r|   r     s    

z@BlockPtrOptions.codegen_broadcast_and_reshape.<locals>.<genexpr>tl.broadcast_to(, ))zipr   r   triton_reshaperB   graphr   r   lenkernelindex_to_str)r   r   r   r   r   pre_broadcast_shaperequire_broadcastr{   r   r|   codegen_broadcast_and_reshape  s"   
z-BlockPtrOptions.codegen_broadcast_and_reshaperange_treeslist[IterationRangesRoot]get_max_blockCallable[[str], int]c              	     s|  t jjdfdd}|| j| _|| j| _fdd| jD }fd	d| jD }t|r3d
|d< dd t| j|D }dd t||D   fddtdi fddt	
|  D } dd |D }	t jjrz|d jdksuJ |	d t jj}
t jjst| jtt jj|
 krt jj r|	tjjg|
 7 }	t| t jj|tttt| j||	||d}|| |S )z,Helper to create a  BlockPtrOptions instanceexprsIterable[sympy.Expr]rr   r   c                   s    fdd| D S )Nc                   s   g | ]}  |qS r{   )lookup_precomputed_sizer   exprr   r{   r|   r   >      z?BlockPtrOptions.create.<locals>.lookup_size.<locals>.<listcomp>r{   )r
  r   r{   r|   lookup_size=     z+BlockPtrOptions.create.<locals>.lookup_sizec                      g | ]}  |d qS )r   r   )r   strider   r{   r|   r   F      z*BlockPtrOptions.create.<locals>.<listcomp>c                   r  )rD   r   )r   r   r   r{   r|   r   L  r  Fc                 S     g | ]\}}|s|qS r{   r{   )r   r   is_singletonr{   r{   r|   r   V      c                 S     g | ]}t |qS r{   )r   )r   dimsr{   r{   r|   r   ]      c                   s   dd t |  D S )z@Removes any broadcasting or singleton dims from a given sequencec                 S  r  r{   r{   )r   itemis_removabler{   r{   r|   r   a  r  z?BlockPtrOptions.create.<locals>.remove_dims.<locals>.<listcomp>)r   )it)removable_dimsr{   r|   remove_dims_  s   z+BlockPtrOptions.create.<locals>.remove_dimsc                      i | ]	\}}| |qS r{   r{   r   keyval)r   r{   r|   r   i      z*BlockPtrOptions.create.<locals>.<dictcomp>c                 S  s   g | ]}t |qS r{   )r   r   r   r   r{   r{   r|   r   m  r  r   x)r   r   r   r   r   r   r   N)r
  r  rr   r   r{   )rB   r   r   r   r   r   allr   r   dataclassesasdictitemsr  no_x_dimprefixpopnum_reduction_dimsinside_reductionr   numelsfeaturesis_reductionr   r   r   r   r  listreversedrangecompute_boundary_check)r   r   r  r   r  r  r   singleton_dimsr   r   reduction_ndimresultr{   )r  r   r   r|   create0  sT   


	


	zBlockPtrOptions.creater  replacementr   r   c                 C  s   t j| }t|||iS )zN
        Replaces instances of {symt}_offset with the new expression.
        )r   r   r;   )r   r  r<  r   roffsetr{   r{   r|   replace_offset  s   
zBlockPtrOptions.replace_offsetTnamec                   s   dfdd t jj}g j}|s fdd|D }jdkr+| d	|j d
n|d|j d|j d|j d|j d|| g}dd	| d
S )a  
        Codegen a call to tl.make_block_ptr()

        Args:
            name: variable name for pointer
            roffset: should rn_offset be included in offsets=..., for use with tl.advance()

        Returns:
            "tl.make_block_ptr(...)"
        r  r   rr   c                   s$   t jD ]} | td|} q| S Nr   )r   r   r>  r   Integer)r  r   r   r{   r|   remove_roffsets  s   
z/BlockPtrOptions.format.<locals>.remove_roffsetsc                      g | ]} |qS r{   r{   r   r   )rB  r{   r|   r     r  z*BlockPtrOptions.format.<locals>.<listcomp>r    + (r   zshape=zstrides=zblock_shape=zorder=zoffsets=ztl.make_block_ptr(r   N)r  r   rr   r   )
rB   r  r  r   r   r   r   r   r   r   )r   r?  r=  fr   argsr{   )rB  r   r|   format  s   

zBlockPtrOptions.formatrs   c                   sF   t jjfddtj D   fddttjD _	dS )z6List of indices to pass to tl.load(boundary_check=...)c                   s   i | ]\}}| t | qS r{   r   )r   r   
block_size)r  r{   r|   r     s    z:BlockPtrOptions.compute_boundary_check.<locals>.<dictcomp>c                   sz   g | ]9} j| tjjs;j| j| s;j| tj|  s;t	j
jr9j| tjtj ks|qS r{   )r   r   r   r   Zerostatically_known_multiple_ofr   r   r;   rB   r  r,  r   r   r   r   r   idx)block_to_maxr   r   r{   r|   r     s"    z:BlockPtrOptions.compute_boundary_check.<locals>.<listcomp>N)
rB   r   r   r   r   r+  r6  r   r   r   )r   r  r{   )rO  r  r   r   r|   r7    s   
z&BlockPtrOptions.compute_boundary_checkc                 C  s   | j d usJ | j S rt   )r   r   r{   r{   r|   boundary_check  s   zBlockPtrOptions.boundary_checkc                   s&   t j   fddjD }|S )av  
        Codegen string to pass to tl.advance(name, ...).

        Advance is the difference between offsets in each loop iteration.
        To compute it, we replace rN_offset with multiples of RN_BLOCK.
        Since we expect rN_offset to vary in range(0, rN_numel, RN_BLOCK), the first
        iteration has rN_offset=0, while the second has rN_offset=RN_BLOCK.
        c                   s,   g | ]} |  |tjj qS r{   )r>  r   r   rK  rD  rblockr   r   r{   r|   r     s    z3BlockPtrOptions.advance_roffset.<locals>.<listcomp>)r   r   r   )r   r   advancer{   rQ  r|   advance_roffset  s
   
	zBlockPtrOptions.advance_roffsetc                 C     dS NFr{   r   r{   r{   r|   r        zBlockPtrOptions.has_indirectc                 C  r   )Nc                 s  s    | ]	}t |tjV  qd S rt   )r   r   r   r  r{   r{   r|   r     s
    

z-BlockPtrOptions.has_rindex.<locals>.<genexpr>)r   r   r   r{   r{   r|   r     s   zBlockPtrOptions.has_rindexc                 C  s   |   S rt   )r   r   r{   r{   r|   r        zBlockPtrOptions.has_rmaskc                 C  rU  rV  r{   r   r{   r{   r|   r     rW  zBlockPtrOptions.has_tmpmaskc                 C  s   t |  S rt   )rq   rP  r   r{   r{   r|   r        zBlockPtrOptions.has_maskrr   r   )
r   r   r   r   r   r   r   rq   rr   r   )r   r   r   r   r  r  r   r   r  r	  rr   r   )r  r   r<  r   r   r   rr   r   T)r?  r   rr   r   )r  r	  rr   rs   )rr   r   )r   r   rr   r   r   )ru   r~   r   r   r   r   r   r   r   r   r  staticmethodr;  r>  rH  r7  rP  rT  r   r   r   r   r   r{   r{   r{   r|   r      s<   
 
-
V	
#





r   r   	old_shaper   	new_shapec                 C  s   t |tr
t |tsJ dd |D }dd |D }||kr | S dd |D |kr5d|  dd| dS d	}g }|D ]#}|t|k rS||| krS|d
 |d7 }q;|dksYJ |d q;|t|ksgJ |  dd| dS )z7Workaround https://github.com/openai/triton/issues/2836c                 S     g | ]}t j|qS r{   rB   r  r  r   r   r{   r{   r|   r          z"triton_reshape.<locals>.<listcomp>c                 S  r_  r{   r`  ra  r{   r{   r|   r     rb  c                 S  s   g | ]}|d kr|qS )1r{   )r   sr{   r{   r|   r     rb  ztl.reshape(z, [r   z])r   :rD   rc  rs   [])
isinstancer4  r   r   append)r   r]  r^  old_shape_strnew_shape_strrN  expandsizer{   r{   r|   r     s"   

r   c                   @  s(  e Zd Zd?ddZd?ddZd?d	d
Zd?ddZd?ddZd?ddZd?ddZ	d?ddZ
d?ddZd?ddZd?ddZd?ddZeZd?ddZd@d d!Zd?d"d#Zd?d$d%Zd?d&d'Zd?d(d)Zd?d*d+Zd?d,d-Zd?d.d/Zd?d0d1Zd?d2d3Zd?d4d5Zd?d6d7Zd?d8d9Zd?d:d;Zd?d<d=Zd>S )ATritonPrinterr  r   rr   r   c                 C  4   t |jdks	J d| |jd  dtjj dS )NrD   libdevice.trunc(r   ).to(r   r   rG  _printrB   r  index_dtyper   r  r{   r{   r|   _print_TruncToInt      zTritonPrinter._print_TruncToIntc                 C  s*   t  rtjjr| }|S d| d}|S )Nztl.full([], z, tl.float64))r   	is_fbcoderv   versionhip)r   r  retr{   r{   r|   _print_Float   s
   zTritonPrinter._print_Floatc                 C  s6   t |jdks	J | |jd td d }| dS )NrD   r   Atom      ?z.to(tl.float64))r   rG  parenthesizer   )r   r  rd  r{   r{   r|   _print_ToFloat'  s   
zTritonPrinter._print_ToFloatc                 C  sT   |j \}}|jr|jr| |j dtd d S | |}| |}d| d| dS )N % r}  r~  z!triton_helpers.remainder_integer(r   r   )rG  is_nonnegative	stringifyr   rs  r   r  quotdivquot_sdiv_sr{   r{   r|   _print_PythonMod,  s   


zTritonPrinter._print_PythonModc                 C  s^   |j sJ |j\}}|jr|jr| |jdtd d S | |}| |}d| d| dS )N // r}  r~  z!triton_helpers.div_floor_integer(z,  r   )
is_integerrG  r  r  r   rs  r  r{   r{   r|   _print_FloorDiv4  s   



zTritonPrinter._print_FloorDivc                 C  s   |  |jdtd d S )N / r}  r~  )r  rG  r   ru  r{   r{   r|   _print_IntTrueDiv?  s   zTritonPrinter._print_IntTrueDivc                 C  ro  NrD   libdevice.floor(r   rq  r   rr  ru  r{   r{   r|   _print_floorD  rw  zTritonPrinter._print_floorc                 C  ro  r  rr  ru  r{   r{   r|   _print_FloorToIntJ  rw  zTritonPrinter._print_FloorToIntc                 C  ro  NrD   libdevice.ceil(r   rq  r   rr  ru  r{   r{   r|   _print_ceilingP     "zTritonPrinter._print_ceilingc                 C  ro  r  rr  ru  r{   r{   r|   _print_CeilToIntT  r  zTritonPrinter._print_CeilToIntc                 C  s   d|  | dS )Nzlibdevice.sqrt(().to(tl.float32)))rs  ru  r{   r{   r|   _helper_sqrtX  r  zTritonPrinter._helper_sqrtc                 C  s*   d|  |jd  d|  |jd  dS )Nlibdevice.pow(r   r   rD   r   )rs  rG  ru  r{   r{   r|   _print_FloatPow[  s   (zTritonPrinter._print_FloatPowc                 C  sH   |  |jd }|  |jd }|  |jd }d| d| d| dS )Nr   rD   r   	tl.where(r   r   )doprintrG  )r   r  cpqr{   r{   r|   _print_Whereb  s   zTritonPrinter._print_Wherecmpc                 C  s   t |jdkr| |jd S t |jd }t|}| ||jd|  }| ||j|d  }tdd ||fD \}}|dv sKJ d| d	d
| d| d| d| d| d| d| d| dS )zK
        Helper for max/min code genereration.
        cmp: > or <
        rD   r   r   Nc                 s  s    | ]	}d | dV  qdS )(r   Nr{   r   r'  r{   r{   r|   r   w      z6TritonPrinter._print_min_max_helper.<locals>.<genexpr>)><zUnexpected comparator: ''r  z * ( z= z) + )))r   rG  rs  typetuple)r   r  r  midry   abr{   r{   r|   _print_min_max_helperh  s   6z#TritonPrinter._print_min_max_helperc                 C     |  |dS )Nr  r  ru  r{   r{   r|   
_print_Min{  rY  zTritonPrinter._print_Minc                 C  r  )Nr  r  ru  r{   r{   r|   
_print_Max~  rY  zTritonPrinter._print_Maxc                 C  *   t |jdks	J d| |jd  dS )NrD   tl_math.abs(r   r   r   rG  rs  ru  r{   r{   r|   
_print_Abs     zTritonPrinter._print_Absc                 C  r  )NrD   zlibdevice.cos((r   r  r  ru  r{   r{   r|   _print_OpaqueUnaryFn_cos  r  z&TritonPrinter._print_OpaqueUnaryFn_cosc                 C  r  )NrD   zlibdevice.cosh((r   r  r  ru  r{   r{   r|   _print_OpaqueUnaryFn_cosh  r  z'TritonPrinter._print_OpaqueUnaryFn_coshc                 C  r  )NrD   zlibdevice.acos((r   r  r  ru  r{   r{   r|   _print_OpaqueUnaryFn_acos  r  z'TritonPrinter._print_OpaqueUnaryFn_acosc                 C  r  )NrD   zlibdevice.sin((r   r  r  ru  r{   r{   r|   _print_OpaqueUnaryFn_sin  r  z&TritonPrinter._print_OpaqueUnaryFn_sinc                 C  r  )NrD   zlibdevice.sinh((r   r  r  ru  r{   r{   r|   _print_OpaqueUnaryFn_sinh  r  z'TritonPrinter._print_OpaqueUnaryFn_sinhc                 C  r  )NrD   zlibdevice.asin((r   r  r  ru  r{   r{   r|   _print_OpaqueUnaryFn_asin  r  z'TritonPrinter._print_OpaqueUnaryFn_asinc                 C  r  )NrD   zlibdevice.tan((r   r  r  ru  r{   r{   r|   _print_OpaqueUnaryFn_tan  r  z&TritonPrinter._print_OpaqueUnaryFn_tanc                 C  r  )NrD   zlibdevice.tanh((r   r  r  ru  r{   r{   r|   _print_OpaqueUnaryFn_tanh  r  z'TritonPrinter._print_OpaqueUnaryFn_tanhc                 C  r  )NrD   zlibdevice.atan((r   r  r  ru  r{   r{   r|   _print_OpaqueUnaryFn_atan  r  z'TritonPrinter._print_OpaqueUnaryFn_atanc                 C  ro  )NrD   zlibdevice.llrint(r   rq  r   rr  ru  r{   r{   r|   _print_RoundToInt  rw  zTritonPrinter._print_RoundToIntc                 C  sf   t |jdks	J |j\}}|jr|dk sJ td| d| |td }d| d| d|  S )	Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulzlibdevice.nearbyint(1e * z) * 1e)r   rG  r  
ValueErrorr  r   )r   r  numberndigits
number_strr{   r{   r|   _print_RoundDecimal  s   

z!TritonPrinter._print_RoundDecimalN)r  r   rr   r   )r  r   r  r   rr   r   ) ru   r~   r   rv  r|  r  r  r  r  r  r  r  r  r  r  _print_PowByNaturalr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r{   r{   r{   r|   rn    s<    


























rn  dtypetorch.dtypec                 C     t t| S )zCConvert torch.dtype to triton type and upcast [b]float16 to float32)r<   r>   r  r{   r{   r|   triton_compute_type  r   r  intc                 C  s$   t | } t| dd}|r|d S dS )z'Number of bits of triton_compute_type()itemsizeN   r  )r>   getattr)r  r  r{   r{   r|   _get_primitive_bitwidth  s
   r  c                 C  s   | t jkrt j} t| S )z@Convert torch.dtype to triton type, with fix for storing tl.bool)rv   rq   int8r<   r  r{   r{   r|   triton_store_type  s   
r  c                 C  s&   t | r| jr| jdkrtjS t| S )z0Implicit upcasts used for Triton reduction types   )r   	is_signedr  rv   int32r>   r  r{   r{   r|   upcast_acc_dtype  s   r  c                 C  r  )z:Convert torch.dtype to triton type, with reduction upcasts)r  r  r  r{   r{   r|   triton_acc_type  r   r  rq   c                 C  s   | j dko| jS )Nr   )r  is_floating_pointr  r{   r{   r|   low_precision_fp     r  varUnion[CSEVariable, Any]c                 C  s,   t | tsdS | j}t |tjrt|S dS rV  )rh  rJ   r  rv   r  )r  r  r{   r{   r|   low_precision_fp_var  s   
r  c                      s&   e Zd Zd fddZd	d
 Z  ZS )TritonCSEVariableboundsValueRanges[Any]r  r  rr   rs   c                   s0   t  ||| tt  | _|d usJ dd S )Nz!TritonCSEVariable must have dtype)super__init__r   r   r   )r   r?  r  r  	__class__r{   r|   r    s   zTritonCSEVariable.__init__c                 C  sf   |D ].}t |tr| j|j qt |tjr0tjD ]}t||r/| jt	|  dg  nqqd S )Nr   )
rh  r  r   updater   r   r   r   r   r   )r   r?  rG  kwargsargr   r{   r{   r|   update_on_args  s   


z TritonCSEVariable.update_on_args)r  r  r  r  rr   rs   )ru   r~   r   r  r  __classcell__r{   r{   r  r|   r    s    r  rd   c                  C  s   ddl m}  |  S )Nr   rc   )!torch._inductor.dtype_propagationrd   rc   r{   r{   r|   get_dtype_handler  s   r  Trp   Callable[[_T], _T]c                   s.   ddddfddd fd
d}|S )z
    Codegen helper to upcast arguments to float32, depending on the config and dtype.
    This decorates tl.math/libdevice codegen functions.
    rr   rq   c                 S  s&   t jj ot| to| jtjtjfv S rt   )	r   r   codegen_upcast_to_fp32rh  rJ   r  rv   float16bfloat16r  r{   r{   r|   needs_upcast  s
   
z*maybe_upcast_float32.<locals>.needs_upcastr   c                   s    | rdnd}|  | S )N.to(tl.float32)r   r{   )r  upcast_stringr  r{   r|   maybe_upcast_arg  s   z.maybe_upcast_float32.<locals>.maybe_upcast_argrn   Callable[..., Any]c                   s$   t   d fdd}|S )Nrr   r   c            	        s   fdd| D }fdd|  D }|i |} o-tfddt| | D }|s2d ntt j| i |}|tj	d fv}|rS|d urSdt
| dnd	}| | S )
Nc                   rC  r{   r{   )r   r  r  r{   r|   r      r  zLmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<listcomp>c                   r!  r{   r{   r"  r  r{   r|   r   !  r%  zLmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<dictcomp>c                 3  s    | ]} |V  qd S rt   r{   r   r  r  r{   r|   r   %      
zKmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<genexpr>.to(r   r   )r+  r   	itertoolschainvaluesr  r  ru   rv   rw   r<   )	rG  r  upcast_argsupcast_kwargsr:  any_needs_upcastresult_dtypeneeds_downcastdowncast_string)rp   rn   r  r  r{   r|   wrapped  s$   z8maybe_upcast_float32.<locals>.decorator.<locals>.wrappedr   )rk   r}   )rn   r
  rp   r  r  )rn   r|   	decorator  s   z'maybe_upcast_float32.<locals>.decoratorNr   r   )rn   r  rr   r  r{   )rp   r  r{   r  r|   maybe_upcast_float32	  s   
r  c                   @  s  e Zd ZdZeejZe		dddd	Z	edd
dZ
edd Zedd Zee dd Zedd Zedd Zee dd Zee dd Zee dd Zee dd Zee dd Zee d d! Zee d"d# Zed$d% Zed&d' Zed(d) Zed*d+ Zedejdd,d-d.d/Zee d0d1 Z ee d2d3 Z!ee d4d5 Z"ee d6d7 Z#ed8d9 Z$ed:d; Z%ee d<d= Z&ee d>d? Z'ee d@dA Z(ee dBdC Z)ee dDdE Z*ee dFdG Z+ee dHdI Z,ee dJdK Z-ee dLdM Z.ee dNdO Z/ee dPdQ Z0ee dRdS Z1ee dTdU Z2ee dVdW Z3ee dXdY Z4ee dZd[ Z5ee d\d] Zee d^d_ Z6ed`da Z7edbdc Z8eddde Z9edfdg Z:edhdi Z;edjdk Z<edldm Z=edndo Z>edpdq Z?edrds Z@edtdu ZAedvdw ZBedxdy ZCedzd{ ZDee d|d} ZEee d~d ZFee dd ZGee dd ZHee dd ZIedd ZJee dd ZKee dd ZLee dd ZMee dd ZNeedddd ZOeedddd ZPee dd ZQee dd ZRedd ZSedd ZTee dd ZUedd ZVee dd ZWdS )TritonOverrideszMap element-wise ops to TritonNTr  r  	src_dtypeOptional[torch.dtype]c                 C  sz   ddd}|d urt |||tjjtj_|tjkr d|  d	S |tjkr*|  d
S |r1t|}nt|}|  d| dS )Nr  r  	dst_dtyperr   r  c                 S  sl   | |krdS t jt jf}| |v r||v r| |krJ d| t jks&|t jkr(dS | t jks2|t jkr4dS dS )Nr   zCConversions between float8_e5m2 and float8_e4m3fn is not supported!r  r   )rv   float8_e4m3fnfloat8_e5m2)r  r  
fp8_dtypesr{   r{   r|   _get_min_elements_per_threadF  s   
z>TritonOverrides.to_dtype.<locals>._get_min_elements_per_threadr  z != 0)z.to(tl.int8).to(tl.uint8)r   r   )r  r  r  r  rr   r  )	maxrB   r  min_elem_per_threadrv   rq   uint8r  r  )r'  r  r  use_compute_typesr  	out_dtyper{   r{   r|   to_dtype?  s   




zTritonOverrides.to_dtypec           
      C  s   t |}|tjtjfv rNtjjrNt|dd }|  d| d}|tjtjfv r7t|dd }d| }| d| d}|tjtjfv rL| dS |S t	|}t	|}||kr\d	nd
}	|  d| d|	 dS )Nr  r  z.to(tl.r   ztl.r   z, bitcast=True)r  TrueFalsez
, bitcast=)
r  rv   r  r  r   r   r  r   splitr  )
r'  r  r  triton_dtypetriton_src_dtypecast_xtriton_type_namesrc_dtype_bitwidthtarget_dtype_bitwidthbitcastr{   r{   r|   to_dtype_bitcastx  s"   

z TritonOverrides.to_dtype_bitcastc                 C  sD   t j|}t|| }t|}|dkr|S d| d| d| dS )Nz
tl.float32tl.full(r   r   )rv   _prims_commondtype_to_typerU   r  )r   r  r   type_
triton_valr<   r{   r{   r|   _shaped_constant  s   z TritonOverrides._shaped_constantc                 C  s   | j ||g dS )Nr   )r,  )ry   r   r  r{   r{   r|   constant  s   zTritonOverrides.constantc                 C     d|  dS )Nr  r   r{   r'  r{   r{   r|   abs     zTritonOverrides.absc                 C  X   d|  d| d}t | st |r*t | |}|tjtjfv r*| dt| d}|S )Nr  r  r   r   )r  r  truedivrv   r  rw   r<   r'  youtr  r{   r{   r|   r4    s   zTritonOverrides.truedivc                 C  r3  )Nr  r  r   r   )r  r  modrv   r  rw   r<   r5  r{   r{   r|   r8    s   zTritonOverrides.modc                 C  r/  )Nzlibdevice.abs(r   r{   r0  r{   r{   r|   libdevice_abs  r2  zTritonOverrides.libdevice_absc                 C  s&   t jrd|  dtj dS d|  dS )z
        When use_fast_math, use the ftz (flushing to zero) variant
        of exponent computation.

        Check https://github.com/triton-lang/triton/issues/5735 for
        more details.
        libdevice.exp2(r  r   ztl_math.exp()r   use_fast_mathr  _LOG_2_Er0  r{   r{   r|   exp  s   
zTritonOverrides.expc                 C  r/  )Nzlibdevice.exp(r   r{   r0  r{   r{   r|   libdevice_exp  r2  zTritonOverrides.libdevice_expc                 C  r/  )Nr:  r   r{   r0  r{   r{   r|   exp2  r2  zTritonOverrides.exp2c                 C  r/  )Nzlibdevice.expm1(r   r{   r0  r{   r{   r|   expm1  r2  zTritonOverrides.expm1c                 C  r/  Nzlibdevice.sqrt(r   r{   r0  r{   r{   r|   sqrt  r2  zTritonOverrides.sqrtc                 C  r/  rA  r{   r0  r{   r{   r|   libdevice_sqrt  r2  zTritonOverrides.libdevice_sqrtc                 C  sl   t jj}|dkr
dS |dkrd|  d|  dS |dkr |  dS |d u r/ttd	tj| S td
|)Ncompile_errorzcompile error!runtime_errorz"triton_helpers.device_assert_then(z == 0, "injected assert fail", r   accuracyz + 1r   z:unrecognized config triton.inject_relu_bug_TESTING_ONLY = )	r   r   inject_relu_bug_TESTING_ONLYopsmaximumr.  rv   r  AssertionError)r'  bugr{   r{   r|   relu  s   
zTritonOverrides.reluc                 C     d|  d| dS )Nztriton_helpers.minimum(r   r   r{   r  r  r{   r{   r|   minimum     zTritonOverrides.minimumc                 C  rM  )Nztriton_helpers.maximum(r   r   r{   rN  r{   r{   r|   rI    rP  zTritonOverrides.maximumc                 C  s   d|  d| d| dS )Nr  r   r   r{   )r  r  r  r{   r{   r|   where  s   zTritonOverrides.whererD   )constraintsr  is_purepackc                 G  sh   t |}ddd |D }|d u rddgdd |D  }d|  d| d| d	| d
| d| dS )Nr   c                 S  r  r{   r   r   ir{   r{   r|   r     r  z:TritonOverrides.inline_asm_elementwise.<locals>.<listcomp>z=rc                 S  s   g | ]}d qS )r   r{   r   _r{   r{   r|   r     s    ztl.inline_asm_elementwise('z', 'z', [z	], dtype=z
, is_pure=z, pack=r   )r  r   )asmrR  r  rS  rT  inputsr<   
input_refsr{   r{   r|   inline_asm_elementwise  s
   *z&TritonOverrides.inline_asm_elementwisec                 C  r/  )Nztl_math.cos(r   r{   r0  r{   r{   r|   cos  r2  zTritonOverrides.cosc                 C  r/  )Nzlibdevice.cos(r   r{   r0  r{   r{   r|   libdevice_cos  r2  zTritonOverrides.libdevice_cosc                 C  r/  )Nztl_math.sin(r   r{   r0  r{   r{   r|   sin  r2  zTritonOverrides.sinc                 C  r/  )Nzlibdevice.sin(r   r{   r0  r{   r{   r|   libdevice_sin$  r2  zTritonOverrides.libdevice_sinc                 C     t d)Nz/ops.index_expr not implemented outside a kernelNotImplementedError)ry   r  r  r{   r{   r|   
index_expr)  r   zTritonOverrides.index_exprc                 C  rb  )Nz+ops.masked not implemented outside a kernelrc  )r   bodyotherr{   r{   r|   masked-  r   zTritonOverrides.maskedc                 C  r/  )Nzlibdevice.lgamma(r   r{   r0  r{   r{   r|   lgamma1  r2  zTritonOverrides.lgammac                 C  r/  )Nzlibdevice.erf(r   r{   r0  r{   r{   r|   erf6  r2  zTritonOverrides.erfc                 C  r/  )Nzlibdevice.cosh(r   r{   r0  r{   r{   r|   cosh;  r2  zTritonOverrides.coshc                 C  r/  )Nzlibdevice.sinh(r   r{   r0  r{   r{   r|   sinh@  r2  zTritonOverrides.sinhc                 C  r/  )Nzlibdevice.acos(r   r{   r0  r{   r{   r|   acosE  r2  zTritonOverrides.acosc                 C  r/  )Nzlibdevice.acosh(r   r{   r0  r{   r{   r|   acoshJ  r2  zTritonOverrides.acoshc                 C  r/  )Nzlibdevice.asin(r   r{   r0  r{   r{   r|   asinO  r2  zTritonOverrides.asinc                 C  r/  )Nzlibdevice.asinh(r   r{   r0  r{   r{   r|   asinhT  r2  zTritonOverrides.asinhc                 C  rM  )Nzlibdevice.atan2(r   r   r{   r'  r6  r{   r{   r|   atan2Y     zTritonOverrides.atan2c                 C  r/  )Nzlibdevice.atan(r   r{   r0  r{   r{   r|   atan^  r2  zTritonOverrides.atanc                 C  r/  )Nzlibdevice.atanh(r   r{   r0  r{   r{   r|   atanhc  r2  zTritonOverrides.atanhc                 C  rM  )Nzlibdevice.copysign(r   r   r{   rq  r{   r{   r|   copysignh  rs  zTritonOverrides.copysignc                 C  r/  )Nzlibdevice.erfc(r   r{   r0  r{   r{   r|   erfcm  r2  zTritonOverrides.erfcc                 C  r/  )Nzlibdevice.erfinv(r   r{   r0  r{   r{   r|   erfinvr  r2  zTritonOverrides.erfinvc                 C  rM  )Nzlibdevice.hypot(r   r   r{   rq  r{   r{   r|   hypotw  rs  zTritonOverrides.hypotc                 C  r/  )Nzlibdevice.log10(r   r{   r0  r{   r{   r|   log10|  r2  zTritonOverrides.log10c                 C  r/  )Nzlibdevice.log2(r   r{   r0  r{   r{   r|   log2  r2  zTritonOverrides.log2c                 C  rM  )Nzlibdevice.nextafter(r   r   r{   rq  r{   r{   r|   	nextafter  rs  zTritonOverrides.nextafterc                 C     |  d| S Nr   r{   rN  r{   r{   r|   logical_and     zTritonOverrides.logical_andc                 C  s
   |  dS )Nz == 0r{   r  r{   r{   r|   logical_not     
zTritonOverrides.logical_notc                 C  r}  Nz | r{   rN  r{   r{   r|   
logical_or  r  zTritonOverrides.logical_orc                 C  rM  )Nr   ^ r   r{   rN  r{   r{   r|   logical_xor  rP  zTritonOverrides.logical_xorc                 C  r}  r~  r{   rN  r{   r{   r|   bitwise_and  r  zTritonOverrides.bitwise_andc                 C  s
   d|  S )N~r{   r  r{   r{   r|   bitwise_not  r  zTritonOverrides.bitwise_notc                 C  r}  r  r{   rN  r{   r{   r|   
bitwise_or  r  zTritonOverrides.bitwise_orc                 C  r}  )Nr  r{   rN  r{   r{   r|   bitwise_xor  r  zTritonOverrides.bitwise_xorc                 C  r}  )Nz << r{   rN  r{   r{   r|   bitwise_left_shift  r  z"TritonOverrides.bitwise_left_shiftc                 C  r}  )Nz >> r{   rN  r{   r{   r|   bitwise_right_shift  r  z#TritonOverrides.bitwise_right_shiftc                 C     d| d}d|  d| dS )Nr  ).to(tl.uint32)ztl.rand(r   r   r{   seedr   r{   r{   r|   rand     zTritonOverrides.randc                 C  r  )Nr  r  z	tl.randn(r   r   r{   r  r{   r{   r|   randn  r  zTritonOverrides.randnc              	   C  s*   d| d}d|  d| d| d| d	S )Nr  r  ztriton_helpers.randint64(r   r   r{   )r  r   lowhighr{   r{   r|   	randint64  s   zTritonOverrides.randint64c                 C  rb  )Nz.ops.load_seed not implemented outside a kernelrc  )r?  r   r{   r{   r|   	load_seed  r   zTritonOverrides.load_seedc                 C  r/  )Nzlibdevice.rsqrt(r   r{   r0  r{   r{   r|   rsqrt  r2  zTritonOverrides.rsqrtc                 C  r/  )Nzlibdevice.log1p(r   r{   r0  r{   r{   r|   log1p  r2  zTritonOverrides.log1pc                 C  r/  )Nzlibdevice.tan(r   r{   r0  r{   r{   r|   tan  r2  zTritonOverrides.tanc                 C  r/  )Nzlibdevice.tanh(r   r{   r0  r{   r{   r|   tanh  r2  zTritonOverrides.tanhc                 C  r/  )Nztl.sigmoid(r   r{   r0  r{   r{   r|   sigmoid  r2  zTritonOverrides.sigmoidc                 C  s   d|  d|  d|  dS )Nz(libdevice.signbit(z) != 0) if (z).dtype is tl.float32 else z < 0r{   r0  r{   r{   r|   signbit  s   zTritonOverrides.signbitc                 C  rM  )Nzlibdevice.fmod(r   r   r{   rN  r{   r{   r|   fmod  rs  zTritonOverrides.fmodc                 C  rM  )Nr  r   r   r{   rN  r{   r{   r|   pow  rs  zTritonOverrides.powc                 C  r/  )Nztl_math.log(r   r{   r0  r{   r{   r|   log  r2  zTritonOverrides.logc                 C  r/  )Nzlibdevice.log(r   r{   r0  r{   r{   r|   libdevice_log  r2  zTritonOverrides.libdevice_logF)rp   c                 C  r/  )Nzlibdevice.isinf().to(tl.int1)r{   r0  r{   r{   r|   isinf  r2  zTritonOverrides.isinfc                 C  r/  )Nzlibdevice.isnan(r  r{   r0  r{   r{   r|   isnan  r2  zTritonOverrides.isnanc                 C  r/  )Nzlibdevice.nearbyint(r   r{   r0  r{   r{   r|   round  r2  zTritonOverrides.roundc                 C  r/  )Nr  r   r{   r0  r{   r{   r|   floor	  r2  zTritonOverrides.floorc                 C  sF   |  d| }|  d| }d|  d| d| d| d| d| d	S )
Nr  r  z
tl.where((z
 < 0) != (z < 0), tl.where(z != 0, z - 1, ), r   r{   )r  r  r  remr{   r{   r|   floordiv  s   *zTritonOverrides.floordivc                 C  sV   t dtj}t t || tj}t t | |tj}t ||}| d|  dS )Nr   r   .dtype))rH  r.  rv   r  r  ltr  sub)r'  zleftrightr  r{   r{   r|   sign  s
   zTritonOverrides.signc                 C  r/  )Nrp  r   r{   r0  r{   r{   r|   trunc  r2  zTritonOverrides.truncc                 C  r}  )Nr  r{   rN  r{   r{   r|   truncdiv$  s   zTritonOverrides.truncdivc                 C  r/  )Nr  r   r{   r0  r{   r{   r|   ceil*  r2  zTritonOverrides.ceil)NT)r  r  r  r  )r  r  r  r  )Xru   r~   r   r   mathr{  er<  r\  r  r&  r,  r   r.  r  r1  r4  r8  r9  r=  r>  r?  r@  rB  rC  rL  rO  rI  rQ  rv   rw   r]  r^  r_  r`  ra  re  rh  ri  rj  rk  rl  rm  rn  ro  rp  rr  rt  ru  rv  rw  rx  ry  rz  r|  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r{   r{   r{   r|   r  :  s   8


	





	



















r  r   c                   @  sL   e Zd ZdZedd Zedd Zedd Zedd	 Z	ed
d Z
dS )TritonKernelOverridesa   Map element-wise ops to Triton within a TritonKernel

    Unlike TritonOverrides, these assume the code is going to be inserted into
    the body of the main triton kernel and so it may use indexing and mask
    variables which are assumed to already be defined in the current scope.
    c                 C  s$   t j }dg| }| j|||dS )NrD   r-  )rB   r  triton_tensor_ndimr,  )ry   r   r  ndimr   r{   r{   r|   r.  ;  s   

zTritonKernelOverrides.constantc                 C  s6  t jj|dd}t|tsJ t jjdkrtjntj}|tjtjfvr%|n|}t	j
j}zdt	j
_t jjjt jj|jt||d}W |t	j
_n|t	j
_w |tjtjfvrft jjjt jj| ||t|d}n/|}|jD ]}t|tjrt|t jjj|j j}qk||krt jjjt jj| |||d}|j|_|S )NF	block_ptrtl.int32r  r  r  )rB   r  indexingrh  r   rt  rv   r  int64r   test_configsruntime_triton_dtype_assertcsegeneratecomputer   r3   r  r>   free_symbolsr   r   r   promote_typesvarname_mapr?  r  r   )ry   r  r  r  rt  origr  	index_varr{   r{   r|   re  D  sF   



z TritonKernelOverrides.index_exprc              	   C  s<  | d urt jjd urtjjjtjj|  dt jd} |j	j
dd}|s'J dd}|D ]}|jD ]}|jdks>t|jd rBd	} nq0q+|rHd n|}tjj| |d
}| }	W d    n1 s`w   Y  |r|	jjrot|}tjjjtjjd|	 dt| d|	 dt||	jd}t||	|}
n|	}
|
j| |
S )N.to(tl.int1)r  output)opz)graph for body does not contain an outputFloadrD   Tr   r'  z.shape, r   r  r  )rv   ry  rz  rB   r  r  r  r  rq   r   
find_nodesrG  targetr^   
mask_loadsr  is_boolrU   r   wrapr  rH  rQ  r   discard)r   rf  rg  nodes
need_wherenoder  r   new_maskr:  r{  r{   r{   r|   rh  w  sB   
zTritonKernelOverrides.maskedc                 C  s,   t jj| }d| dt jjd| dS )Ntl.load( + load_seed_offsetr   )rB   r  rG  inputseed_offset)r?  r   r  r{   r{   r|   r    s   zTritonKernelOverrides.load_seedc                 C  s   d|  d}t jj| }r|S t jjj| jd}t jjjtjd}t jj	| d| d|  d t jj
|||f ||fS )Nzfrexp(r   r  r   z = triton_helpers.frexp()rB   r  r  try_getnewvarr  rv   r  r  r   put)r'  	cache_keycse_valmantissaexponentr{   r{   r|   frexp  s   zTritonKernelOverrides.frexpN)ru   r~   r   r   r   r.  re  r\  rh  r  r  r{   r{   r{   r|   r  3  s    

2
,
r  c                   @  sL   e Zd ZU dZded< ded< ddd	Zd
ddddZdd Zdd ZdS )HelperFunctionsz#An ordered set of helper functions.zdict[str, str]_templates_seen	list[str]finalized_helpersrr   rs   c                 C  s   i | _ g | _d S rt   )r  r  r   r{   r{   r|   r    s   
zHelperFunctions.__init___triton_helper_fn	base_nametemplate_coder   c                C  sL   | j |}|dur|S | t| j }|| j |< | j|j|d |S )a9  This accepts a function definition with the function name
        left as a format specifier e.g.

            @triton.jit
            def {name}(arg0, arg1):
                return arg0 + arg1

        We add the templated code to the function set and return the name
        assigned to that function.

        N)r?  )r  getr   r  ri  rH  )r   r  r  existing_namer?  r{   r{   r|   add  s   
zHelperFunctions.addc                 C  r   rt   )iterr  r   r{   r{   r|   __iter__  r   zHelperFunctions.__iter__c                 C  
   | j | S rt   )r  )r   rN  r{   r{   r|   __getitem__  r   zHelperFunctions.__getitem__Nrr   rs   )r  r   rr   r   )	ru   r~   r   r   r   r  r  r  r  r{   r{   r{   r|   r    s   
 
r  c                   @  sl   e Zd ZU dZejedZded< ejedZ	ded< ejedZ
ded< ejedZded< dd
dZdS )r   zM
    Class representing ND block dimensions, for block pointer analysis.
    )default_factoryr   r   r   r   r   rg  rr   c                   s@   t | }tdd | |fD \ |di  fdd D S )z0
        Concatenates block parameters.
        c                 s  s    | ]}t |V  qd S rt   )r)  r*  r  r{   r{   r|   r         z*BlockParameters.__add__.<locals>.<genexpr>c                   s   i | ]}| | |  qS r{   r{   )r   r#  rN  r{   r|   r         z+BlockParameters.__add__.<locals>.<dictcomp>Nr{   )r  r  )r   rg  ry   r{   rN  r|   __add__  s   zBlockParameters.__add__N)rg  r   rr   r   )ru   r~   r   r   r)  fieldr4  r   r   r   r   r   r  r{   r{   r{   r|   r     s   
 r   c                   @  s2   e Zd ZdZdd ZdddZdd	 Zd
d ZdS )"CooperativeReductionWorkspaceCachez
    The scratch space used for cooperative reductions can be reused
    after two reduction loops.  This keeps track of what can be reused.
    c                 C  s0   || _ g | _g | _ttj| _d| _d| _d S r@  )	rG  current_loop
prior_loopcollectionsdefaultdictdequeready_for_reuse
loop_countstore_count)r   rG  r{   r{   r|   r    s   
z+CooperativeReductionWorkspaceCache.__init__nbytesr   c                 C  sD   | j |}|r| S | j|d\}}| j|||f ||fS rV  )r  r  popleftrG  	workspacer  ri  )r   r  cachedws_name	ws_offsetr{   r{   r|   allocate  s   z+CooperativeReductionWorkspaceCache.allocatec                 C  sF   | j D ]\}}}| j| ||f q| j| _ g | _|  jd7  _d S NrD   )r  r  ri  r  r  )r   r  r	  r
  r{   r{   r|   on_loop_end  s
   z.CooperativeReductionWorkspaceCache.on_loop_endc                 C  s   | j }|  j d7  _ |S r  )r  )r   priorr{   r{   r|   increment_store_count  s   z8CooperativeReductionWorkspaceCache.increment_store_countN)r  r   )ru   r~   r   r   r  r  r  r  r{   r{   r{   r|   r    s    
r  c                   @  s&   e Zd ZU ded< dd Zdd ZdS )FixedTritonConfigzdict[str, int]r   c                 C  r  rt   r   r   r  r{   r{   r|   r    r   zFixedTritonConfig.__getitem__c                 C  s
   || j v S rt   r  r  r{   r{   r|   __contains__   r   zFixedTritonConfig.__contains__N)ru   r~   r   r   r  r  r{   r{   r{   r|   r    s   
 r  c                   @  s   e Zd ZdZd	ddZdS )
	TritonCSEz
    Subclasses CSE to apply the current load mask to the cache key to avoid CSEing
    variables across separate masked blocks.
    r  r   rr   Union[str, tuple[str, str]]c                 C  s   t jj }r||jfS |S rt   )rB   r  
_load_maskr?  )r   r  r   r{   r{   r|   augment_key*  s   
zTritonCSE.augment_keyN)r  r   rr   r  )ru   r~   r   r   r  r{   r{   r{   r|   r  $  s    r  c                      s  e Zd ZU eZded< eZded< dZ			ddχ fddZ	dddZ
dddZdd Zdd Zdd Zdd Zdd d!Zd"d# Zedd$d%Zdd&dd&d'dd*d+Z	,ddd2d3Zdd4d5Zdd:d;Zd<d= Zdd>d?Z	dddDdEZdFdG Z		dddRdSZddTdUZddVdWZdd\d]Zdd^d_Z dd`daZ!dbdc Z"ddde Z#dfdg Z$dhdi Z%djdk Z&ddldmZ'ddpdqZ(ddudvZ)ddydzZ*d{d| Z+dd~dZ,dd Z-dd Z.dd Z/e0dd Z1dddZ2e0dd Z3e0dd Z4dd Z5dddZ6dd Z7ddddZ8dddZ9dddZ:dddZ;dddZ<dddZ=dddZ>dddZ?dddZ@dddZAdddZBdddZCeDdddZEdddZFdddZGeDdddńZHdddȄZIdddʄZJddd̈́ZK  ZLS )TritonKernelr  helper_functionszCallable[[sympy.Expr], str]kexprTr   Ntilingdict[str, sympy.Expr]fixed_configOptional[FixedTritonConfig]rr   rs   c                   s   || _ || _t j|fi | t| j| j| _t | _	t | _
tt  | _|| _t | _tttf  | _t | _tt| _t | _tt  | _d | _| jrW|  | j! | j"r^| #  | $  | j"rk| %  d S d S rt   )&optimize_maskr  r  r  r  newvar_prefixsuffixr  rL   post_loop_combinepost_loop_storer   r   outside_loop_varsr  r  countblock_ptr_iddictr   block_ptr_to_bufferr  r  r  r   pointer_advancementsCounter_load_countsr'   autotune_hintstriton_metar0  codegen_reduction_numelsrf  cooperative_reductioninit_cooperative_reductioncodegen_range_treeinit_cooperative_reduction_mask)r   r  r  r  r  r  r  r{   r|   r  7  s0   

zTritonKernel.__init__r  r  r   c                 C  s   t |S rt   )r<   )r   r  r{   r{   r|   dtype_to_str^  rX  zTritonKernel.dtype_to_strrq   c                 C  s   | j o	tj| jS rt   )r0  rB   choices should_use_cooperative_reductionr2  r   r{   r{   r|   r5  a  s   z-TritonKernel.should_use_cooperative_reductionc                   s    j sJ  jD ]}|jdur| jd7  _q jd } jr't| jd } j| _t	 j _
 jd t fdd jD rN jd dS dS )	z/One time setup code for cooperative reductions.NrD   r'  r   a              RSPLIT_NEXT_POWER_OF_2: tl.constexpr = triton_helpers.constexpr_next_power_of_2(RSPLIT)
            RSPLIT_IS_POWER_OF_2: tl.constexpr = RSPLIT == RSPLIT_NEXT_POWER_OF_2
            HAS_RSPLIT: tl.constexpr = RSPLIT > 1
            rsplit_id = tl.program_id(0)
            num_rblocks = (rnumel + RBLOCK - 1) // RBLOCK
            rsplit_chunk = (num_rblocks + RSPLIT - 1) // RSPLIT * RBLOCK
            rsplit_start = rsplit_chunk * rsplit_id
            rsplit_end = rsplit_chunk * (rsplit_id + 1)
            c                 3  s"    | ]}|j r | V  qd S rt   )r3  _has_constant_maskr&  r   r{   r|   r     s    

z:TritonKernel.init_cooperative_reduction.<locals>.<genexpr>z>rsplit_end = tl.where(rsplit_end < rnumel, rsplit_end, rnumel))r/  r  grid_dimr1  r  r   rG  
semaphoressemaphores_namer  %cooperative_reduction_workspace_cacherf  r   r   r   )r   r   	sem_countr{   r   r|   r0  f  s,   



z'TritonKernel.init_cooperative_reductionc                 C  sX   d}| j s
| d}| jd|  |  r| jd d S | j r$J | jd d S )Nz$tl.arange(0, RSPLIT_NEXT_POWER_OF_2)z	[None, :]zrsplit_arange = z                if RSPLIT_IS_POWER_OF_2:
                    rsplit_mask: tl.constexpr = None
                else:
                    rsplit_mask = rsplit_arange < RSPLIT
                zSrsplit_mask = xmask if RSPLIT_IS_POWER_OF_2 else ((rsplit_arange < RSPLIT) & xmask))r,  rf  r   _has_constant_xmaskr   )r   rsplit_aranger{   r{   r|   r2    s   

	z,TritonKernel.init_cooperative_reduction_maskc                 C  s   | j D ]}|js| || j q| jr"| j|j d| |  q| jrStdd | j D rK| j	dddd}| 
|}| jd| |  d S | | j d S d S )Nzbase = c                 s  s    | ]}|j V  qd S rt   is_loopr&  r{   r{   r|   r     s    z2TritonKernel.codegen_range_tree.<locals>.<genexpr>baseTr   zrbase = )r  r?  iteration_ranges_codegen_headerrf  r0  r   r-  iteration_ranges_ranges_coder   _get_reduction_symbols_flatten_reduction_indicesr   r  codegen_reduction_indices)r   r   rn_basesrbaser{   r{   r|   r1    s"   

zTritonKernel.codegen_range_treec                 C  rU  )z
        Indicate whether we need provide numel as arguments for the generated
        kernel calls in the benchmark.

        Should be true for pointwise/reduction kernels but false for triton
        matmul kernels.
        Tr{   r   r{   r{   r|   need_numel_args  s   zTritonKernel.need_numel_argsc                 C  s   | j otj| j| jS rt   )r0  rB   r4  should_use_persistent_reductionr2  r/  r   r{   r{   r|   rI    s   z,TritonKernel.should_use_persistent_reductionc                 C  s@   | j rt| j| jd kr| jr| jd dkS tj| jS dS )NrD   r   F)	persistent_reductionr   r1  r/  r  rB   r4  want_no_x_dimr2  r   r{   r{   r|   rK    s   zTritonKernel.want_no_x_dimc                 C  rU  )Nztl.device_assertr{   r   r{   r{   r|   assert_function  s   zTritonKernel.assert_functionF)
copy_shapedense_indexingoverride_maskr  r   r   c             	     s       j}d}tt  |D ]]ttjsJ |p"ttj	}|r&qtt
jr:jjj }|j qtt
jt
jt
jt
jt
jt
jfrLqfddtjD }	t|	dksdJ dj |	d  d qtjjpy|pyjduo} dk}
d	}d}tt  } D ]}||j rd	}nd}||j! d q|rj"rtjj#r|sjst| dkr$ s|rj%d
krd#ddd#fddd$fddd% fdd}| }|dur|S d}& }t tj'r2|r| dn( }d| d| d}j)r* stdgnt jr*j t+||| S |
rP|sP|r@| dn( }d | d| d!}|n|sa|rad | d| d"}||rit|gjrsj , t+||| S )&zO
        Compute the index and mask to pass to tl.load() or tl.store()
        Fc                   s   g | ]}t  |rt| qS r{   )r   r   r   r  r{   r|   r         z)TritonKernel.indexing.<locals>.<listcomp>rD   zAmbiguous type: r   r   NTr  r   r   
range_treerX   rr   Optional[BlockParameters]c                 S  sB   t | | }|du rdS t|jgt|g|gt|gdS )z
                Matches expressions of the form:
                    idx = s * xindex

                This implies stride (s,), and shape (XBLOCK,).
                Nr   r   r   r   )rE   match_affine_block_exprsymbolr   numelr   r   r   )r   rQ  r  r{   r{   r|   match_affine_block%  s   	

z1TritonKernel.indexing.<locals>.match_affine_blockc              
     s      tjdtjtj gdd\}}tdtj| 	t
 || 	t || }t|  j|}|du r;dS |\}}}t|}	tjjjtfdd|	D r]dS tt|	d gfd	d
t|	dd |dd D  }
 fdd
|D }t||
||dS )a  
                Matches higher-dimensional blocks coming from FloorDiv and ModularIndexing.

                Example expression to match:
                   sN * ((rindex//(d1 * ... * d(N-1))))
                       + s1 * ModularIndexing(rindex, 1, d1)
                       + ...
                       + s(N-1) * ModularIndexing(rindex, d1 * ... * d(N-2), d(N-1))

                This iterates over a block of shape (dN, ..., d1) and stride
                (sN, ..., s1). (d1,...,d(N-1)) and (s1,...,sN) are
                wildcards that we match.

                Note that dN does not appear in the expression, but we solve for it
                using range tree numels and the other dims.
                zdenom modulo)exclude)ry   r   Nc                 3  s*    | ]} |  o| V  qd S rt   )rL  statically_known_power_of_2)r   rV  )	max_blockr   r{   r|   r   z  s    

zETritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<genexpr>r   c                   s"   g | ]\}}t t ||qS r{   )r   Minr   )r   rV  r   )linear_block_sizer{   r|   r     s    zFTritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<listcomp>rD   c                   s    g | ]}t | tiqS r{   )r;   r   r   r  )r  rQ  r{   r|   r     s    rS  )rU  r   symbols	functoolspartialWildr  r   range_tree_nodesr%  r   r   rE   match_mod_div_block_exprrV  get_slice_numelsrB   r   r   rZ  r-  r   r   r   r   r   r   )r   rQ  denommodulonum_dimsmatch_resultr  r   block_index_exprsslice_numelsr   r   r   )r  r\  rZ  rQ  r   r|   match_mod_div_block;  sZ   
	



z2TritonKernel.indexing.<locals>.match_mod_div_blockr  c                   s,    fD ]}|| |}|dur|  S qdS )ze
                Match a block indexing subexpression involving a single range tree.
                Nr{   )r  rQ  
match_funcmatch)rW  rj  r{   r|   match_block_pointer_subexpr  s   
z:TritonKernel.indexing.<locals>.match_block_pointer_subexprOptional[BlockPtrOptions]c                    s   t dd j D  jdd}  fdd| D }tdd | D }t }t| |D ]!\}}t||j	d	kr> d S ||}|d u rJ d S ||7 }q- t
| } tj||| jd
S )Nc                 S  s   i | ]\}}||j qS r{   r  )r   vtr{   r{   r|   r     rb  zFTritonKernel.indexing.<locals>.match_block_pointer.<locals>.<dictcomp>T)reorderc                   s   g | ]
}t  | qS r{   )rE   get_subexpr_involving_symbolrU  r&  index_relative_to_xyr_indexr{   r|   r     s    zFTritonKernel.indexing.<locals>.match_block_pointer.<locals>.<listcomp>c                 s      | ]}|  V  qd S rt   )rU  r&  r{   r{   r|   r         zETritonKernel.indexing.<locals>.match_block_pointer.<locals>.<genexpr>rD   )r   r   r  r   r  )r;   ra  r+  active_range_treesr   r   r   r   intersectionr  sumfilter_masksr   r;  rZ  )r  index_subexprsrange_symbolsblock_paramsr   subexprr   r   )r   r   rm  r   rt  r|   match_block_pointer  s2   



z2TritonKernel.indexing.<locals>.match_block_pointerz.shaper'  r   z, tl.int32)xmaskr   r   .shape))r   r   rQ  rX   rr   rR  )r  r   rQ  rX   rr   rR  )rr   rn  )-prepare_indexingr  r   r   rh  r   r   r   r   r   r   r   r  r  r?  r  r   UNBACKED_INTSIZEPRECOMPUTED_SIZEINDEXFLOATUNBACKED_FLOATr   r   r  r   r   rN  r  rx  ry  var_listr-  allow_block_ptruse_block_ptris_indirect_indexingrt  r  rA  dense_size_strr  r<  r   r{  )r   r   rM  rN  rO  r  
index_varsr   cse_varprefix_matches
need_dense
have_densehave_loop_varsdense_mask_varsr   r  optionsr   r   r{   )r   r   rW  rm  rj  r   r  r|   r    s   





b-


zTritonKernel.indexingr   r?  r  r  r   tuple[str, str]c           
   
   C  s   |  }|s	d}n|r|dksJ d|d}nd|}| jrt| jd jrt| rtdt| j }| jt	|| d|j
|dd	  || j|< tjD ]"}||}td
d |D r^qM| j| }	||	vskJ d||	|< qM||fS |
|}||fS )Nr   , other=0.0, boundary_check=z, padding_option='zero'r  r   = F)r=  c                 s  s&    | ]}t jj|td V  qdS r   N)rB   r   r   r   r   rA  rD  r{   r{   r|   r     s
    
z1TritonKernel.codegen_block_ptr.<locals>.<genexpr>z@duplicate advancement for pointer '{block_ptr}' at type '{symt}')rP  r0  r  r?  r   nextr&  rf  r   rK   rH  r(  r   r   rT  r(  r)  )
r   r?  r  r  rg  checkr  r   advance_offsetsadvancementsr{   r{   r|   codegen_block_ptr  sD   








zTritonKernel.codegen_block_ptrc                 C  sF   | ||j|jd}| dttj| d}d| d| | dS )NFr   r   	tl.store(r   )r  r   r   r  rB   r   	get_dtype)r   r?  r  r  r   rg  r{   r{   r|   codegen_block_ptr_store_line,  s
   z)TritonKernel.codegen_block_ptr_store_liner  rm  lowerr   c                 C  s   |s|sd S t |tjsJ | j|dd}t |tsJ |j}| r&|jnd }|r1t| 	|nd }| 
||r:dnd ||}	| |}
| jj|
|	dtjd d S )NFr  0)
assignmentr  )rh  r   Exprr  r   r   r   r   texprrename_indexingindirect_assertget_load_bufferr  r  rv   r  )r   r  rm  r  r   r  r   r   size_strlinebufferr{   r{   r|   check_bounds6  s   
zTritonKernel.check_boundsc                 C  s<   |  s| r| jS | jr| jd jr| s| jS | jS )Nr  )	r   r   r  r0  r  r?  r   rf  loads)r   r  r{   r{   r|   r  P  s   
zTritonKernel.get_load_bufferc              
     s  | j }| j  d7  < t}| ||}| j|dd}| | }tdd | 	|
 D }| |r?d}	n(|sDd}	n#| jre| jd jre fdd	}
  d
}	ttd|
}nd}	|skr~| r~| jr{dt| j }nd}nd}	 d}tjjr| j }| dk}	 | | o| j o| o|}d}|rd}d }tj}tr|}|tjtj fv rtj!}nvt"|t#r| $|||\}}d| | |	 | d}|%||j&|j'd}n't"|t(j)rd| d| d}|j*}nd| d|j+ d|j, |	 | | d
}|tjtj fv r$tjj-r$|d7 }tj!}|tj.kr8tj/j0d u r8|d7 }tj.}| 1|}| j2j3||||d}|j4dkrV  d8  < t"|t5s^J |j6|_6|rd| d| d}| j2j3|||d}|j6r|j7rd}n|tj.krd}nd}| jrt| jn|}d|j, d| d| d}| j2j3|||d}| jr|8 ss| j9:| |S )NrD   Tr  c                 s  s    | ]}|d kV  qdS r   r{   rV  r{   r{   r|   r   t  r  z$TritonKernel.load.<locals>.<genexpr>z, eviction_policy='evict_last'r  c                     s     krs
rdS dS )N
evict_lastevict_firstr{   r{   expected_countr   indirect_indexingload_countsr?  r{   r|   decide_later}  s   z'TritonKernel.load.<locals>.decide_laterz, eviction_policy='<EP>'z<EP>r   z, other=r  z, cache_modifier='.cg'r  r   rE  r  r  r  r  r  r   r   z0.0r  r  r  );rG  r  r+  r   r  r  r   r   r   get_strides_of_loadr  is_broadcastedr0  r  r?  r^  r_  r2   r   _load_otherrU   r   r   skip_l1_cacher2  buffer_read_countsrB   r   r  r^   rv   r  r  rw   rh  r   r  r  r   r   r   rA  r   r   r   r  rq   ry  rz  r  r  r  	use_countr  r   r  r   r$  r  )r   r?  r   r  	make_lineoriginal_indexr  r   is_coalescedepr  rg  has_read_depsr  r  cachemodappend_broadcastr  r  r  load_buffer
result_varzero	other_valr{   r  r|   r  _  s   



(
zTritonKernel.loadr   rJ   moderA   c              	   C  sB  | j |}|}| j|d|d u d}|| j jv }| |}	|r*|	r*| jt|d t|t	rB| 
|||\}
}| |||
||}n3|d u rXd| d|j d| d|j d	}n|d	krnd
| d|j d| d|j d	}ntd| t }| js| jr|| || j | jt|| | js| j| |  d S )NT)rN  r  ztl.debug_barrier()r  rE  r  r   r   
atomic_addztl.atomic_add(z, sem='relaxed')zstore mode=)rG  r  r  inplace_buffersr  storesr   rK   rh  r   r  r  r   r   rd  
contextlib	ExitStackr0  r/  enter_contextguard_cooperative_storer$  r  close)r   r?  r   r   r  r  r  r  
is_inplacer  r  rg  r  
exit_stackr{   r{   r|   store  s0   


$$zTritonKernel.storec                 C  s*   | j  }|t|d| d | S )z
        For cooperative reductions only one thread block should write out the result.
        We rotate which thread block does each write for better parallelism
        zif rsplit_id == (z % RSPLIT):)r:  r  r   rK   indent)r   r?  r  rN  r{   r{   r|   r  	  s   
z$TritonKernel.guard_cooperative_storer  
boundaries.tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]boundary_indicesindexing_dtyper  sorter Optional[tuple[str, sympy.Expr]]sorter_indicesOptional[CSEVariable]c                 C  s   | j tj | j|d }| |d }	| |d }
| |d }|r.| j|d nd}|r9| |d nd}|tjkrCd}n|tj	krKd}nt
d| jj| jd	| d
| d
|	 d
|
 d
| d
| d
| d
| d
| d
| d
| d|d}|S )z3
        See [Note: Inductor bucketize op]
        r   rD   r   r   rs   r  ztl.int64z5Bucketize only supports indexing with int32 and int64z'triton_helpers.bucketize_binary_search(r   z, )r  )r,  r  r'   ONE_ELEMENT_PER_THREADrG  r  r  rv   r  r  rd  r  r  r  )r   r  r  r  r  r  r  r  boundaries_ptrboundary_sizeboundaries_underlying_numelboundary_stride
sorter_ptrsorter_strider  r:  r{   r{   r|   	bucketize	  sP   

zTritonKernel.bucketizec                 C  sP   |   }|dkrd| dS | j}dg||  dg|  }| dd| dS )	NrD   z!triton_helpers.promote_to_tensor(r   re  rs   rf  r   rg  )r  r/  r   )r   r   ndimsnreducesizesr{   r{   r|   reduction_resizeL	  s   zTritonKernel.reduction_resizec                 C  sT   | j dkr|S |  | j  }|  }|d| dg }t| jj|t||||dS )zC
        Reshape to RBLOCK, collapsing all reduction dims.
        rD   NRBLOCKr  )r/  r  dense_size_listr   r  r  r   )r   r  r   r  target_ndimr   target_shaper{   r{   r|   reduction_collapse_dimsU	  s   
z$TritonKernel.reduction_collapse_dimsr  reduction_typer@   +Union[CSEVariable, tuple[CSEVariable, ...]]c           .        s	  dKdd}dd t |D }t ||}tdd	 |D r,t|tj}ttjjs1J td
d	 j	D }
| t|}jrM|j j	d jd } fdd|} j dLfdddMfdd}	fdd}
||f}|jjv rjj| S t|}t|}jj|d}tdd	 |D |_d|fdd	jrtj|}t|}dN	fdd  d!krnt|t r fd"dt!||D }n ||}d#v r#t"jj#j$d$| d%| d&t%j&j'd'krtj(ntj)d}d(d)d# |
j$||| nd*kr@j*r8+||	|}n̈,|}nĈd+krit|t-sMJ |\}}}t fd,d	.j$|||D }nd!krv/|}nt|t0s~J jj#j$j$t"|d |j1d}nrjj2d-| |d}tj3|}t|}t|t sĈj45| d.  d/| d/| d0 d#v r3d-| d1}j67 }j45| d.  d/t8|j9 d/:| d0 d(d)d# j$;d2| d3| d4 d5| d/| d/| d/| d6| d7	| d8| d9| d7	| d8| d9 |
j<||| nt=rC+||	|}nd!krd-| d:}d-| d;}j45| d.  d<| d0 j45| d=  d/| d0 j$;d>| d3| d?| d/| d/| d/t>j? d@ j$;d>| d7	| d8| d>| d7	| d8| d>	 |}jjd}@j<||||}n?tA|}|||}j$5| d7	||  |tjBkr| dA}tC}|	j<t"||| n|	j<t"|t"|d  j*rtj3|}tDE }j<jFfD ]} | 5dB |G| H  qd#v rbj<5| dCI| dD  J| dE||}!j67 }J||t8|j9}"|
jF||!|" nt=rd*ksnJ |\}#}$}%J|#t||d }&J|$t||dF }'J|%t||dG }(KjF|#|$|%|&|'|(	 n?d!kr|\}}J|t||d })J|t||dF }*@jF|||)|* nJ|t||}+|	jFt"||+d  |L  |jj|< t|t rNtMdHd	 |D sJ jNO| dIv rtP|dFksJ tP|| }tP|tP|ks#J t!||D ]#\},}-|-d us3J |,j1|-krJj<5|, d7|, dJtC|- d0 q(|S t|tQsVJ jNR| |j1|d kr|d d usmJ j<5| d7| dJtC|d  d0 |S )ONr   rJ   rr   c                 S  s$   | j tjtjfv rt| tjS | S rt   )r  rv   r  r  rH  r  rw   r  r{   r{   r|   maybe_upcastm	  s   z,TritonKernel.reduction.<locals>.maybe_upcastc                 S  s   g | ]}|j qS r{   r  )r   r$  r{   r{   r|   r   {	      z*TritonKernel.reduction.<locals>.<listcomp>c                 s  s     | ]}|t jt jfv V  qd S rt   )rv   r  r  r  r{   r{   r|   r   }	  s    z)TritonKernel.reduction.<locals>.<genexpr>c                 s      | ]	}|j  d V  qdS r   Nr-  r&  r{   r{   r|   r   	  r  r  r   c                   s$   j jjd|  d  d| jdS )Nr   r   r   r  r  r  r  r  )rp  )r  r   r{   r|   <lambda>	  s
    z(TritonKernel.reduction.<locals>.<lambda>r   result_typer   c              
     s   dv }|rdnd} | |}dv r'| d d| d  d}n| d d	| d  d}|d
urD| d| d}|S )zK
            Helper to generate a reduction call, e.g. tl.sum.
            )r   r  minprodtriton_helperstl)r  r  r  z2(r   r   r  Nr   )r  r  )r  r   r  
use_helpermodule)r   r  r  r   r{   r|   final_reduction	  s   z/TritonKernel.reduction.<locals>.final_reductionr  rs   c                   s$    | ||}|  | d|  dS )zU
            Generate a reduction and assign it to an existing variable.
            r  N)r   )r  r  r   r  )r  r{   r|   final_reduction_define	  s   	z6TritonKernel.reduction.<locals>.final_reduction_definec                   sh    | |} | |}| d| d| d d| d| d  d| d| d d	 d S )
N                z_val, z_idx = triton_helpers.z_with_index(r   )
                r  _idx
                )r  r   r  )r  r  r   r   )r   r  root_opr   r{   r|   final_argreduce	  s*   z/TritonKernel.reduction.<locals>.final_argreducer  c                 s  s     | ]}t |d  s|V  qdS r  )r8   r  r{   r{   r|   r   	  s    
r   c                   s    s| S t  | |S rt   )r  rQ  )tvalfval)condr{   r|   
where_cond	  s   z*TritonKernel.reduction.<locals>.where_condc                   s    j j j| || jdS )Nr  r  )r   default)r   r
  r{   r|   _mask_value	  s   z+TritonKernel.reduction.<locals>._mask_valueonline_softmax_reducec                   s   g | ]	\}} ||qS r{   r{   )r   rp  d)r  r{   r|   r   	  r%  )argmaxargminr   zindex, r  r  r  r  welford_reducewelford_combinec                 3  s$    | ]}j jj| d V  qdS )r  N)r  r  r  r   r   r  r   r{   r|   r   
  s
    
rY   = tl.full(r   r   _indexr  _next, z_next = triton_helpers.z%imum_with_index(
                    z(index
                )
                r  _nextr  _max_sumz, float('-inf'),  = tl.zeros(z
                    zG_next = triton_helpers.online_softmax_combine(
                        z+
                    )
                    z.to(tl.int8)zif HAS_RSPLIT:z_bval = _val_bvalrD   r   c                 s  s    | ]}t |tV  qd S rt   )rh  r  r  r{   r{   r|   r   
  r  )r  r  r   )r   rJ   rr   rJ   )r   r   r  r   rr   r   )r  r   r   r   r  r   rr   rs   )rr   rJ   )Spytreetree_leavestree_mapr   rv   r  rw   r0  r   r  r{  sortedr  ri  r-  r  _map_tuple_or_scalarr  r/  r  reduction_cacher  r  r  r   r   rJ  r   	Reductiondefault_valuerU   rh  r  r   r   r  r  rB   r  rt  r  r  r/  r  welford_reduce_fallbackr   _welford prepare_softmax_twopass_fallbackrJ   r  namedvardefault_accumulatorrf  r   r2  select_index_dtypeiinfor  r3  r   r"  r6   r   r;  %online_softmax_reduce_final_reductionget_reduction_combine_fnrq   r  r  r  r#  r  r  r  *codegen_cooperative_reduction_peer_combinewelford_reduce_final_reductionr  r(  r$  r  r   r  r  ).r   r  r  r  r   r  original_dtypesmasksreduction_range_prefixr   r  r  acc_typetorch_acc_typer  r  masked_valueaccumulator_indexmeanm2weightaccumulatorrt  accumulator_maxaccumulator_sum
result_max
result_sum
combine_fnupdatedaccumulator_casted_strr  r  bufpeer_valpeer_idxresult_mean	result_m2result_weight	peer_meanpeer_m2peer_weightpeer_maxpeer_sumpeersr  
orig_dtyper{   )
r  r	  r  r   r  r  r  r  r   r
  r|   	reductionf	  s0  


	






	








	


		











zTritonKernel.reductionc                   s    || } || } fddtdD \}}|d| d| d| d| d| dtj d| d|  d| d|  d ||fS )	Nc                      g | ]}t jj d qS r  r   r  r  rX  r  r{   r|   r   
  r  z7TritonKernel._online_softmax_reduce.<locals>.<listcomp>r   
            r   z9 = triton_helpers.online_softmax_reduce(
                )
            r  )r  r6  r   r   r;  r  )r   r  r<  r=  r   r  r>  r?  r{   r  r|   _online_softmax_reduce
  s6   

	z#TritonKernel._online_softmax_reducec           
   	     s    fdd|||fD \}}}d| d| d| d| d	}fddt dD } d| d	|  tfd
d|D }	|	S )z;
        Helper to codegen triton_helpers.welford.
        c                 3  s    | ]
}  |V  qd S rt   )r  r  r  r  r   r{   r|   r     s
    
z(TritonKernel._welford.<locals>.<genexpr>ztriton_helpers.welford(r   r   c                   rQ  rR  rS  rX  r  r{   r|   r     r  z)TritonKernel._welford.<locals>.<listcomp>r   r  c                 3  s    | ]}  |V  qd S rt   )r  r  r   r{   r|   r   	  r  )r6  r   r   r  )
r   r  r8  r9  r:  r   r  welfordwelford_resultsresult_valuesr{   rW  r|   r'  
  s   zTritonKernel._welfordc                 C  s  |   | j }| d}| d}	| d}
| j| d|   d| d | j|	 d|   d| d | j|
 d|   d| d |dkru|\}}}| jd| d	|	 d	|
 d
| d|	 d|
 d| d| d| d n"|dks{J | jd| d	|	 d	|
 d| d| d|	 d|
 d | jd| d|| d| d|	 d||	 d|	 d|
 d||
 d|
 d |}| jj|d}| jj|d}| 	| j
|||||	|
||	S )z%Helper to codegen a welford reduction_mean_m2_weightr  r   r   r  r  r  z<_next = triton_helpers.welford_combine(
                    z,
                    z#
                )
                r  z;_next = triton_helpers.welford_reduce(
                    z1, roffset == 0
                )
                z            r  r  rT  r  )r  r/  rf  r   r  r  r   r  r  r0  r"  )r   r  r  r   r
  r4  r  r   r;  accumulator_m2accumulator_weightr8  r9  r:  rF  rG  rH  r{   r{   r|   r    s   



	zTritonKernel.welford_reducec
                 C  sP   |  ||||||	}
|||g}t||
D ]\}}|| d|  q|||fS )z0Helper to codegen call to triton_helpers.welfordr  )r'  r   r   )r   r  rF  rG  rH  r8  r9  r:  r   r  r  result_exprsresult_exprr   r{   r{   r|   r0  G  s
   

z+TritonKernel.welford_reduce_final_reductionc                 C  sJ   |  |||||}||g}	t|	|D ]\}
}||
 d|  q||fS Nr  )rV  r   r   )r   r  r>  r?  rL  rM  r   r  r  r`  ra  r   r{   r{   r|   r-  [  s
   z2TritonKernel.online_softmax_reduce_final_reductionc                 C  s   | j r| j d S tS )NRSPLIT)r  r*   r   r{   r{   r|   
max_rsplite  s   
zTritonKernel.max_rsplitc           	      C  s   | j d }|  sdnd}||j |   }| j|\}}| jjd| d| d| | dt	| d| d	| d
| ddd | j
| d| dt| d | dS )a	  
        Generate code to save a [XBLOCK, RSPLIT] temporary workspace, where each thread block writes a different
        column.  After the barrier, every thread block loads the completed value so that it can compute the final
        value independently.
        r'  zxindex < xnumelNr  z_ws = (r  z).to(tl.pointer_type(z))
                tl.store(z%_ws + (xindex * RSPLIT + rsplit_id), r   rU  Tstripz_peers = tl.load(z_ws + (xindex * RSPLIT + rsplit_arange), rsplit_mask, eviction_policy='evict_first', other=triton_helpers.if_mask(rsplit_mask, r  _peers)r1  r<  r  rd  r:  r  r"  r   r  r<   r#  r   rU   )	r   r  r  default_valxnumelr   r  r	  r
  r{   r{   r|   r/  j  s8   

z7TritonKernel.codegen_cooperative_reduction_peer_combinec                 C  s   | j sJ d| _ | j|dd}d| _ | j|}t }| jr)|| || j	 t
|trG| j	t|| |||||d|  nt
|tsNJ | j	t|d| d|j d| d|j d		 |  d S )
NFTr  r  r  rE  r  r   r   )r0  r  rG  r  r  r  r/  r  r  r#  rh  r   r   rK   r  rH  rP  r   r   r   r  )r   r?  r   r   r  r  r  r{   r{   r|   store_reduction  s>   

 zTritonKernel.store_reductiondtypestuple[torch.dtype, ...]c           	   	     s*  t  d t   fddtdD }ddd tj|D }d| d	 t d
ddl	m
} | G  fdddt} 4 t|  || }ddd |D }d|  W d    n1 sww   Y  W d    n1 sw   Y  | jj dS )Nz@triton.jitc                   s*   g | ] t  fd dtD qS )c                 3  s.    | ]} j d  d| | dV  qdS )r  rY  r  N)r)  r   n)r  rk  rW  r{   r|   r     s   , z7TritonKernel._lift_helper.<locals>.<listcomp>.<genexpr>)r  r6  )r   )r  rk  num_args)rW  r|   r     s    z-TritonKernel._lift_helper.<locals>.<listcomp>r   r   c                 s      | ]}t |V  qd S rt   rU  r  r{   r{   r|   r     rw  z,TritonKernel._lift_helper.<locals>.<genexpr>zdef {name}():r  r   rc   c                      s"   e Zd Zd fd	d
ZdS )z+TritonKernel._lift_helper.<locals>.CSEProxyr?  r   rG  tuple[Any, ...]r  dict[str, Any]rr   r   c                   sB   d| 7 t ||i |} jt ||i ||dS )NrY  r  )r  r  )r   r?  rG  r  output_dtyper  dtype_handlerhelperhelper_name	overridesr{   r|   _default  s   z4TritonKernel._lift_helper.<locals>.CSEProxy._defaultN)r?  r   rG  rr  r  rs  rr   r   )ru   r~   r   rz  r{   ru  r{   r|   CSEProxy  s    r{  c                 s  rp  rt   rU  )r   r  r{   r{   r|   r     rw  return r  )rL   r   rI   r6  r   r  r  from_iterabler  r  rd   r$   r  rB   set_ops_handlerr  r  r   )	r   fnro  rk  rG  	signaturerd   r{  outputsr{   )r  rv  rk  rw  rx  ro  ry  r|   _lift_helper  s*   
 zTritonKernel._lift_helperr@  UCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]tuple[CSEVariable, ...]c                   st  j sJ jrJ dtdd jD }| t|}jr&J dg }g }tdd |D }t	j
jj |t||} j }t||D ]k\}	}
j
jj|	 dt|
 d|
d}j
jjd	| d
  d|
d}	||	 t|
}jsj
j|
d} }d|d< dd
| d}|
jrdnd}j| d| d
| d
| d || qQdd fdd}|d| d| d
| d|||}js! fdd|D }|t|t|}|t||} fddt||D }t|||D ]\}}}j| d| d
| d q
n|}|D ]}t|ts/J t||_q%t|S )NTODOc                 s  r  r  r  r&  r{   r{   r|   r     r  z$TritonKernel.scan.<locals>.<genexpr>z(ops.scan not supported inside ops.maskedc                 s  rp  rt   r>   r   r  r{   r{   r|   r     rw  r   r   r  r   r   rc  r  rf  rg  zfloat('nan')z-1r  c                 S     d dd | D S )Nr  c                 s      | ]}| d V  qdS ,Nr{   r  r{   r{   r|   r     r  z1TritonKernel.scan.<locals>.csv.<locals>.<genexpr>r   r  r{   r{   r|   csv  r   zTritonKernel.scan.<locals>.csvc           	        s   t |} fddt|D }tfdd|D r$fdd|D S fdd|D }j| d   t||D ]\}}rH|_j|| q?t	|S )Nc                       g | ]}  d | d  qS r   r{   rV  r  r2  r{   r|   r          z;TritonKernel.scan.<locals>.cse_multiple.<locals>.<listcomp>c                 3      | ]	} j |V  qd S rt   r  containsr   r  r   r{   r|   r     r  z:TritonKernel.scan.<locals>.cse_multiple.<locals>.<genexpr>c                      g | ]} j |qS r{   r  r  r  r   r{   r|   r     rb  c                   s   g | ]	} j j|d qS rR  r  r  )r   _dtyper   r{   r|   r     r%  r  )
r   r6  r(  r  r   r   r   r  r  r  )	r  r  r2  rk  rn  
cache_keysresult_varsr  r  r  r   r  r|   cse_multiple  s   z'TritonKernel.scan.<locals>.cse_multipleztl.associative_scan((r  c                   s&   g | ]} d | dt |jdqS )ztriton_helpers.select_one((z1), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)r  )r>   r  )r   partial_scan_varcse_computer{   r|   r   2  s    
z%TritonKernel.scan.<locals>.<listcomp>c                   s,   g | ]\}} d | d| d|j dqS )ztl.where(roffset > 0, r   r   r  r  )r   	full_scanpartial_scanr  r{   r|   r   ;  s    z = tl.where(roffset > 0, ) r0  r/  r   r  r{  r!  r  r  r^  r_  r  r  r  r  r   r  r/  r   r  r  ri  r  rJ  r  r  r   r  rf  r   rh  r  r   )r   rk  r@  r  r2  broadcasted_valuesaccumulatorscombine_helper_fnr   r   r  value_dtyper4  r;  reduced_sizer  r  partial_scan_varspartial_reduce_vars	accs_nextfull_scan_varsr  acc_nextpartial_reducer  r{   )r  r  r   r|   scan  s   





zTritonKernel.scanstable
descendingc                   s|  j sJ jrJ dtdd jD }| t|}jr&J djs-J dt	j
jj  j }tdd D tt|ksPJ  fddt|D }d	d
 fdd}jd jspJ jd rzdnd}	t|dkrd|d  d|d  d|	 d| d| d| d}
||
t||}ntdt||D ]\}}||_|j|_qt|S )Nr  c                 s  r  r  r  r&  r{   r{   r|   r   Z  r  z$TritonKernel.sort.<locals>.<genexpr>z(ops.sort not supported inside ops.maskedz3ops.sort is only supported in persistent reductionsc                 s  rp  rt   r  r  r{   r{   r|   r   e  rw  c                   s2   g | ]\}} d | d   d| dqS )r   r   r   r  )r  )r   rW  r   )r  rk  r   r{   r|   r   g  s    z%TritonKernel.sort.<locals>.<listcomp>c                 S  r  )Nr  c                 s  r  r  r{   r  r{   r{   r|   r   o  r  z1TritonKernel.sort.<locals>.csv.<locals>.<genexpr>r  r  r{   r{   r|   r  n  r   zTritonKernel.sort.<locals>.csvc                   s   fddt |D }tfdd|D r fdd|D S  fddt |D }j| d  t||D ]\}}rG|_j|| q>t|S )Nc                   r  r  r{   rV  r  r{   r|   r   r  r  z;TritonKernel.sort.<locals>.cse_multiple.<locals>.<listcomp>c                 3  r  rt   r  r  r   r{   r|   r   s  r  z:TritonKernel.sort.<locals>.cse_multiple.<locals>.<genexpr>c                   r  r{   r  r  r   r{   r|   r   t  rb  c                   s   g | ]}j j | d qS rR  r  rV  )rk  r   r{   r|   r   u  r  r  )	r6  r(  r  r   r   r   r  r  r  )r  rn  r2  rk  r  r  r  r  r  )rk  r  r2  r|   r  q  s   z'TritonKernel.sort.<locals>.cse_multipler  rs   rnumelr   ztriton_helpers.sort_with_index(r   r   rD   z	, stable=z, descending=r   zUnhandled sort)r0  r/  r   r  r{  r!  r  rJ  r^  r_  r  r  r  r  r/  r  r   	enumerater3  r6  rJ  r   r   r  )r   rk  r  r  r  r2  r   r  r  r  r  r  r  	input_varr{   )r  r  rk  r   r|   sortQ  sJ   


zTritonKernel.sortc                   s~  | j s| js| js| js| js| jsdS dd | jD }| jrRt|dkrRt	|D ]d\}}| j
j|d1 |j}| jr?dnd}| jrFdn| d	}| j
d
| d| d| d|  d	 W d   n1 sjw   Y  | j
j|d d | || j
 W d   n1 sw   Y  q+| j
jt|d* | | j
 | j
| j  | j
| j | j
| j | j
| j W d   n1 sw   Y  tg t	|D ]}\}}| j
j|d d\ | j|j  D ]K\}}|t|d k r||d  }	| j|	j | }
t|	}t|	j|  fddt||
D }| j
t| j| | d| dtj | d qW d   n	1 s@w   Y  | j!"| j# |$  qn| j
| j  | j
| j | j
| j | j
| j | j
| j | jr| js| jr| j% d}| j
jd| ddd | j&'  | j
| j | j (  | j(  | j(  | j(  | j(  | j(  dS )a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        Nc                 S     g | ]}|j r|qS r{   r>  r&  r{   r{   r|   r     r  z-TritonKernel.codegen_body.<locals>.<listcomp>r   )r   rsplit_startr  
rsplit_endrV  zfor zoffset in range(r   zBLOCK):rD   c                   s   g | ]
\}}||   qS r{   r{   )r   curprevprev_num_iterr{   r|   r     s    
z = tl.advance(r   z + tl.program_id(1)zR
                if HAS_RSPLIT:
                    triton_helpers.x_grid_barrier(r  Tre  ))indexing_coder  r  r  r"  r#  r  r0  r   r  rf  r  r-  r/  r   r   rA  rE  r   r5  r)  r   r+  r   r   r   rV  r   rK   r(  rB   r  r  r  
invalidater$  cache_clearr9  r:  r  clear)r   
loop_treeslevelr   r-  
loop_startloop_endr  advancement	prev_treeprev_advancement
prev_blocksem_ptrr{   r  r|   codegen_body  s    	









zTritonKernel.codegen_bodyr  c                 C  s   g }|   rOg }| d|g  |D ]=}t|tr |t| qt|tr3|ttjj	
|j qt|tjrF|ttjj	
| qtdt| |S )Nr   z!Unsupported numel argument type: )rH  add_numel_to_call_argsrh  r  ri  r   r`   rB   r   r   	size_hint
inner_exprr   r  r  r  )r   rG  
numel_argsr  r{   r{   r|   kernel_benchmark_extra_args  s   

z(TritonKernel.kernel_benchmark_extra_argsc                 C  s  t  }| j \}}}}|g d |  t }g }t||D ]\}	}
dt| }t	j
|	}|r]|| dt	j
j|  dt	j
j|  d|  d|  d
 n||	t	j
jv rt	j
j|	 }|| dt	j
j|  dt	j
j|  d|j d|j d
 nKt|
trt	j
j|
j}d|
jv rd	}|| d
|  n,t|
trt	j
 }t	j
j|
j}|| d| d| d|
j d ntd|	 | | q#|!| "  |dd#| d W d    n1 sw   Y  |g d t	j
 }|j$}| S |dt	j
j%&| d | 0 |t	j
j%'| d| }|| d| d |t(t)j* d| d W d    n	1 s[w   Y  W d    n	1 skw   Y  |g d | A |dt	j
j%&| d |  |t	j
j%'| |dt(t)j* d W d    n	1 sw   Y  W d    n	1 sw   Y  |g d | / |d |d |d |d |d|  |d |d W d    |S 1 sw   Y  |S )N)r   r   zdef get_args():arg_z = rand_strided(r   z
, device='z	', dtype=r   r  r   r  z = torch.zeros(z*Don't find the buffer or const tensor for r|  r  )
r  zdef call(args):zwith re  streamz = get_raw_stream(z.run(*args, stream=)r  r  z def benchmark_all_configs(args):z.benchmark_all_configs(*args))r  r  zif __name__ == '__main__':z<from torch._inductor.runtime.benchmarking import benchmarkerr   zargs = get_args()z:ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40)z	num_gb = zgb_per_s = num_gb / (ms / 1e3)z<print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s"))+rL   rG  python_argdefs
writelinesr  r  r%  r   r  rB   r   try_get_bufferr   r   
size_hintsget_size
get_stride
get_devicer  	constantsrm  r  devicer  rh  rQ   r  r  r?  rS   get_current_device_or_throwKeyErrorri  extendr  r   r   
device_opsdevice_guard
set_devicer   r7   KERNEL_NAME)r   num_gbr:  _argdefs	call_argsr  rY  name_cnt	var_namesarg_namearg_sigvar_namerC  const_tensorsymval_hintr  r%  current_devicer   stream_namer{   r{   r|   codegen_kernel_benchmark  s   
D@



'











z%TritonKernel.codegen_kernel_benchmarkc                 C  s   t dtjjdS )Nzl
            from torch._dynamo.testing import rand_strided
            {}
            import torch
        get_raw_stream)textwrapdedentrH  rB   r   r  import_get_raw_stream_asr   r{   r{   r|   imports_for_benchmark_kernel_  s   z)TritonKernel.imports_for_benchmark_kernelc                 C  s6   | j rdS | jr
dS | jr| jsJ dS | jrdS dS )Nr  r/  rJ  rP  	pointwise)r  r/  rJ  r0  r   r{   r{   r|   _get_heuristich  s   
zTritonKernel._get_heuristicc                  C  s   t jj t  tjtjtjj	tj
tjtjtjtjtjjtjjtjjd} t jjd ur/d| d< t r7d| d< tjrNtj| d< tj| d< tj| d< tj| d< tjr`tj| d	< tj| d
< tj| d< | S )N)backend_hash$are_deterministic_algorithms_enabledassert_indirect_indexingautotune_local_cacheautotune_pointwiseautotune_remote_cacheforce_disable_cachesdynamic_scale_rblockmax_autotunemax_autotune_pointwisemin_split_scan_rblockspill_thresholdstore_cubinTis_hiprx  profile_bandwidthprofile_bandwidth_regexprofile_bandwidth_output/profile_bandwidth_with_do_bench_using_profilingcoordinate_descent_tuning coordinate_descent_search_radius'coordinate_descent_check_all_directions)rv   utils_tritontriton_hash_with_backendr  r   r  r  r   r  r  r  r  r  r   r  r  r  ry  rz  rx  r  r  r  r  r	  r
  r  )inductor_metar{   r{   r|   inductor_meta_commont  s@   



z!TritonKernel.inductor_meta_commonc                    s8  t  }i }| j D ](\}}t|r| jsq
tjj|}t	|t
tjfs(d}ntt
|}|||< q
|d u r\|t  tj j}|dkrM|d n|d tjr\||   | j \ }	}	tD ]$\}
}t	|trttj|j}|tjjjv rt|jtjjj| |
< qitt  }| j D ]G}|| jj!v r|"| jj!|  || jj#v r|tjj$vr|| j$vr|"tt%| jj#| j& || jj'v r| jj'| }t	|t(rJ |"| qt) D ]\}}t	|t*r|j+t,j-kr|"|j qt.|}| / D ]}t|j0 d|j1}2|  2t3|j q fdd}| j4D ]}|j5r2| j6r2q&|j7d u r:q&||j08  d q&| j9rN|d	 t:| j; d
}|t<=tj i d}tjj>pjtjj?}| @ jAtB| jCttDjE||| jF| jG| jHd| I }| j9r| j6|d< d }tjstjJr| K d }||d< tLg|d< tMD ]}d|d | j< q|| _N| O  | jPD ]}|Qd || q| jRrd| S  d| jRjd|d|d	}nM| jr	| jTU }d| S  d|d| d|d|d}n/d}tV|dkr tVtWdkrd}nd}d| S  d|d| d |d|d!| jX d}|| |Qd"|pGttDjY d#dZd$d%  D  d& |[ ( | \| | j] D ]\}}|Q| d'|  qf|| j^ W d    n	1 sw   Y  tjr|| _| |` S )(Ni    cpuz"triton_helpers.set_driver_to_cpu()z"triton_helpers.set_driver_to_gpu()rV  c                   s*   t  r
t|   t| dd d S )NT)is_constexpr)r=   ri  rH   rF   )r  argdefsr  r{   r|   add_constexpr_arg  s   z6TritonKernel.codegen_kernel.<locals>.add_constexpr_argr   rc  )
size_dtyper  )r  r  r  )	grid_typer,  kernel_namemutated_arg_namesoptimize_memr,  num_loadnum_reductionrJ  g    eAkernel_num_gbconfigsrD   r  r   z$
                @triton_heuristics.z(
                    config=zI,
                    filename=__file__,
                    triton_meta=z$,
                    inductor_meta=z;
                )
                @triton.jit
            z!(
                    size_hints=z%,
                    reduction_hint=r   r  ztile_hint=TileHint.SQUARE,ztile_hint=TileHint.DEFAULT,r   zH
                    filename=__file__,
                    triton_meta=z*,
                    min_elem_per_thread=zdef r  c                 s  rv  rt   )	full_namer  r{   r{   r|   r   n  rw  z.TritonKernel.codegen_kernel.<locals>.<genexpr>rq  r  )arL   r1  r+  r8   r0  rB   r   r   symbolic_hintrh  r  r   rA  r,   r   r   r  r  r   benchmark_kernelr  rG  r  r  rQ   r   r   r  inv_precomputed_replacementsr?  r   r   	mutationsinput_buffersr  r  removed_buffersrM   
inner_nameoutput_buffersrP   r   rS   	zero_moderT   ZERO_ON_CALLr!  rx  r-  rV  ri  rF   r  r3  rJ  
tensor_dimr   r/  r_   rt  r(   r;  is_inferenceis_backward_get_grid_typeru   setr,  r7   DESCRIPTIVE_NAMEr,  r  r  r  r  estimate_kernel_num_bytesr[   r\   r-  r  r  r   r  r  r2  get_reduction_hintr   r]   r  r  r   r  codegen_static_numelsaliasesrf  r  r   ) r   r?  coder  r-  rV  
numel_hintr  device_typerY  rW  r  rU  mutated_argsmutationmutation_argargnamer   sizeargr  triton_meta_signaturer-  r  r  r  arg_numrw  heuristics_linereduction_hint	tile_hintoldnewr{   r  r|   codegen_kernel  s*  








	




	



,

zTritonKernel.codegen_kernelc                 C  sx   t jj| } t| tjtfrt| }t|}|S d}t jj	| |s:|dkr.t
d|  |d9 }t jj	| |r#|S )N   i @  z!Failed to find static RBLOCK for r   )rB   r   r   simplifyrh  r   rA  r  r,   statically_known_leqr  )r  r$  r{   r{   r|   _get_persistent_RBLOCK{  s   z#TritonKernel._get_persistent_RBLOCKc                 C  s&   zt |  W dS  ty   Y dS w )NTF)r  rG  r  )r  r{   r{   r|   has_persistent_RBLOCK  s   
z"TritonKernel.has_persistent_RBLOCKc                 C  s   ddd}| j D ]Z}|jr| jr)tjj|j}||r)||j	 dt
|  |jrU| jrU| jrB| | |j}d| d	}n| |j}||j	  d
|  |j	dkrb| jrb|d qdS )a  
        We get a small speedup from hard coding numels if they are static.

        This code stomps on the passed-in values by writing an constant to the top of the kernel.

        In a kernel like:
        def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):

        We would add
        xnumel = 4096
        r0_numel = 768

        After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
        a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
        knows that its a static numel, as that you just plop a constant into the kernel.
        r  r   rr   rq   c                 S  s   t | tjtfS rt   )rh  r   rA  r  ro  r{   r{   r|   is_static_integer  r  z=TritonKernel.codegen_static_numels.<locals>.is_static_integerznumel = z*triton_helpers.constexpr_next_power_of_2((z + RSPLIT - 1) // RSPLIT)zBLOCK: tl.constexpr = r'  zXBLOCK: tl.constexpr = 1N)r  r   rr   rq   )r  r3  r0  rB   r   r   rE  rV  r   r-  r  rJ  r/  r  r  rG  r   r,  )r   r4  rI  r   simplified_tree_numelrV  r$  r{   r{   r|   r2    s    


z"TritonKernel.codegen_static_numels type[triton_heuristics.GridExpr]c                 C  s|   t dd | jD }| jr|dksJ tjS |dkrtjS |dkr0tt| j| jr-tj	S tj
S |dkr7tjS td| )Nc                 S  s   g | ]}t |j qS r{   )r  r3  r&  r{   r{   r|   r     rb  z/TritonKernel._get_grid_type.<locals>.<listcomp>rD   r   r   z"Unsupported number of dimensions: )rz  r  r/  r%   CooperativeReductionGridGrid1Dr   r   needs_yz_grid_overflowGrid2DWithYZOverflowGrid2DGrid3Dr  )r   rn  r{   r{   r|   r-    s   zTritonKernel._get_grid_typec                 C  s`   | j D ]*}t|jtjtjfr|j}ntjj	||}|j
r!| jr-|| |t| qd S rt   )r  rh  rV  r   rA  r   rB   r   wrapper_codegenerate_numel_exprr3  r0  ri  r  )r   r?  r  	arg_typesr   r  r{   r{   r|   r    s   

z#TritonKernel.add_numel_to_call_argsr  Optional[IRNode]c                 C  s~   t jj}|  | j \}}}}| ||| | jjD ]}|| q|j	||d|| j
d t| jjD ]}|| q5d S )NT)r   rT  r-  )rB   r   rR  write_triton_header_oncerG  r  r  workspace_argsgenerate_workspace_allocationgenerate_kernel_callr-  r5  generate_workspace_deallocation)r   r?  r  wrapperrY  r  rT  wsr{   r{   r|   call_kernel  s    zTritonKernel.call_kernelc                 C  s   t jj}| j \}}}}t||D ]0\}}t|trBt jjr,|	d| d| d qd| d}|	| d| d}|	| qd S )Nz:AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_check_inf_and_nan("z", z));zassert not z.isnan().any().item()z.isinf().any().item())
rB   r   rR  rG  r  r   rh  rR   cpp_wrapperr   )r   r[  rY  r  arg_signaturesr  arg_signaturer  r{   r{   r|   codegen_nan_check  s   


zTritonKernel.codegen_nan_checkr  c                 O  s   t |i |S rt   )r  )r   rG  r  r{   r{   r|   create_cse_var  r   zTritonKernel.create_cse_varentryrW   c                 C  sF   |j  d| | |j }|jjr| j| d S | j| d S rb  )	r?  r  r  r  rootr?  r  r   rf  )r   rc  r  r{   r{   r|   codegen_iteration_ranges_entry  s   z+TritonKernel.codegen_iteration_ranges_entryrX   c                 C  sn   |j d usJ | |j }| j}|dkrd| dnd}| jr*| jr*|jr*| d}d|j  d| | S )Nr  r   r   r   z + rsplit_startztl.arange(0, zBLOCK))r*  indexing_size_strrt  r/  rJ  r3  r-  r   )r   rc  rm  rt  r!  r{   r{   r|   rB     s   
z)TritonKernel.iteration_ranges_ranges_coder   c                 C  s0   | j }|  }dg| }d| d| d| dS )NrD   r'  r   r   )rt  r  )r   rc  r   rt  r  rm  r{   r{   r|   iteration_ranges_scalar_code  s   
z)TritonKernel.iteration_ranges_scalar_codec                 C  st   |j d usJ d|j  d}| |r#d| d|j d  d|j  d}|j||}| jdkr8| d	| j dS |S )
Nztl.program_id(r   r  z + tl.program_id(rD   z) * tl.num_programs(r  r  r   )r7  rN  	pid_cacher  rt  )r   rc  r#  pidr{   r{   r|   iteration_ranges_get_pid  s   
 
z%TritonKernel.iteration_ranges_get_pidc                 C  s0   |j dko|j o| j otjj|jt  S r  )	r7  has_zdimr/  rB   r   r   rF  rV  r+   )r   rc  r{   r{   r|   rN  $  s   
z#TritonKernel.needs_yz_grid_overflowr-  r  c                 C  s&   | j r| j |  d S t|  S )Nr   )r  r   r)   )r   r-  r{   r{   r|   rZ  ,  s   zTritonKernel.max_blockr   c                 C  s   | j sdS | jr#|j  d| jv r#| j|j  d dkr"dS ntjj|jdr.dS |j	r;| j
r;| |j}n|jdkrF| jrFd}n| |j}|j	rX| jrX||   }tjj|j|rs|jdkpr|jprtjj|jt S dS )NFr   rD   Tr'  )r  r  r-  r   rB   r   r   r   rV  r3  rJ  rG  r,  rZ  r/  rd  rL  r7  rk  rF  r+   )r   r   rZ  r{   r{   r|   r6  1  s,   
zTritonKernel._has_constant_maskc                 C  s"   | j d }|jdksJ | |S )Nr   r'  )r  r-  r6  )r   xtreer{   r{   r|   r<  [  s   

z TritonKernel._has_constant_xmaskr   r   c                 C  s6   | j D ]}| |r||j d q|d d S )Nr   rs   )r  r6  r  r-  )r   r   r   r{   r{   r|   r{  `  s
   

zTritonKernel.filter_masksc                 C  s   dd t tjd | j D S )Nc                 S  s   g | ]}t | qS r{   rI  r   r{   r{   r|   r   j  s    z7TritonKernel.get_reduction_prefixes.<locals>.<listcomp>)r4  r   r   r/  r   r{   r{   r|   get_reduction_prefixesh  s   z#TritonKernel.get_reduction_prefixesr  rL   c                 C  sp   dd | j D }dtdd |D }|d| |  dd | j D }t|}|d| |  d	S )
z^
        Generates code that flattens ND reduction numels, block sizes, etc. into 1D.
        c                 S  r  r{   )r3  r&  r{   r{   r|   r   t  r  z9TritonKernel.codegen_reduction_numels.<locals>.<listcomp>r  c                 s  r  )rV  Nr  r&  r{   r{   r|   r   u  r  z8TritonKernel.codegen_reduction_numels.<locals>.<genexpr>z	rnumel = c                 S  s   g | ]}|j rtj|j qS r{   )r3  r   r   r   r&  r{   r{   r|   r   y  s    
zRBLOCK: tl.constexpr = N)r  r   r!  r   r  r:   )r   r  reduction_treesr  	rn_blocksrR  r{   r{   r|   r.  o  s   z%TritonKernel.codegen_reduction_numelsr!  list[sympy.Symbol]c                   s   |   } fdd|D S )zK
        Helper to initialize symbols like rn_numel, rn_base, etc.
        c                   s&   g | ]}t j|  fi  qS r{   )r   r   )r   r-  r  r!  r{   r|   r     s   & z7TritonKernel._get_reduction_symbols.<locals>.<listcomp>)rm  )r   r!  r  rn_prefixesr{   rq  r|   rC    s   z#TritonKernel._get_reduction_symbolsr   c                   sB   |   }| jdddd  fddtt|d D tdg S )z
        Compute coefficients to convert ND reduction indices to linear indices.
        For example:
          rindex = r0_index * r1_numel * ... * rn_numel + ... + rn_index.
        rV  Tr   c                   s    g | ]}t  |d  d qS r   )r:   rM  	rn_numelsr{   r|   r     s    z<TritonKernel._get_reduction_index_coeffs.<locals>.<listcomp>rD   )rm  rC  r6  r   r   rA  )r   rr  r{   rs  r|   _get_reduction_index_coeffs  s   

z(TritonKernel._get_reduction_index_coeffs
multi_indsc                 C  s   |   }t||S )zK
        Compute linear reduction indices from N dimensional ones.
        )ru  r9   )r   rv  coeffsr{   r{   r|   rD    s   
z'TritonKernel._flatten_reduction_indicesc                 C  sd   | j dddd}| j dddd}| |}|d| |  | |}|d| |  dS )zX
        Generates code that converts ND reduction indices into linear indices.
        r   Tr   r   z
roffset = z	rindex = N)rC  rD  r   r  )r   r  
rn_offsetsrn_indsr=  rindexr{   r{   r|   rE    s   

z&TritonKernel.codegen_reduction_indicesr4  c                 C  s  |j }|jr||j d| d| d nP|jd u r2||j d| |  || d n4|jd urB| d| | }n	| || d}|| d| 	| d|
  d|j d| g | |r||  }|| d	| d
 d S || d|j d| d d S )Nr  z	offset + r@  z
offset = 0r   z	offset = r  r   zmask = tl.full(z, True, tl.int1)zmask = z < rV  )r-  r?  r   r?  r7  rB  r*  rg  r  rj  r   r6  r  )r   rc  r4  r'  r  r  r{   r{   r|   rA    s$    


"z,TritonKernel.iteration_ranges_codegen_header)r   TN)r  r  r  r  rr   rs   r  r  rr   r   r   r   )r   r   )r   )r?  r   r  r   r  r   rr   r  )r  r   rm  r   r  rq   r   rq   )r?  r   r   r   rt   )
r?  r   r   r   r   rJ   r  rA   rr   rs   NN)r  rJ   r  r  r  rJ   r  r  r  rq   r  r  r  r  rr   rJ   )r   r   r  r  rr   r   )
r  r  r  r  r  r@   r   r  rr   r  )r  r  )r?  r   r   r   r   r  )rk  rl  rr   r   )rk  rl  r@  r  r  r  rr   r  )
rk  rl  r  r  r  rq   r  rq   rr   r  )rr   r  )rr   rK  )r?  r   r  rU  r  )rr   r  )rc  rW   )rc  rX   rr   r   )rc  rX   r   r   rr   r   )rc  rX   rr   rq   )r-  r   rr   r  )r   rX   rr   rq   )r   r   rr   rs   )r  rL   rr   rs   )r!  r   rr   rp  rZ  )rv  r   rr   r   )rc  rX   r4  rL   rr   rs   )Mru   r~   r   r  ry  r   r  r  r  r  r3  r5  r0  r2  r1  rH  rI  rK  r   rL  r  r  r  r  r  r  r  r  r  r  r  rP  rV  r'  r  r0  r-  rd  r/  rj  r  r  r  r  r  r  r  r  r\  r  rC  rG  rH  r2  r-  r  r]  ra  rb  re  rB  rg  rj  rN  rZ  r6  r<  r{  r1   rm  r.  rC  ru  rD  rE  rA  r  r{   r{   r  r|   r  1  s   
 
'
%


  +
/


 	*
3
	
   
	
;


*
3
o@
dZ	

' `


&









*




r  c                	      s   e Zd ZU eZded< eejej	ej
ejejejejejgZd) fddZed*ddZdd Zdd Zd+d,ddZ	d-d.ddZd/d!d"Zd0d%d&Zd'd( Z  ZS )1TritonSchedulingz	type[Any]kernel_type	schedulerOptional[Scheduler]rr   rs   c                   sF   t  | |d u st|dsd S |jD ]}t|ttfr t|_qd S )Nr  )	r  r  r   r  rh  r0   r.   debug_triton_codedebug_device_str)r   r  r  r  r{   r|   r    s   
zTritonScheduling.__init__r  torch.devicec                 C  s*   t jjst jjrtg | jtjS | jS rt   )r   r   cooperative_reductionsforce_cooperative_reductionsr   backend_featuresrG   REDUCE_TO_SINGLE_ELEMENT)ry   r  r{   r{   r|   get_backend_features  s   z%TritonScheduling.get_backend_featuresc                   s   t jj}t||\}}|r|| tjrAddlm m	 t
fdd|D sC fdd|D }||j dd|  d S d S d S )	Nr   r-   ForeachKernelSchedulerNodec                 3  s    | ]}t | V  qd S rt   )rh  rm  )r  r{   r|   r     s    

z3TritonScheduling.codegen_comment.<locals>.<genexpr>c                   s   g | ]}t | r| qS r{   )rh  get_namerm  )r-   r{   r|   r     rP  z4TritonScheduling.codegen_comment.<locals>.<listcomp>z Fused node name list: r   )rB   r   rR  r5   r   r   debug_fusiontorch._inductor.schedulerr-   r  r   commentr   )r   node_scheduler[  origins_detailed_origins
node_namesr{   r  r|   codegen_comment  s"   

z TritonScheduling.codegen_commentc                 C  st  t jj}||jv r|j| }|S tjjrt|tjjnd}t|d d }d	d|||
 g}||j|< tjjr;|nd}|ttj|}|ttj|}|dd}tt| d\}	}
}t }t rnt|| |d	|d
 |j|dd t j }|d|j d d| }t||\}}|d| d | 7 }||| | tdrt ||| |S )Nr   r   rY  r   triton_z#pragma CMT#pyzasync_compile.triton(z, '''Tre  z''', device_str='z')z# kernel path: r  kernel_metadata)!rB   r   rR  src_to_kernelr   r   descriptive_namesr4   rC   r   next_kernel_suffixunique_kernel_namesreplacer   r7   r/  r  r"   r!   rf  rL   async_compileuse_process_poolr   r   r  r  r5   define_kernelr   r   is_metric_table_enabledlog_kernel_metadata)r   src_coder  r  r[  r  
fused_namekernel_category	subs_name	_basenamerY  kernel_pathcompile_wrapperr  metadata_commentr  detailed_originsr{   r{   r|   r    sD   

5




zTritonScheduling.define_kernelr  tuple[float, str]c                 C  s6   | j |dd}t|}| j||tdd |D dS )z
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        T)r!  c                 s  rv  rt   r  rm  r{   r{   r|   r   G  rw  z9TritonScheduling.benchmark_fused_nodes.<locals>.<genexpr>)r  )generate_kernel_code_from_nodesr#   r  benchmark_codegened_moduler   )r   r  n_spills_thresholdr  r8  r{   r{   r|   benchmark_fused_nodes?  s
   
z&TritonScheduling.benchmark_fused_nodesNr  Optional[OrderedSet[str]]c           	        s   t tjj}t  |tj  dfddfdd}fdd}|dur.|ntdg}t	d	|j
 | durUj
fW  d   W  d   S   jjzj  d
  W n8 ty } z,tjjrw t	d|| td|  j
fW  Y d}~W  d   W  d   S d}~ww j}t|dksJ |d
 j|krtdnt fddtjd
kr؈t fdd t	d| |  j
fW  d   W  d   S 1 sw   Y  W d   dS 1 s	w   Y  dS )z$Benchmark an already compiled moduleNc                     $    j d usJ tj j d d S Nr   z.kernel_perf__file__ospathsplitextr{   r8  r{   r|   cache_file_pathU     zDTritonScheduling.benchmark_codegened_module.<locals>.cache_file_pathc                    sD     } t | d}|t W d    d S 1 sw   Y  d S )Nwopenwriter   r  fd)r  msr{   r|   store_cacheY  s   "z@TritonScheduling.benchmark_codegened_module.<locals>.store_cachec                    sJ     } t j| r#t| }t| W  d    S 1 sw   Y  d S rt   )r  r  existsr  floatreadr  r  r{   r|   
load_cache^  s   

 z?TritonScheduling.benchmark_codegened_module.<locals>.load_cacheunknown%kernel src code for %s written to: %sr   z*Exception (%s) in compiling fused nodes %sinfrD   c                        j   d S r@  
clone_argsr{   rG  callwrapped_jit_functionr{   r|   r    r  z=TritonScheduling.benchmark_codegened_module.<locals>.<lambda>c                     s
   j   S rt   r  r{   rG  r  r{   r|   r    s   
 z+The fused kernel for %s took %.3f ms to run)r   rB   r   r6  r   r  r  r   r  debugr  get_argsr  r  r  	Exceptionr   r   .disallow_failing_autotune_kernels_TESTING_ONLYr  	launchersr   n_spillsr&   benchmark_gpur  )	r   r8  r  r  device_interfacer  r  r  r  r{   )rG  r  r  r8  r  r  r|   r  J  st   "(
Tz+TritonScheduling.benchmark_codegened_modulekernel_featuresrf   kernel_args	list[Any]kernel_kwargsrs  list[TritonKernel]c           	      C  s   | d}|otdd | D }| j}|rddlm} |}|r%d|d< | dr2d	|d
< d|d< t|jsC|	d
r?J d|d
< t
j||||}||i |}| |||S )Nr  c                 s  rv  rt   )is_split_scanr   r  r{   r{   r|   r     r  z9TritonScheduling.create_kernel_choices.<locals>.<genexpr>rD   )TritonSplitScanKernelFoverride_cooperative_reductionr  Toverride_persistent_reduction)contains_opr   scheduler_nodesr~  triton_split_scanr  r  rH  reduction_numelr  rB   r4  triton_kernel_kwargsadd_multi_kernel_choices)	r   r  r  r  is_scanr  r~  r  r  r{   r{   r|   create_kernel_choices  s*   

z&TritonScheduling.create_kernel_choicesr  r  c           
      C  s   |g}t jjs	|S |jo|d }|jo|d }|r,|| j|i |ddi |r`|jj	}t
jj|dr`|| j|i |ddi } |r`|jr`|| j|i |ddd t|dkr{|dd  D ]}	|j|	_ql|jdd d	 |S )
Nr  r  Fi   )r  r  rD   c                 S  r   rt   )rJ  )kr{   r{   r|   r    s    z;TritonScheduling.add_multi_kernel_choices.<locals>.<lambda>)r#  )r   r   multi_kernelrJ  r  r/  ri  r~  r2  r  rB   r   r   rF  r   must_keep_buffersr  )
r   r  r  r  kernelsoptional_persistentoptional_cooperativer  rg  kernel2r{   r{   r|   r    s^   



	
z)TritonScheduling.add_multi_kernel_choicesc                   s  fddfdd}fdd}dg }}d}t jj}t|t j_t jj}t|t j_tjdk}	tjdk}
| j|d	|	|
d	d
}|D ]\}}}dd |D }dd |D }|	t
tjd}t|td|j | \d ur|7 }|7 }|j qE  jjj  d  j}t|dksJ |d jdkrtd nt fddt fddtdtdd |D  |  |7 }|7 }|j qE|t j_|t j_|||fS )Nc                     r  r  r  r{   r  r{   r|   r     r  z@TritonScheduling.benchmark_combo_kernel.<locals>.cache_file_pathc                    sX     } t j| r*t| }tdd |  D W  d    S 1 s%w   Y  dS )Nc                 s  rp  rt   )r  )r   r  r{   r{   r|   r     rw  zNTritonScheduling.benchmark_combo_kernel.<locals>.load_cache.<locals>.<genexpr>r|  )r  r  r  r  r  r  r  r  r  r{   r|   r    s   
 z;TritonScheduling.benchmark_combo_kernel.<locals>.load_cachec                    sP     } t | d}|td t  W d    d S 1 s!w   Y  d S )Nr  r  r  r  )r  r  ms_cloner{   r|   r    s   "z<TritonScheduling.benchmark_combo_kernel.<locals>.store_cacher   g        T)subkernel_nodescustom_part_algorithmenable_autotunemixed_sizesonly_gen_src_codec                 S  s   g | ]}|  qS r{   )	get_nodesr  r{   r{   r|   r   !  r  z;TritonScheduling.benchmark_combo_kernel.<locals>.<listcomp>c                 S  s   g | ]}|D ]}|  qqS r{   r  )r   r  rn  r{   r{   r|   r   "  r  r  r  rD   r  c                     r  r@  r  r{   r  r{   r|   r  C  r  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>c                     s   j   d S r@  r  r{   r  r{   r|   r  F  s    zDThe fused kernel for %s took %.3f ms to run, %.3f ms to clone inputsc                 s  rv  rt   r  rm  r{   r{   r|   r   K  rw  z:TritonScheduling.benchmark_combo_kernel.<locals>.<genexpr>)rB   r   r%  r   inplaced_to_remover   combo_kernels_autotunecombo_kernel_allow_mixed_sizesgenerate_combo_kernel_coder  r   r7   r  r#   r  r  r  r  ri  r  r  r  r  r  r   r  r  r&   r  )r   	node_listr  r  total_ms	file_listtotal_clone_msremoved_buffers_originplaced_to_remove_origr  r  kernel_code_listr  rY  
node_groupfused_node_listsnamesr  r{   )rG  r  r  r8  r  r  r  r|   benchmark_combo_kernel  sz   





z'TritonScheduling.benchmark_combo_kernel)r  r  rr   rs   )r  r  )r  )rr   r  )r  N)r  r  rr   r  )r  rf   r  r  r  rs  rr   r  )r  r  r  r  r  rs  rr   r  )ru   r~   r   r  r~  r   r   rG   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANSORTTRITON_TEMPLATESTUPLE_REDUCTIONr  r  r   r  r  r  r  r  r  r  r	  r  r{   r{   r  r|   r}    s0   
 
:
W
%5r}  r  r-   r  c                 C  s
  g }|   }|d u st|tjsJ |r%|jd u r%||   d |S ddlm} | 	 }|d us5J | j
|}t|t|fsKJ dt| tj| tj}||   }|t_W d    n1 skw   Y  ||   d |t|d |S )Nz" Unfinalized multi template bufferr   )CUDACombinedSchedulingz]Scheduling backend should be SIMD or CUDACombined when generating debug Triton strings, got: z Triton code:z    )get_template_noderh  r   MultiTemplateBuffermake_kernel_renderri  r  0torch._inductor.codegen.cuda_combined_schedulingr  r  r  get_backendrZ   r  rB   r   set_current_devicer   generated_kernel_countr  r  rf  r  r  )r  linesmulti_templater  r  backendold_generated_kernel_counttriton_coder{   r{   r|   r  X  s2   
r  r   )r   r   r]  r   r^  r   rr   r   r{  )r  r  rr   r  )r  r  rr   r  )r  r  rr   rq   )r  r  rr   rq   )rr   rd   r[  )rp   rq   rr   r  )r  r-   rr   r  )
__future__r   r  r  r)  r^  r  loggingr  r  r  collections.abcr   r   r   typingr   r   r   r	   r
   r   r   sympy.printing.precedencer   rv   torch._loggingtorch.utils._pytreer  _pytreer  torch._dynamo.device_interfacer   torch._dynamo.utilsr   r   torch._prims_commonr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._tritonr   utils._sympy.symbolr   r   r   r   utils._sympy.value_rangesr   r   r   r   r   r  r    	codecacher!   r"   r#   ops_handlerr$   runtimer%   runtime.benchmarkingr&   runtime.hintsr'   r(   r)   r*   runtime.runtime_utilsr+   r,   r  r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   virtualizedr?   rH  r@   rA   rB   wrapper_benchmarkrC   block_analysisrE   commonrF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   simdrU   rV   rW   rX   rY   rZ   triton_utilsr[   r\   r]   r^   r_   r[  r`   typesra   rb   r  rd   re   simd_kernel_featuresrf   rg   	getLoggerru   r  _logginggetArtifactLoggerperf_hint_logschedule_log
fusion_logrk   r   r   r   	dataclassr   r   r   rn  r  r  r  r  r  r  r  r  r  r  r  r  r  _initialize_pointwise_overridesr  r  r   r  r  r   r  r  r  r}  r  r{   r{   r{   r|   <module>   s    @D 
  
 %








1   
y '$(
                   +   