o
    IhE                    @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZ d dlmZmZmZmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZm Z  d d	l!m"Z"m#Z#m$Z$m%Z% d
dl&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3 ddl*m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z: ddl+m;Z;m<Z<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZI ddlJmKZKmLZLmMZM ddlNmOZO ddlPmQZQmRZRmSZSmTZT ddlUmVZV ddlWmXZXmYZYmZZZm[Z[ erd dl\m]Z]m^Z^m_Z_ e`eaZbejcdeadZeejcdeadZfejcdeadZgeT jhZieg d ZjejkG d!d" d"ZlG d#d$ d$elZmG d%d& d&elZnd9d+d,Zoed-eQeQd.ZpG d/d0 d0eSep eep ZqG d1d2 d2e<Zrejkd3d4G d5d6 d6ZsG d7d8 d8etZudS ):    )annotationsN)Counter)AnyCallableGenericno_type_checkOptionalTYPE_CHECKINGUnion)TypeVar)immutable_dict)
OrderedSet)FloorDivIdentityModularIndexing)free_symbol_is_type
prefix_strsymbol_is_typeSymT   )counters   )configir	scheduler)prologue_preserves_zero_mask)	code_hash)	MemoryDepStarDepWeakDep)IRNodeTritonTemplateBuffer)!indexing_dtype_strength_reduction)
green_textyellow_text)BaseSchedulerNodeBaseScheduling	WhyNoFuse)cache_on_selfexpr_fits_within_32bitget_dtype_sizeIndentedBufferPlaceholderprefix_is_reduction'set_kernel_post_grad_provenance_tracingsympy_index_symbolsympy_product
sympy_subsunique)ops
OpsWrapperV   )BlockPatternMatcher)CSEVariableindex_prevent_reorderingKernelPythonPrinter)MultiKernel)DisableReductionEnableReductionNodeScheduleMarkerSIMDKernelFeatures)IterableIteratorSequence
perf_hintsschedulefusion)zyxr0_r1_c                      sh   e Zd ZdZejjejjdd fddZee	e
dddZdddZee	e
d ddZ  ZS )!IterationRangesa  
    Each range tree represents multiple sets of iteration indexing
    in a single tiled dimension in the output kernel.

    If you have two loops ranges one (4, 3, 2) and another (4, 6),
    then the range tree will be:
            4 (i0)
        3 (i1)  6 (i3)
        2 (i2)
    Where i0 is shared between both loops, but then the split into
    different indexing vars.  All loop ranges must iterate over
    the same number of elements.
    )divisorlengthnamestrvar_listlist[sympy.Symbol]
var_rangesdict[sympy.Symbol, sympy.Expr]numel
sympy.Exprprefixkernel
SIMDKernelrootIterationRangesRootreturnNonec          
        sD   t    || _|| _|| _|| _|| _|| _|| _|| _	|	| _
d S N)super__init__rO   rQ   rS   rU   rW   rM   rN   rX   rZ   )
selfrO   rQ   rS   rU   rW   rX   rM   rN   rZ   	__class__ P/var/www/vscode/kcb/lib/python3.10/site-packages/torch/_inductor/codegen/simd.pyr`   `   s   

zIterationRanges.__init__boolc                 C  
   t | jS r^   )r-   rW   ra   rd   rd   re   is_reductionx   s   
zIterationRanges.is_reductionsympy.Symbolc                 C  rg   r^   )r/   rO   rh   rd   rd   re   symbol~      
zIterationRanges.symbolr   c                 C  s   dd t  D }|| j S )Nc                 S     i | ]\}}||qS rd   rd   ).0symtrW   rd   rd   re   
<dictcomp>       z(IterationRanges.symt.<locals>.<dictcomp>)r   itemsrW   )ra   prefix_to_symtrd   rd   re   ro      s   
zIterationRanges.symt)rO   rP   rQ   rR   rS   rT   rU   rV   rW   rP   rX   rY   rZ   r[   r\   r]   r\   rf   r\   rj   )r\   r   )__name__
__module____qualname____doc__sympySOner`   propertyr(   r   ri   rk   ro   __classcell__rd   rd   rb   re   rL   P   s    
rL   c                      sh   e Zd Z	d.d/ fddZd0ddZd1ddZd2ddZd3d!d"Zd4d&d'Zd5d)d*Z	d6d,d-Z
  ZS )7r[   NrO   rP   rU   rV   rW   indexintrX   rY   	pid_cacheOptional[dict[str, str]]is_looprf   
tensor_dimOptional[int]grid_dimhas_zdimr\   r]   c             	     sj   |d u ri }t  j|g i |||| d || _i | _|| _|r'| jr%|	d u s'J || _|| _|	| _|
| _	d S )N)rO   rQ   rS   rU   rW   rX   rZ   )
r_   r`   r   nodesr   ri   r   r   r   r   )ra   rO   rU   rW   r   rX   r   r   r   r   r   rb   rd   re   r`      s&   	
zIterationRangesRoot.__init__c                 C  s   d| j d| j dS )NzIterationRangesRoot(, z, ...))rO   rU   rh   rd   rd   re   __repr__   s   zIterationRangesRoot.__repr__c                 C  s   | j  D ]}|  qd S r^   )r   valuescache_clear)ra   noderd   rd   re   r      s   
zIterationRangesRoot.cache_clearrj   c                 C  s   t | j dS )Nr   )r/   rW   rh   rd   rd   re   	index_sym   s   zIterationRangesRoot.index_symrM   rN   IterationRangesEntryc                 C  s   t jj|| | jrt|  |}nt|  ||}|| jvrMt	| j
 tt jj |||| }|t jj| < | j|  || j| < || j|< | j| S )zF
        Lookup a given RangeTreeEntry, creating it if needed
        )r5   graphsizevarsstatically_known_equalsrU   r   r   r   r   r   rW   nextrX   iter_vars_countrange_tree_nodesrk   rQ   appendrS   )ra   rM   rN   exprr   rd   rd   re   lookup   s    


zIterationRangesRoot.lookuplengthslist[sympy.Expr]list[IterationRangesEntry]c                 C  s@   t jj}g }t|D ]}|| || || }q
g t|S r^   )rz   r{   r|   reversedr   r   )ra   r   rM   itervarsrN   rd   rd   re   construct_entries   s   
z%IterationRangesRoot.construct_entriesrR   c                 C  s   dd |  |D S )Nc                 S     g | ]}|  qS rd   )rk   )rn   erd   rd   re   
<listcomp>       z1IterationRangesRoot.construct.<locals>.<listcomp>)r   ra   r   rd   rd   re   	construct      zIterationRangesRoot.construct+tuple[list[sympy.Symbol], list[sympy.Expr]]c                   s   dd |j D }fdd|D }|jdd d tjj g g  fdd}|D ]}tjj|j	 sE|
 t|j	  |j	 || q+tjjj s_|
 tj  g tg tfS )	z,Figure out vars from this tree used in indexc                 S  s   g | ]	}t jj|qS rd   )r5   rX   r   getrn   srd   rd   re   r      s    z6IterationRangesRoot.vars_and_sizes.<locals>.<listcomp>c                   s    g | ]}|r|j  j kr|qS rd   rW   rn   nrh   rd   re   r      s     c                 S  s   t jjj| jtjdS )N)fallback)r5   r   r   	size_hintrM   r   unbacked_symint_fallbackrI   rd   rd   re   <lambda>   s    z4IterationRangesRoot.vars_and_sizes.<locals>.<lambda>keyc                   s(    |    | j  | j  d S r^   )r   rk   rN   )r   )rM   
index_varssizesrd   re   add   s   z/IterationRangesRoot.vars_and_sizes.<locals>.add)free_symbolssortrz   r{   r|   r5   r   r   r   rM   r   r   rU   r   )ra   r   r   r   r   rd   )rM   r   ra   r   re   vars_and_sizes   s"   
z"IterationRangesRoot.vars_and_sizesr^   )rO   rP   rU   rV   rW   rP   r   r   rX   rY   r   r   r   rf   r   r   r   r   r   rf   r\   r]   r\   rP   r\   r]   ru   )rM   rV   rN   rV   r\   r   )r   r   r\   r   )r   r   r\   rR   )r   rV   r\   r   )rv   rw   rx   r`   r   r   r   r   r   r   r   r~   rd   rd   rb   re   r[      s    
*





r[   c                      sd   e Zd Zd  fddZd!ddZd"ddZd#ddZd!ddZd$ddZd%ddZ	d&ddZ
  ZS )'r   rO   rP   rM   rV   rN   r   parentrL   r\   r]   c                   sP   t  j||j| |j|j|j|||j|jd	 || _t	
d | j| _|| _d S )N)	rO   rU   rQ   rS   rW   rM   rN   rX   rZ   )r_   r`   rU   rQ   rS   rW   rX   rZ   r   	functools	lru_cache_codegencodegenr   )ra   rO   rM   rN   r   r   rb   rd   re   r`     s   
zIterationRangesEntry.__init__c                 C  s.   d| j  d| j d| j d| j d| j dS )NzIterationRangesEntry(r   ))rO   rM   rN   r   rS   rh   rd   rd   re   r     s   .zIterationRangesEntry.__repr__c                   s$    fdd| _ dd | j _ | _d S )Nc                     s    S r^   rd   rd   rO   rd   re   r          z/IterationRangesEntry.set_name.<locals>.<lambda>c                   S     d S r^   rd   rd   rd   rd   re   r   !  r   )r   r   rO   )ra   rO   rd   r   re   set_name  s   
zIterationRangesEntry.set_namec                 C  s   | j   d S r^   )r   r   rh   rd   rd   re   r   $  s   z IterationRangesEntry.cache_clearc                 C  s   t j|  | jS r^   )r5   rX   codegen_iteration_ranges_entryrO   rh   rd   rd   re   r   '  s   zIterationRangesEntry._codegenr   c                 C  s   g }t | jtjr|S t | jttfsJ t| j| jjdd  D ]"}t |tjtjfsD|j	}t
|dkrDtdd |D rD|| q"|S )Nr6   r   c                 s  s    | ]	}t |tjV  qd S r^   )r   r   SIZEr   rd   rd   re   	<genexpr>4  s    
z8IterationRangesEntry.precomputed_args.<locals>.<genexpr>)
isinstancer   rz   Symbolr   r   typeargsIntegerr   lenallr   )ra   precomputed_argsargsymbolsrd   rd   re   r   +  s   
z%IterationRangesEntry.precomputed_argsr   c                 C  rg   r^   )hashrO   rh   rd   rd   re   __hash__:  rl   zIterationRangesEntry.__hash__otherobjectrf   c                 C  s   t |tsJ | j|jkS r^   )r   r   rO   )ra   r   rd   rd   re   __eq__=  s   zIterationRangesEntry.__eq__)rO   rP   rM   rV   rN   rV   r   rV   r   rL   r\   r]   r   )rO   rP   r\   r]   r   )r\   r   r\   r   )r   r   r\   rf   )rv   rw   rx   r`   r   r   r   r   r   r   r   r~   rd   rd   rb   re   r     s    





r   valueUnion[int, float]r\   rP   c                 C  s6   | t dkrdS | t dkrdS t| rdS t| S )Ninfzfloat("inf")z-infzfloat("-inf")zfloat("nan"))floatmathisnanrepr)r   rd   rd   re   constant_reprB  s   
r   CSEVariableType)bounddefaultc                      s  e Zd ZU dZeZded< ded< dZded< ded	< 	
	
	
dd fddZe	e
edddZdddZe	ddd Zdd!d"Zdd(d)Zdd+d,Zdd/d0Zdd6d7Zdd8d9Zdd:d;Zdd=d>Zdd?d@ZddBdCZddEdFZddGdHZddIdJZddMdNZddOdPZddRdSZddVdWZe dd\d]Z!e"e#j$j%fdd_d`Z&ddbdcZ'e"ddddeZ(ddfdgZ)ddhdiZ*ddjdkZ+ddldmZ,dddodpZ-ddrdsZ.ddtduZ/dddxdyZ0e1j2dd~dZ3dddZ4e dd Z5dd Z6dd Z7dd Z8dd Z9dd Z:dd Z;dddZ<  Z=S )rY   zo
    Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
    zCallable[[sympy.Expr], str]sexprkexprFrf   allow_block_ptrrP   kernel_nameNtilingdict[str, sympy.Expr]featuresr@   r   r   override_persistent_reductionOptional[bool]override_cooperative_reductionr\   r]   c                   s   |d u ri }t    | _|  _t  _t  _dd | D  _	g  _
i  _t  _|  _|d ur;|n   _|d urF|n   _   _d  _td d fdd}| _ | d S )Nc                 S  s    i | ]\}}|t jj|qS rd   )r5   r   r   simplify)rn   rW   valrd   rd   re   rp   h  s    z'SIMDKernel.__init__.<locals>.<dictcomp>r   rV   c                   s6   t jj|   }  jD ]} | |} q | S r^   )r5   r   r   simplify_with_rangesrS   range_treescombine_contiguous_dimscombine_modular_indexing_pairs)r   treerh   rd   re   simplify_indexing}  s   

z.SIMDKernel.__init__.<locals>.simplify_indexing)r   rV   )r_   r`   r   get_mutations	mutationsr+   bodyindexing_coderr   numelsr   r   	itertoolscountr   ri   inside_reduction should_use_cooperative_reductioncooperative_reductionshould_use_persistent_reductionpersistent_reductionwant_no_x_dimno_x_dimr   r   r   r   initialize_range_tree)ra   r   r   r   r   r   r   rb   rh   re   r`   Y  s8   




zSIMDKernel.__init__r   c                 C     t dd | jD S )Nc                 s      | ]}t |V  qd S r^   )r-   rn   rW   rd   rd   re   r         z0SIMDKernel.num_reduction_dims.<locals>.<genexpr>)sumr   rh   rd   rd   re   num_reduction_dims  s   zSIMDKernel.num_reduction_dimsdtypetorch.dtypec                 C     t r^   NotImplementedError)ra   r  rd   rd   re   dtype_to_str     zSIMDKernel.dtype_to_strc                 C  s   |  | j S r^   )r  r   select_index_dtyperh   rd   rd   re   index_dtype  s   zSIMDKernel.index_dtypec                 C     dS NFrd   rh   rd   rd   re   r    r  zSIMDKernel.want_no_x_dimr   ri   r   r  list[IterationRangesRoot]c                   s   t  fddtD }| p| }ddd}g d}	dd	g}
|r#|
}n	|r(|	}n|	|
 }|||}||	t}g }t|D ]6\}}t|}||}||}|d u rT|n|}|t| d
 | ||| ||oi| j ||d v d
 q<|S )Nc                 3      | ]	}| v r|V  qd S r^   rd   r  r   rd   re   r     s    z3SIMDKernel.construct_range_trees.<locals>.<genexpr>r\   dict[Any, int]c                   s    dd t  fdd| D D S )Nc                 S  rm   rd   rd   )rn   idxr   rd   rd   re   rp     s    
zPSIMDKernel.construct_range_trees.<locals>.filtered_index_map.<locals>.<dictcomp>c                 3  r  r^   rd   )rn   r   maskrd   re   r         zOSIMDKernel.construct_range_trees.<locals>.filtered_index_map.<locals>.<genexpr>)	enumerate)seqr  rd   r  re   filtered_index_map  s   z<SIMDKernel.construct_range_trees.<locals>.filtered_index_map)rI   rH   rG   rJ   rK   r   rG   )r   r   r   r   r   )r\   r  )r   all_prefixesr  r-   r   r   r[   r  )ra   r   r   ri   r   r  active_prefixesno_r_dimr   	grid_dimsreduction_dimstensor_dimstensor_dim_mapgrid_dim_mapr   irW   r   r   r   rd   r  re   construct_range_trees  sF   





z SIMDKernel.construct_range_treesdict[str, str]c                 C  s.   |  || j| j | j| j}| j| d S r^   )r*  r   r   ri   r   r  r   extend)ra   r   r   rd   rd   re   r    s   z SIMDKernel.initialize_range_treeindicesSequence[sympy.Expr]c                 C  r  )zr
        Hook called right before codegen with every index that will be
        used in the fused kernel.
        Nrd   )ra   r-  rd   rd   re   finalize_indexing  r   zSIMDKernel.finalize_indexingrO   r   rV   r   r8   c                 C  s,   | j }d| _ z| |||W || _ S || _ w r  )r   store)ra   rO   r   r   priorrd   rd   re   store_reduction  s
   zSIMDKernel.store_reductionc                 C  r  r  rd   rh   rd   rd   re   r     r  z+SIMDKernel.should_use_cooperative_reductionc                 C  r  r  rd   rh   rd   rd   re   r     r  z*SIMDKernel.should_use_persistent_reductionrT   c                 C  s   t tjdd | jD S )Nc                 s  s    | ]}|j  V  qd S r^   )rS   rr   rn   r   rd   rd   re   r     s    

z(SIMDKernel.var_ranges.<locals>.<genexpr>)dictr   chainfrom_iterabler   rh   rd   rd   re   rS     s
   zSIMDKernel.var_rangesc                 C  r  )Nc                 s  s    | ]
}t |jd uV  qd S r^   )r   r   r3  rd   rd   re   r     s    z0SIMDKernel.triton_tensor_ndim.<locals>.<genexpr>)r	  r   rh   rd   rd   re   triton_tensor_ndim  r   zSIMDKernel.triton_tensor_ndimr)  c                 C  s(   dg|    }d||< dd| dS )Nr]   :[r   ])r7  join)ra   r)  r   rd   rd   re   indexing_size_str  s   zSIMDKernel.indexing_size_str	list[str]c                 C  sL   dg|    }| jD ]}|jd u rq
|jr| jr#|j  d||j< q
|S )N1BLOCK)r7  r   r   ri   r   rW   upper)ra   r   r   rd   rd   re   dense_size_list  s   

zSIMDKernel.dense_size_listc                 C  s   |   }dd| dS )Nr9  r   r:  )rA  r;  ra   r   rd   rd   re   dense_size_str  s   zSIMDKernel.dense_size_strc                 C  sx   t |ts|S |jd }| j| }d u r|S t|||ji}tjj	
|}t||j |jtjj|jj iS )Nr   )r   r   r   r   r   r1   r   r5   r   r   r   rZ   r   r   rz   r{   r|   rU   rk   )ra   r   rI   	tree_node	new_indexrd   rd   re   r   	  s   

z)SIMDKernel.combine_modular_indexing_pairsr   r[   c                 C  s8   t jj| }r|\}}t| |||S | ||S r^   )r5   r   r   expand_floor_divr   _combine_contiguous_dims)ra   r   r   
expand_resrE  denominatorrd   rd   re   r     s   z"SIMDKernel.combine_contiguous_dimsc           
      C  s   t |tjtjfr|S ||\}}t|dkr|S tjj	||t
|g||\}}}||kr1|S ||}t|tt|||}	|	S )zI
        More aggressive simplification to merge contiguous dims
        r6   )r   rz   r   r   r   r   r5   r   r   _simplify_loopsr9   r   r1   r4  zip)
ra   r   r   r   r   	new_sizesreindex_prunenew_index_varsrE  rd   rd   re   rG  $  s   

z#SIMDKernel._combine_contiguous_dims'contextlib.AbstractContextManager[None]c                   s,    j d jp jtj fdd}| S )Nc                   3  sf     j  s jrJ d V  d S r   d _zd V  r)   W d _d S W d _d S d _w )NFT)r   ri   r   codegen_bodyrd   ra   should_flushrd   re   ctx;  s   



z)SIMDKernel.disable_reduction.<locals>.ctx)r   r   r   
contextlibcontextmanager)ra   rU  rd   rS  re   disable_reduction8  s   zSIMDKernel.disable_reductionr   rR   c                 G  s,   t |t | jksJ dd t|| jD S )Nc                 S  s   g | ]	\}}| |qS rd   )r   )rn   rN   rangesrd   rd   re   r   R  s    z)SIMDKernel.set_ranges.<locals>.<listcomp>)r   r   rK  r   rd   rd   re   
set_rangesP  s   
zSIMDKernel.set_rangesgroupsIterable[sympy.Expr]Sequence[Sequence[sympy.Expr]]Stuple[list[list[sympy.Expr]], list[list[Callable[[list[sympy.Expr]], sympy.Expr]]]]c              
     s  t dd |D rdd | D g fS tjjdd | D  fdd| D t d fdd}ddd}g }d}|D ]~}g }|D ]r}|drU|dd  qE|tk ru| dru|d7 }|tk ru| dsc|d tk r	|| r
|| st| }	t|| }
|||
|||	||d |
 qE|t||| qE|| q?t dd D sJ d d|  |fS )Nc                 s  s    | ]	}t |d kV  qdS r   Nr   )rn   rN   rd   rd   re   r   ^  r  z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>c                 S     g | ]}g qS rd   rd   )rn   grouprd   rd   re   r   _      z6SIMDKernel._split_iteration_ranges.<locals>.<listcomp>c                 S  ra  rd   rd   )rn   _rd   rd   re   r   b  rc  c                   s   g | ]}  |qS rd   r   )rn   g)svrd   re   r   c  rq   r)  r   r   rV   r\   c                   sF    |}|  |stt|  || <  |  | tS r^   )r   statically_known_multiple_of	CantSplitr   r   r   )r)  r   
new_ranges	remainingrg  	var_countrd   re   	add_rangef  s   
z5SIMDKernel._split_iteration_ranges.<locals>.add_rangesizeidx1idx2(Callable[[list[sympy.Expr]], sympy.Expr]c                   s   d fdd}|S )N	flat_varsr   r\   rV   c                   s   |    |   S r^   rd   )rs  rp  rq  ro  rd   re   getterr  r   zISIMDKernel._split_iteration_ranges.<locals>.make_combined.<locals>.getter)rs  r   r\   rV   rd   )ro  rp  rq  ru  rd   rt  re   make_combinedo  s   z9SIMDKernel._split_iteration_ranges.<locals>.make_combinedr   r6   c                 S  s   t jjS r^   )rz   r{   Zero)rd  rd   rd   re   r   }  s    z4SIMDKernel._split_iteration_ranges.<locals>.<lambda>c                 s  s"    | ]}t jj|d kV  qdS )r6   Nr5   r   r   r   r   rd   rd   re   r          zfailed to set ranges  )r)  r   r   rV   r\   r   )ro  rV   rp  r   rq  r   r\   rr  )r   r5   r   r   r   r   r   r   r   statically_known_gtrh  ri  r   operator
itemgetter)r[  r   rn  rv  return_getters_groupscurrent_grouplength_groupreturn_gettersro  size1size2rd   rj  re   _split_iteration_rangesW  sb   
	z"SIMDKernel._split_iteration_rangesreduction_numelc                 C  sj   t jj}t|d dkr!|t|t|d | r!|d |gf}z	| || W dS  ty4   Y dS w )Nr6   r   TF)r5   r   r   r   r   r0   r  ri  )clsr[  r   r  r   rd   rd   re   is_compatible  s   zSIMDKernel.is_compatiblelist[list[sympy.Expr]]c                 C  sP   dd | j D }| js|D ]}t|rtjj||< qg | }| ||| jS )Nc                 S  s   i | ]}|j |jqS rd   )rW   rU   )rn   rtrd   rd   re   rp     rq   z3SIMDKernel.split_and_set_ranges.<locals>.<dictcomp>)	r   r   r-   rz   r{   r|   r   map_kernel_groups_to_node_sizesrZ  )ra   r   r   rW   r[  rd   rd   re   split_and_set_ranges  s   zSIMDKernel.split_and_set_rangesc                   sf   t |t |krtdd t||D r|| S | ||\}}g tj||   fdd|D S )a  
        We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).

        To do this we need to split up the iteration space of i0 into something like:
            for i1 in s0:
              for i2 in s1:
                i0 = i1*s1 + i2
                ....

        This function matches and resplits lengths to the groups of
        this kernel to enable tiled + non-tiled fusions.
        c                 s  s.    | ]\}}t jjt|| d kV  qdS r_  r5   r   r   r   r0   )rn   rI   rf  rd   rd   re   r     s
    
z=SIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<genexpr>c                   s   g | ]} fd d|D qS )c                   s   g | ]}| qS rd   rd   )rn   fnr   rd   re   r     r   zISIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<listcomp>.<listcomp>rd   )rn   fnsr  rd   re   r     s    z>SIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<listcomp>)r   r   rK  r  r   r5  r6  )r  r[  r   rZ  rk  r~  rd   r  re   r    s   z*SIMDKernel.map_kernel_groups_to_node_sizesc                 C  s   t |tjS r^   )r   r   TMPra   r   rd   rd   re   is_indirect_indexing  s   zSIMDKernel.is_indirect_indexingc                   s   |  |rdS dgt| j }|jD ] }|| jvrq| j| }t|jts'J ||jj  |j	9  < qt
jjj t fddt|| j D S )NFr6   c                 3  s$    | ]\}} | |kV  qd S r^   rd   )rn   	idx_range
iter_rangere  rd   re   r     s
    
z,SIMDKernel.is_broadcasted.<locals>.<genexpr>)r  r   r   r   r   r   r   r[   r   rN   r5   r   r   r   anyrK  r   )ra   r   index_numelsrk   entryrd   re  re   is_broadcasted  s   




zSIMDKernel.is_broadcastedc                 C  s4   t |trddt| j| dS | | |S )a  
        Convert an index expr to a string that can be used in output code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the generated kernel.

        Index expressions often need to be passed in as arguments to the triton kernel.
        Rename_indexing and codegen_indexing keep track of the needed indices and add
        new parameters to the function signature.
        r9  r   r:  )r   listr;  mapindex_to_strr   rename_indexingr  rd   rd   re   r    s   
	zSIMDKernel.index_to_strc                 C  s   |  |}t|tjjj}t|tj	st|tj
r%|tjjj}t|tj
rV|tj
D ]"}|j}t|dkrUtdd |D rU|tjj|i}t||}q3|  |}t|tsb|n|jd }| |S )Nr   c                 s  s"    | ]}t |tjtjfV  qd S r^   )r   r   r   PRECOMPUTED_SIZEr   rd   rd   re   r     s
    
z.SIMDKernel.prepare_indexing.<locals>.<genexpr>)r   r1   r5   r   r   precomputed_replacementsr   atomsrz   floorceilingsubsr   r   lookup_precomputed_sizer   r   r   codegen_indexing)ra   r   ar   replacements
simp_indexrd   rd   re   prepare_indexing  s"   
 


zSIMDKernel.prepare_indexingreorderc                   s    fdd j D }|rIt|dkrItdd |D }ddd |d | D d| d  ks=J d	d |d | D t|d | |d |< |S )
Nc                   s   g | ]
}|j r
 jr|qS rd   )ri   r   rn   trh   rd   re   r   2  s    z1SIMDKernel.active_range_trees.<locals>.<listcomp>r6   c                 s  s    | ]}|j d v V  qdS )xyzNr   r  rd   rd   re   r   6  s    z0SIMDKernel.active_range_trees.<locals>.<genexpr> c                 s  s    | ]}|j V  qd S r^   r   r  rd   rd   re   r   7  s    zyxc                 S     g | ]}|j qS rd   r   r  rd   rd   re   r   7  s    )r   r   r	  r;  r   )ra   r  treesr   rd   rh   re   active_range_trees1  s   
2
zSIMDKernel.active_range_treesr   c                 C  s   t jj||  }t|jtdD ]6}|| jv rGi }| j| 	 D ]}t jj
|||< q!t|dkr@t| j| j|| j| _| j|   q|S )Nr   r   )r5   r   r   r   rS   sortedr   rP   r   r   r  r   r1   r   r   )ra   r   symr  psrd   rd   re   r  =  s   

zSIMDKernel.codegen_indexingc                 C     t d)NzNYI: codegen_nan_checkr  rh   rd   rd   re   codegen_nan_checkN     zSIMDKernel.codegen_nan_checkr   Optional[IRNode]c                 C  r  )NzNYI: call_kernelr  )ra   rO   r   rd   rd   re   call_kernelQ  r  zSIMDKernel.call_kernelr  Union[str, OpsWrapper]r   Iterator[str]c                 c  s\    | j }| j}|rt||}t|}|| _ || _z|V  W || _ || _dS || _ || _w )z:Context manager to add an additional mask to tl.load/storeN)
_load_mask_load_otherr3   logical_andr4   _unwrap)ra   r  r   r1  	prior_valrd   rd   re   
mask_loadsT  s   

zSIMDKernel.mask_loadsc                 C  s\   dd | j  D }t||}i }| jD ]}t|j}t||dit||di ||< q|S )a\  
        This gets the stride of the index for each of the tiling variables
        (technically, it does it at index 0)

        For example, if
        xindex = x0 + 512*x1 + 1024*r0
        x0 = (xindex//512)
        x1 = (xindex % 512)
        r0 = rindex // 1024

        this function would return
        {xindex: 512, rindex: 1024}
        c                 S  s   i | ]\}}||j qS rd   )r   )rn   kvrd   rd   re   rp   v      z2SIMDKernel.get_strides_of_load.<locals>.<dictcomp>r6   r   )r   rr   r1   r   r/   rO   )ra   r   index_to_tile_indexesindex_in_tile_varsstrides
range_treer   rd   rd   re   get_strides_of_loadh  s   


zSIMDKernel.get_strides_of_loadc                 C  s    t |trtt| |S | |S r^   )r   tupler  )r  r   rd   rd   re   _map_tuple_or_scalar  s   
zSIMDKernel._map_tuple_or_scalarc                 C  s0  g }t t| jj }| j \}}}}| j }tj	j
t| j }t|D ]i\}}||vr8|d q*tj	|}	tj	j
|	}
|
|krxtt  }d}|| D ]}t|ttfrj|d|  |d7 }qT||j qTt || }n|
}tj	|}t|}||| dt||k    q*t|S )a+  
        Try the best to estimate the total size (in bytes) of the
        kernel's inputs and outputs, which is used for estimating the memory
        throughput of this kernel. This information is used for checking how
        far we are from the peak memory bandwidth. It's important that
        we want to avoid overestimating the sizes of the inputs and outputs,
        because it can wrongfully give us a very large memory traffic value,
        which may be even larger than the theoretical bandwidth and thus
        become very misleading. This is particularly problematic for cases
        where we slice some inputs. In those cases, we should only count
        the size of the "slices" instead of the original inputs, because
        only the slices contribute to the real memory traffic.
        r   no_index_dep_r6   )r   r2   r   inplace_buffersr   python_argdefsr   buf_accessesr5   r   r   r   r0   r   r  r   	get_numelr   r   r   r   r   r   r   	get_dtyper*   r   r	  )ra   nbytesninplace_argsrd  	call_argsr  	out_numelr)  r   	arg_numelbuf_sizer-  no_index_dep_countdeprU   r  
dtype_sizerd   rd   re   estimate_kernel_num_bytes  s2   



 z$SIMDKernel.estimate_kernel_num_bytesc                 C  st  t | jjdkrt | jjdkrt | jjdkrdS | j \}}}}d}|D ]}tj|}|s2q'|	 }	t |	j
dkrt dd |	j
D dkrJq't|	j}
|du rW|
}q'||
krtd| d	d
|
 d|  }t| dd |D }dd |D }dd |D }dd |D }td| d| d| d| d| d }t|  dS q'td| d}t| dS )zr
        Print message if the kernel have mixed layout inputs.
        Only care about 4D tensor for now.
        r6   r   N   c                 S  s   g | ]}|d kr|qS )r6   rd   rn   rI   rd   rd   re   r     r  z.SIMDKernel.warn_mix_layout.<locals>.<listcomp>r   zExpected stride order z, but found stride orderrz  z for kernel c                 S  s4   g | ]}t j|rtt j| jnd qS r^   )r5   r   try_get_bufferr   get_stride_order
get_buffer
get_layoutstridern   rO   rd   rd   re   r     s    
c                 S  s.   g | ]}t j|rt j| jnd qS r^   )r5   r   r  r  r  ro  r  rd   rd   re   r     s    
c                 S  s0   g | ]}|t jjv rd n	|t jjv rdndqS )
GraphInputIntermediateBufferN)r5   r   graph_inputsname_to_bufferr  rd   rd   re   r     s    c                 S  r  rd   r   r  rd   rd   re   r          z  param names z
  buf names z
  strides z	
  sizes z
  sources 
z%All the inputs for the triton kernel z have uniform layout)r   r   input_buffersoutput_buffersr  r  r5   r   r  r  ro  r   r  r  r$   logwarningr#   )ra   r   argdefsr  
_signaturerd  uniform_stride_orderarg_namebuflayoutstride_ordermsgstride_order_list	size_listsource_listargdef_namesrd   rd   re   warn_mix_layout  s\   

	

zSIMDKernel.warn_mix_layoutc           	      C  sp   t ||d|}d| _t | jj|}t ||}d| _t ||}t ||}t ||d|}t	
|||fS )Nr	  FT)r3   	reductionr   
index_exprr   r  truedivsubmulr4   r  )	ra   r  r   sum_rnumelmeandxdx2m2rd   rd   re   welford_reduce_fallback  s   z"SIMDKernel.welford_reduce_fallbackc                 C  sD   t ||d|}t ||}t |}t ||d|}t||fS )Nmaxr	  )r3   r  r  expr4   r  )ra   r  r   vmaxr  r  vsumrd   rd   re    prepare_softmax_twopass_fallback  s
   
z+SIMDKernel.prepare_softmax_twopass_fallbackc                 C  r  r^   r  rh   rd   rd   re   codegen_kernel  r  zSIMDKernel.codegen_kernelc                 C  r   r^   rd   rh   rd   rd   re   rR  "  r  zSIMDKernel.codegen_bodyr  r   c                 C  r   r^   rd   )ra   r  rd   rd   re   r   %  r  z)SIMDKernel.codegen_iteration_ranges_entry)NNN)r   r   r   r@   r   r   r   r   r   r   r\   r]   r   )r  r  r\   rP   r   rt   )r   r   r   rf   ri   rf   r   r   r  rf   r\   r  )r   r+  r\   r]   )r-  r.  r\   r]   )rO   rP   r   rV   r   r8   r\   r]   )r\   rT   )r)  r   r\   rP   )r\   r=  )r   rV   r\   rV   )r   rV   r   r[   r\   rV   )r\   rP  )r   rV   r\   rR   )r[  r\  r   r]  r\   r^  )r[  r\  r   r]  r  rV   r\   rf   )r   r]  r\   r  )r[  r.  r   r]  r\   r  )r   rV   r\   rf   )r   rV   r\   rP   F)r  rf   r\   r  )r   rV   r\   rV   r   r^   )rO   rP   r   r  r\   r]   )r  r  r   r   r\   r  )r   rV   r\   rT   )r  r   )>rv   rw   rx   ry   pexprr   __annotations__r   r`   r}   r(   r   r
  r  r  r  r*  r  r/  r2  r   r   rS   r7  r<  rA  rC  r   r   rG  rX  rZ  staticmethodr  classmethodrz   r{   r|   r  r  r  r  r  r  r  r  r  r  r  rV  rW  r  r  r  r  r  r  r  r  rR  r   r~   rd   rd   rb   re   rY   O  s|   
 /



6













	

L




&



?GrY   c                   @  s.  e Zd ZU eZded< dd Zdd ZeZeZ	dd Z
dQddZedRddZdSddZdTddZdd ZdddUd!d"Zd#d$ Z	dVdWd,d-Zd.d/ Zeed0dXd2d3ZedYd8d9ZedZd<d=Zed[d?d@Zed\dBdCZeejjfd]dDdEZ dFdG Z!d^dHdIZ"dVdJdKZ#dLdM Z$dNdO Z%dPS )_SIMDSchedulingz	type[Any]kernel_typec                 C  s   t dd |D S )Nc                 s  s"    | ]}t jjt|V  qd S r^   r  r   rd   rd   re   r   -  ry  z*SIMDScheduling.group_fn.<locals>.<genexpr>)r  rB  rd   rd   re   group_fn,     zSIMDScheduling.group_fnc                   sd  t |tjst |tjrtj||S |j\}\}}|j\}\ t||}| r7| s7| r6|d n| rG| sG| rG|d | rc| rc| koV|k}|sa|d| | |S | s7| s7| kru|ks| s|d| | dS |	 D ]+}| r n$|
 | @ sq|j\}\}	}
||	kr||
ks|d||	||
  dS qt||fdD ]\}}| rt | t}|s|| d |  S q| |	 ||}| |	 ||}| |	 |	  ||}tjjr5d}t|d	krt|d	kr||  ko|kn  }n||k}nt|d	kr)||k}|s5|d
||| dS dS | s| r|dkrKdksMJ |  krt fdd|	 D si|d dS tjjr| st| |	 | |df dffv }|s|d |S dS | kr|d | kS | r| rJ | ||S )z
        Hook called by Scheduler to determine if the Triton backend
        can fuse node1 and node2.  These nodes might already be
        FusedSchedulerNodes.
        z&Split scan cannot fuse with reductionsz1numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)z5numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)Fz:numel/rnumel mismatch prologue mismatch (%s, %s), (%s, %s))node1node2z is not TritonTemplateBufferTr   ztiling mismatch (%s, %s, %s)r6   c                 3  s$    | ]}t  f| V  qd S r^   )rY   r  
get_rangesr   numel2rnumel2rd   re   r     s
    
z*SIMDScheduling.can_fuse.<locals>.<genexpr>z"nodes numel/rnumel incompatibilityzinvalid tiling for reductionznodes numel incompatibility)r   r   ForeachKernelSchedulerNodecan_fuserb  r'   is_split_scanri   is_template	get_nodesused_buffer_namesget_buffer_namesrK  get_template_noder!   select_tilingr   triton tiling_prevents_pointwise_fusionr   r    tiling_prevents_reduction_fusionr  r   can_fuse_horizontal)ra   r  r  rd  numel1rnumel1whyreduction_can_fuser   	pro_numel
pro_rnumelr   	node_nameis_triton_templatetiling1tiling2tiling3condis_reduction_tiling_validrd   r  re   r  /  s   





zSIMDScheduling.can_fusec              	     sb  g t tj  t t  t t   d fdd}fdd} fdd} fdd}tj fd	d
}fdd}	|D ]h}
|
v rMqF|
 ||
r|	|
rn|  W d    n1 siw   Y   r{||
s{pytnd ||
 qF||
r|  |
 W d    n1 sw   Y  qFt	d d d|
j
d  S )Nc                   s2   | j \}\}}| kr|kp|  ko|dkS Nr6   rb  r   rd  
node_numelnode_rnumelrU   r  rd   re   fits_in_main_body  s   z@SIMDScheduling.generate_node_schedule.<locals>.fits_in_main_bodyc                   s&   | j \}\}}| ko|dkodkS r6  r7  r8  r;  rd   re   fits_outside_reduction  s   zESIMDScheduling.generate_node_schedule.<locals>.fits_outside_reductionc                   s"   | j jD ]
}|j v r dS qdS )NTF)read_writesreadsrO   )r   read)current_loop_buffer_usagerd   re   expect_improved_memory_usage  s
   
zKSIMDScheduling.generate_node_schedule.<locals>.expect_improved_memory_usagec                   s    |  |   dd | jjD  |  r8t| tjr8t| j	t
jr8t| j	jt
js8 |   d S  dd | jjD  d S )Nc                 S  r  rd   r   r  rd   rd   re   r     r  zXSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loop.<locals>.<listcomp>c                 S  r  rd   r   r  rd   rd   re   r     r  )r   r   updater>  r?  ri   r   r   SchedulerNoder   r   ComputedBufferdataScanget_namewrites)r   )rA  donenode_schedulenot_ready_yet_nodesrd   re   schedule_node_in_loop  s   


zDSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loopc                   3  sn    rd t u r  nt r%t d t  d d V  t       d S )NrQ  r6   )r>   popr   r=   insertclearrd   )rA  maybe_split_indexrK  rL  rd   re   end_current_reduction_loop  s   


zISIMDScheduling.generate_node_schedule.<locals>.end_current_reduction_loopc                   s<   dkrdS  | j @ sdS |rt|d ttfrJ t S )Nr6   FrQ  )	ancestorsr   r>   r=   rf   )r   rK  )rL  r  rd   re   #requires_closing_previous_reduction  s   
zRSIMDScheduling.generate_node_schedule.<locals>.requires_closing_previous_reductionzunexpected group: (r   z) != r6   )r   r   r%   rP   rV  rW  r   r   r   r  rb  )ra   r   rU   r  r<  r=  rB  rM  rR  rT  r   rd   )rA  rJ  rQ  rK  rL  rU   r  re   generate_node_schedule  sD   





z%SIMDScheduling.generate_node_scheduler   <Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode]c                 C  sN   |  }t|dd dj\}\}}| |||}td| | t|||S )zK
        Given a set of pre-fused nodes, generate a Triton kernel.
        c                 S     t |  S r^   r   ri   r   rd   rd   re   r   #      z-SIMDScheduling.codegen_node.<locals>.<lambda>r   zSchedule:
 %s)r   r  rb  rU  schedule_logdebugcodegen_node_scheduler@   )ra   r   r   rd  rU   r  rK  rd   rd   re   codegen_node  s   
zSIMDScheduling.codegen_noderU   rV   buffers<Iterable[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]r\   rf   c                 C  sl   t t jj}t| sdS dd |D }tdd |D sdS tjj	| | |D ]
}tjj	|| q)dS )NFc                 S  s    g | ]}|  r|  qS rd   )has_tensor_outputr  storage_size)rn   r  rd   rd   re   r   8  s    
z9SIMDScheduling.can_use_32bit_indexing.<locals>.<listcomp>c                 s  r  r^   )r)   )rn   ro  rd   rd   re   r   >  r  z8SIMDScheduling.can_use_32bit_indexing.<locals>.<genexpr>T)
torchiinfoint32r  r)   r   r5   r   r   	guard_leq)rU   r^  int_max	buf_sizesro  rd   rd   re   can_use_32bit_indexing,  s   z%SIMDScheduling.can_use_32bit_indexingkernel_featuresr@   c              	   C  s"  |j }| ||j|j}| ||gd|i}|D ]}| || qt| |D ]9}t	| |
 }W d    n1 s>w   Y  | |||}tjjrSt|| td| ||_t||_q(~t|dkrnt|}n|\}t	| | D ]}	|	  q{W d    n1 sw   Y  | | ||j tjr|  tjr||d j tj j|jO  _tj j|jO  _tjjj rtj!r|d j"# }
| D ]5}	|	$ }||
vrq|	j%d usJ |	j%& }|d ur
t'd d  d7  < tjj(d|j)d| d	 q| *  d S )
Nr   z+Generating kernel code with kernel_name: %sr6   r   inductorintermediate_hookszrun_intermediate_hooks(r   r   )+rK  r$  rU   r  create_kernel_choices!codegen_node_schedule_with_kernelr<   merge_workspaces_inplacer5   set_kernel_handlerr  define_kernelr   traceenabledr.   r  r[  r   r   r   scheduler_nodesmark_runcodegen_commentr  nan_assertsr  r  r   removed_buffersinplaced_to_removewrapper_codesupports_intermediate_hooksgenerate_intermediate_hooksr   live_output_buffersrH  r   get_origin_noder   	writelinerO   free_buffers_in_scheduler)ra   ri  rK  r   kernelsrX   src_coder   final_kernelr   	live_outsrO   origin_noderd   rd   re   r\  H  st   







z$SIMDScheduling.codegen_node_schedulelist[SIMDKernel]c                 C  s   | j |i |gS r^   )r  )ra   ri  kernel_argskernel_kwargsrd   rd   re   rl    s   z$SIMDScheduling.create_kernel_choicesc              	   C  s   |t t  }i }|D ]0}|tu r||  q|tu r"|  q|  ||	 }|
t|j|  q||  |D ](}|tu rS||  qE|tu r\|  qEt|j ||	 }|| qEW d    d S 1 syw   Y  d S r^   )rV  	ExitStackr=   enter_contextrX  r>   closedecide_inplace_updater  r  rC  r4  fromkeys_bodyindexing_from_argsr   r/  keysr"   r   )ra   rK  rX   stackall_indexingr   r   rd   rd   re   rm    s4   


"z0SIMDScheduling.codegen_node_schedule_with_kernelFonly_gen_src_codeOptional[str]c                C  s  |j \}\}}|dksJ |j|j\}}	i }
| }g }|D ]+}| }|| ||@ rKt|dks7J ||
tt|< |j	
tt| g }q t|dksTJ | |se|g|D ]}|  q^|	 }|d |D ]}|||  qp|jt  W d   n1 sw   Y  |j D ]|\}}d| d}|
| g  }rtdd |D }td	| N ||9 |D ]'}t| dkrt|dkrt|r| j| O  _|||  q|jt  W d   n1 sw   Y  W d   n	1 sw   Y  qW d   n	1 s w   Y  t|ts7|d
 |jddd t | |j! D ]}d| d}|j|dd qB|d t|trb|}n|d |j"}W d   n	1 suw   Y  g |||}tj#r|$ d }|%  d| d|&|'  }|r|W  d   S | (|||}tj)j*rt+|| W d   n	1 sw   Y  | ,| |-||j tj. j/|j/O  _/tj. j0|j0O  _0| 1  dS )z
        Codegen a triton template

        If `only_gen_src_code` the src code will be returned instead of codegen'd into the wrapper
        r6   r   z<STORE_OUTPUT>Nz<LOAD_INPUT_>c                 s      | ]}|  V  qd S r^   )can_codegen_without_upcasts)rn   p_nrd   rd   re   r     s    
z2SIMDScheduling.codegen_template.<locals>.<genexpr>ztriton.codegen_upcast_to_fp32z<DEF_KERNEL>z	<ARGDEFS>F)strictg    eAr  )2rb  r   make_kernel_renderr!  r"  r   r   r   iterprologue_fused_inputsr   rt  set_subgraph_bodyr   r  r  cse
invalidater   named_input_nodesrr   r   rH  r   r   patchr   #prologue_fused_inputs_preserve_zeror   rP   finalize_hookr5   ro  r  codebenchmark_kernelr  imports_for_benchmark_kernelcodegen_kernel_benchmarkgetvaluerp  rq  rr  r.   ru  r  r   rw  rx  r  )ra   template_nodeepilogue_nodesprologue_nodesr  rd  _numelr  rX   renderbuf_name_to_prologue_grouptemplate_readsprologue_groupprologuenamesr   partial_code
input_namebuffersubgraph_namecan_codegen_without_upcastprologue_noder  rK  num_gbr   rd   rd   re   codegen_template  s   


.





zSIMDScheduling.codegen_templatec                 C  s   t jjt jj  d S r^   )r5   r   ry  r~  
device_opssynchronizerh   rd   rd   re   codegen_sync-  s   zSIMDScheduling.codegen_syncsubkernel_nodeslist[BaseSchedulerNode]custom_part_algorithmenable_autotunemixed_sizesr  list[tuple[str, Any, Any]]c              
   C  s  ddl m} dd |D }i i }}	t||D ]6\}
}t|dd dj\}\}}| |||}| |||}||||f|	|
< |j|t|||| d||
< q|j	|| |||	d	}t
d
t|dd |D  g }|D ]s}dd |D }|||d}t||D ]R\}
}| |	|
 d |||
  ||
 }|	|
 d }|st| t|D ]}|  qW d    n1 sw   Y  tj j|jO  _tj j|jO  _q~| }||||f qj|S )Nr6   )ComboKernelc                 S  r   rd   r   rn   r   rd   rd   re   r   :  r   z=SIMDScheduling.generate_combo_kernel_code.<locals>.<listcomp>c                 S  rW  r^   rX  r   rd   rd   re   r   =  rY  z;SIMDScheduling.generate_combo_kernel_code.<locals>.<lambda>r   )r   optimize_mask)r   triton_schedulingcustom_algorithm
kernel_mapnode_info_mapz1ComboKernels: %d nodes partitioned into %s groupsc                 S  s   g | ]}t |qS rd   r`  )rn   prd   rd   re   r   Q  r   c                 S  r   rd   r  r  rd   rd   re   r   U  r   )r  r  r   )triton_combo_kernelr  rK  r  rb  rU  r$  create_triton_kernelr@   horizontal_partitionr  r[  r   rm  create_sub_kernelr5   ro  r?   
only_nodesrt  r   rw  rx  r  r   )ra   r  r  r  r  r  r  fused_node_listssubkernel_mapnode_schedule_mappnr   rd  rU   r  rK  r   
partitionskernel_code_list
node_grouprX   	subkernelr   r  rd   rd   re   generate_combo_kernel_code0  sd   



z)SIMDScheduling.generate_combo_kernel_codec                 C  s   |  }|j}|j}tjdkptjdko|}| ||||}|D ]!\}}}	| ||g|}
| |g t	d|
 |
tjj|
 q |   d S )Nr6   z"ComboKernels: generated kernel %s.)get_subkernel_nodesuse_custom_partition_algor  r   combo_kernel_allow_mixed_sizesr  rp  ru  r  r[  r  r5   r   ry  r  )ra   combo_kernel_noder  r  r  r  r  r  rX   rd  r   rd   rd   re   codegen_combo_kernelm  s   
z#SIMDScheduling.codegen_combo_kernel    list[CandidateTiling]c           	        s   dk}d
 fdd}|  \}t|dkr!tdkr!g S |  \}|||r-|n||} fdd	|D }|S )Nr6   is_pointwiserf   r\   r  c                   s  t |jt |ksJ d|jd||j|jg}tdd tj|D s)J dd tj|D }tdd |jD }ddd}t	 
||g| dddg}|D ]}tjj|j|j}	t |	t |ksjJ z |	dd }
|
t |krzW qTtdd |	|
d D rW qTW n	 ty   Y qTw ||d|
 |||
d f}tjjtdd t||	D }|j|v r|d9 }t	|d r|d9 }t	|d r|d9 }tjj|tt| dkr|t	 
||d|
 |||
d g||jd qT|S )zX
            Compute tiling candidates by dividing up the iteration ranges.
            zrw.range_vars=z ranges=c                 s  s    | ]
}t |ttfV  qd S r^   )r   r   r   rn   r  rd   rd   re   r     s
    
zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>c                 S  s(   g | ]}|j tjjvrt|tr|qS rd   )rO   r5   r   rw  r   r   r  rd   rd   re   r     s    zISIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<listcomp>c                 S  r  rd   r   r  rd   rd   re   r     r  rY  r.  r\   rV   c                 S  s   t jjt| S r^   r  )rY  rd   rd   re   collapse_ranges  r  zNSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.collapse_rangesnoner   )r   rO   scorer6   c                 s  s    | ]}|d kV  qdS r_  rd   r   rd   rd   re   r     r  Nc                 s  s     | ]\}}|d kr|V  qdS r_  rd   )rn   ro  r  rd   rd   re   r     s    r   r   r  rO   )rY  r.  r\   rV   )r   
range_varsr?  rI  r   r   r5  r6  r   CandidateTilingcreate_partial_tilingr5   r   r   stride_hintsr   
ValueErrorr   r0   rK  rO   is_good_sizer   )r  rY  rwdep_sourcesdepswrite_namesr  tilingsr  r  splittiled_groupsr  )r  r  reduction_rangesrd   re   tile_ranges  s   (




z5SIMDScheduling.candidate_tilings.<locals>.tile_rangesc                   s*   g | ]}t  |j|j|jd qS )r  )r  complete_partial_tilingr   r  rO   )rn   r   )r  rU   r  rd   re   r     s    z4SIMDScheduling.candidate_tilings.<locals>.<listcomp>)r  rf   r\   r  )r  r   "pointwise_or_reduction_read_writes)	r  r   rU   r  r  r  pointwise_rangespartial_tilingsfull_tilingsrd   )r  rU   r  r  re   candidate_tilings  s   ^
z SIMDScheduling.candidate_tilings	pw_tilingr.  reduction_tilingr   c                 C  sF   g dt | d }ddgdt | }tg t||t||S )zK
        Create a tiling dict from pointwise and reduction splits.
        )rG   rH   rI   NrJ   rK   )r   r   rK  )r  r  r  pw_prefixesreduction_prefixesrd   rd   re   create_tiling  s
   zSIMDScheduling.create_tilingr   r  c                 C  s   |  |r|ng |s|S g S r^   )r  )r  r   r  rd   rd   re   r    s   
z$SIMDScheduling.create_partial_tilingr  c           	      C  sH   t | }d|v }|| }|t| g}|r||fn||f}| j| S )zb
        Given a tiling for only pointwise or reduction dimensions, adds the missing one.
        rI   )r  r   r0   r  )	r  r   rU   r  splitsr  total_numelmissing_tilingtiling_argsrd   rd   re   r    s   

z&SIMDScheduling.complete_partial_tiling"list[dict[str, tuple[sympy.Expr]]]c              
   C  s   |dk}t tttjf   }t|D ]}t|tj	sq|
 }|s+t|d dkr+q||r0dnd }|g}	dd |j D }
|
D ]~}g |j }tjj}tjj}t|D ]\}\}}||9 }|||ri nqW|||sqqB|d }|r}|d| n||d }g }|D ]3\}}t|j|}td|t|t t|}t||||}|dur|d n|g}| | q|	!| qB|	D ]2}tdt|t"j#j$ }|d }t%|d| }|ft&||d  }|'| (| )|||| qqt*|tdd}|S )	z
        Creates N-dimensional tiling candidiates, attempting to simplify loads/stores
        by tiling the kernel into higher dimensions.

        Returns a list of tilings ranked by dimensionality.
        r6   r   c                 S  s(   g | ]}t |trt|jd kr|qS )r   )r   r   r   rY  r  rd   rd   re   r   K  s    z1SIMDScheduling.get_nd_tilings.<locals>.<listcomp>Nr   T)r   reverse)+r   r4  rP   rz   Exprr>   filterr   r   rD  r  r   r>  reads_and_writesrY  rr   r{   r|   r5   r   r   r  statically_known_geqr   r7   get_subexpr_involving_symbolr   r  r   r   r   match_mod_div_block_exprr,  r   r   r%  	max_tilesr0   r  r   r  r  r  )r  rK  pointwise_numelr  r  r  r   node_rangesranges_to_tilenode_tilingsmemory_depsr  all_var_rangespointwise_vars_numelr   pointwise_end_idxvarrU   reduction_start_idxrS   index_tilingr   num_dimsmatch_resultdimsnode_tilingnum_leading_dimsfirst_trailing_dimcollapsed_leading_dimcollapsed_splitsranked_tilingsrd   rd   re   get_nd_tilings+  s   


zSIMDScheduling.get_nd_tilingsc                   s   dk}|  |g g}|stjjrtjjdkrBtjtjkr@t	
|D ]}tjjs?t| || dkr?ttd  |S q#|S tt  }t }t	
|D ]&}| || D ]}	|	j|v raqY|	jdurl||	j ||	  |	j7  < qYqPdd | D }
tjjdkr|rddd}tdt|
D ]}||
d |
| }|dur|g|
 }
 nqt|
dkrtd|
 tjjr| || |
 }
|
D ]ttsJ t fdd|D r߈  S q|S )z
        Heuristics to decide how to tile kernels.
        Currently, we tile based on stride-1 dimensions.

        Returns:
            `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`

        r6   r   z
                                Reduction over non-contiguous dims.
                                Consider setting config.triton.tile_reductions to True.
                                Nc                 S  s   g | ]\}}|j qS rd   )r   )rn   candidate_tilingr  rd   rd   re   r     s    z0SIMDScheduling.select_tiling.<locals>.<listcomp>r   tiling0r   r1  r\   Optional[dict[str, sympy.Expr]]c                 S  s   | d |  dd}}|d | dd}}tjj|| dkr#d S tjj|| dk r;||f||f\}}\}}tjj|| dksHJ tjj||sRd S |t|||| d d}|S )NrI   rH   r6   r   rJ   )rG   rH   rI   rJ   )r   r5   r   r   r   rh  r   )r%  r1  a0a1b0b1
new_tilingrd   rd   re   convert_tiling_to_3d  s   z:SIMDScheduling.select_tiling.<locals>.convert_tiling_to_3dzpossibly bad tiling: %sc                 3  s4    | ]}t |tjrtj |  d V  qdS ))r  N)r   r   rD  rY   r  r   r  r  r  r   rd   re   r     s    

z/SIMDScheduling.select_tiling.<locals>.<genexpr>)r%  r   r1  r   r\   r&  )r  r   r%  tile_reductionsr  perf_hint_loglevelloggingWARNINGr>   r	  r   r  infotextwrapdedentr   rP   collectionsr   rO   r   r  most_commonrangeprefer_nd_tilingr#  r   r4  r   )r  rK  rU   r  r  default_tilingr   
seen_namescandidate_tilesr$  r"  r,  r)  new_3d_tilingrd   r-  re   r$    sv    




	zSIMDScheduling.select_tilingc                 C  r   r^   rd   rh   rd   rd   re   flush  r  zSIMDScheduling.flushc                 C  r  r  rd   rh   rd   rd   re   ready_to_flush  r  zSIMDScheduling.ready_to_flushc              	   C  s2  t dd |D set|dd dj\}\}}| |||}| |||}| j|t|||d}| || t	d|! t
| | }	W d    n1 sPw   Y  W d    n1 s_w   Y  n)|d |\}
}}t	d| | j|||
d	d
}	W d    n1 sw   Y  |	ttjd}	|	S )Nc                 s  r  r^   )r  r   rd   rd   re   r     r  zASIMDScheduling.generate_kernel_code_from_nodes.<locals>.<genexpr>c                 S  rW  r^   rX  r   rd   rd   re   r     rY  z@SIMDScheduling.generate_kernel_code_from_nodes.<locals>.<lambda>r   )r   r  r   Tr  triton_)r  r  rb  rU  r$  r  r@   rm  r   r  r5   ro  r  get_prologue_template_epiloguer  replacerP   r,   KERNEL_NAME)ra   r   r  rd  rU   r  rK  r   rX   r  r  templateepiloguerd   rd   re   generate_kernel_code_from_nodes  s>   


 
z.SIMDScheduling.generate_kernel_code_from_nodesc                 C  r   r^   rd   )ra   rK  rd   rd   re   ru  4  r  zSIMDScheduling.codegen_commentc                 C  r  r^   r  )ra   r  rK  rX   rd   rd   re   rp  7  r  zSIMDScheduling.define_kernelN)r   rV  )rU   rV   r^  r_  r\   rf   )ri  r@   )ri  r@   r\   r  )r\   r  r  )r  r  r  rf   r  rf   r  rf   r  rf   r\   r  )r\   r  )r  r.  r  r.  r\   r   )r   r.  r  rf   r\   r   )r   r   rU   rV   r  rV   r\   r   )r\   r  )r\   r   rt   )&rv   rw   rx   rY   r  r  r  r  can_fuse_verticalr(  rU  r]  r  rh  r\  rl  rm  r  r  r  r  r  r   r   r  r  r  r  r#  rz   r{   r|   r$  r>  r?  rF  ru  rp  rd   rd   rd   re   r  )  sN   
  	
`

C
#v	={
qr

r  T)frozenc                   @  s6   e Zd ZU ded< ded< dZded< edd	 ZdS )
r  r   r   r   r  Nr  rO   c                 C  s"   t jj| } | dko| d dkS )z@Somewhat arbitrary heuristic used to boost scores for some sizesr  r   rx  )r   rd   rd   re   r  A  s   zCandidateTiling.is_good_size)rv   rw   rx   r  rO   r  r  rd   rd   rd   re   r  ;  s   
 r  c                   @  s   e Zd ZdS )ri  N)rv   rw   rx   rd   rd   rd   re   ri  H  s    ri  )r   r   r\   rP   )v
__future__r   r6  rV  dataclassesr   r   r1  r   r|  r4  r   typingr   r   r   r   r   r	   r
   typing_extensionsr   rz   rb  torch._loggingtorch.fx.immutable_collectionsr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   r   _dynamo.utilsr   r  r   r   r   analyze_preserves_zero_maskr   	codecacher   dependenciesr   r   r   r    r!   optimize_indexingr"   runtime.runtime_utilsr#   r$   r%   r&   r'   utilsr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   virtualizedr3   r4   r5   block_analysisr7   commonr8   r9   r:   r;   multi_kernelr<   simd_kernel_featuresr=   r>   r?   r@   collections.abcrA   rB   rC   	getLoggerrv   r  _logginggetArtifactLoggerr/  rZ  
fusion_logdoprintr  r!  	dataclassrL   r[   r   r   r   rY   r  r  	Exceptionri  rd   rd   rd   re   <module>   s   $4
8{
>
     _        
