o
    IhU                    @   sZ  U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZmZm Z  d d	l!m"Z"m#Z#m$Z$ d
dl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZD ddlEmFZFmGZGmHZHmIZI ddlJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZUmVZV ddlWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZe ejfdkZgehddd ZiejjkeldZmeg dZnddddddd d!d"d"d#
Zoeg d$Zpd%d&d'd(d)d*d+d,d-d.d/
Zqd0d1d2ZrejsejtgZuejvejwejsejtejxejyejzej{ej|g	Z}e~ej ed3< ejwejsejtejyejzgZe~ej ed4< d5d6 Zd7d8 Z		dd9eej fd:d;Zd<d= Zd>eRd?ejd@edAejdBejdCeLfdDdEZdFeeeNf dGedHedIejdJeeef f
dKdLZdMeRdNedOefdPdQZejhd9ejdRejfdSdTZejhd9ejdRejdUefdVdWZejh	dd9ejdRejdUee fdXdYZejG dZd[ d[ZG d\d] d]e5ZG d^d_ d_ZG d`da daeUZedb G dcdd ddeZede e  G dfdg dgeZG dhdi dieSZG djdk dkeZG dldm dmeZdne/dCeeej exf fdodpZG dqdr drZG dsdt dteZG dudv dveZG dwdx dxeZG dydz dze2ZG d{d| d|ZG d}d~ d~ZejG dd dZejG dd dZdS )    N)Sequence)Enum)AnyCallablecastOptionalUnion)dependencies)is_float_dtypeis_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)free_symbol_is_typesymbol_is_typeSymT   )counters   )	codecacheconfigcpp_buildercpu_vec_isairmetrics)LoopBody)BaseSchedulerNodeBaseSchedulingExternKernelSchedulerNodeForeachKernelSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfget_bounds_index_exprget_fused_kernel_namehas_free_symbolsis_multi_outputs_templateis_welford_reductionparallel_num_threadsPlaceholdersympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_subs)NullKernelHandleropsOpsValueV   )BackendFeatureBracesBufferCSECSEVariableDataTypePropagationDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferKernel
KernelArgsOpOverridesOptimizationContext)_get_dtype_from_loopbodies_get_loop_bodycexprcexpr_indexcodegen_randCppCSEVariableDTYPE_TO_CPP
INDEX_TYPELocalBufferContextmay_unify_binary_op_mask_typepromote_args(template_fusion_with_epilogues_supportedunify_mask_base_typevalue_to_cppwin32c                   C   s   t rdS dS )Nz__declspec(dllexport) _IS_WINDOWS rS   rS   O/var/www/vscode/kcb/lib/python3.10/site-packages/torch/_inductor/codegen/cpp.pyget_export_declarationW      rU   schedule)+*^||minmaxrX   rY   rZ   r\   r]   argminargmaxr[   welford)
sumprodxor_sumr\   r]   r^   r_   anywelford_reducewelford_combine)
r]   r\   ra   rb   rc   re   rf   r^   r_   rd   z
at::Tensorlongdoubleboolzstd::stringzc10::ScalarTypezat::MemoryFormatz
at::Layoutz
at::Devicez
at::Scalar)
Tensorintfloatri   str
ScalarTypeMemoryFormatLayoutDevicenumberzstd::vectorzstd::optional)Listr   VECTORIZABLE_DTYPESMASKED_VECTORIZABLE_DTYPESc                 C   s   |t v rtj}| dv rdS | dkrdS | dv r`t| }|tjkr)| dv r)ttj }t|r3d| dnd	| d
}t|rCd	| dnd	| d}| dv rO|n|}| dv rW|S d| d| dS t| rldt|  dS t| )N)rc   ra   rd   r   rb   r4   )r]   r_   r\   r^   r^   r_   -std::numeric_limits<>::infinity()std::numeric_limits<>::min()>::max())r]   r_   )r]   r\   IndexValue<z>{0, }Welford<>())	DTYPE_LOWP_FPtorchfloat32rG   ri   rl   r
   r)   AssertionError)reduction_typedtypecdtypemin_varmax_varinit_varrS   rS   rT   reduction_init   s6   


r   c                 C   sL   t t|  }t| rd| dS | dv r$|tjkrt tj }d| dS |S )Nr~   >rv   r|   )rG   r;   r)   r   ri   rl   )r   r   scalar_typerS   rS   rT   reduction_acc_type   s   

r   indexc           
   	   C   s  |t jk}| dkr|rdnd}| d| d| S | dkr$| d| S | dkr/| d| S | d	kr:| d
| S | dv rI|  d| d| dS | dkrVd| d| dS | dkr|t|tre|\}}}	nt| |\}}}	d| d| d| d|	 d	S | dv rt|dr|jt jkr|js|d ur|  d| d| d| dS |  d| d| dS |d ur|  d| d| d| dS |  d| d| dS t| )Nra   |rX    rb    * rc    ^ rd    || )r\   r]   z_propagate_nan(, )re   welford_combine(rf   , {})rv   r   z	_combine(z, static_cast<float>(), )))	r   ri   
isinstancetuplereduction_projecthasattrr   is_vecr   )
r   var
next_valuer   	src_dtypeis_boolconjunctionmeanm2weightrS   rS   rT   reduction_combine   sB   

r   c                 C   s:   t | r| d| d| dfS | dv r| dS |S )Nz.meanz.m2z.weightrv   z.index)r)   )r   accrS   rS   rT   r     s
   
r   codeiter_varnew_iter_var
loop_startloop_endreturnc              
   C   s   t  }t g}|dt d| dt| d| dt| d| d  ||  t| j	D ]3\}}t
|ttfs?J d}	t
|trL|j}	|j}td	|  d	 | |}
|	r`t|	|
}
||
 q2W d   |S 1 sqw   Y  |S )
a  
    f(iter_var) is transformed to f(new_iter_var) under the inner loop
      \/
    for (new_iter_var = loop_start; new_iter_var < loop_end; new_iter_var++) {
        f(new_iter_var)
    }
    Please be careful while using this function,
    as the variable defined in f(iter_var) will be invalid outside the for loop.
    For example:
    auto tmp0 = in_ptr[x0]; ->
    for (new_x0 = start; new_x0 < end; new_x0++){
        auto tmp0 = in_ptr[new_x0];
    }
    The tmp0 is invalid outside the loop.
    zfor (r    = ; < ; ++)N\b)r6   
contextlib	ExitStack	writelinerH   rD   enter_contextindent	enumerate_linesr   rm   r:   namelineresub)r   r   r   r   r   transformed_codestack_r   deferred_namenew_linerS   rS   rT   move_code_under_inner_loop	  s8   



r   acc_varacc_typer   r   lenc              
   C   sz   t  }t rd|  d| d| dn
| d|  d| d}||  |d| d	d
d|  d||| ddg |S )a  
    MSVC don't support dynamic array(VLA). So we use std::unique_ptr here.
    Ref: https://stackoverflow.com/questions/56555406/creating-dynamic-sized-array-using-msvc-c-compiler
    MSVC is the only one compiler without VLA. support. Since MSVC can't get good performance here.
    We just use unique_ptr make it works on MSVC.
    For other compilers, we continue to use VLA to get best performence.
    auto z_arr = std::make_unique<z[]>();r   _arr[];for (int i = 0; i < ; i++){    z
_arr[i] = r   r}   )r<   r   
is_msvc_clr   
writelines)r   r   r   r   r   init_fncode_bufferacc_declrS   rS   rT   reduction_prefix_array9  s   
r   bufferr   new_namec                 C   st   t | jD ]2\}}t|ttfsJ t|tr'td|  d | |j|_qtd|  d | || j|< qd S )Nr   )r   r   r   rm   r:   r   r   r   )r   r   r   ir   rS   rS   rT   replace_acc_nameZ  s   
 "r   r   c                 C   s6   |  |s	tjjS ||d i}t| |}t||  S Nr4   )hassympySZeror/   simplify)r   r   replacement	new_indexrS   rS   rT   	stride_ati  s
   

r   
vec_lengthc                    s   d d fdd}fdd}| }t jddd}| tr+| t||} t jd	dd}| trA| t|||} t | } | |krPt| S | S )
a  
    Simplifies the index expression within the range of a vectorized loop.
    Given a vectorized loop variable `var` in the range of a loop with `vec_length`,
    this function transforms the `index` into an equivalent form. It handles
    simplifications for cases where `var` can be expressed as `vec_length * a + b`,
    where `b` ranges from 0 to `vec_length - 1`. The function reduces occurrences
    of `FloorDiv` and `ModularIndexing` in the `index` with best-effort optimizations.

    NOTE:
    The simplified index expression is intended for analysis purposes only, not
    for code generation. It replaces `FloorDiv` and `ModularIndexing` with free variables
    which are not dependent on the loop variable `var` in the vectorized range. Check
    https://github.com/pytorch/pytorch/pull/117221#discussion_r1449746217 for more details.

    Examples:
    1. If `var` is `x3` and `vec_length` is 16, and `x3 = 16*a + b`, then
       `FloorDiv(x3, div)` or `ModularIndexing(x3, div, mod)` becomes a free variable
       when `div` is divisible by 16.
    2. `ModularIndexing(x3, 1, mod)` can be simplified to `x3 + c` where `c` is a free
       variable when `mod` is divisible by 16.
    r   c                    s:   t | }t| krt d  } d7  |S )N_div_cr4   )r   r   gcdSymbol)divisorresult)div_freevar_idr   r   rS   rT   visit_indexing_div  s
   
z7simplify_index_in_vec_range.<locals>.visit_indexing_divc                    sx   t | |}t| krt d  } d7  |S | dkr:t|kr:t d   } d7  |S )N_mod_cr4   )r   r   r   r   )r   modulusr   )mod_freevar_idr   r   rS   rT   visit_modular_indexing  s   z;simplify_index_in_vec_range.<locals>.visit_modular_indexingr   T)integerr   )r   Wildr   r   replacer   r   simplify_index_in_vec_range)r   r   r   r   r   original_indexdivmodrS   )r   r   r   r   rT   r   u  s   


r   c                 C   s   |rt | ||} t| |S N)r   r   )r   r   r   rS   rS   rT   stride_at_vec_range  s   
r   c                   @   s"   e Zd ZU dZeed< eed< dS )ParallelDepthz{
    A class representing parallel depth.
    Includes the starting depth of parallelism and the depth of parallelism.
    parallel_depthstart_depthN)__name__
__module____qualname____doc__rk   __annotations__rS   rS   rS   rT   r     s   
 r   c                       s`   e Zd ZededefddZdddeeee	f  f fdd	Z
d
d Zdd Zdd Z  ZS )OuterLoopFusedSchedulerNodenode1node2c                 C   s   |j |j u sJ tdd ||fD sJ tdd ||fD rF| |j t|tu r/t| n|gt|tu r@t|  |S |g |S | |j ||g|S )Nc                 s   "    | ]}t |tttfv V  qd S r   )typer  r#   r!   .0noderS   rS   rT   	<genexpr>  s    
z3OuterLoopFusedSchedulerNode.fuse.<locals>.<genexpr>c                 s       | ]	}t |tu V  qd S r   r  r  r  rS   rS   rT   r
        )	schedulerallrd   r  r  listget_outer_nodes)clsr  r  outer_loop_fusion_depthrS   rS   rT   fuse  s,   	
z OuterLoopFusedSchedulerNode.fuser  r"   outer_fused_nodesc                    sR   || _ || _g }| j D ]}t|ttfsJ |t|  qt 	|| d S r   )
r  r  r   r#   r!   extendr  	get_nodessuper__init__)selfr  r  r  flatten_snodes_node	__class__rS   rT   r    s   
z$OuterLoopFusedSchedulerNode.__init__c                 C      | j S r   )r  r  rS   rS   rT   r       z+OuterLoopFusedSchedulerNode.get_outer_nodesc              
      s   dt dt dtdtdtf
 fdd tt|d D ]}|| j}||d  j} |||d	s1 d
S q|D ]0}ttj	|j
d | }t|j
|krdt|tjrdt|j
| tjrd|d |j
| k rd d
S q4dS )Nleft_loop_nestright_loop_nestloop_fusion_depthcurrent_checking_depthr   c                    s   | j sJ |j s
J | j |  |j | g d}t fdd|D s&dS |dks,J |d  }dkrS|d }|t| j k sAJ |t|j k sJJ | |||sSdS dS )N)r   sizeoffsetstepsc                 3   s$    | ]}t  |t |kV  qd S r   )getattr)r  attr_compareleft_loop_levelright_loop_levelrS   rT   r
    s    
zaOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner.<locals>.<genexpr>Fr4   r   T)loopsr  r   )r"  r#  r$  r%  outer_loops_attr_compare_list_innerr+  rT   r1  	  s.   



zNOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._innerr4   r   F,  T)LoopNestrk   ri   ranger   	loop_nest	functoolsreduceoperatormulrangesr   r   Integer)r  cpp_kernel_proxy_listr  idxr"  r#  cpp_kernel_proxyouter_rangesrS   r0  rT   "check_outer_fusion_loop_level_attr   sR   	*


z>OuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attrc                    sP   |d j }t|} fdd|D |_|d }||j_|jjd  j |j_|S )Nr   c                    s   g | ]	}|j  jqS rS   )r5  from_loop_levelr  )r  proxyr   rS   rT   
<listcomp>\  s    zJOuterLoopFusedSchedulerNode.merge_outer_fusion_kernels.<locals>.<listcomp>)kernel_groupOuterLoopFusedKernelinnerr5  kernelr.  r  )r  r<  rD  outer_loop_fused_kernelouter_fused_proxyrS   r   rT   merge_outer_fusion_kernelsV  s   

z6OuterLoopFusedSchedulerNode.merge_outer_fusion_kernels)r   r   r   classmethodr   r  r  r   r!   r#   r  r  r@  rJ  __classcell__rS   rS   r  rT   r    s    #Vr  c                   @   s<   e Zd ZddefddZdd Zdd Zd	d
 Zdd ZdS )RecordOptimizationContextrP   	func_namec                 C   s   || _ d | _d | _d S r   )rN  current_nodeopt_ctx)r  rN  rS   rS   rT   r  i  s   
z"RecordOptimizationContext.__init__c                 C   sr   t jsJ t jjsJ t jj| _| jd usJ tj| jjv r'| jjtj | _nt | _| jd us2J | j| j_| S r   )	r3   interpreterrO  r@   keymetarP  rN  ops_namer   rS   rS   rT   	__enter__n  s   


z#RecordOptimizationContext.__enter__c                 C   s(   | j sJ | js
J | j| j jtj< d S r   )rO  rP  rS  r@   rR  r  exc_typeexc_valexc_tbrS   rS   rT   __exit__|  s   

z"RecordOptimizationContext.__exit__c                 C   r  r   )rP  r   rS   rS   rT   get_opt_ctx  r!  z%RecordOptimizationContext.get_opt_ctxc                 C      | j sJ | j S r   )rO  r   rS   rS   rT   get_fx_node     
z%RecordOptimizationContext.get_fx_nodeN)rP   )	r   r   r   rm   r  rU  rZ  r[  r]  rS   rS   rS   rT   rM  h  s    rM  c                   @   s  e Zd ZdZedd Zedd Zedd Zedd
dZedd Z	edd Z
edd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zed d! Zed"d# Zed$d% Zed&d' Zed(d) Zed*d+ Zed,d- Zed.d/ Zed0d1 Zed2d3 Zed4d5 Zed6d7 Zed8d9 Zed:d; Z ed<d= Z!ed>d? Z"ed@dA Z#edBdC Z$edDdE Z%edFdG Z&edHdI Z'edJdK Z(edLdM Z)edNdO Z*edPdQ Z+edRdS Z,edTdU Z-edVdW Z.edXdY Z/edZd[ Z0ed\d] Z1ed^d_ Z2ed`da Z3edbdc Z4eddde Z5edfdg Z6edhdi Z7edjdk Z8edldm Z9edndo Z:edpdq Z;edrds Z<edtdu Z=edvdw Z>edxdy Z?edzd{ Z@ed|d} ZAed~d ZBedd ZCedd ZDedd ZEedd ZFedeGjHdeGjHfddZIedeGjHdeGjHfddZJedeGjHdeGjHfddZKedd ZLedd ZMdS )CppOverrideszMap element-wise ops to C++c                 C      d|  d|  d| dS )N	decltype()( + r   rS   abrS   rS   rT   add     zCppOverrides.addc                 C   r`  )Nra  rb   - r   rS   rd  rS   rS   rT   r     rh  zCppOverrides.subc                 C   r`  )Nra  rb  r   r   rS   rd  rS   rS   rT   r9    rh  zCppOverrides.mulNTc                 C   s   t | tsJ |d u r| j}tj| ||}tjjtjj|}|	d| |fd|i |t
v r>|tjkr>	 tj| ||| |S )Nto_dtyper   )r   rF   r   r3   rG  get_to_dtype_exprcsegeneratecomputeupdate_on_argsr   r   rl   cache_dtype_convert)xr   r   use_compute_typesexprcsevarrS   rS   rT   rj    s   zCppOverrides.to_dtypec                 C   s2   |t v sJ | dt ddt |  d|  dS )Nz missing from z.DTYPE_TO_CPPzc10::bit_cast<>(r   )rG   r   )rq  r   r   rS   rS   rT   to_dtype_bitcast  s   zCppOverrides.to_dtype_bitcastc                 C      d|  dS )Nz	std::abs(r   rS   rq  rS   rS   rT   abs  rV   zCppOverrides.absc                 C   rw  )Nz	std::sin(r   rS   rx  rS   rS   rT   sin  rV   zCppOverrides.sinc                 C   rw  )Nz	std::cos(r   rS   rx  rS   rS   rT   cos  rV   zCppOverrides.cosc                 C      d|  d|  dS )Nra  z)(-r   rS   rx  rS   rS   rT   neg     zCppOverrides.negc                 C   rw  )Nz	std::exp(r   rS   rx  rS   rS   rT   exp  s   zCppOverrides.expc                 C   rw  )Nz
std::exp2(r   rS   rx  rS   rS   rT   exp2  rV   zCppOverrides.exp2c                 C   rw  )Nzstd::expm1(r   rS   rx  rS   rS   rT   expm1  rV   zCppOverrides.expm1c                 C   rw  )Nz	std::erf(r   rS   rx  rS   rS   rT   erf  rV   zCppOverrides.erfc                 C   rw  )Nz
std::erfc(r   rS   rx  rS   rS   rT   erfc  rV   zCppOverrides.erfcc                 C   rw  )Nzcalc_erfinv(r   rS   rx  rS   rS   rT   erfinv  rV   zCppOverrides.erfinvc                 C   rw  )Nz
std::sqrt(r   rS   rx  rS   rS   rT   sqrt  rV   zCppOverrides.sqrtc                 C   rw  )Nz1 / std::sqrt(r   rS   rx  rS   rS   rT   rsqrt  rV   zCppOverrides.rsqrtc                 C   sB   t jj}|dkr|  d|  dS |d u rd|  dS td|)Naccuracy + decltype()(1)zstd::log1p(r   8unrecognized config cpp.inject_log1p_bug_TESTING_ONLY = r   cppinject_log1p_bug_TESTING_ONLYr   rq  bugrS   rS   rT   log1p  s   zCppOverrides.log1pc                 C   rw  )Nz	std::tan(r   rS   rx  rS   rS   rT   tan  rV   zCppOverrides.tanc                 C   rw  )Nz
std::tanh(r   rS   rx  rS   rS   rT   tanh	  rV   zCppOverrides.tanhc                 C   s   t rd|  dS d|  dS )z
        On windows std::signbit only support float type.
        Ref: https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/signbit?view=msvc-170
        z std::signbit(static_cast<float>(r   zstd::signbit(r   rQ   rx  rS   rS   rT   signbit  s
   
zCppOverrides.signbitc                 C      d|  d| dS )Nz	std::pow(r   r   rS   rd  rS   rS   rT   pow  r~  zCppOverrides.powc                 C   rw  )Nz	std::log(r   rS   rx  rS   rS   rT   log  rV   zCppOverrides.logc                 C   rw  )Nzstd::nearbyint(r   rS   rx  rS   rS   rT   round!  rV   zCppOverrides.roundc                 C   rw  )Nzstd::floor(r   rS   rx  rS   rS   rT   floor%  rV   zCppOverrides.floorc                 C   sF   |  d| }|  d| }d|  d| d| d| d| d| d	S )
N /  % ((z
 < 0) != (z	 < 0) ? (z != 0 ? z - 1 : z) : r   rS   )re  rf  quotremrS   rS   rT   floordiv)  s   *zCppOverrides.floordivc                 C   rw  )Nz
std::ceil(r   rS   rx  rS   rS   rT   ceil0  rV   zCppOverrides.ceilc                 C   rw  )Nzstd::trunc(r   rS   rx  rS   rS   rT   trunc4  rV   zCppOverrides.truncc                 C      |  d| S Nr  rS   rd  rS   rS   rT   truncdiv8  s   zCppOverrides.truncdivc                 C   r  )Nz
std::fmod(r   r   rS   rd  rS   rS   rT   fmod=  r~  zCppOverrides.fmodc                 C   rw  )Nzstd::isinf(r   rS   rx  rS   rS   rT   isinfA  rV   zCppOverrides.isinfc                 C   rw  )Nzstd::isnan(r   rS   rx  rS   rS   rT   isnanE  rV   zCppOverrides.isnanc                 C   rw  )Nzstd::lgamma(r   rS   rx  rS   rS   rT   lgammaI  rV   zCppOverrides.lgammac                 C   rw  )Nz
std::acos(r   rS   rx  rS   rS   rT   acosM  rV   zCppOverrides.acosc                 C   rw  )Nzstd::acosh(r   rS   rx  rS   rS   rT   acoshQ  rV   zCppOverrides.acoshc                 C   rw  )Nz
std::cosh(r   rS   rx  rS   rS   rT   coshU  rV   zCppOverrides.coshc                 C   rw  )Nz
std::sinh(r   rS   rx  rS   rS   rT   sinhY  rV   zCppOverrides.sinhc                 C   rw  )Nz
std::asin(r   rS   rx  rS   rS   rT   asin]  rV   zCppOverrides.asinc                 C   rw  )Nzstd::asinh(r   rS   rx  rS   rS   rT   asinha  rV   zCppOverrides.asinhc                 C   r  )Nzstd::atan2(r   r   rS   rq  yrS   rS   rT   atan2e  r~  zCppOverrides.atan2c                 C   rw  )Nz
std::atan(r   rS   rx  rS   rS   rT   atani  rV   zCppOverrides.atanc                 C   rw  )Nzstd::atanh(r   rS   rx  rS   rS   rT   atanhm  rV   zCppOverrides.atanhc                 C   r  )Nzstd::copysign(r   r   rS   r  rS   rS   rT   copysignq  r~  zCppOverrides.copysignc              	   C   s   d|  dd|  df}t dd |D rtdd |D S t }tjjjtjd}tjjj| j	d}|
d| d	 |
d
| d|  d| d tjj| ||f}t||D ]\}}tjj|| q[||fS )Nfrexp()[0])[1]c                 s   "    | ]}t jj|d uV  qd S r   r3   rG  rl  try_getr  	cache_keyrS   rS   rT   r
  x       z%CppOverrides.frexp.<locals>.<genexpr>c                 s       | ]
}t jj|V  qd S r   r  r  rS   rS   rT   r
  y      r   zint32_t r   r   z = std::frexp(, &r   )r  r   r6   r3   rG  rl  newvarr   int32r   r   rn  splicezipput)rq  
cache_keysr   exponentmantissacse_varsr  cse_varrS   rS   rT   frexpu  s   zCppOverrides.frexpc                 C   r  )Nzstd::hypot(r   r   rS   r  rS   rS   rT   hypot  r~  zCppOverrides.hypotc                 C   rw  )Nzstd::log10(r   rS   rx  rS   rS   rT   log10  rV   zCppOverrides.log10c                 C   rw  )Nz
std::log2(r   rS   rx  rS   rS   rT   log2  rV   zCppOverrides.log2c                 C   r  )Nzstd::nextafter(r   r   rS   r  rS   rS   rT   	nextafter  r~  zCppOverrides.nextafterc                 C   f   t jj}|dkr
dS |dkr|  dS |dkr|  d|  dS |d u r,d|  d	|  d
S td|)Ncompile_errorcompile error!runtime_error	; throw 1r  r  r  z	std::max(, decltype()(0))7unrecognized config cpp.inject_relu_bug_TESTING_ONLY = r   r  inject_relu_bug_TESTING_ONLYr   r  rS   rS   rT   relu     
zCppOverrides.reluc                 C   r  )Nzmin_propagate_nan(r   r   rS   rd  rS   rS   rT   minimum  r~  zCppOverrides.minimumc                 C   r  )Nzmax_propagate_nan(r   r   rS   rd  rS   rS   rT   maximum  r~  zCppOverrides.maximumc                 C   s   |  d| d| S )N ?  : rS   )re  rf  crS   rS   rT   where  s   zCppOverrides.wherec                 C   r  )Nzmod(r   r   rS   rd  rS   rS   rT   r     r~  zCppOverrides.modc                 C   s   t | t| S r   )rN   rG   )valr   rS   rS   rT   constant     zCppOverrides.constantc                 C   s8   t tj| }tjjjtjj|t| d}t	||S )Nbounds)
rC   r3   rG  rename_indexingrl  rm  rn  r%   r1   rj  )rs  r   idx_strr   rS   rS   rT   
index_expr  s
   zCppOverrides.index_exprc              	   C   s   t  }tjj }|d| d tj|( |  | }|d| d W d    n1 s4w   Y  W d    n1 sCw   Y  |d tjj	| t
|d| d}|  d| d| S )	Nr    = [&]return r   ra  z())r  z() : )r6   r3   rG  rl  r  r   swap_buffersr   rn  r  rN   )maskbodyotherr   body_varr   
other_coderS   rS   rT   masked  s    
zCppOverrides.maskedc                 C   r  )N && rS   rd  rS   rS   rT   logical_and  r  zCppOverrides.logical_andc                 C   
   d|  S )N!rS   re  rS   rS   rT   logical_not     
zCppOverrides.logical_notc                 C   r  )Nr   rS   rd  rS   rS   rT   
logical_or  r  zCppOverrides.logical_orc                 C   r  )N != rS   rd  rS   rS   rT   logical_xor  r  zCppOverrides.logical_xorc                 C   r`  )Nra  rb   & r   rS   rd  rS   rS   rT   bitwise_and  rh  zCppOverrides.bitwise_andc                 C   r|  )Nra  z)(~r   rS   r  rS   rS   rT   bitwise_not  r~  zCppOverrides.bitwise_notc                 C   r`  )Nra  rb   | r   rS   rd  rS   rS   rT   
bitwise_or  rh  zCppOverrides.bitwise_orc                 C   r`  )Nra  rb  r   r   rS   rd  rS   rS   rT   bitwise_xor  rh  zCppOverrides.bitwise_xorc                 C   s   t  }|d | W t| j }|d| d| d |d| d| d| d |  |d	|  d
 W d    n1 sEw   Y  |d	|  d| d|  d| d	 W d    n1 sfw   Y  |d |S )N[&]()constexpr decltype() max_shift = sizeof(z) * CHAR_BIT;$if ((static_cast<std::make_signed_t<>>() < 0) || ( >= max_shift))return decltype(z)(0);z#)(static_cast<std::make_unsigned_t<z) << r   ()r6   r   r   rG   r   re  rf  r   scalar_trS   rS   rT   bitwise_left_shift  s&   




zCppOverrides.bitwise_left_shiftc              
   C   s   t  }|d | Z t| j }|d| d| d| d |d| d| d| d	 |  |d
|  d|  d W d    n1 sKw   Y  |d
|  d|  d| d W d    n1 siw   Y  |d |S )Nr  r  r  z ) * CHAR_BIT - std::is_signed_v<z>;r  r  r  r	  r
  rb  z >> max_shift); >> r   r  r  r  rS   rS   rT   bitwise_right_shift  s"   



 
z CppOverrides.bitwise_right_shiftseedr'  c                 C   r  )Nznormalized_rand_cpu(r   r   rS   r  r'  rS   rS   rT   rand  r~  zCppOverrides.randc                 C   r  )Nz
randn_cpu(r   r   rS   r  rS   rS   rT   randn  r~  zCppOverrides.randnc              	   C   s   d|  d| d| d| d	S )Nzrandint64_cpu(r   r   rS   )r  r'  lowhighrS   rS   rT   	randint64!  s   zCppOverrides.randint64c                 C      d|  d|  d|  dS )Nra  z)(1) / (decltype(z)(1) + std::exp(-r   rS   rx  rS   rS   rT   sigmoid%  rh  zCppOverrides.sigmoidc              
   C   s   t  }d|  d}d|  d}|d | + |d|  d| d| d |d	|  d
| d| d |d W d    n1 sFw   Y  |d |S )Nra  )(0)r  r  auto left = z > 0 ? r  r   auto right = z < 0 ? return left - right;r  r6   r   r   )rq  r   scalar_zero
scalar_onerS   rS   rT   sign)  s   


zCppOverrides.signNT)Nr   r   r   r   staticmethodrg  r   r9  rj  rv  ry  rz  r{  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r   Exprr  r  r  r  r"  rS   rS   rS   rT   r_    s    


*






























































r_  r  c                       s  e Zd ZdZ fddZedd Zedd Zedd	 Zed
d Z	edd Z
edd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zed d! Zed"d# Zed$d% Zed&d' Zed(d) Zed*d+ Zed,d- Zed.d/ Zed0d1 Zed2d3 Zed4d5 Zed6d7 Zed8d9 Z ed:d; Z!ed<d= Z"ed>d? Z#ed@dA Z$edBdC Z%edDdE Z&edFdG Z'edHdI Z(edJdK Z)edLdM Z*edNdO Z+edPdQ Z,edRdS Z-edTdU Z.edVdW Z/edXdY Z0edZd[ Z1ed\d] Z2ed^d_ Z3ed`da Z4edbdc Z5eddde Z6edfdg Z7edhdi Z8edjdk Z9edldm Z:edndo Z;edpdq Z<edrds Z=edtdu Z>edvdw Z?edxdy Z@edzd{ ZAed|d} ZBed~d ZCedd ZDedd ZEedd ZFedd ZGedd ZHedd ZIedd ZJedd ZKedd ZLedd ZMedddZNedd ZOedd ZPedd ZQedd ZReSdd ZTeSdd ZU  ZVS )CppVecOverridesz.Map element-wise ops to aten vectorization C++c                    s^   t  |   fdd}tt D ]\}}t|dd tkr,|dvr,t |||j q S )Nc                    s    fdd}|S )Nc                     s@  dd | D }dd | D }t | }|rL|rLg }| D ]1}t|ttjfrFt|tjr5|js5t|tj	}nt
|tj	}t|trD|jn|}|| q|rjt|dkrYt|}ntjkrjt|dd  |dd < |r|rttjtsvJ fdd|D }|r|i |S tt}t|j}|d usJ || i |S )Nc                 S   s0   g | ]}t |ttjfst |tr|js|qS rS   )r   rk   r   r%  rF   r   r  argrS   rS   rT   rC  N  s    zJCppVecOverrides.__new__.<locals>.wrap.<locals>.wrapper.<locals>.<listcomp>c                 S   s    g | ]}t |tr|jr|qS rS   )r   rF   r   r'  rS   rS   rT   rC  T  s    r   r4   c                    s@   g | ]}t |tr|js tjtjtjfvrtj	|n|qS rS   )
r   rF   r   r&  r  r  r  r3   rG  	broadcast)r  new_argfuncrS   rT   rC  t  s    )r  r   rk   r   r%  	is_numberr1   r  r   int64r  r2   valueappendr   rK   r&  r  r3   rG  CppVecKernelr  r)  r   )argskwargsscalarsvectorsnew_argsr(  
scalar_opsscalar_func)r  r,  r  rS   rT   wrapperM  s@   



z6CppVecOverrides.__new__.<locals>.wrap.<locals>.wrapperrS   )r,  r9  )r  r  r+  rT   wrap@  s   Bz%CppVecOverrides.__new__.<locals>.wrapr  )r  r  )	r  __new__varsr&  itemsr)  r$  setattr__func__)r  r2  kargsr:  r   methodr  r   rT   r;  =  s   QzCppVecOverrides.__new__c                 C   r  )Nrc  rS   rd  rS   rS   rT   rg    r  zCppVecOverrides.addc                 C   r  )Nri  rS   rd  rS   rS   rT   r     r  zCppVecOverrides.subc                 C   r  Nr   rS   rd  rS   rS   rT   r9    r  zCppVecOverrides.mulc                 C   r  r  rS   rd  rS   rS   rT   truediv  r  zCppVecOverrides.truedivc                 C   
   |  dS )Nz.abs()rS   rx  rS   rS   rT   ry    r  zCppVecOverrides.absc                 C   rD  )Nz.sin()rS   rx  rS   rS   rT   rz    r  zCppVecOverrides.sinc                 C   rD  )Nz.cos()rS   rx  rS   rS   rT   r{    r  zCppVecOverrides.cosc                 C   rD  )Nz.exp()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.expc                 C   rD  )Nz.exp2()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.exp2c                 C   s   d|  d}|  d| S )Nra  r  z	.exp() - rS   )rq  vec_onerS   rS   rT   r    s   zCppVecOverrides.expm1c                 C   rD  )Nz.erf()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.erfc                 C   rD  )Nz.erfc()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.erfcc                 C   rD  )Nz	.erfinv()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.erfinvc                 C   rD  )Nz.sqrt()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.sqrtc                 C   L   t tjtsJ t | tsJ | jd usJ tj| j d|  d| dS )N( == r   r   r3   rG  r1  rF   r   _get_mask_typer  rS   rS   rT   eq      zCppVecOverrides.eqc                 C   s   t tjtsJ t | tsJ | jtjkr/|jtjksJ ttjj	| |f\}}| d| S | jd us6J tj
| j d|  d| dS )Nr  rG  r   )r   r3   rG  r1  rF   r   r   ri   rM   rn  rJ  )rq  r  x_casty_castrS   rS   rT   ne  s    zCppVecOverrides.nec                 C   rF  )NrG  r   r   rI  r  rS   rS   rT   lt  rL  zCppVecOverrides.ltc                 C   rF  )NrG  z > r   rI  r  rS   rS   rT   gt  rL  zCppVecOverrides.gtc                 C   rF  )NrG   <= r   rI  r  rS   rS   rT   le  rL  zCppVecOverrides.lec                 C   rF  )NrG   >= r   rI  r  rS   rS   rT   ge  rL  zCppVecOverrides.gec                 C   r  Nr  rS   r  rS   rS   rT   and_  r  zCppVecOverrides.and_c                 C   rD  )Nz.rsqrt()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.rsqrtc                 C      |  d| dS )Nz.pow(r   rS   rd  rS   rS   rT   r       zCppVecOverrides.powc                 C   rD  )Nz.log()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.logc                 C   rD  )Nz.round()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.roundc                 C   rD  )Nz.floor()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.floorc                 C   rD  )Nz.ceil()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.ceilc                 C   rD  )Nz.trunc()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.truncc                 C   rX  )Nz.fmod(r   rS   rd  rS   rS   rT   r  #  rY  zCppVecOverrides.fmodc                 C   rD  )Nz	.lgamma()rS   rx  rS   rS   rT   r  '  r  zCppVecOverrides.lgammac                 C      t | |\} }|  d| S rV  rJ   rd  rS   rS   rT   r  +     zCppVecOverrides.logical_andc                 C   r  N~rS   r  rS   rS   rT   r  0  r  zCppVecOverrides.logical_notc                 C   rZ  Nr   r[  rd  rS   rS   rT   r  4  r\  zCppVecOverrides.logical_orc                 C   rZ  Nr   r[  rd  rS   rS   rT   r  9  r\  zCppVecOverrides.logical_xorc                 C   rZ  rV  r[  rd  rS   rS   rT   r  >  r\  zCppVecOverrides.bitwise_andc                 C   r  r]  rS   r  rS   rS   rT   r  C  r  zCppVecOverrides.bitwise_notc                 C   rZ  r_  r[  rd  rS   rS   rT   r  G  r\  zCppVecOverrides.bitwise_orc                 C   rZ  r`  r[  rd  rS   rS   rT   r  L  r\  zCppVecOverrides.bitwise_xorc                 C   r  )Nz << rS   rd  rS   rS   rT   r  Q  r  z"CppVecOverrides.bitwise_left_shiftc                 C   r  )Nr  rS   rd  rS   rS   rT   r  U  r  z#CppVecOverrides.bitwise_right_shiftc                 C   s    t tjtsJ tj| | S r   )r   r3   rG  r1  load)r   r'  rS   rS   rT   	load_seedY  s   zCppVecOverrides.load_seedc                 C   .   t tjtsJ t }d|  d}t|||S )Nz)result[offset_idx] = normalized_rand_cpu(, offset[offset_idx]);r   r3   rG  r1  r6   rE   r  r'  r   rand_functionrS   rS   rT   r  ^  s
   
zCppVecOverrides.randc                 C   rc  )Nzresult[offset_idx] = randn_cpu(rd  re  rf  rS   rS   rT   r  g  s   zCppVecOverrides.randnc                 C   s>   t tjtsJ t }d|  d| d| d}t|||tjS )Nz#result[offset_idx] = randint64_cpu(z, offset[offset_idx], r   r   )r   r3   rG  r1  r6   rE   r   r.  )r  r'  r  r  r   rg  rS   rS   rT   r  n  s   zCppVecOverrides.randint64c                 C   s0   | j |j ks
J d|  dt| | d| S )Nz;remainder vec implementation expect the same inputs' dtype.z - (z) * )r   r&  r  rd  rS   rS   rT   	remainderu  s   zCppVecOverrides.remainderc                 C   rD  )Nz.tan()rS   r  rS   rS   rT   r  |  r  zCppVecOverrides.tanc                 C   rD  )Nz.tanh()rS   r  rS   rS   rT   r    r  zCppVecOverrides.tanhc                 C   rD  )Nz.reciprocal()rS   r  rS   rS   rT   
reciprocal  r  zCppVecOverrides.reciprocalc                 C   rD  )Nz.atan()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.atanc                 C   rD  )Nz.acos()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.acosc                 C   rD  )Nz.asin()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.asinc                 C   rD  )Nz.cosh()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.coshc                 C   rD  )Nz.sinh()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.sinhc                 C   rD  )Nz.log10()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.log10c                 C   rD  )Nz.log2()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.log2c                 C   rX  )Nz.nextafter(r   rS   r  rS   rS   rT   r    rY  zCppVecOverrides.nextafterc                 C   rX  )Nz
.copysign(r   rS   rd  rS   rS   rT   r    rY  zCppVecOverrides.copysignc                 C   rX  )Nz.atan2(r   rS   rd  rS   rS   rT   r    rY  zCppVecOverrides.atan2c                 C   rX  )Nz.hypot(r   rS   rd  rS   rS   rT   r    rY  zCppVecOverrides.hypotc              
   C   s:   d|  d}d|  d}| d| d|  d| d|  d
S )	Nra  r  z)(0.5)z * ((rc  z)/(ri  z)).log()rS   )rq  rE  vec_one_halfrS   rS   rT   r    s   "zCppVecOverrides.atanhc                 C   rD  )Nz.asinh()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.asinhc                 C   rD  )Nz.acosh()rS   rx  rS   rS   rT   r    r  zCppVecOverrides.acoshc                 C   r  )Nr  r  r  r  r  r  r  zat::vec::clamp_min(r  r  r  r  r  rS   rS   rT   r    r  zCppVecOverrides.reluc                 C   r  )Nra  z)(1)/(decltype(z)(1) + z.neg().exp())rS   rx  rS   rS   rT   r    rh  zCppVecOverrides.sigmoidc                 C   rD  )Nz.neg()rS   rx  rS   rS   rT   r}    r  zCppVecOverrides.negc                 C   s   t | jr| j|jksJ dd|  d| dS tdd | |fD s%J d|  d}tj|jdk rH| d	dtjj> d  d
| d| d}|  d| }d|  d| d| d}d|  d| d| d| d	}| d| d| d| d| d| dS )NzDdiv_floor_floating_vec implementation expect the same inputs' dtype.zdiv_floor_floating_vec(r   r   c                 s   s    | ]}t |jV  qd S r   )r   r   )r  itemrS   rS   rT   r
        z+CppVecOverrides.floordiv.<locals>.<genexpr>ra  r4   ::blend<ru  (1), r  rG  r  r  z(0))r  r   z	(0)) != (z(0)))z	::blendv(ri  r  )r
   r   r  r3   rG  _get_raw_num_vectorstiling_factor)re  rf  _tr  has_remis_negrS   rS   rT   r    s   
((zCppVecOverrides.floordivc                 C   sT   t j|jdk r#d| d}| ddt jj> d  d| d| d}|  d| S )Nr4   ra  r   rm  ru  rn  r  )r3   rG  ro  r   rp  )re  rf  rq  rS   rS   rT   r    s   (zCppVecOverrides.truncdivc                 C   R   | j tjkr |j tjksJ ttjj| |f\}}| d| S d|  d| dS )Nr  at::vec::minimum(r   r   r   r   ri   rM   r3   rG  rn  re  rf  a_castb_castrS   rS   rT   r    
   zCppVecOverrides.minimumc                 C   rt  )Nr   at::vec::maximum(r   r   rv  rw  rS   rS   rT   r    rz  zCppVecOverrides.maximumc                 C   s   |  d|  S rB  rS   r  rS   rS   rT   square
  r  zCppVecOverrides.squarec                 C   s   t tjtsJ |jtjkr2|jtjksJ ttjj| ||f\}}}d| d| d| d| d	S d| d| d| dtj	| |j d	S )Nra  
)::blendv(r   r   )
r   r3   rG  r1  r   r   ri   rM   rn  _get_mask_cast)re  rf  r  blendv_ablendv_bblendv_crS   rS   rT   r    s   
*zCppVecOverrides.wherec                 C   s   t  }d|  d}d|  d}d|  d| d| d| d|  d}d|  d| d| d|  d| d}|d |  |d	| d
 |d| d
 |d W d    n1 s^w   Y  |d |S )Nra  r  r  r}  r   r   r   r  r  r   r  r  r  r  )rq  r   vec_zerorE  blendv_lblendv_rrS   rS   rT   r"    s   $$


zCppVecOverrides.signNTc              
   C   s   |t jt jt jt jt jt jt jt jt j	f	v sJ t
 d| t| ts&J | j}tj| ||}tjjtjj|}|d| |fd|i |tv rX|t jkrXtj| ||| |S )Nz does not support rj  r   )r   ri   float64rl   bfloat16float16uint8int8r  r.  r   r   rF   r   r3   rG  rk  rl  rm  rn  ro  r   rp  )rq  r   r   use_compute_dtypesrs  rt  rS   rS   rT   rj  )  s*   
zCppVecOverrides.to_dtypec                 C   s@   t jj}|dkr|  d|  dS |d u r|  dS td|)Nr  r  r  z.log1p()r  r  r  rS   rS   rT   r  ?  s   
zCppVecOverrides.log1pc                    s8  t tjtsJ t }tjj }tj| G}|d| d tj	|( |
  | }|d| d W d    n1 sCw   Y  W d    n1 sRw   Y  W d    n1 saw   Y  |d tjj| |j | d} fdd}|jr|}	n||}	t|t  }
||
}t |tsJ ||jrft }|d tj	| |
  |d	| d
 |
  |d| d W d    n1 sw   Y  |d |
 H tjjtjj|	}tjjtjj|}t |tsJ |t |tsJ | |_ |_tjj}|d|||| d W d    n	1 s1w   Y  W d    n	1 sAw   Y  W d    n	1 sQw   Y  |d tjjtjj|}n)|jr}tjjtjj|  d|	 d| }ntjjtjj|  d| d|
 }|d| |||fi  |S )Nr   r  r  r   r  c                    s8    t jkrtj  d|  dS tj  d|  dS )N::from(r   rG  )r   ri   r3   rG  rJ  _get_vec_type)r   r  rS   rT   maskify_or_vecify[  s
   
z1CppVecOverrides.masked.<locals>.maskify_or_vecify[&]if (z.all_zero())elser  r  r  )r   r3   rG  r1  r6   rl  r  r  r   r  r   rn  r  r   r   rN   rG   rF   rm  	overridesr  ro  )r  r  r  r   r   new_maskr   	body_coder  body_code_vecr  other_code_vecbody_vec_varother_vec_varr  rt  rS   r  rT   r  K  s    





 
zCppVecOverrides.maskedc                 C   s   t tjtsJ tj| }tjjtjj }tj||}|dkr't	| |S |d urQtjj
jtjjt|t| d}t||}t |trI|j}tj||}ntjd ||tjj}|d| |fi  |S )Nr   r  r  )r   r3   rG  r1  r  itervars
tiling_idx_try_get_const_strider_  r  rl  rm  rn  rC   r%   r1   rj  r2   r/  arange_load_or_store_non_contiguousro  )rs  r   r   
tiling_varstrider=  r/  rt  rS   rS   rT   r    s&   
zCppVecOverrides.index_exprc              	   C   s  d|  dd|  df}t dd |D rtdd |D S t| j }tjjr+tjjntjj}t }tjj	j
tjd}tjj	j
| jd}|jd| fi d	 |jd| fi d	 tj| j}|d
krgd| dnd| d| d}||d
kr|d| dnd| d| d || d| d |d |  |d| dtjj d ||  dt| d |dtjj d |d| dtjj d |dt| d |  |d W d    n1 sw   Y  ||d
kr| dt| dn| d | d!t| d || d"| d#t| d W d    n	1 s-w   Y  |d$ tjj| ||f}	t||	D ]\}
}tjj	|
| qG||fS )%Nr  r  r  c                 s   r  r   r  r  rS   rS   rT   r
    r  z(CppVecOverrides.frexp.<locals>.<genexpr>c                 s   r  r   r  r  rS   rS   rT   r
    r  r  r  )r3  r4   at::vec::Vectorized<r   at::vec::VectorizedN<r   zat::vec::Vectorized<int32_t> r   zat::vec::VectorizedN<int32_t, > r   r  __at_align__ std::array<	> tmpbuf;.store(tmpbuf.data(), r   z!__at_align__ std::array<int32_t, z> tmpbuf_exponent;z> tmpbuf_mantissa;r   r   z@tmpbuf_mantissa[i] = std::frexp(tmpbuf[i], &tmpbuf_exponent[i]);z? = at::vec::Vectorized<int32_t>::loadu(tmpbuf_exponent.data(), z! = at::vec::VectorizedN<int32_t, z!>::loadu(tmpbuf_exponent.data(), r   z ::loadu(tmpbuf_mantissa.data(), z();)r  r   rG   r   r3   rG  	tail_sizerp  r6   rl  r  r   r  ro  _get_num_vectorsr   r   rD   rn  r  r  r  )rq  r  r   r&  r   r  r  n_vec
mantissa_tr  r  r  rS   rS   rT   r    sl   





zCppVecOverrides.frexpc                    s    fdd}|S )Nc                     s>  |rJ t j}t|tsJ t }|d | d j}||}|jr&|jn|j	}g }t
| } jdv }	|	r8dn|}
 jdkrEt
| d  n|
}
|  t| D ]D\}}t|tr|js^J |j|kseJ |d| d|j	 d	| d
 || d| dt| d |d| d qP|| qP|d|
 d|j	 d  | }|dt| d |  |d| d
 W d    n1 sw   Y  |	r|jrJ d}d| d| d}ndt| }|dkrd|
 d}n	d|
 d| d}|d| d| d W d    n	1 sw   Y  |d |S ) Nr  r   )r  r  r  ri   rv  r  r   z> tmpbufr   z.store(tmpbufz	.data(), r   tmpbufz[i]z> tmpbuf_out;r   r   ztmpbuf_out[i] = ztmpbuf_out.data()at::vec::VecMask<,z>::fromztmpbuf_out.data(), r4   r  z>::loaduz at::vec::VectorizedN<r  rG  r  )r3   rG  r   r1  r6   r   r   r  r  rp  rG   r   r   r   rF   r   rD   r0  )r2  r3  rG  r   	vec_dtyper  r&  scalar_argsr   output_maskoctypeargidxr(  res	load_argsload_fnr8  rS   rT   rF    sb   










 z)CppVecOverrides._scalarize.<locals>.innerrS   )r  r8  rF  rS   r  rT   
_scalarize  s   9zCppVecOverrides._scalarizec                 C   sV   t t}t t D ]\}}t|tr(||vr(| |j}||_t	| |t| q
d S r   )
r<  r&  r_  r=  r   r$  r  r?  r   r>  )r  vec_varsr   rA  r,  rS   rS   rT   _initialize_scalarize%  s   z%CppVecOverrides._initialize_scalarizer#  )Wr   r   r   r   r;  r$  rg  r   r9  rC  ry  rz  r{  r  r  r  r  r  r  r  rK  rO  rP  rQ  rS  rU  rW  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rb  r  r  r  rh  r  r  ri  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r}  r  r  r  r  r|  r  r"  rj  r  r  r  r  rK  r  r  rL  rS   rS   r  rT   r&  :  sB   ]









































































L

8
<r&  cppvecc                   @   s   e Zd Zedd ZdS )CppTile2DOverridesc                 C   s(   t tjtsJ tj| } t| |S r   )r   r3   rG  CppTile2DKerneltransform_indexingr&  r  )rs  r   rS   rS   rT   r  4  s   zCppTile2DOverrides.index_exprN)r   r   r   r$  r  rS   rS   rS   rT   r  3  s    r  c                       s  e Zd ZeZeZdZdZ fddZ	e
efddZdd ZdNd
ee fddZejdd Z	dOdejfddZdejdefddZdejdejfddZdejdejfddZdd Zdejd ejd!ed"efd#d$Zd%edejfd&d'ZdNd(d)Z d*e!e"ef d+ed,ed-e#j$fd.d/Z%dNd ee& fd0d1Z'd2d3 Z(d4d5 Z)d6d7 Z*d8d9 Z+d:d; Z,d<d= Z-e.defd>d?Z/d@dA Z0ejdBdC Z1dDdE Z2dFdG Z3dHdI Z4				dPd
edJee dKeej fdLdMZ5  Z6S )Q	CppKernelr   r   c                    s   t  | i | _g | _d | _g | _g | _d | _t | _	g | _
t | _t | _t | _t | _t | _d| _t | _t| j| jdd| _t| j| jdd| _t | _t | _|| _i | _g | _d S )NFtmp_acc)name_prefixwrecps)r  r  active_rangesinner_itervarscall_rangesr:  r  reduction_depthr<   reduction_prefixreduction_prefix_generatorsreduction_suffixparallel_reduction_prefixparallel_reduction_suffixlocal_reduction_initlocal_reduction_storesis_reductionnon_parallel_reduction_prefixr7   newvar_prefixsuffixreduction_cseweight_recps_csepreloads
poststoresnum_threadsreduction_omp_decreduction_var_names)r  r2  r  r  rS   rT   r  A  s2   

zCppKernel.__init__c           
      C   s   t jjr| js| jd | d}t jjrdnt }| d}	| j| d| d||| d | jt|||||| | j	|	 d| d | j
d| d	d
d| d||||	|d ddg d S )Nz(int max_threads = omp_get_max_threads();_localmax_threadsz	_arr[tid]r   r   r   zfor (int tid = 0; tid < z; tid++)r   r   r   r}   )r   r  dynamic_threadsr  r   r*   r  r  r   r  r  r   )
r  r   r   r   r   reduction_combine_fnreduction_init_fn	acc_localr  acc_local_in_arrayrS   rS   rT   _gen_parallel_reduction_buffersf  s:   	



z)CppKernel._gen_parallel_reduction_buffersc                 C   s$   | j D ]}t| j|| d qd S )Nr  )r  r   stores)r  var_namerS   rS   rT   %update_stores_with_parallel_reduction  s   
z/CppKernel.update_stores_with_parallel_reductionNr   c                 C   s   |d u sJ t  }t 1}t| dr%|| j | | ||  || j	 || j
 || j W d    n1 sAw   Y  t| drQ|| j | jrl| jD ]}| j| \}}t||| d||}qW|S )Ncodegen_inner_loops_tail)r6   r   r   r   r  r  r  r   r   loadsrn  r  r  r  r  r   )r  r   r   r=  startendrS   rS   rT   gen_body  s$   




zCppKernel.gen_bodyc                 c   s`    | j }|rt||}t|tr|j}t|tsJ tj|_	|| _ z	|V  W || _ dS || _ w )z>Context manager to add an additional mask to loads and stores.N)

_load_maskr1   rW  r   r2   r/  rF   r   ri   r   )r  r  priorrS   rS   rT   r    s   
zCppKernel.maskedr4   r   r   c                 C   s(   | j | }||| | i}t||}|S r   )r  r/   )r  r   scaleitervar_idxr'  r   r   r   rS   rS   rT   scale_index_with_offset  s   

z!CppKernel.scale_index_with_offsetr   c                 C   s   t | |S )z
        Convert an index expr to a string that can be used in cpp code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the cpp kernel.
        )rC   r  r  r   rS   rS   rT   index_to_str  s   zCppKernel.index_to_stritervarc                    s   t  fdd|jD S )z]
        Check if an index has free symbol CppCSEVariable that depends on `itervar`.
        c                 3   sF    | ]}|j jjv rtjj|j  trjj|j   V  qd S r   )r   rl  varname_mapr   rF   
depends_onr  sr  r  rS   rT   r
    s    z6CppKernel.index_indirect_depends_on.<locals>.<genexpr>)rd   free_symbolsr  r   r  rS   r  rT   index_indirect_depends_on  s   z#CppKernel.index_indirect_depends_onc                 C   s   ||j v p
| ||S r   )r  r  r  rS   rS   rT   index_depends_on  s   zCppKernel.index_depends_onc                 C   s   t t| j| jS r   )dictr  r  r:  r   rS   rS   rT   
var_ranges     zCppKernel.var_rangesrs  r&  lowerupperc                 C   s   |s|sd S t |tj}|rt|tjj}tj	j
}n tj	j
}z| jtj	_
t|tjj}W |tj	_
n|tj	_
w | j}|rFtj	| |nd }	| ||rOdnd |	| j}
| jj||
dd d S )N0F)
assignment)r   r   TMPr1   r  r   r.  r/  r3   rG  rn  r  sexprr  indirect_assertr  rl  rm  )r  rs  r&  r  r  indirectrt  r   prior_computesize_strr   rS   rS   rT   check_bounds  s"   

zCppKernel.check_boundsr   c                 C   sR   | j |}| |}| dt| d}| j| j|}|d| ||fi  |S )N[]ra  )r2  inputr  rD   rl  rm  r  ro  )r  r   r   r   r   rt  rS   rS   rT   ra    s   
zCppKernel.loadc                 C   s   d|v sJ | j |}| |}|d u r#| dt| d| d}nB|dkr^tjjs>| jdkr>| dt| d| d}n'tj	
|}dt|  d	| d
}d| dt| d| d}ntd| | jt|| d S )Nbufr	  ] = r   
atomic_addr4   z] += zstatic_cast<ru  r   zatomic_add(&z], r   store mode=)r2  outputr  rD   r   r  r  r  r3   graph	get_dtyperG   NotImplementedErrorr  r   r:   )r  r   r   r/  moder   r   r   rS   rS   rT   store  s   
zCppKernel.storer   r   rtyper   c                    s$   ddt t f fdd}|S )Nr&  c                    s6   | d u r d  d dS t  | S )Nr   r   r   )r   )r&  r   r   r   r   r  rS   rT   rF  $  s   z.CppKernel._gen_reduction_prefix.<locals>.innerr   )r   rk   )r  r   r   r  r   r   rF  rS   r  rT   _gen_reduction_prefix  s    zCppKernel._gen_reduction_prefixc                 C   s    | j D ]
}| j|| qd S r   )r  r  r  )r  r&  gen_fnrS   rS   rT   finalize_reduction_prefix3  s   
z#CppKernel.finalize_reduction_prefixc              	   C   s"  |dv }|||f}|| j jv r| j j| S | j j| jd| dd}| j|  d| _|r0|n|}t||}	| j| 	||	||t
 | jd usKJ | j| j }
t| jd t| jD ]}|
| j|  | j|  }
q\| j| dt||||
 d | ||	|| t||}|| j j|< |S )	Nr_   r^   
reduction FwriteTr4   r   r   )r  reduction_cacherm  r  r  r0  r  r   r  r  r   r  r  r4  r   r:  r  r   r   r  r   )r  r   r   r   r/  argmax_or_argminreduction_keyr   
init_dtyper   r   r   r   rS   rS   rT   	reduction7  s6   



zCppKernel.reductionc              
   C   sB   |  |}| j|}| jt|| dt| d| d d S )Nr	  r  r   )r  r2  r  r  r   r:   rD   )r  r   r   r/  r   rS   rS   rT   store_reductionV  s
   
zCppKernel.store_reductionc                    s    j r) j t|t| ksJ  j  dt| dt|  jt|ks(J n&t|t|  _  fdd j D  _dd tt jD  _t| _ jd  j  j jd  fS )NrH  rc  c                       g | ]}  |qS rS   r  )r  rq  r   rS   rT   rC  e      z(CppKernel.set_ranges.<locals>.<listcomp>c                 S      g | ]}t tj|qS rS   r-   r   XBLOCKr  nrS   rS   rT   rC  f      
)r  r   r  r   r:  r4  r  )r  lengthsreduction_lengthsrS   r   rT   
set_ranges]  s   
zCppKernel.set_rangesc                 C   s&   | j d usJ tjjjt| j ddS )N    fallback)r  r3   r  sizevars	size_hintr.   r   rS   rS   rT   r5  p  s   
zCppKernel.size_hintc                    s6  t tsJ t 	jd usJ t |jtr!|j| 	n| 	|jd uo4|jj	 j
t }jrOrD
  n
	 | n	dkr^
 r^|   dtf fddddd	ddtd	tf 	
fd
dddtd	tf fdd		ddtd	tdtffdd|   t |jtrt tjtrtjjrtjj}| D ]C}tfdd| jD }t| j  }d| dt!| d}	|" }
 #d| d|
 d|	 d  #| d|
 d|
 d q| W d    d S 1 sw   Y  d S ) Nr4   
_loop_nestc                    s    fdd}   }t|tr|jD ]   qd S t|ts#J  jd ur/| r/|  t }|	
  | W d    d S 1 sKw   Y  d S )Nc                     s"    j sJ  j j } | jo| jS r   )r.  r   r  parallel)root)r6  	par_depthrS   rT   is_parallel_reduction  s   
zOCppKernel.codegen_loops_impl.<locals>.gen_kernel.<locals>.is_parallel_reduction)
get_kernelr   rE  rF  CppKernelProxyr.  r  r   r   r   r   r  )r6  r:  rG  r   )r   gen_loop_nestr9  )r6  rT   
gen_kernel  s   



"z0CppKernel.codegen_loops_impl.<locals>.gen_kernelFc                 S   sB   |r| j }|r| j| }|S | j}|r|| j }|S || j }|S r   )r  r  r  r  r  )rG  r7  	is_suffixr  prefixrS   rS   rT   get_reduction_prefix_suffix  s   


zACppKernel.codegen_loops_impl.<locals>.get_reduction_prefix_suffixr   depthc                    s  |   }| js	J | j| }t s}|jr.|s.||jdd}|r)|    | rF|jrF |j	rF|j
s@J  |j	 | | r]|jr]|j
rY |j
   |jru|s} ||jdd W d    d S W d    d S W d    d S 1 sw   Y  d S )NF)r?  T)r;  r.  r   r   r  r7  r   r   r  r  r  close)r6  rB  in_reductionrG  loopstack_outerr  )r   gen_loop_atrA  is_reduction_loopthreadsworksharingrS   rT   gen_loop_with_reduction  s@   










"z=CppKernel.codegen_loops_impl.<locals>.gen_loop_with_reductionc                    s   t  9}| js
J | j| }| }|d u r 	 W d    d S  | |   | |d |j W d    d S 1 s@w   Y  d S r   )r   r   r.  linesr   r   r   r  )r6  rB  r   rE  
loop_lines)r   r=  rS   rT   rG    s   



"z1CppKernel.codegen_loops_impl.<locals>.gen_loop_atrD  c                    s4   | j d u s|t| j kr |  d S | || d S r   )r.  r   )r6  rB  rD  )r>  rK  rS   rT   r=    s   z3CppKernel.codegen_loops_impl.<locals>.gen_loop_nestc                    r%  rS   r&  )r  size_valr   rS   rT   rC    s    z0CppKernel.codegen_loops_impl.<locals>.<listcomp>zstd::make_unique<z []>(r   zstd::unique_ptr<z	 []> buf_r   r   z* z = buf_z.get();)FF)r   F)r   )$r   r<  r*   r  rG  rE  decide_parallel_depthmax_parallel_depthr.  r   r  r   r   r   rC  r7  mark_parallelsingler   r   r3  rk   ri   r3   local_buffer_contextrI   local_buffersvaluesr.   
get_layoutr&  rG   r   rC   get_namer  )r  r5  r   rJ  r   rT  local_bufferlocal_buf_sizelocal_buf_dtypeallocatelocal_buffer_namerS   )r   r>  rG  r=  rK  rA  rH  r9  r  rI  rJ  rT   codegen_loops_implv  s   




!




$zCppKernel.codegen_loops_implc                 C   s   t | }| ||| d S r   )r3  buildr]  )r  r   rJ  r5  rS   rS   rT   codegen_loops	  s   
zCppKernel.codegen_loopsc                 C   s   t jjrdS dS )NAOTI_TORCH_CHECKTORCH_CHECK)r3   r  aot_moder   rS   rS   rT   assert_function		  s   zCppKernel.assert_functionc           	      C   s   | j d usJ | j |j|j|j  }|  }d}d}|D ]-}tjjj|dd}|d| ks2||kr4 n|| tjj	k r> n|d7 }||9 }|| }qtjj
r[|dkr[t|dkr[d}t||jdS )Nr4   r   r1  r2  r   r   r   )r  r   r   r5  r3   r  r4  r   r  min_chunk_sizer  r   r   )	r  rP  rI  r:  seqparrB  rs  hintrS   rS   rT   rO  	  s.   

zCppKernel.decide_parallel_depthc                 c   s    | j | j| j| jf}t | _ t | _t | _| j | _d V  | j| j  | j| j | j| j |\| _ | _| _| _d S r   )r  rn  r  rl  r<   cloner  r  )r  r  rS   rS   rT   write_to_suffix-	  s   zCppKernel.write_to_suffixc                 O   s   t |i |S r   )rF   )r  r2  r3  rS   rS   rT   create_cse_var:	     zCppKernel.create_cse_varc                 C   s   dt |  d| dS )Nzc10::convert<ru  r   )rG   )r  srcr   r   rS   rS   rT   rk  =	     zCppKernel.get_to_dtype_exprc                 C   s    |  |||}| j|| d S r   )rk  rl  r  )r  dst	dst_dtyperm  r   rs  rS   rS   rT   rp  @	  s   zCppKernel.cache_dtype_convertr@  r   c           
         s   |d u rd}j sdS g   fdd}|d ur/|j v sJ j | \}}||||s.dS nj  D ]\}}|\}}||||sE dS q4d }	|	r[|d| d|	 d	 dS dS )
NrP   Tc                    s   | |krdS d }t jD ]\}}||kr|} nqttkr/|r/| dkr/|j| kr/d} | dt|    | dt|  dS )NFr   r4   rT  r   T)r   r  r  r  r:  r0  rD   )r  r  r   var_idr   _var
conditionsr  rS   rT   genP	  s"   z)CppKernel.codegen_conditions.<locals>.genFr  zif(rG  r   )r  r=  joinr   )
r  r   r@  r   ru  r  r  rr  _rangejoined_conditionsrS   rs  rT   codegen_conditionsD	  s,   
zCppKernel.codegen_conditionsr   )r4   r  r   NN)7r   r   r   r_  r  rC   r  r  r  r  r   r   r  r  r   r6   r  r   contextmanagerr  r   r%  r  rm   r  r   r  r  r  ri   r  ra  r  r   r8   r   r   r  rk   r  r#  r$  r0  r5  r]  r_  propertyrc  rO  rj  rk  rk  rp  ry  rL  rS   rS   r  rT   r  ;  s    +
)


 


 
r  c                       s  e Zd ZeZ	d? fdd	ZdejdejfddZ	de
jd	efd
dZde
jd	efddZde
jd	efddZe
jfde
jd	efddZdede
jd	efddZ	d?dedejde
jdee fddZ			d@dee dejde
jdee deeeef  ded	ee fddZdedejf fd d!Z	dAd"eeef dedejde
jdef
d#d$Zd?d%d&Zd'd( Zd)d* Zd+ed	efd,d-Z ded.ejd	efd/d0Z!d1d2 Z"d3d4 Z#d?d5d6Z$ddde
j%fdeej d7ee d8ee
j fd9d:Z&d? fd;d<	Z' fd=d>Z(  Z)S )Br1  Nc                    s\   t  || t | _| jsJ |dksJ d|| _|| _|| _|r)|| _d S || _d S )Nr   z0Expect pass in Non-Zero tiling_factor explicitly)	r  r  r   pick_vec_isavec_isarp  r  r  	num_elems)r  r2  r  rp  r  r  r  rS   rT   r  x	  s   

zCppVecKernel.__init__r   r  c                    s`     ||rd S  fdd|jD D ]}t|tsJ |jr! d S qt|| j}|jr.|S d S )Nc                 3   *    | ]}t |tjr jj|j V  qd S r   r   r   r  rl  r  r   r  r   rS   rT   r
  	      

z5CppVecKernel._try_get_const_stride.<locals>.<genexpr>)r  r  r   rF   r   r   rp  r-  )r  r   r  indirect_varr  rS   r   rT   r  	  s   

z"CppVecKernel._try_get_const_strider   r   c                 C   s0   t | j|j d | j  }|dksJ |S )N   r4   )mathr  rp  itemsizer~  	bit_widthr  r   num_vectorsrS   rS   rT   r  	  s
   zCppVecKernel._get_num_vectorsc                 C   s   | j |j d | j  S )Nr  )rp  r  r~  r  )r  r   rS   rS   rT   ro  	  s   z!CppVecKernel._get_raw_num_vectorsc                 C   s8   |  |}|dkrdt|  dS dt|  d| dS )Nr4   r  r   r  r  )r  rG   r  rS   rS   rT   r  	  s   
zCppVecKernel._get_vec_typec                 C   s.   |t jkrdS | |}dt|  d| dS )NrP   r  r  r   )r   ri   r  rG   r  rS   rS   rT   rJ  	  s   

zCppVecKernel._get_mask_typer  c                 C   s<   |j tjksJ t|| |}| dt|  d| dS )Nz.template cast<r  r   )r   r   ri   reprr  rG   )r  r  r   r  rS   rS   rT   r~  	  s   
zCppVecKernel._get_mask_castr   	load_maskc           
      C   s   t | }| |}d}|r%|js| tj d| d}n| |tj }|dkr2| dt| n|}|tjkrE|   d| d}	|	S |rU| d| d| d| dn| 	| d	| d
t| j
 d}	|	S )a  
        Get a load line str that loads a vector from `var` at `index` of type `dtype`.
        If `load_mask` is not None, we do a masked load accordingly.
        Notes on the `dtype`:
        1. We always load `self.tiling_factor` number of elements regardless of the `dtype`.
           It means we load half of the vector lanes for 16-bit data types and quarter of the
           vector lanes for 8-bit data types.
        2. `torch.bool` and `torch.uint8` could mean masks and we load them as float mask vectors.
        Nr  r   r   rc  z.template loadu<r  ru  ::loadu(r   )rG   r  r   rJ  r   rl   r~  rD   ri   r  r  )
r  r   r   r   r  cpp_typer  load_mask_strloadbufr   rS   rS   rT   _get_vec_load_line	  s    

 zCppVecKernel._get_vec_load_lineFr   store_value
accu_storec                    s  |r
|dus
J d|r|sJ  du rj  dtjdtffdddtjdtffddd	tdtf fd
d}t }|d | c |}	|}
dt|  d|
 d}|| |rr|| dt	|	 d t
jj  d}i }fdd|jD D ]}t|tsJ |jr||}| d| d||< qj|j|d}d}jdur|rJ dtjtsJ jjjrӈj d| d}nj d}t r|dj  n	|dj  |d| d| d t	j d! | d"  | } t h}t	|}|D ]}td#|  d# || |}q|dur8| d| dn| }|rN|d$| d ||  |rg|rVd%nd&}|| d'| d(| d) n|d*| d+| d, W d   n	1 s~w   Y  W d   n	1 sw   Y  |sd-d.|}|d/| d, W d   n	1 sw   Y  |d0 |r|d,  | dS j |}t|tsJ d1|_|S )2a  
        Load or store a vector in a non-contiguous way. The vector is initialized from an array that is
        filled in an inner loop over the tiling factor.
        :param var: buffer to load from or store to, i.e. `var[transformed(index)]`. If None, we load the index
                    as index expression, i.e. `transformed(index)`.
        :param index: index into the `var` or the index expression by its own if `var` is None.
                      The `index` could contain indirect indexing or the tiling itervar. When used in
                      the inner loop, the index is transformed as follows:
                      1. the index is linearized along the tiling dim.
                      2. the indirect indexing vector variables are transformed into arrays over the tiling dim.
        :param dtype: data type of `var` or `index` if `var` is None.
        :param buffer: the code buffer to write the generated code to. If None, we write to `self.loads`.
        :param store_value: the value to store. If None, we load the vector.
        :param accu_store: whether accumulate the store_value to store_ptr. If True, a store_value should be provided
        :return: a CppCSEVariable that represents the loaded vector or None if it is a store.
        Nzstore var must be providedr   r   c                        | j dk r jd| j   S  jS N   )r  r  r  r   rS   rT   get_result_size	     
zCCppVecKernel._load_or_store_non_contiguous.<locals>.get_result_sizec                    r  r  )r  rp  r  r   rS   rT   get_tiling_size	  r  zCCppVecKernel._load_or_store_non_contiguous.<locals>.get_tiling_sizevec_varc                    s   | j sJ t }|d | C | j}|d usJ |tjkr#tj}|}|}|dt|  d| d |  dt	| d}|| |d W d    n1 sWw   Y  |d j
 |}t|tsoJ |S )	Nr  r  r   r  r  r   zreturn tmpbuf;r  )r   r6   r   r   r   r   ri   rl   rG   rD   rl  rm  r   rF   )r  r   r  result_sizetiling_sizer   rt  r   r  r  r  rS   rT   vec_to_array
  s*   





z@CppVecKernel._load_or_store_non_contiguous.<locals>.vec_to_arrayr  r  r   r  r  r   r1  c                 3   r  r   r  r  r   rS   rT   r
  ,
  r  z=CppVecKernel._load_or_store_non_contiguous.<locals>.<genexpr>r	  r
  r  r'  zunexpected store with load maskz.is_masked(r   z != 0z#pragma GCC unroll z#pragma unroll 
for (long  = 0; r   r   r   r   r  +==r   z tmpbuf[r   ztmpbuf[r  r   ztmpbuf.data()r   r  r  T)r  r   r   rk   rF   r6   r   r   rG   rD   r,   r  r  r  r   r   r  r  r   is_gccrp  r  r   r   r   r   r   r  r  rl  rm  )r  r   r   r   r   r  r  r  r   r  r  result_declareitervar_innerreplacementsr  	array_varr  r   index_crhsr   	load_linert  rS   r  rT   r  	  s   





  
@

z*CppVecKernel._load_or_store_non_contiguousr   c           	         s   | j |}| |}tj|}| j| j }| ||}|dkr(t	 
||S |dkr>| |||| j}| j| j|}n| |||}t|tsLJ |d| ||fi  d|_|S )Nr   r4   ra  T)r2  r  r  r3   r  r  r  r  r  r  ra  r  r  rl  rm  r  r  r   rF   ro  r   )	r  r   r   r   r   r  r  r   rt  r  rS   rT   ra  h
  s   
zCppVecKernel.loadr/  c                 C   s*  t |tst |tr|jsJ || j| j }| dt| }| ||}t }	|dkr|r^|t	j
krD| jdu rD| | d| dn| | d| dt| j d}
d| d|
 d}|t	j
kru| jdu ru|	| d| d	 |	S |	| d| dt| j d	 |	S | j||||	||d
 |	S )a2  
        Get a store line buffer that stores `value` into `var` at `index` of `dtype`. It handles
        both contiguous and non-contiguous store cases.
        :param value: Vectorized type templaterized on `dtype`.
        :param var: buffer to store into.
        :index: index into the `var`.
        rc  r4   Nr  r   r   rG  .store(r   )r   r  r  )r   rm   rF   r   r  r  rD   r  r<   r   rl   r  r  r  r   r  )r  r/  r   r   r   r  r  var_exprr  r   ra  rS   rS   rT   _get_store_line|
  s:   
 	zCppVecKernel._get_store_linec                    sd  d v sJ t |tsJ ||js| |}| j }| |}tj	 }|d u rC| 
||||}| j| fdd d S |dkrtjjsj| jdkrj| j
| |||dd}| j| fdd d S | |}| tj}	t| }
t|tjj}t |tr|jsJ d	|
 d
|	 d
| d| d
| d
| d}| jt | d S td| )Nr  c                    
   t  | S r   r:   rx  r   rS   rT   <lambda>
     
 z$CppVecKernel.store.<locals>.<lambda>r  r4   T)r  c                    r  r   r  rx  r  rS   rT   r  
  r  zatomic_add_vec<r   ru  r   r  )r   rF   r   r)  r2  r  r  r3   r  r  r  r  r  mapr   r  r  r  r  r   r.  rG   r1   r  r/  r   r:   r  )r  r   r   r/  r  r   r   r   n_srcn_idxr   r   rS   r  rT   r  
  s8   


*zCppVecKernel.storec               
   C   s  |t v sJ |dv }| j| jk}|r|n|}t|tsJ ||js'| |}|||f}|| jjv r8| jj| S d}	|	 dt	|  d}
t
||}| ||}| jj| jd| dd}t|tscJ | d}d	| }|  j| ||g7  _d
| _| j| ||||t | j| ||||| j tdd | j| jd  }|dkr=| jd usJ | j| ||||| j tdd | j| jd  }| j| jkr| jnd}t||| _| j| jjvr| jj| jd| j dd| _| j| jj| j< | j|  | t!j"j#rdnt$ }| j%|  || n| jj| j | _| j&r(|n|}| j'| d| (|||d
 d nD| jd usEJ | j)| j }t*| jd t+| j)D ]}|| j|  | j)|  }qV||||d}| j'| d| j(||fi | d | j,||||| j(| jd | j,||||t-td |dkr| j,||||| j(| jd |t.j/k}|r]t0|r| 1|dv sJ dd| d}d| d}| j2| dt-||| d ng|r| d| d}n[|r	|dv rd| d}nL|dksJ | d}n?d| (|d d! d" }|t.j/k}|rt.j3n|}d#t	|  d}
d$t	|  d%| 1| d}| d&|
 d'|
 d(| d%| d
}| j2| dt-||||d) d |}n|}t0|ryd	| }| j2| dt-||| d t4||}|| jj|< |S )*Nr  zat::vecz::Vectorized<r   r  Fr  _vecmasked_Tc                 S      | | S r   rS   r  rS   rS   rT   r  
      z(CppVecKernel.reduction.<locals>.<lambda>re   c                 S   r  r   rS   r  rS   rS   rT   r    r  r4   r  r   r   )r   r   horizontal_reductionr   )r  r  )r4   r   z4Welford reduction does not support VectorizedN (N>2)zwelford_vec_reduce_all(r   z_vec_reduce_all()rd   ra   r]   r  z.all_zero()r\   z.all_masked()z	{ return rq  r  z; }r  zat::vec::vec_reduce_all<r   z([](z& x, z& y) r  )5VECTORIZABLE_RTYPESr  r  r   rF   r   r)  r  r  rG   r   reduction_acc_type_vecrm  r  r  r  r  r0  r  r   reduction_init_vecr6  r7  r:  rp  r   weight_recp_vec_ranger  rn  weight_recps_valr  r   welford_weight_reciprocal_vecr   r  r  r*   r  r  r  reduction_combine_vecr  r4  r   r  r   r   ri   r)   r  r  rl   r   ) r  r   r   r   r/  r   r  r"  r!  vec_nsvecr   acc_type_vecr   acc_vecmasked_acc_vecreduction_sizereduction_factorr  acc_vec_r   r   r3  r   r   masked_next_valuereduce_all_bodyr  vec_reduce_all_functmpvarmasked_tmpvarr   rS   rS   rT   r#  
  s<  





	
	


	


"


zCppVecKernel.reductionc                    s  |  |}| j }tj }|jr|tjkr|ntj	ntj
}tj|}tj|}t }	| j| jkrL|	| dt| dt|  d| d nf||krt|  d| }
|tjkrk| d| tj d}n.||  krudkrn nd	t|  d| d
}nd	t|  d| dt|  d| d| d
}|	d|
 d| d |
}|	| |||| | j|	 fdd d S )Nr	  z] = static_cast<ru  r   r   z.template cast<bool,r   r4   at::vec::convert<r   r  r   r   r   c                    r  r   r  rx  r  rS   rT   r    r  z.CppVecKernel.store_reduction.<locals>.<lambda>)r  r2  r  r3   r  r  is_floating_pointr   rh   rl   r.  rG  r  r<   r  r  r   rD   rG   ri   r  r  r  r  )r  r   r   r/  r   	out_dtyper   out_num_vectorssrc_num_vectorsr   converted_valueconvertrS   r  rT   r$    sF   
"
zCppVecKernel.store_reduction
scalar_varc                 C   s   |j rJ |jtjkr| j| j|   d|j d}n|jd us$J | j| j| 	|j d|j d}t
|ts>J |j|_|j|_d|_ |S )Nr  r   rG  T)r   r   r   ri   rl  rm  rn  rJ  r   r  r   rF   dependent_itervars)r  r  r  rS   rS   rT   r)    s   
zCppVecKernel.broadcastr  c              	   C   sb   |j rJ |jd usJ | j| j| |j d| d| d}t|ts(J |j|_d|_ |S )Nz	::arange(r   r   T)r   r   rl  rm  rn  r  r   rF   )r  r   r  rt  rS   rS   rT   r    s   
zCppVecKernel.arangec           
      C   s   t | }| |}t|rd| dS |dv rNt| }| ||}|dkr6t|r/d| dnd| d}nt|r@d| dnd| d	}| d
| dS |dkrY|   dS t||}| d
| d}	|tj	kr{|dv sqJ |   d| dS |	S )Nr~   r   rv   r^   ry   rx   r{   rw   rz   rG  r   rd   z	::from(0))r\   r]   ra   r  )
r;   r  r)   rG   r  r
   rJ  r   r   ri   )
r  r   r   r   vec_typer   r   r  scalar_initvec_initrS   rS   rT   r    s2   




zCppVecKernel.reduction_init_vecc                 C   s   t | }| |}t|rd| dS |dv rD| |}| tj}|tjkr6dttj  d| d| dS dt|  d| d| dS |tjkrT|dv sOJ | 	  S |S )Nr~   r   rv   zIndexValueVec<r   )r\   r]   rd   ra   )
r;   r  r)   r  r   r.  ri   rG   rl   rJ  )r  r   r   r   r  r  r  rS   rS   rT   r    s   




z#CppVecKernel.reduction_acc_type_vecc                 C   s>   |rt | j|n| j}t|}d| | d| j d| dS )Nzstatic WeightRecp<r  rG  r   )r   r  rD   r  r  )r  r   r  vec_num_range_threadvec_num_range_thread_exprrS   rS   rT   r    s   z*CppVecKernel.welford_weight_reciprocal_vecr  r   c                 C   s  |t jk}|dkr-| jrd| d| dt| j dS |r$| d| S d| d| dS |dkrU| jrCd| d| dt| j dS |rL| d	| S d
| d| dS |dkr{| jrkd| d| dt| j dS |rodnd}	| d|	 d| S |dkr| jrd| d| dt| j dS | d| S |dkr| jrd| d| dt| j dS | d| S |dkr|r| jrd| d| dt| j d| j d	S d| d| d| j dS | jrd| d| dt| j dS d| d| dS |dkr:t|tr
|\}
}}nt||\}
}}| jr+d| d|
 d| d| dt| j dS d| d|
 d| d| d	S |dv r|d usFJ t| }|t jkrUtt j	 }| 
|}| 
t j}d}d}|d ur~|d uspJ dt|  }d| }| jr| d| d| d| | d | d| | dt| j dS | d| d| d| | d | d| | dS |d!krt|tr|jt jksJ ttjj|f\}| d| S t)"Nr]   zmax_masked_reduce(r   r   r   r{  r\   zmin_masked_reduce(r  ru  ra   zsum_masked_reduce(r   rX   r   rb   zprod_masked_reduce(r   rc   zxor_sum_masked_reduce(r   re   r   r  rf   r   z}, r   rv   rP   z_combine_vec<ru  rd   )r   ri   r  rD   r  r   r   r   rG   rl   r  r.  rm   r  rF   r   rM   r3   rG  rn  r  )r  r   r   r   use_weight_recpsr   r  r   r   r   r   r   r   r   r  r  t_extra	arg_extrarS   rS   rT   r  
  s   

&
*




0
z"CppVecKernel.reduction_combine_vecc           	   	      s  t |tsJ |jd usJ |js(t |tr|jrd| d}t ||||S |}|}|r:| |j d| d}|rH| |j d| d}|rf|rfd| d| d| d| d	}| d| d| }n#|rw| d| }| d| }n|s{J | d| }| d| }| |j d| d}|r|js| |j d| d}d| d| d}| jr| |j d| |j d	| d
t	| j d}d| d}| j
 d| d| dS )NrG  z).all_masked()r   rR  z) & (r   z) | ~(z::set(z::from(1), (r   z, "index out of bounds: z"))r   rF   r   r   r  r  r  rJ  r  rD   rc  )	r  r   r  r  r  lower_scalarupper_scalarcond
cond_printr  rS   rT   r  i  sF   zCppVecKernel.indirect_assertc           	         s  t |tsJ |jst |||S t| }| |}t| }| |}d| d}|tjkrG|tjkrG| 	| d| d| d| d}|S |tjkr^|tjkr^| d| d| d}|S ||kr||  krldkryn nd	| d| d}|S d	| d| d| d| d| d}|S )
NrG  r   z::from<r  ru  z.to<r   r4   r  )
r   rF   r   r  rk  rG   r  r   ri   rJ  )	r  rm  r   r   src_cpp_typer  dst_cpp_typedst_num_vectorsrs  r  rS   rT   rk    s(   

"$zCppVecKernel.get_to_dtype_exprr   )NNF)F)*r   r   r   r&  r  r  r   r%  r   r  r   r   rk   r  rl   ro  rm   r  rJ  rF   r~  r   r  r<   r   ri   r  ra  r  r  r#  r$  r)  r  r  r  r  r   r  r  rk  rL  rS   rS   r  rT   r1  u	  s    

*
 


,! >&"

_%r1  c                       s   e Zd ZdZeZ		d fdd	Zdd Zdd Z	dd	d
Z	de
dejf fddZd fdd	Zdd Z fddZdejdejfddZ  ZS )r  an  
    A vector kernel that handles the 2d tiles with the tile size defined in `tiling_factor` on
    the inner-most loop level and one of the outer loop level (`outer_tiling_idx`). When the data
    tile is accessed in a contiguous way from the outer loop axis, a transposition is applied on the
    tile to make the access contiguous from the inner-most loop axis. Then, the same vectorization
    logic from its parent `CppVecKernel` is leveraged for load/store/compute. The transposed tile load
    and store are generated into kernel.preloads and kernel.poststores buffers.

    The loop structure looks like below:
    for ...
      for i_outer ...
        for ...
          for inner_most ...
            // generated by CppTile2DKernel
            float tmp0[16*16]; at::vec::transpose_mxn<...>(tmp0, in_ptr0 + ..., ...); // into kernel.preloads
            float tmp1[16*16]; // into kernel.preloads
            for i_inner ... { // the kernel inner loop
              vectorized loads/compute/stores (e.g., load tmp0, store tmp1) // into kernel.loads/compute/stores
            }
            at::vec::transpose_mxn(out_ptr0 + ..., tmp1, ...) // into kernel.poststores
          for inner_most ... (tail)
            // generated by CppVecKernel
            ...
      for i_outer ... (tail)
        for ...
          for ...
            // generated by CppKernel
            ...
    Nc                    sP   t  ||||d | || _|| _|| _|r|n|| _|r |n|| _d| _d S )Nr4   T)r  r  tiling_indicesinner_tail_sizeouter_tail_sizeinner_num_elemsouter_num_elemsinner_is_tiling_idx)r  r2  r  rp  r  r  r  r  rS   rT   r    s   	
zCppTile2DKernel.__init__c                 C   s   t | j| j  dS )Nr1  )r,   r  	outer_idxr   rS   rS   rT   inner_itervar  rn  zCppTile2DKernel.inner_itervarc                 C   sh   | j | j }| j | j }t||| j}t||| j}| jd u o3|dko3||o3|| o3|| S r   )r  r  r  r   rp  r  r   )r  r   	outer_var	inner_varouter_strideinner_striderS   rS   rT   need_vec_transpose  s   


z"CppTile2DKernel.need_vec_transposec                 C   s  t j|}| j}| dt| }d}	tt|| j| j | j }
t| j }|r4|	|}}	||
}
}d}| j	|A rC| j
| j}}n| j| j
}}|rR|dkrRdnd}t|tjr]|jrft|tjr|jsdt|  d| d	| d
|
 d
|	 d
| d
t| d
t| d}n!dt|  dt| dt| d| d	| d
|
 d
|	 d
| d}|r| j }n| j|s| jj| j|dd}nd}| j|}|rt| }d| d| d}| d| d| d| d| d
}| j| |dt|}|r| jt|| |S | j| |S )Nrc  __place_holder__Tr  truefalseztranspose_mxn<r  ru  r   r   Fr  zalignas(std::max(std::size_t(z), alignof(z)))r   r	  rY   r   )r3   r  r  rp  rD   r   r  r  r  r  r  r  r   r   r%  r-  rG   rl  r  containsrm  r  getr   r   rm   r  r:   )r  r   r   r   is_store
store_moder   factorrm  ro  ld_srcld_dstneed_defineMNr  load_or_storetile_var	cpp_dtypealignasdefine_linerS   rS   rT   gen_transposed_tile_load_store  s|   



&"z.CppTile2DKernel.gen_transposed_tile_load_storer   r   c                    s   | j |}| |}|  }| |rT| j|||dd}| dt|| j  }tj	
|}| |d|}| j| j|}	|	d| ||fi  t|	tsOJ d|	_|	S | |}
t ||
S )NF)r  rc  r   ra  T)r2  r  r  r  r  r  rD   r  r3   r  r  r  rl  rm  r  ro  r   rF   r   r  r  ra  )r  r   r   r   rF  r  r  r   r   rt  r   r  rS   rT   ra  '  s"   


zCppTile2DKernel.loadc                    s  d|v sJ t |tsJ ||js| |}| j|}|  }| |}| |rt| j	|||d|d}| dt
|| j  }| jsRtj|ttjtjg v ra| d| dt
| j d}	n| d| d}	| jt||	 d S | |}
t ||
|| d S )Nr  T)r  r  rc  r  r   r   )r   rF   r   r)  r2  r  r  r  r  r  rD   r  r  r3   r  r  r   r   r  r  r  r   r:   r  r  r  )r  r   r   r/  r  r   rF  r  storebufr   r   r  rS   rT   r  =  s*   




zCppTile2DKernel.storec                 C   sj   |   }| jr|d| d| dt| j d| d	 d S |d| d| dt| j d| d	 d S )Nr  r  r   r   r   )r  r  r   rD   r  r  )r  r   rF  rS   rS   rT   r  Z  s   ""z#CppTile2DKernel.codegen_inner_loopsc                    sz   t  ||}| jd | jk r| jnt| j\| _| _| j| jd kr0| j| _| j	| _
d| _|S | j| _| j| _
d| _|S )Nr4   r   FT)r  r0  r  r  reversedr  r  r  r  r  r  r  r  r  )r  groupreduction_groupr<  r  rS   rT   r0  e  s   
zCppTile2DKernel.set_rangesr   c                 C   s   | j || j|  dS )Nr  )r  r  r  r  rS   rS   rT   r  w  s
   z"CppTile2DKernel.transform_indexingrz  r   )r   r   r   r   r  r  r  r  r  r  rm   r   r%  ra  r  r  r0  r  rL  rS   rS   r  rT   r    s    
<r  _bodyc                 C   s   | j gt| j  }d}d}|D ]Q}|jjD ]J}|jdks#|jdv r$q|jdvr+d}t|dr_|j	r_t
j|j	v s;J |j	t
j }|jrI|jtvrLd}q|dur[||jkrZtd q|j}qd}qq||fS )	z
    Returns the low precision data type (torch.float16/torch.bfloat16) contained in the nodes
    and if all the nodes can codegen with this data type without converting to float.
    Otherwise returns None and True.
    NFplaceholder)	get_indexr  )ra  r  ry  r}  r  TrS  z.bf16 and fp16 are mixed in the scheduler node.)
root_blockr  	subblocksrU  r  nodesoptargetr   rS  r@   rR  r   r   warningswarn)r  
sub_blocks_lowp_fp_type	_use_fp32	sub_blockr  rP  rS   rS   rT   get_loop_body_lowp_fp  s,   


r  c                       sF   e Zd ZdZ fddZdeee ee f fddZdd Z	  Z
S )	TilingSelectz
    Implement the heuristic to select the tiling factors and tiling indices.
    In the future, we can implement advanced heuristic in a subclass.
    c                    s   t    d S r   )r  r  r   r  rS   rT   r    rl  zTilingSelect.__init__r   c           "         s  t |}t|}|sJ tdd |D rg g fS tj}t|d d   r7t fdd|dd  D r7 }t j	|d}| 
|||}|rt|dd d	\}}	t|t|	 }
tjjr_d
d }dd }dd }dd tt|
D }t|}|d | ||d  }}i }i }|D ]}|jgt|j  }|D ]p}|jjD ]i}|jdv r|jdkrdnd}|j||f|j| jd  }|||r|||||}|jdkr|d u rn|dvr||j| t|jtr|jds|jdv s|j|vrd||j< q||j  d7  < qqqt| }t| }d}d}||ks0|dkr4|| |kr4g g fS |	s_|r_t|dkr_t ||d  gs_||d  |d k r_|dk r_g g fS |t!v rt j	|d}|D ]N} | dk r{| t|
 } | dk s| t|
krqnt |
rt"jj#j$|
|  dd}!|!|k rt"jj#%|!| |d } nqn|
|  |k r|d } nqnt|dkr|g|fS t|dkr||g|fS g g fS )Nc                 s       | ]}|t vV  qd S r   )rt   r  r   rS   rS   rT   r
        z-TilingSelect.select_tiling.<locals>.<genexpr>r   c                 3   s     | ]}t |d   kV  qdS )r   N)r  )r  	loop_body_lowp_fp_dtyperS   rT   r
    s
    
r4   r  c                 S      t | d S r   r   sizesrS   rS   rT   r        z,TilingSelect.select_tiling.<locals>.<lambda>rR  c                 S   s&   ||d  }t | ||}|jr|S d S Nr   )r   r-  )r   r  rp  r  r  r  rS   rS   rT   _try_get_stride  s   z3TilingSelect.select_tiling.<locals>._try_get_stridec                 S   s(   | |vr
d|| < d S ||   d7  < d S r   rS   )	node_namenon_contig_indexing_op_counterrS   rS   rT   _update_negative_op_count  s   z=TilingSelect.select_tiling.<locals>._update_negative_op_countc                 S   sD   t |dko!t | dko!|d dkr|d n|d t |  t | k S Nr4   r   r"  )r  r  rS   rS   rT   _is_valid_indices  s   
z5TilingSelect.select_tiling.<locals>._is_valid_indicesc                 S   r(  rS   r)  r+  rS   rS   rT   rC    r-  z.TilingSelect.select_tiling.<locals>.<listcomp>)r  ra  r  r  r   r   r4   masked_subblock)r1   r  r  r  gQ?#   r  
   r2  )&rB   rA   rd   r   rl   r  r  r   r}  	nelements_select_tiling_indicesr]   r   r   r  enable_tiling_heuristicsr4  r   r  r  r  rU  r  r  r  r  indexing_from_argsr2  r   rm   
startswithra   r'   r   r3   r4  r5  guard_lt)"r  fn_listvar_sizes_listloop_bodies
all_dtypesr   rp  r  r	  r
  r  r(  r+  r-  r  r  r<  reduction_vars
op_counterr*  r  r  r  r  arg_idxr   r  op_numnon_contig_indexing_op_numratio_thresholdquantity_thresholdfactor_lowptiling_indice
call_rangerS   r  rT   select_tiling  s   




















zTilingSelect.select_tilingc                 C   s  g }t ||D ]\}}tj|g|R  }|dd t|j|jD 7 }qtt  }g }	tt  }
tt  }|D ][}|j	D ]U}t
d|jsFq<t|||}|dkrQq<|dkrn|t|jdd   |	t|jdd   q<tdd |j	D r|
t|jdd   q<|t|jdd   q<q7||
 | }t|dd	 d
\}}t|t| }t|dkr|d gS |rt|dd  S ||
@ | }t|}t|dkr|d |v r|d |d kr|S t||	jd
dd  S )Nc                 S      g | ]}|j qS rS   )r   )r  deprS   rS   rT   rC  h  s    z7TilingSelect._select_tiling_indices.<locals>.<listcomp>z^d\d+$r   r4   c                 s       | ]	}t |tjV  qd S r   )r   r   SIZEr  rS   rS   rT   r
  w  r  z6TilingSelect._select_tiling_indices.<locals>.<genexpr>c                 S   r!  r   r"  r#  rS   rS   rT   r  |  r%  z5TilingSelect._select_tiling_indices.<locals>.<lambda>r&  r  r   )r  r	   extract_read_writes	itertoolschainreadswritesr   rk   r  r   searchr   r   rg  r0  r  r]   r   sortedcount)r  r8  r9  rp  	all_indexfn	var_sizesrwcontig_varscontig_vars_listnon_contig_stride_constnon_contig_stride_otherr   r   r  contig_onlyr	  r
  num_itervarscontig_and_const_stridecontig_vars_sortedrS   rS   rT   r3  _  sL    




z#TilingSelect._select_tiling_indices)r   r   r   r   r  r   r  rk   rF  r3  rL  rS   rS   r  rT   r    s    
 ,r  c                       s   e Zd Z fddZdd ZdefddZdefd	d
Zdd Z	dd Z
dd Zdee fddZdd Zdd Zd!dee fddZdeded fdd Z  ZS )"r<  c                    s:   t  |j|jj || _d | _d | _t	 | _
g | _d S r   )r  r  r2  wsr  rD  r5  r  r   r}  picked_vec_isakernelsr  rD  r  rS   rT   r    s   

zCppKernelProxy.__init__c                 C   s&   |D ]}t |tsJ t| qd S r   )r   r#   r9   propagate_scheduler_node)r  r  r  rS   rS   rT   data_type_propagation  s   z$CppKernelProxy.data_type_propagationscheduler_nodec                 C   s<   t |jtsdS t| t|jd d uot|jd  S )NTr   r4   )r   r  r   r9   rc  r  )r  re  rS   rS   rT   is_lowp_fp_scheduler  s   
z#CppKernelProxy.is_lowp_fp_schedulerr  c                 C   s@   dt jjfdd}|jgt|j  }|D ]}||j qd S )N	sub_graphc              	      sJ  dt jjdtt j fdddt jjdtt j fdddt jjdt jffdd	dt jjdt jffd
ddt jjdt jffdd}t| j}g |D ]}|jdv r|  tv rt	 fdd|j
D rnqP|jd }| |$ | jd||t jfd|fdd t jd7  _W d    n1 sw   Y  qP|jdkr|  tv r|j\}}}}}|| rqPtj|| | | jd||fd|| t jd7  _W d    n1 sw   Y  qP|jdkr'|j\}}}	}
|tv r&t jt jt jt jfv sJ |tv rt jnt j|	|
f|_qP|jdkrR|jd tv rR|j\}}
 t	 fdd|j
D rJqP||
t jf|_qP|jdkr|jd tv r|j\}} t	 fdd|j
D ruqP| ||t jf|_qP|jdkr|j\}}}|tv r|||s| | | jd|||fd|| t jd7  _W d    n	1 sw   Y  tv rt	fdd|j
D s|jd }| |$ | jd||t jfd|fdd t jd7  _W d    n	1 sw   Y  qP	 qPd t jjffd!d"}||  d S )#Nr	  r   c                 S   sd   | j dkrtj| jd S | j dkr| jd S | j dkr0t| jdkr)| jd S | jddS dS )	z6Get input dtype for nodes that may consumes lowp fp dtr  r4   rv  r  rj  r   r   N)r  r3   r  r  r2  r   r3  r  r	  rS   rS   rT   get_input_dtype  s   




z]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_input_dtypec                 S   sZ   | j dkrt| jdksJ tj| jd S | j dv r!| jd S | j dkr+| jd S dS )	z6Get output dtype for nodes that may produce lowp fp dtra  r   r4   )rj  r  r  r  rv  r   N)r  r   r2  r3   r  r  rh  rS   rS   rT   get_output_dtype  s   




z^CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_output_dtypedtc                    s   |t v sJ  | |kS )z]Check if the given node produces output with expected low precision floating point data type.)r   r	  rk  )rj  rS   rT   is_lowp_fp_source  s   z_CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_sourcec                    s2   |t v sJ  |  }r||kS | jdkrdS dS )zZCheck if the given node accept input with expected low precision floating point data type.rj  TF)r   r  )r	  rk  input_dtype)ri  rS   rT   is_lowp_fp_sink  s   
z]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_sinkc                    s$   |  ot  fdd| jD S )zCheck if the node is a lowp fp sources which are all directly fed to ops that accepts lowp fp input
                thus no need to promote to float
                c                 3       | ]}| V  qd S r   rS   r  userrk  ro  rS   rT   r
        

z}CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote.<locals>.<genexpr>r  usersrl  )ro  rm  )rk  rT   is_lowp_fp_source_no_promote  s   zjCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote)ra  r  c                 3   rp  r   rS   rq  rs  rS   rT   r
    rl  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>r   rj  r2  c                       |  uS r   rS   r,  to_type_noderS   rT   r    r  zVCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<lambda>r4   r  r#  r  r  c                 3   rp  r   rS   rq  rs  rS   rT   r
    rl  c                 3   rp  r   rS   rq  rs  rS   rT   r
  "  rl  rv  c                 3   rp  r   rS   rq  )r   ro  rS   rT   r
  J  rl  c                    ry  r   rS   rz  r{  rS   rT   r  R  r  rg  c                    s"   dt jjf fdd}||  d S )Nrg  c                    s   dt jjfdd dd | jD } fdd|D }|D ]7}| D ]0\}| jv rRtfdd|D sCv rRtd	d |D rRjd
 }| |  q"q| j	d u r_| 
  d S d S )Nto_nodec                 S   s   t dd | jD S )Nc                 s   s    | ]}|j d kV  qdS )rj  Nr  r  usrrS   rS   rT   r
  a  rl  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to.<locals>.<genexpr>ru  )r}  rS   rS   rT   _used_by_to`     zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_toc                 S   s   g | ]	}|j d kr|qS )rj  r~  r  rS   rS   rT   rC  c  s    zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<listcomp>c                    s   g | ]} |r||j iqS rS   )rv  r  )r  rS   rT   rC  f  s
    c                 3   s$    | ]}|j d   j d  kV  qdS r  Nrx  r  rh  rS   rT   r
  l     " zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>c                 s   s    | ]
}|j d  tv V  qdS r  )r2  r   r  rS   rS   rT   r
  o      
r  )r   fxNoder  r=  r  all_input_nodesreplace_all_uses_with
erase_nodeowning_modulelint)rg  all_to_nodesall_to_nodes_and_users
node_usersrv  val_nodeto_lowp_fp_legalized_nodes)r  r	  rT   _eliminate_duplicate_to_nodeY  s2   





zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node)r   r  Graph)rg  r  r  rS   rT   eliminate_to_dtypeX  s   )z`CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype)r   r  r  r   r   r  r  r  r   r  rv  r2  inserting_aftercall_methodrl   r  r   cpp_to_dtype_countr3   r  r  inserting_beforereplace_input_withr  r  r.  r0  r  )rg  rw  sub_graph_nodesr  r1   r   r   	value_varr   r   r/  rq  r  rS   )rk  r   ri  rj  ro  rm  r  r|  rT   add_to_dtype  s   




	








	,zDCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype)r   r  r  r  r  r  rU  r  )r  r  r  r  r  rS   rS   rT   legalize_lowp_fp_dtype_loopbody  s    ]z.CppKernelProxy.legalize_lowp_fp_dtype_loopbodyc                    s   t  fdd|D rJ|D ]:}|jjgt|jj  }|D ](}|jjD ]!}|jdv rE|j	s0J t
j|j	v s8J |j	t
j }|jtv sEJ q$qqd S |D ]}t|tsUJ t|jts]J |j}| si | qLd S )Nc                 3   s$    | ]}t |to |V  qd S r   )r   r#   rf  r  r  r   rS   rT   r
    s
    
z8CppKernelProxy.legalize_lowp_fp_dtype.<locals>.<genexpr>)ra  r  )r  r  r  r  r  rU  r  r  r  rS  r@   rR  r   r   r   r#   r   is_memory_copyr  )r  r  r  r  r  fx_noderP  r  rS   r   rT   legalize_lowp_fp_dtype  s8   




z%CppKernelProxy.legalize_lowp_fp_dtypec                     sx  t  t ks
J | jtdd d\|  fdd} fdd|t}tj j|jO  _tj j|jO  _t	
|| _| jrO| jsa|g| _| dd  | j|  d S tjjjdd	 t }| \}}t |t |ksJ d
}tt }	tdd |	D rd}d}
d }|rd}|d }|d }t | jj|kr| jj| j}| jj| j}|o| }
t |dkrt jd7  _| jj|d |d d}|t|d |d }|j |j! }|j"d|j!fi|_#tj$j%r|r|t|d |d |}n|}|j"g|_&|j"|j!|j fi|_#||g| _|}nt |dkr|d t | jd kr:|d |d ks<J t jd7  _| jj|d |d d}d|j!f|j!|j fd}|j |j! }| jj|d |d d}d|j!f|j!|j fd}|j |j! }|t'|d |}|j"|d |j"|d i|_#g }tj$j%r|rdD ]3\}}|dkr|nd }|dkr|nd }|t'|d |||}|j"|| |j"|| i|_#|(| qn;|t|d |d }|j"|d |j"|d i|_#|j"g|_&|(| |j"|d |j"d|j fi|_#|j"|j"g|_&|(| |g| | _|}n|g| _| |
| | j|  W d    d S 1 s5w   Y  d S )Nc                 S   r!  r   r"  r#  rS   rS   rT   r    r%  z2CppKernelProxy.codegen_functions.<locals>.<lambda>r&  c                    sP    j | g|R  }t jd8  _| |W  d    S 1 s!w   Y  d S r   )
new_kernelr   generated_kernel_count)r  r2  rG  )rD  runrS   rT   codegen_kernel  s
   $z8CppKernelProxy.codegen_functions.<locals>.codegen_kernelc              	      s   |  \}}d}t D ]L\}}|fttdffv r-|r'J ||| qd}|dfksBJ d| d d |   ||d W d    n1 sVw   Y  qd S )NFrS   Tzunexpected group: r  r   )r0  r  r   rL  rM  rj  )rG  r<  r<  	in_suffixrT  rU  )r8  r	  r
  r9  rS   rT   r    s*   
z-CppKernelProxy.codegen_functions.<locals>.runFinplace_buffersTc                 s   r  r   )ru   r  rS   rS   rT   r
    r  z3CppKernelProxy.codegen_functions.<locals>.<genexpr>r   r4   )r  r   maintailr  )r  )r  r  )r  r  r  ))r   rD  r]   r0  r  r3   r  removed_buffersinplaced_to_remover3  r^  r5  r`  r  ra  aggregate_reduction_buffers
set_kernelr   	_inductorr   patchr  rF  rA   rB   rd   r.  r  r   generated_cpp_vec_kernel_counttiler1  r&  
tiled_sizer   r  r  enable_loop_tail_vecr  r  r0  ) r  r8  r9  r  scalar_kerneltiling_selecttiling_factorsr  could_masked_vecr;  _inner_loop_reduction_outer_not_outer_loopinner_loop_reductionouter_loop_levelinner_loop_levelouter_loop_reductionrE  
vec_kernelr  tail_kernel
outer_loopr?  r  
inner_loopinner_rangesr  tile2d_kernelouter_rinner_r_inner_tail_size_outer_tail_sizerG  rS   )r8  r	  rD  r
  r  r9  rT   codegen_functions  s  	













 $z CppKernelProxy.codegen_functionsc                 C   s.   |D ]}|  | t| q| || d S r   )r  r9   propagate_loopbodyr  )r  r:  r9  r  rS   rS   rT   codegen_loop_bodies`  s   
z"CppKernelProxy.codegen_loop_bodiesr  c                    s   |  | | | t|dksJ dd   fdd|D }ttjtr6tjjr6dd fdd|D }d	d |D }| || d S )
Nr4   c                 W   s0   |    |   ttjtr| j| S | |S r   )decide_inplace_updatemark_runr   r3   rG  r0   r  codegen)r	  
index_varsrS   rS   rT   rT  l  s
   

z(CppKernelProxy.codegen_nodes.<locals>.fnc                    s   g | ]}t  |qS rS   )r6  partialr  )rT  rS   rT   rC  t  s    z0CppKernelProxy.codegen_nodes.<locals>.<listcomp>c                 S   s   t j| }| |_|S r   )r3   rS  localize_functionoriginal_fn)rT  
wrapped_fnrS   rS   rT   wrap_fn{  s
   z-CppKernelProxy.codegen_nodes.<locals>.wrap_fnc                    s   g | ]} |qS rS   rS   )r  rT  )r  rS   rT   rC    s    c                 S   s   g | ]}|j d  qS )r4   )r	  r  rS   rS   rT   rC    r'  )	r  rd  r   r   r3   rS  rI   rT  r  )r  r  r8  r9  rS   )rT  r  rT   codegen_nodesf  s   


zCppKernelProxy.codegen_nodesc                 C   s   |  | j|| d S r   )r]  r5  )r  r   rJ  rS   rS   rT   r_    r  zCppKernelProxy.codegen_loopsc                 C   s   | j D ]}|  qd S r   )ra  r  r  rG  rS   rS   rT   r    s   

z4CppKernelProxy.update_stores_with_parallel_reductionNr   c              	   C   st   |d usJ d}| j D ],}t }|||r(d}||  ||  W d    n1 s2w   Y  qd S )N
C10_LIKELYC10_UNLIKELY)ra  r   r   ry  r   r   r  r  )r  r   	if_prefixrG  r   rS   rS   rT   r    s   

zCppKernelProxy.gen_bodyinner_loop_reduction_outer_notr  	LoopLevelc                    s   d fdd} j d }|r|sJ || n|   j|j  j|j  j|j  j|j  j|j  j|j  j	|j	 d S )Nr  r  c              	      s  t  jdks	J  jd } jd }t|tsJ t|tkr5||j |   j	|j|j  n|   j	|j t
 }t }||d| jr]||  |	|j W d    n1 sgw   Y  t \}||d| jr||  t|tkr|j}|D ]}| d| j dt| j d}t|j|| t|j|| q|	t|j| j| j d	| j| j n|	|j W d    n1 sw   Y  | _d S )
Nr   r   r  r  r  r   z_tail - r
  r  )r   ra  r   r1  r  r  r  rp  r  r  r6   r   r   ry  r   r   r   r  r  rD   r  r   r  r   r&  )r  main_loop_kerneltail_loop_kernel
suffix_bufr   r<  r   r   r   rS   rT   !aggregate_reduction_prefix_suffix  sf   






zUCppKernelProxy.aggregate_reduction_buffers.<locals>.aggregate_reduction_prefix_suffixr   )r  r  )
ra  r  r  r  r  r  r  r  r  r  )r  r  r  r  main_kernelrS   r   rT   r    s   
8
z*CppKernelProxy.aggregate_reduction_buffersr   )r   r   r   r  rd  r#   rf  r   r  r  r  r  r  r  r_  r  r   r6   r  ri   r  rL  rS   rS   r  rT   r<    s&    
 b 9!
r<  c                       s$   e Zd Z fddZdd Z  ZS )rE  c                    s   t  |j|jj g | _d S r   )r  r  r2  r_  r  rF  rb  r  rS   rT   r    s   
zOuterLoopFusedKernel.__init__c              	   C   sr   g }dd | j D }|D ]}|j}|d usJ ||tt||j |jd|j qtt|jt	||jdS )Nc                 S   s   g | ]}|  qS rS   )r;  )r  r5  rS   rS   rT   rC    s    z>OuterLoopFusedKernel.decide_parallel_depth.<locals>.<listcomp>rd  )
rF  r  r0  rO  r   r   r   r   r\   r]   )r  rP  rI  kernels_parallel_depthnested_kernelsrG  r  rS   rS   rT   rO    s.   
z*OuterLoopFusedKernel.decide_parallel_depth)r   r   r   r  rO  rL  rS   rS   r  rT   rE    s    rE  c                   @   s   e Zd ZdZdZdZdS )ReasonFusedNodessame_vars_reducecompatible_reductioncompatible_ranges_no_reductionN)r   r   r   SAME_VARS_REDUCECOMPATIBLE_REDUCTIONCOMPATIBLE_RANGES_NO_REDUCTIONrS   rS   rS   rT   r  
  s    r  c                       sd  e Zd ZdZeejejgZe	de
jdee fddZ fddZdefd	d
Zdd Zdd Zdd Zdee fddZdd Zdd Zdd ZdededefddZdd Zdd  Zd!d" Zd#d$ Zd%ee  fd&d'Z!d(e"fd)d*Z#d(e$e"e%e f fd+d,Z&d(edefd-d.Z'd/ed0e(e d1e(e fd2d3Z)d4d5 Z*d6d7 Z+d8d9 Z,d?d;d<Z-d=d> Z.  Z/S )@CppSchedulingi  devicer   c                 C   r  r   )backend_features)r  r  rS   rS   rT   get_backend_features  s   z"CppScheduling.get_backend_featuresc                    s"   t  | |r|   d| _d S NF)r  r  reset_kernel_group_ready_to_flush)r  r  r  rS   rT   r     s   
zCppScheduling.__init__statusc                 C   
   || _ d S r   r  )r  r  rS   rS   rT   _set_flush_status&     
zCppScheduling._set_flush_statusc                 C   s   t dd |D S )Nc                 s   s$    | ]}t ttjjj|V  qd S r   )r   r  r3   r  r4  r   r  rS   rS   rT   r
  *  r  z)CppScheduling.group_fn.<locals>.<genexpr>)r   )r  r$  rS   rS   rT   group_fn)  r  zCppScheduling.group_fnc                 C   s   t  | _d S r   )KernelGrouprD  r   rS   rS   rT   r  ,  s   z CppScheduling.reset_kernel_groupc                    s  |  s|  rt||S | r| rJ t||S | ||tjkrt|t	tfs0J t|t	tfs9J |j
\}\}}|j
\}\}}|dkrO|dksUJ ||f fdd t|t|k re|n|}t|t	snJ t|t|k rx|n|}	 |	}
|j|
d |j
\}\}}|j
\}\}}||krt||S  |}t|	t	r|	j|d n!t|	tsJ |	jD ]}t|t	sJ |j|d qt|	j|	j}	|j
\}\}}|j
\}\}}||ksJ ||ft||S | ||rt||| ||S t||S )NrS   c           	         s   t | trAt| jdksJ | jd }tt  }| jD ]} |\}}|d u r)|}||ks5J ||| jf|| q|t|fS t | tsHJ | j	}t |t
jsSJ | \}}}|jt|j fS r'  )r   r!   r   snodesr   r   updater  r#   r	  r   ComputedBufferget_default_sizes_bodyr  indexing_exprsrU  )	r	  r  r  snodevexprscomp_bufferr   r  get_indexing_ranges_exprsrS   rT   r  A  s    


z5CppScheduling.fuse.<locals>.get_indexing_ranges_exprs)extra_indexing_constraints)
is_foreachr    r  is_templater!   _why_fuse_nodesr  r  r   r#   r	  r   recompute_size_and_bodyr  r  can_fuse_vertical_outer_loopr  _get_outer_loop_fusion_depth)r  r  r  r   vars1reduce1vars2reduce2node_to_recompref_noderef_indexing_constraints#node_to_recomp_indexing_constraintsr  rS   r  rT   r  /  s`   


zCppScheduling.fusec                 C   sb   |j \}\}}|j \}\}}||kr||krtjS |dkr&||| kr&tjS | ||r/tjS d S )NrS   )r	  r  r  r  &_can_fuse_nodes_with_compatible_rangesr  )r  r  r  r   r  r  r  r  rS   rS   rT   r    s   zCppScheduling._why_fuse_nodesc                 C   s  |j \}\}}|j \}\}}|dko|dk}t|t|k}	t|dkp+t|dk}
|r2|	r2|
s4dS t|t|k r>|n|}t|t|k rJ|n|}t|trSdS t|tsZJ t|jtj	rcdS t|jtj
slJ |jj }d }t|trtttdf   }|jD ]}t|jtj	r nt|jtj
sJ |t|jj  qt|dkrdS ttt|}nt|tsJ t|jtj
sJ |jj }||krdS dS )NrS   r4   F.T)r	  r  rb   r   r   r!   r#   r	  r   TemplateBufferr  dataget_sizer   r   r   r  rg  r  nextiter)r  r  r  r   r  r  r  r  c1c2c3r  r  ranges2ranges1
ranges_setr  rS   rS   rT   r    sB   


z4CppScheduling._can_fuse_nodes_with_compatible_rangesc                 C   sN   t |ttfs	J t |ttfsJ tdd ||fD rdS | ||d uS )Nc                 s   s    | ]}t |tV  qd S r   )r   r  r  rS   rS   rT   r
    rt  z:CppScheduling._can_fuse_horizontal_impl.<locals>.<genexpr>F)r   r!   r#   rd   r  r  r  r  rS   rS   rT   _can_fuse_horizontal_impl  s   z'CppScheduling._can_fuse_horizontal_implc                 C   sD   |  s|  r
dS t| t|  tjjkrdS | ||S r  )r  r   r  r   r  max_horizontal_fusion_sizer!  r   rS   rS   rT   can_fuse_horizontal  s   z!CppScheduling.can_fuse_horizontalr  r  c                 C   sR   |   }r't|jtjo&t|jtjo&t|jjdko&|jjd 	 |j
kS dS )Nr4   r   F)get_template_noder   layoutr   MultiOutputLayoutr	  MultiOutputr   inputsrW  r   )r  r  r  template_bufrS   rS   rT   can_fuse_multi_outputs_template  s   z-CppScheduling.can_fuse_multi_outputs_templatec                 C   sX  d}t dd ||fD s|S t|tr| d n|}t|ttfs%J t|tr0| d n|}t|ttfs;J |j\}\}}|j\}\}	}
|dkr[|	dkr[|dkr[|
dkr[|S t dd ||fD rq|j|jkro|jS |S tt	|t	|	}|dkr|d | |	d | krt
dd ||fD rt|tu r|n|}|j|kr|S |S |S |S )	Nr   c                 s   r  r   )r  r  r!   r#   r  rS   rS   rT   r
    s    
z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>r  rS   c                 s   r  r   r  r  rS   rS   rT   r
    r  r4   c                 s   r  r   r  r  rS   rS   rT   r
    s    
)r  r   r  r  r!   r#   r	  r  r\   r   rd   r  )r  r  r  DISABLE_OUTER_LOOP_FUSION_node1_node2r   r  r  r  r  r  _compare_noderS   rS   rT   r    sL    
z*CppScheduling._get_outer_loop_fusion_depthc                 C   sJ   |   o$|   o$| |j@ o$| ||o|   o$| ||dkS r   )r  get_operation_names	ancestorsr!  r  r  r   rS   rS   rT   r
    s   
z*CppScheduling.can_fuse_vertical_outer_loopc                 C   s   |  ||rdS dS r,  )r
  r   rS   rS   rT   get_fusion_pair_priority(  s   z&CppScheduling.get_fusion_pair_priorityc                 C   sT   |  rdS |  rt||g\}}|  o|S | ||o#|  p)| ||S r  )r  rL   r  r!  r
  )r  r  r  template_fusion_supportedr   rS   rS   rT   can_fuse_vertical/  s   
zCppScheduling.can_fuse_verticalr  c                    s  t dd |D r|S ddd}d}d}d}|D ]}t|jtjs$J |j \}}}|j D ]i\}	|	t	D ]_ t  fdd|j
D rR |krR }|d7 }|dkr^|      S t jd tjjjr jd |j
v rdurt fdd|j D r jd d	kr jd  jd d
}|}q:q1q|s|S dfdd}
|D ]}||kr|j|
d q|D ]}||kr|j|
d q|S )aI  
        Apply loop split optimization.
        When one of the indexing_exprs contains a division, we eliminate the division by splitting the loop
        to avoid non-contiguous loads, subject to the following conditions:
            1. No reduction and no mudular index for all nodes.
            2. The indexing_exprs of all nodes contain only one (or more, but all the same) division,
               where the divisor is an integer and not too small (the divisor > 8), the dividend is
               one of the iter_vars, and this var, i.e. the dimension that needs to be split, is
               contiguous in all other indexing_exprs.

        For example, if the node's var_ranges: {z0: 2, z1: 9216, z2: 960} and indexing_exprs:
        {'index0': 8847360*z0 + 960*z1 + z2, 'index1': 32*z0 + (z2//30), 'index2': z2},
        we will split z2 -> 30*z2 + z3, then the node's var_ranges will be changed to
        {z0: 2, z1: 9216, z2: 32, z3: 30} and indexing_exprs will be changed to
        {'index0': 8847360*z0 + 960*z1 + 30*z2 + z3, 'index1': 32*z0 + z2, 'index2': 30*z2 + z3}.
        c                 s   s@    | ]}t |jd  d  dkptdd |jj D V  qdS )r4   r   c                 s   s    | ]}| tV  qd S r   )r   r   )r  rs  rS   rS   rT   r
  Q  rt  z9CppScheduling.try_loop_split.<locals>.<genexpr>.<genexpr>N)r   r	  rd   r  r  rU  r  rS   rS   rT   r
  O  s    

z/CppScheduling.try_loop_split.<locals>.<genexpr>Nr   Fc                 3   s    | ]}  |V  qd S r   )r   )r  r   )div_exprrS   rT   r
  e  rl  r4   c                 3   s0    | ]\}}|krt | jd  dv V  qdS )r   r.  N)r   r2  )r  name_expr_)r4  r   rS   rT   r
  p  s    r  Tc                    s   | \}}|\}}| }| }||  ||< ||d  tj||dd\\}	}
}|	 }||d }||  | ||< t|||g||	|} sY|jt	|j
 f ||f||	|ffS )Nr4   r  )r@  )r   copyinsertr	   index_vars_no_squeezepopr   r   r  r  r  rU  )r$  r  r<  
index_sizereduce_sizer  reduce_vars	split_idxnew_index_sizenew_index_varsr   r  	iter_varsdivisor_var)r  split_number	split_varrS   rT   
loop_split  s.   
z0CppScheduling.try_loop_split.<locals>.loop_split)recompute_sizes_body_func)r  rF  )rd   r   r	  r   r  r  r  r=  findr   rA  r2  r   corenumbersr;  r  r	  )r  r  num_div	div_expr_	match_divmatched_noder	  r   original_bodyrs  rE  rS   )r4  r  r   rC  rD  rT   try_loop_split<  sl   

zCppScheduling.try_loop_splitr	  c                    s   | j tj}g  g t|tsJ dtf fdd}||si|t_     tjjj	dd, |
 D ]}t|ttfsCJ | }t}|| || q8W d   dS 1 sbw   Y  dS dS )a  
        Generate the code for the outer loop fused scheduler node.
        1. Codegen with fused outer loop: depends on the analysis of
            the outer loop fused scheduler node, with or without the local buffer.
        2. If failed, fallback to standard codegen.
        r	  c              	      s  t tsJ     dtfdd g }i t fdd D rtt   D ]t t	s:J 
   sMt dkrNq1 d tfddjD rˈj}t |tjskJ | }jt  }fd	d
}| r| sq1t|j|j|j|d |j|d }fdd}d}|||}	|	stj| dt| |d}	||	 g |	j< |	j | q1tj}
t|dkr|D ]}|jdusJ |
 ||j  qڈ D ]"}t |t!t	fsJ t"}|#|  | |  q$js3|
j%D ]
}t&j'j%(| q	 W d   dS t)j*t)j+tt|
j,d -}.|g t/j01 W d   dS 1 s`w   Y  dS )zN
            Codegen code with fused outer loop and local Buffer.
            r	  c                 S   sH   t | ttfs	J |  }t|dd dj\}\}}t|t| }|S )Nc                 S   s   t |  S r   )rk   r  rx  rS   rS   rT   r    r%  z~CppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_ranges.<locals>.<lambda>r&  )r   r#   r!   r  r]   r	  r   )r	  r  r   r	  r
  r  rS   rS   rT   get_call_ranges  s   
zlCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_rangesc                 3   s&    | ]}t  |jd  kV  qdS )r4   N)r   r  r  )rP  r	  rS   rT   r
    s
    
zfCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.<genexpr>r4   r   c                 3   s    | ]
}|j   v V  qd S r   )r	  r  rq  rh  rS   rT   r
    r  c                     st   d d} t jj D ]\}} | | 7  | |9 } qj } fdd|o9tfddjD S )Nr   r4   c                    s   |  kS r   rS   rx  )contiguous_index_exprrS   rT   is_contiguous_index  s   zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.is_contiguous_indexc                 3   s2    | ]}t |jto |jj V  qd S r   )r   r	  r#   r  get_read_exprrW  rq  )rR  scheduler_bufferrS   rT   r
    s    
zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.<genexpr>)r  r  r  r=  get_write_exprrW  r  rv  )r  r   r4  write_index_expr)rT  re  )rQ  rR  rT   is_all_write_read_contiguous  s   

zyCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguousNc                    s<   |D ]}| |j krtfdd |j D r|  S qd S )Nc                 3   s>    | ]}|j d urt fddtjjj|j  jD V  qd S )Nc                 3   s    | ]
}|j   v V  qd S r   )r	  rW  rq  visited_scheduler_nodesrS   rT   r
    s
    
zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>.<genexpr>)r   r  r3   r  r  name_to_bufrv  )r  global_bufferrX  rS   rT   r
    s    
	zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>)r%  r  r   )local_buffer_layoutrT  	local_buf)local_to_global_buffersrY  rS   rT   try_share_local_buffer  s   zsCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_bufferlocal_buffer_datar   )r   r%  F)local_buffer_numberT)2r   r  clearr   r  r  r   rm   r  r#   rg  rW  r  r   get_outputsrv  r	  r   r  rV  r  is_contiguousFixedLayoutr  r   r&  r  Bufferr0  r   rI   r2  add_local_bufferr!   r<  r  r@  r  r3   r  remover   !cpp_outer_loop_fused_inner_countsCppOuterLoopFusedCountrT  rJ  finalize_kernelrL  rM  from_iterable)r	  rT  r[  global_buffer_layoutsize_offsetrW  r\  r_  local_buf_prefixlocal_buffer_usedscoperX  r  r>  removed_bufferouter_fusion_cpp_kernel_proxyr<  rD  
nodes_list)rP  r^  r	  rT  re  rY  rT   $try_outer_loop_fusion_with_local_buf  s   	
	






$$zSCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_bufFr  N)rD  r   r  r   r  rb  r   r  r   r  r  r!   r#   r  r<  r  rk  )r  r	  r  rv  r  _nodesr>  rS   rt  rT   codegen_outer_loop_node  s*   
 "
"z%CppScheduling.codegen_outer_loop_nodec                 C   sp   | j }t|tr| | n| }| |}t|}|| ||| | 	 }|t
jkr6| d dS dS )zC
        Turn an set of pre-fused nodes into a C++ kernel.
        TN)rD  r   r  rx  r  rO  r<  r  rk  _get_scheduled_num_argsr  MAX_FUSED_KERNEL_ARGS_NUMr  )r  r	  rD  r  r>  args_numrS   rS   rT   codegen_nodel  s   



zCppScheduling.codegen_nodec                 C   s   t |tot |jtjS r   )r   r#   r	  r   CppTemplateBuffer)r  r	  rS   rS   rT   is_cpp_template  s   zCppScheduling.is_cpp_templatetemplate_nodeepilogue_nodesprologue_nodesc                 C   s  |rJ dd |D }t d d  d7  < t d d  t|7  < | |s*J dtt|}|j\}\}}|dks<J ttj|j}d	d |D }t	d
d |D sWJ ddd }|||j
|}	|j||	|d\}
}|
 t|jsx|  |D ]}|  qz| }W d   n1 sw   Y  t|
 |g|}| |||
j}W d   n1 sw   Y  t|jrt|jdksJ d|jd jD ]}t|jtsJ dt|jjtjsJ d|j  q|
|| tj j|
jO  _|   dS )zG
        Codegen a CPP template, possibly with fused epilogues
        c                 S   s   g | ]}t |ttfr|qS rS   )r   r#   r!   )r  epilogue_noderS   rS   rT   rC    s    z2CppScheduling.codegen_template.<locals>.<listcomp>inductorcpp_templated_kernel_counterr4   cpp_epilogue_fusion_counterzlTemplate node passed to CppScheduler.codegen_template must be a SchedulerNode that wraps a CppTemplateBufferrS   c                 S   rG  rS   rh  r+  rS   rS   rT   rC    s    c                 s   rI  r   )r   r   r  r+  rS   rS   rT   r
    r  z1CppScheduling.codegen_template.<locals>.<genexpr>z9Epilogue nodes must all be instances of ir.ComputedBufferc                    s>    sdS |   |v sJ ||    j}t fdd|D  S )NFc                 3   s(    | ]}t |jto|jj v V  qd S r   )r   r	  r   rq  r  rS   rT   r
    s    

zZCppScheduling.codegen_template.<locals>.template_buffer_has_other_users.<locals>.<genexpr>)rW  rv  r  )template_bufferoutputs_by_namer  rv  rS   r  rT   template_buffer_has_other_users  s   
zGCppScheduling.codegen_template.<locals>.template_buffer_has_other_users)$flag_template_buffer_has_other_usersr  NzSMulti outputs template should be with 1 output template buffer of MultiOutputLayoutr   z?Multi outputs template should be with ExternKernelSchedulerNodez7Multi outputs template has multi users with MultiOutput)r   r   r~  r   r#   r	  r   r}  r	  r  r  make_kernel_renderr(   r  r3   set_kernel_handlerdefine_kernelr2  outputsrv  r   r   r'  call_kernelr  r  free_buffers_in_scheduler)r  r  r  r  r   rnumelctbepilogue_ir_nodesr  r  rG  renderr	  src_codenode_schedulekernel_namerr  rS   rS   rT   codegen_template  sl   	





zCppScheduling.codegen_templatec                 C   s
   | j  S r   )rD  get_num_argsr   rS   rS   rT   ry    r  z%CppScheduling._get_scheduled_num_argsc                 C   r  r   r  r   rS   rS   rT   ready_to_flush  r!  zCppScheduling.ready_to_flushc                 C   s   d S r   rS   r   rS   rS   rT   codegen_sync  s   zCppScheduling.codegen_syncNc                 C   s  t jj}tjjrt|tjjnd}dd|| g}t jj	r!|nd}|
ttj|}|
ttj|}|
dd}|d}|d|}	|||	d	   d
}
t }|d u r\| jjn|}| \}}}t jj	sr|d|d |j|dd t jj	s|d |j|| d|
d |S )NrP   r   r  rG  z#pragma CMTz//z
extern "C"r   r4   z;
zasync_compile.cpp_pybinding(z, '''T)stripz''')F)gpucpp_definition)r3   r  wrapper_coder   r  descriptive_namesr&   rv  next_kernel_suffixcpp_wrapperr   rm   r+   KERNEL_NAMEDESCRIPTIVE_NAMErfindrG  r<   rD  r2  cpp_argdefsr   r  r  getvalue)r  r  r  kernel_argsr9  
fused_namer  kernel_decl_name
first_char	last_charkernel_definitioncompile_wrapperr2  r   	arg_typesrS   rS   rT   r    s8   

zCppScheduling.define_kernelc                 C   sF   | j  }|r| || j j}| j tjj| |   | 	d d S r  )
rD  codegen_groupr  scheduled_nodesr  r3   r  r  r  r  )r  r  r  rS   rS   rT   flush  s   
zCppScheduling.flushr   )0r   r   r   rz  r   r5   INPLACE_BUFFERSREDUCE_TO_SINGLE_ELEMENTr  rK  r   r  r  r  ri   r  r  r  r  r   r  r  r  r!  r#  r   r*  r  r
  r1  r3  r  r#   rO  r  rx  r   r!   r|  r~  r   r  ry  r  r  r  r  rL  rS   rS   r  rT   r    sd    R8	
6o
 B

W
%r  c                       sL   e Zd Z fddZdd Zdd Zdd Zdd
efddZdd Z	  Z
S )r  c                    sH   t    t | _t | _t| j| _t	 | _
| j
| j g | _d S r   )r  r  r>   r2  r6   
loops_codeWorkSharingr_  r   r   r   r   r  r   r  rS   rT   r    s   


zKernelGroup.__init__c                 G   s   || j t g|R  S r   )r2  r*   )r  r  r2  rS   rS   rT   r  !  rn  zKernelGroup.new_kernelc                 C   s*   |  j |7  _ | j}| j}||| d S r   )r  r  r_  r_  )r  r  r  r   r_  rS   rS   rT   rk  $  s   zKernelGroup.finalize_kernelc                 C   s   | j  \}}}t|}|S r   )r2  r  r   )r  arg_defs
_call_args
_arg_typesr{  rS   rS   rT   r  *  s   zKernelGroup.get_num_argsNr   c              	   C   sh  | j   | js
dS t }tjjotjdv }|r|	dg |
t  |d u r.ttjn|}|d u r9ttjn|}| j \}}}dd|}t }|
d| d| d| d	 | G |rtjj}	|	d urtd
t|	 d nd}
|	d|
|  dg | j D ]\}}|
d| d| d q|| j W d    | S 1 sw   Y  | S )NrP   )linuxrO   z!#include <ATen/record_function.h>z,
   zextern "C" z void rG  r   graph_r   zRECORD_FUNCTION("z#", c10::ArrayRef<c10::IValue>({}));r   r   r   )r   rC  r  r6   r   r  enable_kernel_profilesysplatformr   r   r   
cpp_prefixrm   r+   r  r  r2  r  ljustrv  rU   r   r3   r  graph_idaliasesr  r  r  )r  r   r   r  r  r  r  r   func_export_declr  r@  oldnewrS   rS   rT   r  /  s>   


zKernelGroup.codegen_groupc                 C   s&   | j  \}}}|j||d|d d S )NF)tritonr  )r2  r  generate_kernel_call)r  r9  r  r   	call_argsr  rS   rS   rT   r  W  s   
zKernelGroup.call_kernelr   )r   r   r   r  r  rk  r  rm   r  r  rL  rS   rS   r  rT   r    s    	(r  c                   @   s<   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd ZdS )r  c                 C   s    || _ d| _d | _t | _d S r  )r   in_parallelr  r   r   r   )r  r   rS   rS   rT   r  _  s   zWorkSharing.__init__c                 C   sz   | j r|| jkr|   | j s;|| _d| _ tjjr | jd n
| jd| d | j	| j
  | jd d S d S )NTz#pragma omp parallelz!#pragma omp parallel num_threads(r   zint tid = omp_get_thread_num();)r  r  rC  r   r  r  r   r   r   r   r   )r  rI  rS   rS   rT   r7  e  s   zWorkSharing.parallelc                 C   s   | j r	| jd | j S )Nz#pragma omp single)r  r   r   r   rS   rS   rT   rR  u  s   zWorkSharing.singlec                 C   s   | j   d| _d S r  )r   rC  r  r   rS   rS   rT   rC  z  s   

zWorkSharing.closec                 C   s   | j   | S r   )r   rU  r   rS   rS   rT   rU  ~  s   
zWorkSharing.__enter__c                 C   s   | j ||| d S r   )r   rZ  rV  rS   rS   rT   rZ    r  zWorkSharing.__exit__N)	r   r   r   r  r7  rR  rC  rU  rZ  rS   rS   rS   rT   r  ^  s    r  c                   @   s   e Zd ZU dZeej ed< dZeej ed< ej	j
Zejed< ej	j
Zejed< ej	jZejed< dZeed< d	Zeed
< d	Zeed< d	Zeed< d	Zeed< dd Zdd Zdd ZdS )r  Nr   r&  r'  r  r(  r   r7  Fsimd_ompsimd_vec	collapsedr  c                 C   s$   t  }|r| | _d S d| _d S r'  )r   r}  r2  simd_nelements)r  r`  rS   rS   rT   __post_init__  s   	zLoopLevel.__post_init__c                 C   sP   t |}t| j| j}||_d|_t|j|| |_| j	|_	d|_
| j|_|S )NTF)r   r;  r  r   r&  r(  r  r   r  r7  r  r  )r  r  sympy_factorrE  rS   rS   rT   r    s   
zLoopLevel.tilec           	      C   sZ  t | j}t | j}tjjr||krd S | jr#| jdkr#d| j dnd}| jrEd}| jdkr8|d| j d7 }| jrD|	dd| }n| j
rKd}n| jrTd	| }n| js^t r^d
}nd}t d| j d| }| j d| }| jjr| j dt | j }n| j dt | j dt | j d}d| d| d| d}| js|s|gS ||gS )Nr4   zsimd simdlen(z) rP   z#pragma omp forz
 collapse(r   z for z#pragma omp z#pragma GCC ivdepr   r  <r  z+=(z == 0 ? 1 : zfor(r   )rD   r'  r&  r   r  no_redundant_loopsr  r  r7  r   r  r  r   r  rH   r   r(  r-  r  )	r  offset_expr	size_exprsimdline1
offset_strr  	steps_strline2rS   rS   rT   rL    sH   




zLoopLevel.lines)r   r   r   r   r   r   r%  r  r&  r   r   r'  r  Oner(  r7  rk   r  ri   r  r  r  r  r  rL  rS   rS   rS   rT   r    s   
 	r  c                   @   s   e Zd ZU dZdZeee  ed< dZ	ee
 ed< ede
fddZdd Zed	d
 Zdd Zdd Zde
fddZdd ZdefddZdS )r3  aV  
    A loop-nest-like structure. It is built with the `build` method
    as a loop nest and then will perform loop-tiling at some depth.

    A typical case is for vectorization, where we typically do loop-tiling
    at the innermost loop level. A more complicated case is when we do
    2D tiling at both the innermost and outer levels.
    Nr.  rG  c           
      C   sz   | j }| j}| j}|dusJ d}tt||D ]\}\}}t||}|s)|g}n|| ||kr6| j|_qt|}	|	S )z4Build a LoopNest with the given `kernel` as the leafN)	r  r:  r  r   r  r  r0  r  r3  )
rG  r  r:  r  r.  loop_idxr   r&  rE  r5  rS   rS   rT   r^    s   

zLoopNest.buildc                 C   s
   t | jS r   )ri   r.  r   rS   rS   rT   __bool__  r  zLoopNest.__bool__c                 C   s   | j du rtdddS d}d}| j d j}td}| j D ]}|j|kr& n
||j }|d7 }q|t| j k rtt|tjrtt| j | jtjrt|d | j | jk rt|}d}| j | j}t|t| j D ]}| j | j|kro n|d7 }qct||dS )a  
        Maximal allowed depth for parallelism: All reduction or non-reduction levels.
        When the range of the first inner loop beyond the maximum parallel depth is much
        larger than the range of all outer loops within the maximum parallel depth,
        change the starting depth of parallelism to the first inner loop and recalculate
        the maximum parallel depth.
        Nr   rd  r4   r2  )	r.  r   r  r   r;  r&  r   r   r4  )r  r   	max_depthr  
loop_sizesrE  r   rS   rS   rT   rP    s2   
	






zLoopNest.max_parallel_depthc                 C   s   |j |  j ksJ d| jd usJ t| j|j ksJ | j|j }|j |_|jr1t jd7  _t	|jd |j D ]}d| j| _
q:d S )Nz?Parallel depth cannot exceed the maximal allowed parallel depthr4   T)r   rP  r.  r   r   r7  r  r   parallel_reduction_countr4  r  )r  r9  rE  r   rS   rS   rT   rQ  )  s   zLoopNest.mark_parallelc                 C   s*   | j sJ | j | || j |< | j | S )z
        Do loop-tiling at the `depth` level with `factor`.
            for (x0 = 0; x0 < x0_end; x0++)
            ->
            for (x0 = 0; x0 < x0_end; x0 += factor)
        See details in Note [tiled_size].
        )r.  r  )r  rB  r  rS   rS   rT   r  6  s   

zLoopNest.tiler   c                 C   r\  r   rG  r   rS   rS   rT   r;  B  r^  zLoopNest.get_kernelc                 C   r  r   r  r  rS   rS   rT   r  F  r  zLoopNest.set_kernellevelc                 C   sH   | j sJ t| j |ksJ |t| j krd n| j |d  }t|| jS r   )r.  r   r3  rG  )r  r  r.  rS   rS   rT   rA  I  s   
 zLoopNest.from_loop_level)r   r   r   r   r.  r   r  r  r  rG  r  r$  r^  r  r$   rP  rQ  r  r;  r  rk   rA  rS   rS   rS   rT   r3    s   
 	
&r3  rz  r   )r   dataclassesr6  rL  r  r8  r   r  r  collections.abcr   enumr   typingr   r   r   r   r   r   r   torch.fxtorch._inductorr	   torch._prims_commonr
   r   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   _dynamo.utilsr   rP   r   r   r   r   r   r   r  r   r  r   r   r   r    r!   r"   r#   utilsr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   virtualizedr0   r1   r2   r3   commonr5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   	cpp_utilsrA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   r  rR   	lru_cacherU   _logginggetArtifactLoggerr   schedule_logNATIVE_OMP_RTYPESRTYPE_TO_CPPr  PYTHON_TO_CPPCONTAINER_PYTHON_TO_CPPr  r  r   r  rl   ri   r  r  r  r.  rt   r  r   r  ru   r   r   r   r   r   r%  rm   r   rk   r   r   r   r   r   	dataclassr   r  rM  r_  _initialize_pointwise_overridesr&  r  r  r  r1  r  r   r  r  r<  rE  r  r  r  r  r  r3  rS   rS   rS   rT   <module>   sl  
 $8	8@

	"
.
0


!>
 !!   
1     
z    >      4  ], f    [#      G(U