o
    Ihh                    @  s
  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlmZ d dlm Z! d d	l"m#Z# d d
l$m%Z% d dl&m'Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC ddlDmEZEmFZFmGZGmHZHmIZImJZJ erd dlKmLZLmMZMmNZN ddlOmPZPmQZQmRZRmSZS ddlTmUZU ddlVmWZWmXZXmYZY ddlZm[Z[ edZ\eeeX geWf Z]e^e[ Z_ee`ejaf Zbe`ZcejdeefdZgehefZid&d d!ZjG d"d# d#ejZke=d$d%G d&d' d'ZlejmG d(d) d)ZnejmG d*d+ d+ZoejmG d,d- d-ZpejmG d.d/ d/ZqejmG d0d1 d1Zreeleneoeqepf Zsi Ztd2eud3< G d4d5 d5Zvi Zwd6eud7< 	d'd(d?d@ZxG dAdB dBeZyd)dEdFZzd*dIdJZ{d+dLdMZ|	Nd,d-dPdQZ}e~dd.dRdSZd/dYdZZd0d\d]Zd1d^d_Zejejejejid`da ejejejejejejejejejejejfD Zdbeudc< d2didjZG dkdl dlZG dmdn dne(Z'G dodp dpZe
jdqe
jdrZd3dtduZG dvdw dwe6eeFe ZejmG dxdy dyZed4i dzeejd{d| d}d~deejdd| dd| dddeejdd| dd| dddeejdd| dd| dddeejdd| dd| dddeejdd| dd| dddeejdd| dd| dddeejdd| dd| dd| dddeejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd| dd| dddeejdd| dd| dddeejdd| dd| dddeejdd| dd~deejdd| dd~deejdd| dd| dddeejdd| dd| dddeejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~Zd eud< d5ddZG dd de:ZG dd de<ZG d	d
 d
eZejmG dd dZG dd dZe ZG dd dZG dd dZede`dZedeedZereejeHeeeedf f f ZG dd deeef ZG dd dZG dd deee ZejmG dd dZe~dd6d d!ZG d"d# d#ZG d$d% d%e7ZdS (7      )annotationsN)autoEnum)chain)	AnyCallablecastClassVarGeneric
NamedTupleOptionalTYPE_CHECKINGUnion)TypeVar)ELEMENTWISE_TYPE_PROMOTION_KIND)_pytree)
OrderedSet)int_oo)PythonPrinter)free_symbol_is_typesymbol_is_typeSymT)bound_sympyValueRanges   )configmetrics)DtypePropagationOpsHandler)BasicMathOpsMixinDefaultHandler)boolean_opsDeferredLineBasegenerate_assertIndentedBufferir_dataclass
ScopedDict	sympy_dotsympy_index_symbol
sympy_substriton_typeunique)ops
OpsHandlerOpsValueReductionType	StoreModeV)IteratorMutableMappingSequence)BufferChoiceCallerFixedLayoutIRNodeLoopBody)BaseScheduling	SchedulerSchedulerNode   PythonWrapperCodegen_TschedulemsgstrreturnNonec                 C  s    t tjrt d|  d S d S )NzData type propagation: %s)schedule_logisEnabledForloggingDEBUGdebug)rB    rK   R/var/www/vscode/kcb/lib/python3.10/site-packages/torch/_inductor/codegen/common.pydata_type_loggerP   s   rM   c                   @  s4   e Zd ZdZdZdZedddZedddZdS )WorkspaceZeroModer   r=   r   abrD   c                 C  s:   | |ks	|t jkr| S | t jkr|S td| d|d)NzWorkspaceZeroMode.combine(, ))rN   UNINITIALIZEDNotImplementedErrorrO   rP   rK   rK   rL   combineZ   s
   
zWorkspaceZeroMode.combine	zero_fillboolc                 C  s   | rt jS t jS N)rN   ZERO_ON_CALLrS   )rW   rK   rK   rL   	from_boolb   s   zWorkspaceZeroMode.from_boolN)rO   rN   rP   rN   rD   rN   )rW   rX   rD   rN   )	__name__
__module____qualname__rS   rZ   ZERO_PER_GRAPHstaticmethodrV   r[   rK   rK   rK   rL   rN   U   s    rN   T)frozenc                   @  s   e Zd ZU dZded< ded< ded< ded	< d
Zded< ejZded< e	d0d1ddZ
e	d2ddZe	d3ddZe	d3ddZd4ddZeZd5ddZd6d!d"Zed6d#d$ZeZeZeZd7d&d'Zd7d(d)Zd8d*d+Zd9d-d.Zd/S ):WorkspaceArga2  A temporary buffer used for a single kernel, then discarded.

    Not registered as a traditional buffer since there are no users,
    so it would be dead code eliminated.

    Args:
        nbytes: The size of the buffer in bytes.
        zero_fill: Whether the buffer should be initialized to zero.

    
sympy.ExprcountrN   	zero_modetorch.devicedevicerC   
outer_namews_ptr
inner_nametorch.dtypedtype
workspace_prefixrD   c                 C  s   |  t tjj S rY   )nextr0   graphworkspace_id)rn   rK   rK   rL   unique_name}   s   zWorkspaceArg.unique_namerO   rP   rX   c                 C  s$   | j |j ko| j|jko| j|jkS rY   )rj   rl   rg   rU   rK   rK   rL   can_join   s   "zWorkspaceArg.can_joinc                 C  s0   t | j|j t| j|j| j| j| j| jdS N)rd   re   rl   rg   rj   rh   )	rb   rd   rN   rV   re   rl   rg   rj   rh   rU   rK   rK   rL   join   s   
zWorkspaceArg.joinc                 C  s\   | j |j kr| j|jkr| j|jksJ tt| j|jt| j	|j	| j | j| j| j
dS rt   )rl   rg   rj   rb   sympyMaxrd   rN   rV   re   rh   rU   rK   rK   rL   maximum   s   (zWorkspaceArg.maximumc                 C     | j S rY   rg   selfrK   rK   rL   
get_device      zWorkspaceArg.get_devicec                 C  ry   rY   )rl   r{   rK   rK   rL   	get_dtype   r~   zWorkspaceArg.get_dtyper6   c                 C  s&   ddl m} || j| j| jgdgdS )Nr   )r6   r=   )rg   rl   sizestride)irr6   rg   rl   rd   )r|   r6   rK   rK   rL   
get_layout   s   zWorkspaceArg.get_layoutc                 C  s   |   S rY   )r   r{   rK   rK   rL   layout      zWorkspaceArg.layoutlist[sympy.Expr]c                 C  s   | j gS rY   )rd   r{   rK   rK   rL   get_size      zWorkspaceArg.get_sizec                 C  s
   t jjgS rY   )rv   SOner{   rK   rK   rL   
get_stride      
zWorkspaceArg.get_stridec                 C  ry   rY   )rh   r{   rK   rK   rL   get_name   r~   zWorkspaceArg.get_name	list[str]c                 C  s   g S rY   rK   r{   rK   rK   rL   get_inputs_that_alias_output      z)WorkspaceArg.get_inputs_that_alias_outputN)rm   )rn   rC   rD   rC   )rO   rb   rP   rb   rD   rX   )rO   rb   rP   rb   rD   rb   )rD   rf   )rD   rk   )rD   r6   )rD   r   rD   rC   )rD   r   )r\   r]   r^   __doc____annotations__rj   torchuint8rl   r`   rr   rs   ru   rx   r}   get_device_or_errorr   r   propertyr   get_output_specmaybe_get_output_specmaybe_get_layoutr   r   r   r   rK   rK   rK   rL   rb   i   s:   
 







rb   c                   @  sB   e Zd ZU ded< ded< ded< ejjZded< dZd	ed
< dS )	TensorArgrC   namebufferrk   rl   rc   offsetNOptional[str]alias_of)	r\   r]   r^   r   rv   r   Zeror   r   rK   rK   rK   rL   r      s   
 r   c                   @  s,   e Zd ZU ded< ded< ed
ddZd	S )SizeArgrC   r   rc   exprrD   r   c                 C     d S rY   rK   r{   rK   rK   rL   r         zSizeArg.alias_ofNrD   r   )r\   r]   r^   r   r   r   rK   rK   rK   rL   r      s
   
 r   c                   @     e Zd ZU ded< dS )ConstexprArgrC   r   Nr\   r]   r^   r   rK   rK   rK   rL   r         
 r   c                   @  r   )TMADescriptorArgrC   r   Nr   rK   rK   rK   rL   r      r   r   c                   @  s*   e Zd ZU ded< ded< dZded< dS )DeviceCodegenSchedulingConstructor
schedulingWrapperConstructorwrapper_codegenNOptional[WrapperConstructor]cpp_wrapper_codegen)r\   r]   r^   r   r   rK   rK   rK   rL   r      s   
 r   zdict[str, DeviceCodegen]device_codegensc                   @  s   e Zd Zd+ddZd,dd	Zd-d
dZd,ddZd-ddZd-ddZd-ddZ	d-ddZ
d-ddZd-ddZd-ddZd-ddZd-ddZd-d d!Zd-d"d#Zd-d$d%Zd.d(d)Zd*S )/DeviceOpOverridesr   rC   rD   c                 C     t rY   rT   r|   r   rK   rK   rL   import_get_raw_stream_as   r   z*DeviceOpOverrides.import_get_raw_stream_as
device_idxintc                 C  r   rY   r   r|   r   rK   rK   rL   
set_device   r   zDeviceOpOverrides.set_devicec                 C  r   rY   r   r{   rK   rK   rL   synchronize   r   zDeviceOpOverrides.synchronizec                 C  r   rY   r   r   rK   rK   rL   device_guard   r   zDeviceOpOverrides.device_guardc                 C  r   rY   r   r{   rK   rK   rL   cpp_device_guard   r   z"DeviceOpOverrides.cpp_device_guardc                 C  r   rY   r   r{   rK   rK   rL   cpp_aoti_device_guard  r   z'DeviceOpOverrides.cpp_aoti_device_guardc                 C  r   rY   r   r{   rK   rK   rL   cpp_stream_guard  r   z"DeviceOpOverrides.cpp_stream_guardc                 C  r   rY   r   r{   rK   rK   rL   cpp_aoti_stream_guard  r   z'DeviceOpOverrides.cpp_aoti_stream_guardc                 C  r   rY   r   r{   rK   rK   rL   cpp_getStreamFromExternal
  r   z+DeviceOpOverrides.cpp_getStreamFromExternalc                 C  r   rY   r   r{   rK   rK   rL   kernel_header  r   zDeviceOpOverrides.kernel_headerc                 C  r   rY   r   r{   rK   rK   rL   kernel_driver  r   zDeviceOpOverrides.kernel_driverc                 C  r   rY   r   r{   rK   rK   rL   cpp_stream_type  r   z!DeviceOpOverrides.cpp_stream_typec                 C  r   rY   r   r{   rK   rK   rL   aoti_get_stream  r   z!DeviceOpOverrides.aoti_get_streamc                 C  r   rY   r   r{   rK   rK   rL   cpp_kernel_type  r   z!DeviceOpOverrides.cpp_kernel_typec                 C  r   rY   r   r{   rK   rK   rL   cpp_device_ptr  r   z DeviceOpOverrides.cpp_device_ptrc                 C  r   rY   r   r{   rK   rK   rL   tma_descriptor_helpers  r   z(DeviceOpOverrides.tma_descriptor_helpersidxOptional[tuple[str, str]]c                 C  r   rY   r   )r|   r   rK   rK   rL   cpp_global_scratch"  r   z$DeviceOpOverrides.cpp_global_scratchNr   rC   rD   rC   )r   r   rD   rC   r   )r   r   rD   r   )r\   r]   r^   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rK   rK   rK   rL   r      s$    















r   zdict[str, DeviceOpOverrides]device_op_overrides_dictrg   device_schedulingr   device_wrapper_codegenr   device_cpp_wrapper_codegenr   c                 C  s   t |||t| < d S rY   )r   r   )rg   r   r   r   rK   rK   rL   register_backend_for_device?  s   r   c                   @  sH   e Zd Ze Ze Ze Ze Ze Ze Z	e Z
e Ze Ze ZdS )BackendFeatureN)r\   r]   r^   r   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANSORTTUPLE_REDUCTIONPREFER_STORE_LOOP_ORDERTRITON_TEMPLATESREDUCE_TO_SINGLE_ELEMENTrK   rK   rK   rL   r   J  s    
r   Union[torch.device, str, None]OrderedSet[BackendFeature]c                 C  sf   | d u rt  S t  t| tjr| j}nt| tsJ | }t|} t|}|s*J |d }|| S rY   )	r   init_backend_registration
isinstancer   rg   typerC   get_scheduling_for_deviceget_backend_features)rg   device_typescheduling_ctorr   rK   rK   rL   r   W  s   

r   featurerX   c                 C  s   t |tsJ |t| v S )zSee also V.graph.has_feature)r   r   r   )rg   r   rK   rK   rL   has_backend_featurei  s   r   Optional[SchedulingConstructor]c                 C  s   | t v r	t |  jS d S rY   )r   r   rz   rK   rK   rL   r   q     r   Fcpp_wrapperc                 C  s$   | t v rt |  }|r|jS |jS d S rY   )r   r   r   )rg   r   wrapper_codegen_objrK   rK   rL   get_wrapper_codegen_for_deviceu  s   r   c                    s  ddl m}  ddlm} ddlm} ddlm} ddlm	} ddl
m} ddlm} dd	lm} dd
lm} tdd u rS| ||d td fdd|tjjrP|n| tdd u ri||dtdfdd|| tdd u rvtd||| tdd u rtd||| tj }	|	dkrt|	d u rddlm}
 z%|
d}|
d}|
d}|r|r|rt|	||| W d S W d S W d S W d S  ty   Y d S w d S d S )Nr=   )CppScheduling)CppWrapperCpu)CppWrapperCpuArrayRef)CppWrapperGpu)CUDACombinedScheduling)HalideScheduling)MetalScheduling)TritonSchedulingr>   cpu)cpphalidetritonc                       t j | S rY   )r   cpu_backendr   )cpu_backendsrK   rL   <lambda>      z+init_backend_registration.<locals>.<lambda>cuda)r   r   c                   r   rY   )r   cuda_backendr   )cuda_backendsrK   rL   r     r   xpumpsprivateuseoner   )_get_custom_mod_func
Schedulingr?   CppWrapperCodegen)r   r   cpp_wrapper_cpur   cpp_wrapper_cpu_array_refr   cpp_wrapper_gpur   cuda_combined_schedulingr   r   r   r  r   r   r   wrapperr?   r   r   r   aot_inductorallow_stack_allocationr   _C_get_privateuse1_backend_name torch.utils.backend_registrationr  RuntimeError)r   r   r   r   r   r   r   r   r?   private_backendr  r   r   r   rK   )r   r  rL   r     s   
	


r   indexSequence[sympy.Expr]
index_varssizesr   c                 C  s$   ddl m} g | t|||S )Nr   )FlexibleLayout)r   r  r&   contiguous_strides)r  r  r  r  rK   rK   rL   index_prevent_reordering  s   r  device_op_overridesc                 C  s   |t | < d S rY   )r   )rg   r  rK   rK   rL   register_device_op_overrides  s   r  c                 C  sB   t | tsJ tsddlm}m} ddlm} ddlm} t|  S )Nr=   )cpu_device_op_overridesmps_device_op_overrides)r  )	r   rC   r    r  r   r  r  r  )rg   r  r   r  xpu_op_overridesrK   rK   rL   get_device_op_overrides  s   r#  c                 C  s   i | ]}||qS rK   rK   ).0rl   rK   rK   rL   
<dictcomp>  s    r%  zdict[torch.dtype, torch.dtype]DTYPE_TO_COMPUTATION_DTYPEop_nameargsr   kwargsOptional[torch.dtype]c                 O  s   | t  v rtjS | dv rd|v r|d S |d S | dv rtjS | dv r&tjS | dkr6d|v r2|d S |d S | dkrFd|v rB|d S |d S | d	v rT|d }tj|S | d
krdd|v r`|d S |d S dS )zK
    Given op name and a list of input dtypes, deduce the output dtype
    )to_dtype
index_exprrl   )randrandn)	get_index	randint64	load_seed	reductionr=   constant)loadstorestore_reductionto_dtype_bitcastN)r    r   rX   floatint64r0   rp   r   )r'  r(  r)  buf_namerK   rK   rL   deduce_output_dtype_by_name  s$   
r=  c                   @  sd   e Zd ZdddZdd
dZdddZdddZd ddZd!ddZe	d"ddZ
e	d#ddZdS )$DataTypePropagationbodyr9   rD   rE   c                 C  s8   || _ d|jji| _|j D ]
\}}|j| j|< qd S Nroot)r?  
root_blockrp   graphs	subblocksitems)r|   r?  kvrK   rK   rL   __init__-  s   zDataTypePropagation.__init__nodetorch.fx.Noder*  c                 C  sV   |j }dd |D }t|dkrd S tdd |D }|sd S ttjdd |D S )Nc                 S  s(   g | ]}t |tjjr|jd kr|qS )placeholder)r   r   fxNodeopr$  nrK   rK   rL   
<listcomp>7  s    zCDataTypePropagation.deduce_node_dtype_by_inputs.<locals>.<listcomp>r   c                 s  s.    | ]}t j|jv o|jt j jd uV  qd S rY   )OptimizationContextkeymetarl   rO  rK   rK   rL   	<genexpr>=  s    
zBDataTypePropagation.deduce_node_dtype_by_inputs.<locals>.<genexpr>c                 S  s   g | ]	}|j tj jqS rK   )rT  rR  rS  rl   rO  rK   rK   rL   rQ  G      )all_input_nodeslenall	functoolsreducer   promote_types)r|   rI  inputsinput_nodesall_input_nodes_propagatedrK   rK   rL   deduce_node_dtype_by_inputs5  s   z/DataTypePropagation.deduce_node_dtype_by_inputsrk   c                 C  s"   | j |j }| |}|sJ |S rY   )rC  targetpropagate_graph)r|   rI  	sub_graphrl   rK   rK   rL   deduce_node_dtype_by_subgraphJ  s   
z1DataTypePropagation.deduce_node_dtype_by_subgraphc                 C  s   |j dkrd S |jdkrt|jdkrd S |jtjkr#| |jd S t|jts+J |j	dr6| 
|S t|jg|jR i |j }d urJ|S | |S )NrK  outputr=   r   masked_subblock)rN  ra  rX  r(  operatorgetitemdeduce_node_dtyper   rC   
startswithrd  r=  r)  r`  )r|   rI  output_dtyperK   rK   rL   ri  P  s(   


z%DataTypePropagation.deduce_node_dtyperp   torch.fx.Graphc                 C  sf   |j sJ d }|j D ]&}tj|jv r|jtj }nt }| ||_||jtj< |jdkr0|j}q
|S )Nre  )nodesrR  rS  rT  ri  rl   ra  )r|   rp   graph_dtyperI  opt_ctxrK   rK   rL   rb  k  s   


z#DataTypePropagation.propagate_graphc                 C  s   |  | jd S r@  )rb  rC  r{   rK   rK   rL   	propagate}     zDataTypePropagation.propagatec                 C  s   | |  S rY   )rp  )clsr?  rK   rK   rL   propagate_loopbody     z&DataTypePropagation.propagate_loopbodyr<   c                 C  sB   ddl m} ddlm} t||sJ t|j|sJ t|jS )Nr   r8   )r<   )	loop_bodyr9   	schedulerr<   r   _bodyr>  rs  )rr  rI  r9   r<   rK   rK   rL   propagate_scheduler_node  s
   z,DataTypePropagation.propagate_scheduler_nodeN)r?  r9   rD   rE   )rI  rJ  rD   r*  )rI  rJ  rD   rk   )rp   rl  rD   r*  )rD   r*  )r?  r9   rD   r*  )rI  r<   rD   r*  )r\   r]   r^   rH  r`  rd  ri  rb  rp  classmethodrs  rx  rK   rK   rK   rL   r>  ,  s    





r>  c                      s&   e Zd Zdddd fd
dZ  ZS )r   T)simplifypr   rc   rz  rX   r{  rD   rC   c                  s6   |rt |tjrttjdrtjj|}t 	|S )Nsizevars)
r   rv   Exprhasattrr0   rp   r|  rz  superdoprint)r|   r   rz  r{  	__class__rK   rL   r    s   zPythonPrinter.doprint)r   rc   rz  rX   r{  rX   rD   rC   )r\   r]   r^   r  __classcell__rK   rK   r  rL   r     s    r   c                   @  s  e Zd ZdZed1ddZed2dd	Zed2d
dZed2ddZed2ddZ	ed2ddZ
ed2ddZed2ddZed2ddZed2ddZed2ddZed2ddZed3d d!Zed4d%d&Zed4d'd(Zed4d)d*Zed5d,d-Zed4d.d/Zd0S )6OpDecompositionsz!
    Decomposes inductor ops
    valueOpVarTrD   c                 C  s   | S rY   rK   )r  rK   rK   rL   identity     zOpDecompositions.identityxc                 C  s   t t dtj| S Nr=   )r+   truedivr4  r   int32r  rK   rK   rL   
reciprocal     zOpDecompositions.reciprocalc                 C  s   t | | S rY   )r+   mulr  rK   rK   rL   square  rt  zOpDecompositions.squarec                 C  s   t t dtjt | S r  )r+   subr4  r   float32erfr  rK   rK   rL   erfc     zOpDecompositions.erfcc                 C  s   t t t | t | S rY   )r+   r  expr  r  r  rK   rK   rL   erfcx  s   zOpDecompositions.erfcxc                 C  s   t t | t dtjS r  )r+   r  r  r4  r   r  r  rK   rK   rL   expm1  r  zOpDecompositions.expm1c              	   C  &   t t | t dtd tjS )Nr=   
   r+   r  logr4  mathr   r  r  rK   rK   rL   log10     &zOpDecompositions.log10c              	   C  r  )Nr=   r   r  r  rK   rK   rL   log2  r  zOpDecompositions.log2c              
   C  s"   t t | t tdtjS )Nr   )r+   r  r  r4  r  r  r   r  r  rK   rK   rL   exp2  s   "zOpDecompositions.exp2c              	   C  s   t t | t dtjS r  )r+   r  addr4  r   r  r  rK   rK   rL   log1p  r  zOpDecompositions.log1pc                 C  .   t dtj}t |t |t t | S r  )r+   r4  r   r  r  r  r  negr  onerK   rK   rL   sigmoid      zOpDecompositions.sigmoidc                 C  s   t | t dtjS Nr   )r+   rx   r4  r   r  r  rK   rK   rL   relu  r  zOpDecompositions.reluyzc                 C  s   t t | ||S rY   )r+   r  r  r  r  r  rK   rK   rL   fma  s   zOpDecompositions.fmarO   rl   rk   c                 C     t t | |S rY   )r+   r+  floorrO   rl   rK   rK   rL   floor_to_int     zOpDecompositions.floor_to_intc                 C  r  rY   )r+   r+  ceilr  rK   rK   rL   ceil_to_int  r  zOpDecompositions.ceil_to_intc                 C  r  rY   )r+   r+  truncr  rK   rK   rL   trunc_to_int  r  zOpDecompositions.trunc_to_intrP   c              	   C  sT   t | |}t t |t dtjt t |t |}t |t 	|||S r  )
r+   modand_ner4  r   r  signbitwherer  )rO   rP   rcondrK   rK   rL   	remainder  s   zOpDecompositions.remainderc                 C  r  rY   )r+   r+  roundr  rK   rK   rL   round_to_int  r  zOpDecompositions.round_to_intN)r  r  rD   r  r  r  rD   r  )r  r  r  r  r  r  rD   r  )rO   r  rl   rk   rD   r  rO   r  rP   r  rD   r  )r\   r]   r^   r   r`   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rK   rK   rK   rL   r    sL    r  z[a-z0-9_.]+|\([^)]*\)|)flagsstringc                 C  s   | d dkst | dk rdS d}t| dd  D ]$\}}|dkr%|d7 }n|dkr-|d8 }|dkr<|t | d kr< dS q|dksCJ dS )Nr   (r   Fr=   rR   T)rX  	enumerate)r  rd   icharrK   rK   rL   _all_in_parens  s   
r  c                   @  s  e Zd ZedddZedd
dZedddZedddZedddZedddZ	edddZ
edddZedddZedddZedddZedd!d"Zedd#d$Zedd%d&Zedd'd(Zedd)d*Zedd,d-Zedd1d2Z	3	3ddd;d<ZddBdCZddEdFZ	GdddJdKZddLdMZddRdSZddZd[Zdd^d_Z	G	GdddidjZddkdlZdGe j!d3dmdnddvdwZ"ddydzZ#dd{d|Z$edd~dZ%e&dddZ'e&dddZ(dGS )OpOverridesr  r  rD   c                 C  s,   t | tst| st| r| S d|  dS Nr  rR   )r   CSEVariable_RE_PAREN_NOT_NEEDED	fullmatchr  )r  rK   rK   rL   paren  s   zOpOverrides.parenr  Union[bool, float, int]rl   rk   c                 C  s   t | S rY   )repr)r  rl   rK   rK   rL   r4    r   zOpOverrides.constantr  c                 C  r  r  )r+   r4  r   r  r  r  libdevice_expr  r  rK   rK   rL   libdevice_sigmoid  r  zOpOverrides.libdevice_sigmoidc                 C  
   t | S rY   )r+   absr  rK   rK   rL   libdevice_abs     
zOpOverrides.libdevice_absc                 C  r  rY   )r+   sqrtr  rK   rK   rL   libdevice_sqrt  r  zOpOverrides.libdevice_sqrtc                 C  r  rY   )r+   cosr  rK   rK   rL   libdevice_cos  r  zOpOverrides.libdevice_cosc                 C  r  rY   )r+   sinr  rK   rK   rL   libdevice_sin!  r  zOpOverrides.libdevice_sinc                 C  r  rY   )r+   r  r  rK   rK   rL   libdevice_log%  r  zOpOverrides.libdevice_logc                 C  r  rY   )r+   r  r  rK   rK   rL   r  )  r  zOpOverrides.libdevice_expc                 C  s   dt |  S )N~r  r  r  rK   rK   rL   bitwise_not-     zOpOverrides.bitwise_notrO   c                 C  s   t |  dS )Nz == 0r  )rO   rK   rK   rL   logical_not1  r  zOpOverrides.logical_notr  c                 C     t |  dt | S )Nz & r  r  r  rK   rK   rL   bitwise_and5     zOpOverrides.bitwise_andc                 C  r  )Nz | r  r  rK   rK   rL   
bitwise_or9  r  zOpOverrides.bitwise_orc                 C  r  )Nz ^ r  r  rK   rK   rL   bitwise_xor=  r  zOpOverrides.bitwise_xorc                 C  r  )Nz << r  r  rK   rK   rL   bitwise_left_shiftA  r  zOpOverrides.bitwise_left_shiftc                 C  r  )Nz >> r  r  rK   rK   rL   bitwise_right_shiftE  r  zOpOverrides.bitwise_right_shiftrP   c                 C  s   t | |S rY   )r+   r  rU   rK   rK   rL   int_truedivI  s   zOpOverrides.int_truedivr   rC   r   c                 C  s   t | t|S rY   )r+   r5  rv   Integer)r   r   rK   rK   rL   r2  P  r  zOpOverrides.load_seedTvarr   Union[sympy.Expr, int]checkrX   wrap_negsympy.Symbolc                 C  s   t t|S rY   )r'   rC   )r|   r  r   r  r  rK   rK   rL   indirect_indexingT  s   zOpOverrides.indirect_indexingr   rc   lowerupperrE   c                 C     t t| j d)Nz,: check_bounds should be handled by CSEProxyrT   r   r\   r|   r   r   r  r  rK   rK   rL   check_bounds]     zOpOverrides.check_boundsr  c                 C  r  )Nz$: load should be handled by CSEProxyr  r|   r   r  rK   rK   rL   r5  d     zOpOverrides.loadNmoder/   c                 C  r  )Nz%: store should be handled by CSEProxyr  r|   r   r  r  r   rK   rK   rL   r6  i  r  zOpOverrides.storec                 C  r  )Nz/: store_reduction should be handled by CSEProxyr  r|   r   r  r  rK   rK   rL   r7  p  r  zOpOverrides.store_reduction	src_dtypereduction_typer.   !Union[OpVarT, tuple[OpVarT, ...]]c                 C  r  )Nz): reduction should be handled by CSEProxyr  r|   rl   r  r  r  rK   rK   rL   r3  u     zOpOverrides.reductiondtypestuple[torch.dtype, ...]
combine_fnFCallable[[tuple[OpVarT, ...], tuple[OpVarT, ...]], tuple[OpVarT, ...]]valuestuple[OpVarT, ...]c                 C  r  )Nz$: scan should be handled by CSEProxyr  r|   r  r
  r  rK   rK   rL   scan     	zOpOverrides.scanstable
descendingc                 C  r  )Nz$: sort should be handled by CSEProxyr  r|   r  r  r  r  rK   rK   rL   sort  r  zOpOverrides.sort
boundaries.tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]boundary_indicesindexing_dtyperightsorter Optional[tuple[str, sympy.Expr]]sorter_indicesOptional[OpVarT]c                 C  r  )Nz): bucketize should be handled by CSEProxyr  r|   r  r  r  r  r  r  r  rK   rK   rL   	bucketize  s   
zOpOverrides.bucketizec                 C  r  )Nz2: halide_clamp only implemented for Halide backendr  )r|   r  r   r  rK   rK   rL   halide_clamp  r  zOpOverrides.halide_clampr=   )constraintsrl   is_purepackr]  asmr!  r   r"  r#  r   c                G  r  )Nz<: inline_asm_elementwise only implemented for Triton backendr  )r|   r$  r!  rl   r"  r#  r]  rK   rK   rL   inline_asm_elementwise  r  z"OpOverrides.inline_asm_elementwiser(  c                 G  r  )Nz.: ops.output should not appear at codegen timeAssertionErrorr   r\   )r|   r(  rK   rK   rL   re    r  zOpOverrides.outputc                 C  r  )Nz3: ops.placeholder should not appear at codegen timer&  r|   r  rK   rK   rL   rK    r  zOpOverrides.placeholderCallable[..., OpVarT]c                   s   d fdd	} |_ d
|_|S )Nr|   r  r(  r   r)  rD   r  c                   s   t t| j d  )Nz does not implement ops.r  r|   r(  r)  r   rK   rL   unimplemented  s   z1OpOverrides._unimplemented.<locals>.unimplementedT)r|   r  r(  r   r)  r   rD   r  )r\   is_unimplemented)r   r,  rK   r+  rL   _unimplemented  s   zOpOverrides._unimplementedc                 C  s2   t | |d }t t|d }| p||kpt |ddS )Nr-  F)getattrr,   )rr  r   fn
default_fnrK   rK   rL   _is_unimplemented  s   zOpOverrides._is_unimplementedra  c                 C  s   |dv sJ |t  D ]7\}}t||}|d u r(| |r't| || | q|| jvs8J d| d| j ||_t| |t| qd S )N)r   r   cppvecr   r  zmultiple definitions of z on )	pointwise_overrides_datarE  r/  r2  setattrr.  __dict__r\   r`   )rr  ra  funcnamedataimplrK   rK   rL   _initialize_pointwise_overrides  s   

z+OpOverrides._initialize_pointwise_overrides)r  r  rD   r  )r  r  rl   rk   rD   r  r  )rO   r  rD   r  )r  r  r  r  rD   r  r  )r   rC   r   r  rD   r  TT)
r  r  r   r  r  rX   r  rX   rD   r  
r   rc   r   rc   r  rX   r  rX   rD   rE   )r   rC   r  rc   rD   r  rY   )
r   rC   r  rc   r  r  r   r/   rD   rE   )r   rC   r  rc   r  r  rD   rE   )
rl   rk   r  rk   r  r.   r  r  rD   r  )r  r	  r
  r  r  r  rD   r  )
r  r	  r  r  r  rX   r  rX   rD   r  NN)r  r  r  r  r  r  r  rk   r  rX   r  r  r  r  rD   r  )r  r  r   rc   r  rX   rD   r  )r]  r  r$  rC   r!  r   rl   rk   r"  rX   r#  r   rD   r  )r(  r  rD   rE   )r  r   rD   r  )r   rC   rD   r)  r   rC   rD   rX   )ra  rC   rD   rE   ))r\   r]   r^   r`   r  r4  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r2  r  r  r5  r6  r7  r3  r  r  r  r   r   r  r%  re  rK  r.  ry  r2  r:  rK   rK   rK   rL   r     s    

	





	


r  c                   @  s\   e Zd ZU ded< ded< dZded< dZded< ejZd	ed
< dZ	ded< dZ
ded< dS )OverridesDatarC   r   zCallable[..., str]r   NzOptional[Callable[..., str]]r   r3  r   type_promotion_kindr   r  )r\   r]   r^   r   r   r3  r   DEFAULTr@  r   r  rK   rK   rK   rL   r?    s   
 
r?  airy_aic                 C     d|  dS )Nzairy_ai_forward(rR   rK   r  rK   rK   rL   r         r   special_airy_ai)r@  r   r   	bessel_j0c                 C  rC  )Nzbessel_j0_forward(rR   rK   r  rK   rK   rL   r     rD  c                 C  rC  )Nzlibdevice.j0(rR   rK   r  rK   rK   rL   r     rD  special_bessel_j0)r@  r   r   r   	bessel_j1c                 C  rC  )Nzbessel_j1_forward(rR   rK   r  rK   rK   rL   r     rD  c                 C  rC  )Nzlibdevice.j1(rR   rK   r  rK   rK   rL   r     rD  special_bessel_j1	bessel_y0c                 C  rC  )Nzbessel_y0_forward(rR   rK   r  rK   rK   rL   r   	  rD  c                 C  rC  )Nzlibdevice.y0(rR   rK   r  rK   rK   rL   r   
  rD  special_bessel_y0	bessel_y1c                 C  rC  )Nzbessel_y1_forward(rR   rK   r  rK   rK   rL   r     rD  c                 C  rC  )Nzlibdevice.y1(rR   rK   r  rK   rK   rL   r     rD  special_bessel_y1digammac                 C  rC  )Nzcalc_digamma(rR   rK   r  rK   rK   rL   r     rD  c                 C  
   |  dS )Nz
.digamma()rK   r  rK   rK   rL   r        
 )r@  r   r3  r   r  c                 C  rC  )Nzcalc_erfcx(rR   rK   r  rK   rK   rL   r     rD  c                 C  rC  )Nzlibdevice.erfcx(rR   rK   r  rK   rK   rL   r     rD  special_erfcxr  c                 C     d|  d| d| dS )Nz	std::fma(rQ   rR   rK   r  rK   rK   rL   r   #      c                 C  rR  )Nzfmadd(rQ   rR   rK   r  rK   rK   rL   r   $  rS  c                 C  rR  )Nzlibdevice.fma(rQ   rR   rK   r  rK   rK   rL   r   %  rS  )r@  r   r3  r   r   igammac                 C     d|  d| dS Nzcalc_igamma(rQ   rR   rK   r  rK   rK   rL   r   +      igammacc                 C  rU  Nzcalc_igammac(rQ   rR   rK   r  rK   rK   rL   r   0  rW  gammaincc                 C  rU  rV  rK   r  rK   rK   rL   r   5  rW  special_gammainc	gammainccc                 C  rU  rY  rK   r  rK   rK   rL   r   :  rW  special_gammaincci0c                 C  rC  )Nzcalc_i0(rR   rK   r  rK   rK   rL   r   ?  rD  c                 C  rC  Nzlibdevice.cyl_bessel_i0(rR   rK   r  rK   rK   rL   r   @  rD  c                 C  rO  )Nz.i0()rK   r  rK   rK   rL   r   A  rP  )r@  r   r   r3  r   i0ec                 C  rC  )Nz	calc_i0e(rR   rK   r  rK   rK   rL   r   F  rD  c                 C  rO  )Nz.i0e()rK   r  rK   rK   rL   r   G  rP  special_i0ei1c                 C  rC  )Nzcalc_i1(rR   rK   r  rK   rK   rL   r   L  rD  c                 C  rC  Nzlibdevice.cyl_bessel_i1(rR   rK   r  rK   rK   rL   r   M  rD  
special_i1i1ec                 C  rC  )Nz	calc_i1e(rR   rK   r  rK   rK   rL   r   R  rD  special_i1elog_ndtrc                 C  rC  )Nzcalc_log_ndtr(rR   rK   r  rK   rK   rL   r   W  rD  special_log_ndtrmodified_bessel_i0c                 C  rC  )Nzmodified_bessel_i0_forward(rR   rK   r  rK   rK   rL   r   ]  rD  c                 C  rC  r_  rK   r  rK   rK   rL   r   ^  rD  special_modified_bessel_i0modified_bessel_i1c                 C  rC  )Nzmodified_bessel_i1_forward(rR   rK   r  rK   rK   rL   r   c  rD  c                 C  rC  rc  rK   r  rK   rK   rL   r   d  rD  special_modified_bessel_i1modified_bessel_k0c                 C  rC  )Nzmodified_bessel_k0_forward(rR   rK   r  rK   rK   rL   r   i  rD  special_modified_bessel_k0modified_bessel_k1c                 C  rC  )Nzmodified_bessel_k1_forward(rR   rK   r  rK   rK   rL   r   n  rD  special_modified_bessel_k1ndtrc                 C  rC  )Nz
calc_ndtr(rR   rK   r  rK   rK   rL   r   t  rD  special_ndtrndtric                 C  rC  )Nzcalc_ndtri(rR   rK   r  rK   rK   rL   r   y  rD  special_ndtri	polygammac                 C  s   |  d| d| d|  dS )Nz == 0 ? calc_digamma(z) : calc_polygamma(rQ   rR   rK   r  rK   rK   rL   r   ~  s    scaled_modified_bessel_k0c                 C  rC  )Nz"scaled_modified_bessel_k0_forward(rR   rK   r  rK   rK   rL   r     rD  !special_scaled_modified_bessel_k0scaled_modified_bessel_k1c                 C  rC  )Nz"scaled_modified_bessel_k1_forward(rR   rK   r  rK   rK   rL   r     rD  !special_scaled_modified_bessel_k1spherical_bessel_j0c                 C  rC  )Nzspherical_bessel_j0_forward(rR   rK   r  rK   rK   rL   r     rD  special_spherical_bessel_j0zetac                 C  rU  )Nzzeta(rQ   rR   rK   r  rK   rK   rL   r     rW  special_zetachebyshev_polynomial_tc                 C  rU  )Nzchebyshev_polynomial_t_forward(rQ   rR   rK   r  rK   rK   rL   r     rW  special_chebyshev_polynomial_tchebyshev_polynomial_uc                 C  rU  )Nzchebyshev_polynomial_u_forward(rQ   rR   rK   r  rK   rK   rL   r     rW  special_chebyshev_polynomial_uchebyshev_polynomial_vc                 C  rU  )Nzchebyshev_polynomial_v_forward(rQ   rR   rK   r  rK   rK   rL   r     rW  special_chebyshev_polynomial_vchebyshev_polynomial_wc                 C  rU  )Nzchebyshev_polynomial_w_forward(rQ   rR   rK   r  rK   rK   rL   r     rW  special_chebyshev_polynomial_wlegendre_polynomial_pc                 C  rU  )Nzlegendre_polynomial_p_forward(rQ   rR   rK   r  rK   rK   rL   r     rW  special_legendre_polynomial_pshifted_chebyshev_polynomial_tc                 C  rU  )Nz'shifted_chebyshev_polynomial_t_forward(rQ   rR   rK   r  rK   rK   rL   r     rW  &special_shifted_chebyshev_polynomial_tshifted_chebyshev_polynomial_uc                 C  rU  )Nz'shifted_chebyshev_polynomial_u_forward(rQ   rR   rK   r  rK   rK   rL   r     rW  &special_shifted_chebyshev_polynomial_ushifted_chebyshev_polynomial_vc                 C  rU  )Nz'shifted_chebyshev_polynomial_v_forward(rQ   rR   rK   r  rK   rK   rL   r     rW  &special_shifted_chebyshev_polynomial_vshifted_chebyshev_polynomial_wc                 C  rU  )Nz'shifted_chebyshev_polynomial_w_forward(rQ   rR   rK   r  rK   rK   rL   r     rW  &special_shifted_chebyshev_polynomial_whermite_polynomial_hc                 C  rU  )Nzhermite_polynomial_h_forward(rQ   rR   rK   r  rK   rK   rL   r     rW  special_hermite_polynomial_hhermite_polynomial_hec                 C  rU  )Nzhermite_polynomial_he_forward(rQ   rR   rK   r  rK   rK   rL   r     rW  special_hermite_polynomial_helaguerre_polynomial_lc                 C  rU  )Nzlaguerre_polynomial_l_forward(rQ   rR   rK   r  rK   rK   rL   r     rW  special_laguerre_polynomial_lzdict[str, OverridesData]r4  r   c                   s.   t  fddtjjtjjtjjtjjfD S )Nc                 3  s    | ]} |v V  qd S rY   rK   r$  r  r+  rK   rL   rU    s
    
z$is_buffer_removed.<locals>.<genexpr>)anyr0   rp   removed_bufferskernelinplaced_to_remover+  rK   r+  rL   is_buffer_removed  s   r  c                      s6   e Zd ZdZd fddZdd	d
ZdddZ  ZS )DeferredLinezHA line that can be 'unwritten' by adding name to V.graph.removed_buffersr   rC   linec                   s$   t  | || _t|trJ d S rY   )r  rH  r   r   r!   )r|   r   r  r  rK   rL   rH    s   zDeferredLine.__init__rD   r   c                 C  s   t | js| jS d S rY   )r  r   r  r{   rK   rK   rL   __call__  s   
zDeferredLine.__call__c                 C  s   t | j|S rY   )r  r   )r|   r  rK   rK   rL   	_new_line  s   zDeferredLine._new_line)r   rC   r  rC   r   )r  rC   rD   r  )r\   r]   r^   r   rH  r  r  r  rK   rK   r  rL   r    s
    
r  c                   @  s   e Zd Zd	d
ddZdS )BracesBufferr=   r   r   rD   'contextlib.AbstractContextManager[None]c                   s   t jd fdd}| S )NrD   Iterator[None]c                  3  s    t  D ]} d  jd7  _qt   D ]}  jd8  _d qd V  t   D ]} d  jd7  _q0t  D ]}  jd8  _d qCd S )N{r=   })range	writeline_indent)_r   r|   rK   rL   ctx  s   

z BracesBuffer.indent.<locals>.ctx)rD   r  )
contextlibcontextmanager)r|   r   r  rK   r  rL   indent  s   zBracesBuffer.indentN)r=   )r   r   rD   r  )r\   r]   r^   r  rK   rK   rK   rL   r    s    r  c                   @  s   e Zd ZU ded< ded< dS )InplacedBufferrC   rj   r   other_namesNr   rK   rK   rK   rL   r  	  s   
 r  c                   @  s,   e Zd ZU ded< dZded< d
ddZd	S )ArgNamerC   r   FrX   is_constexprrD   c                 C  s   | j  | jr
d S d S )Nz : tl.constexprr!  )r   r  r{   rK   rK   rL   	full_name  s   zArgName.full_nameNr   )r\   r]   r^   r   r  r  rK   rK   rK   rL   r    s   
 r  c                   @  s   e Zd ZdddZdS )
RemovedArgrD   rC   c                 C  s   dS )NREMOVEDrK   r{   rK   rK   rL   __str__  r   zRemovedArg.__str__Nr   )r\   r]   r^   r  rK   rK   rK   rL   r    s    r  c                   @  s   e Zd ZedIdd	ZdJddZdKddZedLddZdMddZdMddZ	dNddZ
dOdd ZdPd"d#ZdQd&d'ZdRd)d*ZdSd,d-ZdTd/d0ZdUd4d5ZdVd8d9ZdWd;d<ZdXd>d?ZdYdAdBZdZdCdDZd[dFdGZdHS )\
KernelArgsrn   rC   odict6Union[dict[_T, Union[str, RemovedArg]], dict[_T, str]]r   r@   rD   c                 C  s6   | |t}t|tr|  t|  ||< }|S |S rY   )getr  r   r  rX  )rn   r  r   result
new_resultrK   rK   rL   _lookup!  s
   
zKernelArgs._lookuprE   c                 C  s"   i | _ i | _i | _i | _g | _d S rY   )input_buffersoutput_buffersinplace_buffersr|  workspace_argsr{   rK   rK   rL   rH  -  s
   
zKernelArgs.__init__c              
   C  s&   d dtt| j| j| j| jgS )NzKernelArgs({})rQ   )formatru   mapr  r  r  r  r|  r{   rK   rK   rL   __repr__4  s   zKernelArgs.__repr__r   rX   c                 C  s
   t | tS rY   r   r  r+  rK   rK   rL   _buffer_is_marked_removedC  s   
z$KernelArgs._buffer_is_marked_removedc                 C  s   t jjrt jjj||}|t jjvsJ ||| jv r$tt| j| S || j	v r2tt
| j	| jS |dr?| d| j|S | d| j|S )Nseedin_ptr)r0   rp   rv  mutation_real_namer  r  r  r   rC   r  r  rj   rj  r  r  r   rK   rK   rL   inputH  s   


zKernelArgs.inputc                 C  sZ   t jjrt jjj||}|t jjvsJ ||| jv r%tt| j| j	S | 
d| j|S )Nout_ptr)r0   rp   rv  r  r  r  r  r   r  rj   r  r  r   rK   rK   rL   re  T  s   
zKernelArgs.output
input_nameoutput_namec                 C  s   || j vsJ || j v r%| j | }t|trJ |j| || j |< d S dd | j  D }dd | j  D }tt|t| }td| ||g}|| j |< || j |< d S )Nc                 S  s   g | ]	}t |ts|qS rK   r  r$  valrK   rK   rL   rQ  d      z+KernelArgs.make_inplace.<locals>.<listcomp>c                 S  s   g | ]	}t |tr|qS rK   r  r  rK   rK   rL   rQ  i  r  
in_out_ptr)	r  r   r  r  appendr  rX  r*   r  )r|   r  r  bufalive_buffersr  inplace_buffer_idxrK   rK   rL   make_inplace\  s&   


zKernelArgs.make_inplacenbytesrc   rW   tuple[str, int]c                 C  s   t |t|tj t  d}t| jD ]+\}}t 	||r2|j
}t ||| j|< |j|f  S |j|jkr>|j|jks@J q| j| |jdfS )a  
        Allocate or extend a workspace buffer of nbytes bytes.

        This function manages the allocation of a workspace buffer. It either creates
        a new WorkspaceArg or extends an existing one.

        Note:
        - Calling this function will in-place mutate the args by adding or updating
        a WorkspaceArg.
        - The codegen for generating the Python argdefs and call_defs will check
        this field and allocate the buffer accordingly.
        - A new argument "ws_ptr" will be present in the generated code.

        Args:
            nbytes (sympy.Expr): The number of bytes to allocate.
            zero_fill (bool): Whether to initialize the buffer to zero.

        Returns:
            Tuple[str, int]: A tuple containing:
                - "ws_ptr": A string identifier for the workspace pointer.
                - offset: An integer representing the byte offset in the workspace.
        )rd   re   rg   rh   r   )rb   rN   r[   r0   rp   get_current_device_or_throwrr   r  r  rs   rd   ru   rj   rh   r  )r|   r  rW   argr  existing_argr   rK   rK   rL   	workspacev  s   
zKernelArgs.workspacemin_sizec              	   C  sh   t j }t|tjtjdd|j d|j	 |d}| j
D ]}|j|jkr*||ks*J q| j
| |jS )a  
        Lazily allocate a graph-wide semaphores buffer with at least min_size.  This is a single buffer shared by
        all kernels and zero initialized once at graph start.  Each kernel must leave the buffer zeroed on exit.

        Warning: multiple calls to this function will return the same buffer.

        Args:
            min_size: the number of int32 semaphores required

        Returns:
            name of the semaphores buffer
        sem_ptrsemaphores_r  )rd   re   rl   rj   rh   rg   )r0   rp   r  rb   rN   r_   r   uint32r   r  r  rj   r  )r|   r  current_devicer  r  rK   rK   rL   
semaphores  s   

zKernelArgs.semaphoresr  r   c                   sx   t |tsJ t||ft|}|| jv r| j| S  | j v r5  t fdd| j D    | j|<  S )Nc                 3  s    | ]
}|  rd V  qdS )r=   N)rj  )r$  rG  r+  rK   rL   rU    s    z)KernelArgs.seed_offset.<locals>.<genexpr>)r   r   r   rv   r  r|  r  sum)r|   r   r  rK   r+  rL   seed_offset  s   


"
zKernelArgs.seed_offsetr  c                 C  sD   t |tjsJ t||f|jdkrd| j|< dS | d| j|S )Nr  ks)r   rv   Symbolr   r   r|  r  r   rK   rK   rL   r     s
   

zKernelArgs.sizeIterator[str]c                 C  s   t | j | j | j S rY   )r   r  keysr  r|  r{   rK   rK   rL   
call_names  s   zKernelArgs.call_namesr   c                 C  sX   | j |d}|durt|ts|jS | j|d}|dur%t|ts%|S | j|dS )z;
        Returns inner name of a given outer name.
        N)r  r  r   r  rj   r  r  )r|   r   inplacedr  rK   rK   rL   arg_name  s   zKernelArgs.arg_namer  rl   rk   c                 C  s   |S rY   rK   )r|   r  rl   rK   rK   rL   wrap_ptr_arg  r   zKernelArgs.wrap_ptr_argr   
SymbolLikec                 C  s   t |S rY   )rC   )r|   r   rK   rK   rL   wrap_size_arg  r   zKernelArgs.wrap_size_arg&tuple[list[str], list[str], list[str]]c                 C  s  ddl m}m} g }g }g }t| j D ]5}t|trq|jd }|j	}t
j|}	||	 }
||
 d|  || ||	 ||
 d q| j D ]1\}}|| jv rZqPt
j|}	||	 }
|d|
 d|  || ||	 |d|
 d qP| j D ]4\}}|| jv st|trqt
j|}	||	 }
||
 d|  || ||	 ||
 d q| j D ]*\}}|d| d|  || | |d|  t
jjrt
jj| q| jrJ d|||fS )	Nr=   )DTYPE_TO_CPP
INDEX_TYPEr-  z* *zconst  zWorkspace not supported on CPU )	cpp_utilsr  r  r*   r  r  r   r  r  rj   r0   rp   r   r  r  r  rE  r  r|  r  wrapper_codeensure_size_computedr  )r|   r  r  	call_argsarg_defs	arg_typesr  outerinnerrl   	cpp_dtypemaybe_innerrK   rK   rL   cpp_argdefs  sN   



zKernelArgs.cpp_argdefs?tuple[list[ArgName], list[str], list[KernelArgType], list[Any]]c           	   
   C  s  g }g }g }g }t | j D ]9}t|trq|t|j ||jd  |t	j
|jd  |t|j|jd t	j
|jd d qt| j | j D ]2\}}|| jv sbt|trcqT|t| || |t	j
| |t||t	j
|d qT| j D ]*\}}|t| || |t| |t|| t	j
jrt	j
j| q| jD ]}|t|j ||j || ||j q||||fS )Nr-  )r   r   rl   )r*   r  r  r   r  r  r  rj   r  r0   rp   r   r   r   r  rE  r  r|  r   r   r  r  r  rh   rl   )	r|   r  r  r  precompile_argsr  r  r  r  rK   rK   rL   python_argdefs  s\   





zKernelArgs.python_argdefsIterator[tuple[str, str]]c                 c  s    t | j D ]:}t|trq|jD ].}|tjjv s!|tj	jv r"q|| j
v r0| j
| |jfV  || jv rAtt| j| |jfV  qqd S rY   )r*   r  r  r   r  r  r0   rp   r  r  r  rj   r  r   rC   )r|   r  otherrK   rK   rL   aliases@  s   



zKernelArgs.aliasesc                 C  s(   t | j|ttot | j|ttS rY   )r   r  r  r  r  r  r   rK   rK   rL   
is_removedO  s
   zKernelArgs.is_removedOrderedSet[str]c                 C  sn   t  }t| j D ]}t|trq
||jd  q
| j	 D ]\}}|| jv s.t|tr/q || q |S )Nr-  )
r   r*   r  r  r   r  r  r  r  rE  )r|   	live_outsr  r  r  rK   rK   rL   live_output_buffersW  s   
zKernelArgs.live_output_buffersN)rn   rC   r  r  r   r@   rD   rC   rD   rE   r   )r   r   rD   rX   r   )r  rC   r  rC   rD   rE   )r  rc   rW   rX   rD   r  )r  rc   rD   rC   )r   rC   r  r   rD   rC   )r   r  rD   rC   )rD   r  )r   rC   rD   r   )r  rC   rl   rk   rD   rC   )r   r  rD   rC   )rD   r  )rD   r  )rD   r  r>  )rD   r  )r\   r]   r^   r`   r  rH  r  r  r  re  r  r  r  r  r   r  r  r  r  r  r   r  r  r  rK   rK   rK   rL   r     s.    






)







)
1
r  c                      sX   e Zd ZdZ	dd fd	d
Zd ddZd!ddZd"ddZd#ddZd ddZ	  Z
S )$r  aD  A CSEVariable is just a name for an expression but it is useful to be able to annotate them on a backend dependent basis.
    To do so, the backends can simply overload `Kernel.create_cse_var`
    The "CSEVariable.update_on_args" method gives you a hook for annotations
    See example of TritonCSEVariable in triton.py
    Nr   rC   boundsValueRanges[Any]rl   r*  c                   s4   t    t|tsJ || _|| _d| _|| _d S r  )r  rH  r   r   r   r	  	use_countrl   )r|   r   r	  rl   r  rK   rL   rH  k  s   

zCSEVariable.__init__rD   c                 C  ry   rY   r+  r{   rK   rK   rL   r  x  r~   zCSEVariable.__str__r   c                 C  s
   t | jS rY   )hashr   r{   rK   rK   rL   __hash__{  r   zCSEVariable.__hash__r  objectrX   c                 C  s   t |to
|j| jkS rY   )r   r  r   )r|   r  rK   rK   rL   __eq__~  r   zCSEVariable.__eq__r(  r   r)  rE   c                 C  r   rY   rK   )r|   r   r(  r)  rK   rK   rL   update_on_args  r   zCSEVariable.update_on_argsc                 C  s   | j j d| jdS r  )r  r\   r   r{   rK   rK   rL   r    r   zCSEVariable.__repr__rY   )r   rC   r	  r
  rl   r*  r   )rD   r   )r  r  rD   rX   )r   rC   r(  r   r)  r   rD   rE   )r\   r]   r^   r   rH  r  r  r  r  r  r  rK   rK   r  rL   r  d  s    




r  AugmentedKeyT)defaultCSEVariableType)boundr  .c                   @  s   e Zd ZdZ							dAdBddZdCddZdDddZdDddZdEd d!ZdFd$d%Z	dGd'd(Z
dHd*d+ZdId,d-Ze d.d.dd/dJd:d;Ze dfdKd<d=Ze dfdLd?d@ZdS )MCSEz Common subexpression eliminationr!  tmpNrn   rC   suffixname_prefixiter_buffersOptional[itertools.count[int]]store_cache.Optional[MutableMapping[str, CSEVariableType]]reduction_cache<Optional[MutableMapping[ReductionCacheKey, CSEVariableType]]varname_map$Optional[dict[str, CSEVariableType]]c                 C  sP   || _ || _i | _|| _|pi | _|pi | _|pt | _t	 | _
|p$i | _d S rY   )rn   r  _cacher  r  r  	itertoolsrd   iter_buffer_idsr   invalidated_storesr  )r|   rn   r  r  r  r  r  r  rK   rK   rL   rH    s   
zCSE.__init__	keep_varsOrderedSet[CSEVariable]rD   rE   c                   s`   g | j  D ]\}}| vr| j |= | j| q r+ fdd| j D | _d S i | _d S )Nc                   s   i | ]\}}| v r||qS rK   rK   )r$  rF  rG  r%  rK   rL   r%    s    z"CSE.invalidate.<locals>.<dictcomp>)r  rE  r$  r  r!  )r|   r%  r   r  rK   r'  rL   
invalidate  s   
zCSE.invalidatetyping.Selfc              	   C  s(   t | | j| j| j| j| j| j| jdS )N)rn   r  r  r  r  r  r  )r   rn   r  r  r#  r  r  r  r{   rK   rK   rL   clone  s   z	CSE.clonec                 C  s0   |   }t| j|_t| j|_t| j|_|S )zNReturn a copy of using ScopedDict so changes to *_cache aren't visible in self)r*  r%   r!  r  r  )r|   new_cserK   rK   rL   scoped_copy  s
   zCSE.scoped_copy	cache_keyr  c                 C  s
   t t|S )z@Override this method to augment cache key with backend specifics)r   r  r|   r-  rK   rK   rL   augment_key  r  zCSE.augment_keyr  r  c                 C  s   || j | |< d S rY   r!  r/  )r|   r-  r  rK   rK   rL   put     zCSE.putrX   c                 C  s   |  || jv S rY   )r/  r!  r.  rK   rK   rL   contains  rq  zCSE.containsOptional[CSEVariableType]c                 C  s   | j | |d S rY   )r!  r  r/  r.  rK   rK   rL   try_get  r2  zCSE.try_getc                 C  s   | j | | S rY   r0  r.  rK   rK   rL   r    rq  zCSE.getT)r	  write
assignmentrl   r   r#   r   CUnion[str, CSEVariable, OpsValue, IndentedBuffer, DeferredLineBase]r	  r
  r6  r7  rl   r*  c             	   C  s  t |tr|j}|s|sJ t |tr&|j||_| jd7  _tt|S t |t	r0|
 }nt |tr9|j}n	t |ts@J |}| |}|s| ||}| || |rtjjrdtjjj|dd t |t	r|rv|| j | d || || j |S t |tr|sJ ||| j | d|j | j  |S |r| j | d| | j }	n| | j }	||	 |rtjjr|d urd| j | dt| d}
||
 |S |j||_| jd7  _|S )	Nr=   T)	only_oncez =z = tl.static_assert(
.dtype == rR   )r   r-   r  r  r	  tightenr  r   r  r#   getvaluer!   r  rC   r5  newvarr1  r0   r  current_nodecodegen_originating_infor  rn   splicer  r  r   test_configsruntime_triton_dtype_assertr)   )r|   r   r   r	  r6  r7  rl   r-  r  r  assert_linerK   rK   rL   generate  sb   










 

zCSE.generatec                 C  s2   | j  t| j }tj|||}|| j|< |S rY   )r  ro   r#  r0   r  create_cse_varr  )r|   r	  rl   var_namer  rK   rK   rL   r>    s   
z
CSE.newvarr   c                   s8   t  | jv fdd tj ||}|| j < |S )Nc                     s
   d  S )Nzduplicate name: rK   rK   r+  rK   rL   r   0  rP  zCSE.namedvar.<locals>.<lambda>)r   _check_valuer  r0   r  rF  )r|   r   r	  rl   r  rK   r+  rL   namedvar)  s   
zCSE.namedvar)r!  r!  r  NNNN)rn   rC   r  rC   r  rC   r  r  r  r  r  r  r  r   )r%  r&  rD   rE   rD   r)  )r-  rC   rD   r  )r-  rC   r  r  rD   rE   )r-  rC   rD   rX   )r-  rC   rD   r4  )r-  rC   rD   r  )r   r#   r   r8  r	  r
  r6  rX   r7  rX   rl   r*  rD   r  )r	  r
  rl   r*  rD   r  )r   rC   r	  r
  rl   r*  rD   r  )r\   r]   r^   r   rH  r(  r*  r,  r/  r1  r3  r5  r  r   unknownrE  r>  rI  rK   rK   rK   rL   r    s:    








Fr  c                      s2   e Zd Zd fddZdddZdddZ  ZS )CodeGenrD   rE   c                   s   t    t | _d S rY   )r  rH  r  	ExitStack
exit_stackr{   r  rK   rL   rH  8  s   
zCodeGen.__init__r)  c                 C  s   | j   | S rY   )rN  	__enter__r{   rK   rK   rL   rO  <  s   
zCodeGen.__enter__exc_typer   exc_valexc_tbc                 C  s   | j ||| d S rY   )rN  __exit__r|   rP  rQ  rR  rK   rK   rL   rS  @  r2  zCodeGen.__exit__r  rJ  rP  r   rQ  r   rR  r   rD   rE   )r\   r]   r^   rH  rO  rS  r  rK   rK   r  rL   rL  7  s    
rL  c                      sZ  e Zd ZU dZded< dZded< dZded< 	dwdx fddZej	dyddZ
ej			dzd{ddZd|d!d"Zd|d#d$Zd}d&d'Z	d~dd*d+Zdd2d3Zdd:d;Zdd>d?ZddAdBZ		dzddLdMZeddNdOZ	d~ddWdXZdd[d\Zdd]d^Zd fd`daZd fdfdgZddhdiZddjdkZddldmZddodpZddrdsZ ddudvZ!  Z"S )Kernelr!  rC   newvar_prefixr  Nz'Optional[Callable[[], OpsHandler[Any]]]	overridesTr(  Optional[KernelArgs]increase_kernel_countrX   rD   rE   c                   s   t    |rt jd7  _|pt | _t | _t | _t | _	d| _
d| _t| j| j| _tt  | _tt  | _d | _d | _d | _d | _tt  | _tt  | _i | _d| _d | _d S )Nr=   r   )r  rH  r   generated_kernel_countr  r(  r#   loadscomputestoresnum_loadnum_reductionr  rW  r  cser   rC   must_keep_buffersstore_buffer_names
_load_mask_load_otherr?  node_to_boundsr  r  inplace_update_buffersmin_elem_per_threadkernel_name)r|   r(  rZ  r  rK   rL   rH  I  s*   

zKernel.__init__rI  r<   r  c                 c  s:    | j }|| _ |j  | _z	d V  W || _ d S || _ w rY   )r?  rw  r	  
get_boundsrf  )r|   rI  priorrK   rK   rL   set_current_nodek  s   zKernel.set_current_nodelbr#   cbOptional[IndentedBuffer]sbc           	      c  s    |d u r|}|d u  }rt  }| j}| j}| j}| j}|| _|| _|| _| | _zd V  W || _|| _|| _|| _|rC|rEJ dd S d S || _|| _|| _|| _|r[|r[J dw )Nz$unexpected store inside swap_buffers)r#   r\  r]  r^  ra  r,  )	r|   rm  rn  rp  disallow_storesr\  r]  r^  ra  rK   rK   rL   swap_buffersu  s:   
zKernel.swap_buffersr   r  rc   r  c                 C  r   rY   r   r  rK   rK   rL   r5    r   zKernel.loadc                 C  s,   | j }z| j| _ | ||W || _ S || _ w )z+A load the depends on an index we have read)r\  r]  r5  )r|   r   r  rk  rK   rK   rL   indirect_load  s
   zKernel.indirect_loadr  c                 C  r   rY   r   r  rK   rK   rL   r7    r   zKernel.store_reductionr   r/   c                 C  r   rY   r   r  rK   rK   rL   r6    r  zKernel.storerl   rk   r  r  r.   +Union[CSEVariable, tuple[CSEVariable, ...]]c                 C  r   rY   r   r  rK   rK   rL   r3       zKernel.reductionr  r	  r
  UCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]r  tuple[CSEVariable, ...]c                 C  r   rY   r   r  rK   rK   rL   r    s   zKernel.scanr  r  c                 C  r   rY   r   r  rK   rK   rL   r    ru  zKernel.sortdict[sympy.Symbol, sympy.Expr]c                 C  r   rY   r   r{   rK   rK   rL   
var_ranges  r   zKernel.var_rangesr  r  r  r  r  r  r  r  Optional[CSEVariable]c                 C  r   )z3
        See [Note: Inductor bucketize op]
        r   r  rK   rK   rL   r    s   zKernel.bucketizec                 C  r   rY   r   r{   rK   rK   rL   assert_function  r   zKernel.assert_functionr  Union[CSEVariable, str]r  r   r  mask!Optional[Union[CSEVariable, str]]c              	   C  s   t |tr	t|}t |tsJ |d u st |tsJ |d u s&t |ts&J |rD|rDd| d| d| d| d	}| d| d| }n|rP| d| }|}n|sTJ | d| }|}|rhd| d| d}| j d| d| dS )	Nr  z <= z) & (z < rR   z) | ~(z, "index out of bounds: z"))r   r  rC   r{  )r|   r  r  r  r}  r  
cond_printrK   rK   rL   indirect_assert  s"   
zKernel.indirect_assertr   r   c                 C  r   rY   r   r  rK   rK   rL   r    r  zKernel.check_boundsc                 C  r   rY   r   r(  rK   rK   rL   index_to_str  r   zKernel.index_to_strr)  c                   sF   t    | js
J | jtt| |   | jt|  | S rY   )	r  rO  rX  rN  enter_contextr0   set_ops_handlerCSEProxyset_kernel_handlerr{   r  rK   rL   rO    s   

zKernel.__enter__rP  r   rQ  rR  c                   s   |    t ||| d S rY   )remove_kernel_local_buffersr  rS  rT  r  rK   rL   rS    s   zKernel.__exit__c                   s   t jjsdS tfdd| jD }tt   | jD ]}|| jvr4|| jjvr4	||r4 
| q D ]3}|| jjv re| jj| }t|trKq7t fdd|jD }|r^| | | j
| q7| | q7dS )z
        Any buffers that are both created and have a last use in the
        same kernel can be removed.

        Note that V.graph.scheduler can be None when codegening triton template
        kernels.
        Nc                 3  s(    | ]}| j v r j |  V  qd S rY   )name_to_bufdefining_op_namer$  r  )rv  rK   rL   rU    s    
z5Kernel.remove_kernel_local_buffers.<locals>.<genexpr>c                 3  s    | ]}| v V  qd S rY   rK   rO  )names_to_removerK   rL   rU  ,  s    )r0   rp   rv  r   rc  rC   rb  r(  r  $can_buffer_be_removed_through_fusionr  r  r   r  rY  r  remove_inplace_bufferr  remove_buffer)r|   fused_node_namesr   r  removerK   )r  rv  rL   r    s6   





z"Kernel.remove_kernel_local_buffersc                 C  (   t d| t| jj|< | j| d S )Nzremove_buffer(%r))r  rJ   r  r(  r  r  r  r   rK   rK   rL   r  3  s   zKernel.remove_bufferc                 C  r  )Nzremoving_inplace_buffer(%r))r  rJ   r  r(  r  r  r  r   rK   rK   rL   r  ;  s   zKernel.remove_inplace_buffer;Union[list[sympy.Expr], tuple[sympy.Expr, ...], sympy.Expr]c                   s\   t |ttfr fdd|D S tjj|}t|jdd d} fdd|D }t	||S )Nc                   s   g | ]}  |qS rK   )rename_indexingr  r{   rK   rL   rQ  F  s    z*Kernel.rename_indexing.<locals>.<listcomp>c                 S  ry   rY   r+  )srK   rK   rL   r   H  s    z(Kernel.rename_indexing.<locals>.<lambda>)rS  c                   s0   i | ]}t |tjtjtjfr| j|qS rK   )r   r   UNBACKED_INTSIZEPRECOMPUTED_SIZEr(  r   r  r{   rK   rL   r%  I  s    z*Kernel.rename_indexing.<locals>.<dictcomp>)
r   listtupler0   rp   r|  rz  sortedfree_symbolsr(   )r|   r  sorted_symbolsreplacementsrK   r{   rL   r  @  s   

zKernel.rename_indexingr)  c                 O  s   t |i |S rY   )r  r*  rK   rK   rL   rF  W  s   zKernel.create_cse_varr7   c                 C  s   |du rdS | j | S )zC
        Returns arg name of a given input or output node.
        N)r(  r  r   )r|   rI  rK   rK   rL   r  Z  s   zKernel.arg_name)NT)r(  rY  rZ  rX   rD   rE   )rI  r<   rD   r  r=  )rm  r#   rn  ro  rp  ro  rD   r  r   rC   r  rc   rD   r  r   rC   r  rc   r  r  rD   rE   rY   
r   rC   r  rc   r  r  r   r/   rD   rE   
rl   rk   r  rk   r  r.   r  rt  rD   rt  r  r	  r
  rv  r  rw  rD   rw  
r  r	  r  rw  r  rX   r  rX   rD   rw  )rD   rx  r  r  r  r  r  r  r  rk   r  rX   r  r  r  rz  rD   r  r   )
r  r|  r  r   r  r   r}  r~  rD   rC   r<  )r  rc   rD   rC   rJ  rU  r  r   rC   rD   rE   )r  r  rD   rc   )r(  r   r)  r   rD   r  )rI  r7   rD   r   )#r\   r]   r^   rW  r   r  rX  rH  r  r  rl  rr  r5  rs  r7  r6  r3  r  r  ry  r  r   r{  r  r  r  rO  rS  r  r  r  r  rF  r  r  rK   rK   r  rL   rV  D  sL   
 "	





	


	


	

'


rV  c                   @  s2   e Zd ZU dZded< dZded< dZded	< dS )
rR  ro  zClassVar[str]rS  Nr*  rl   r!  rC   ops_name)r\   r]   r^   rS  r   rl   r  rK   rK   rK   rL   rR  c  s   
 rR  c                  C  s.   zdd l } | j| jdW S  ty   Y d S w )Nr   )	undefined)jinja2EnvironmentStrictUndefinedImportError)r  rK   rK   rL   
jinja2_envk  s   r  c                   @  s\   e Zd ZdZe	d!d"d	d
Zed#ddZed$ddZd%ddZd&ddZ	d'ddZ
d S )(KernelTemplatezg
    Base class for defining kernel templates.

    Children classes: TritonTemplate, CUDATemplate
       sourcerC   num_indentsr   indents_spacingrD   c                   sD   |  d}t|dkr fdd|dd  D |dd < d|S )NTr=   c                   s   g | ]
}d    | qS )r  rK   )r$  r  r  r  rK   rL   rQ    s    z6KernelTemplate.indent_except_first.<locals>.<listcomp>r!  )
splitlinesrX  ru   )r  r  r  linesrK   r  rL   indent_except_first~  s   


z"KernelTemplate.indent_except_firstr   c              
   C  sj   t  }|d u r	d S tj|jd< ddlm} z|| W S  |y4 } zG dd d|}|||d }~ww )Nr  r   )TemplateSyntaxErrorc                      s(   e Zd Zd
 fddZddd	Z  ZS )zIKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxErrororiginal_errorr  rD   rE   c                   s$   t  |j|j|j|j || _d S rY   )r  rH  messagelinenor   filenamer  )r|   r  r  rK   rL   rH    s   
zRKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxError.__init__rC   c                 S  s   d| j  d}|d| j d7 }t| jdrs| jjd}|d7 }td| j d }tt|| j d }t	||D ]:}|| j d krd||d  d	||  d7 }t| jd
rc|dd| jj
d   d 7 }q8||d  d||  d7 }q8|S )NzError in template at line 
zError message: r  z	Context:
r   r   r=   z: --> columnz     r  z^
z:     )r  r  r~  r  r  splitmaxminrX  r  r  )r|   
error_infor  startendr  rK   rK   rL   r    s*   zQKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxError.__str__)r  r  rD   rE   r   )r\   r]   r^   rH  r  r  rK   rK   r  rL   DetailedTemplateSyntaxError  s    	r  )r  r  r  filtersr  r  from_string)r  envr  er  rK   rK   rL   _template_from_string  s   
z$KernelTemplate._template_from_string	fake_outsUnion[list[Buffer], Buffer]Callable[[str], torch.dtype]c                   sJ   t jj t| ttfrdd | D n|  |  id	 fdd}|S )
Nc                 S  s   i | ]	}|  | qS rK   )r   r   r  rK   rK   rL   r%    rV  z2KernelTemplate._fake_get_dtype.<locals>.<dictcomp>r   rC   rD   rk   c                   s    | }|d ur|S  | S rY   )r  )r   r  _get_dtype_reallookuprK   rL   r     s   
z1KernelTemplate._fake_get_dtype.<locals>.get_dtype)r   rC   rD   rk   )r0   rp   r   r   r  r  r   )r  r   rK   r  rL   _fake_get_dtype  s   zKernelTemplate._fake_get_dtyper   rE   c                 C  s
   || _ d S rY   r+  r   rK   rK   rL   rH    r   zKernelTemplate.__init__choices	list[Any]r)  Optional[NotImplementedError]c              
   K  sf   z| | jdi | W dS  ty2 } ztjd|t| t tjk d |W  Y d}~S d}~ww )a%  
        Maybe generates a new ChoiceCaller and appends it into existing choices.
        Returns None if success, otherwise returns the error.

        choices: A list of ChoiceCallers.
        kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
        Nz3Cannot Append Choice: %s. KernelTemplate type is %s)
stack_inforK   )	r  rE  rT   r  infor   getEffectiveLevelrH   INFO)r|   r  r)  r  rK   rK   rL   maybe_append_choice  s   z"KernelTemplate.maybe_append_choicer5   c                 K  r   )zM
        Generates a ChoiceCaller instance from the given arguments.
        r   )r|   r)  rK   rK   rL   rE    s   zKernelTemplate.generateN)r  )r  rC   r  r   r  r   rD   rC   )r  rC   rD   r   )r  r  rD   r  r  )r  r  r)  r   rD   r  )r)  r   rD   r5   )r\   r]   r^   r   r`   r  r  r  rH  r  rE  rK   rK   rK   rL   r  w  s    
,

r  c                      s   e Zd Zd ZdR fddZdSddZdTddZ		dUdVddZdWd$d%ZdXd'd(Z	dYd*d+Z
	,dZd[d/d0Zd\d1d2Zd]d9d:Zd^dAdBZd_dEdFZ	,	,d`dadPdQZ  ZS )br  r  Kernel[Any]parent_handlerOpsHandler[Any]c                   s.   t    ddlm} | | _|| _|| _d S )Nr   ValueRangeAnalysis)r  rH  r	  r  vr_analysisr  r  )r|   r  r  r  r  rK   rL   rH    s
   

zCSEProxy.__init__r   rC   r(  tuple[Any, ...]r)  dict[str, Any]rD   r   c                   s^   | j g R i t| j i t dd fdd}t|S )	Nr   rG  rC   rD   r  c                   s   t jjd ur!t j j}|dkrtjdkn
|dkrtjdknd}nd}|r8dkr-j}nt	 i }nd }t j
jjt j
j| |d}tjjrn|rnddlm} t|ttfr^| }t j
jd	| d
|| d d7 |  |S )Nr   r   r  Fmasked)r	  rl   r   )r)   r:  r;  rR   r=   )r0   rp   r  r  r   r   r   r  rl   r/  r  ra  rE  r]  rB  rC  torch._inductor.codegen.tritonr)   r   r  r  r  r  )rG  
device_strtriton_backendrk  csevarr)   r(  r	  dtype_handlerr)  r   
output_idxr  rK   rL   do_cse  sJ   

z!CSEProxy._default.<locals>.do_cse)rG  rC   rD   r  )_bound_variabler/  r  r   pytreetree_map)r|   r   r(  r)  r  rK   r  rL   _default  s   3zCSEProxy._defaultr
  c                   s   ddl m} ddlm} ttj|rt S tj	j
  j|kr8| jjdur8t| jjts.J | jj t S tjrgt||rgt fdddD rOt S |rSJ ddd}tt||}t| j|| S t S )z
        If the variable comes from an FX node, we forward the bound we have already computed
        Else, if the variable when codegen'ing another op, we try to compute its bounds
        r   r  )TritonTemplateKernelNc                 3  s    | ]}| j v V  qd S rY   )ra  )r$  r  fx_noderK   rL   rU  D	  s    z+CSEProxy._bound_variable.<locals>.<genexpr>)set_indirectr3  r  r  r   rD   c                 S  s(   t | tr| jS t | tjrt| S | S rY   )r   r  r	  rv   r}  r   r  rK   rK   rL   arg_to_boundM	  s
   
z.CSEProxy._bound_variable.<locals>.arg_to_bound)r  r   rD   r   )r	  r  select_algorithmr  r   r0   r  r   rK  interpreterr?  ra  rf  dictr  r   compute_all_boundsr~  r  r  r  r/  r  )r|   r   r(  r)  r  r  r  
arg_boundsrK   r  rL   r  2	  s    
zCSEProxy._bound_variableTr  r  r   r  r  rX   r  r  c                 C  sX  t |tr
t|}t |tjsJ ||jjdk r|r;t|t	|t
j}|jjdkr:t|d}t|||}n|}t }|jt krtt |tjrt|jtt d@ }t|j| |j| }|jjdkrt|jtdt@ }	||	B }| jjj| jj||d}| j|||}
t|r|jjdk }t |tj p|jj|k  }| j|
||| |
S )Nr   r-  )r	  )r   r   rv   r  r}  r	  r  r+   r  r,  r   longr  ltr  r   rK  Numberr   r  ra  rE  r]  r  r  r"   r  )r|   r  r   r  r  stmr  
new_bounds
neg_boundspos	sympy_varassert_lowerassert_upperrK   rK   rL   r  Y	  s:   


zCSEProxy.indirect_indexingr   rc   r  r  rE   c                 C     | j ||||S rY   )r  r  r  rK   rK   rL   r  	  s   zCSEProxy.check_boundsr  c                 C  s|   || j jjv rtj j| t|tjr| j 	||S | j jj
}||v r(|| S | j ||}|jdkr<| j  jd7  _|S r  )r  ra  r$  r0   rb  r  r   r   TMPrs  r  r5  r  r_  )r|   r   r  r  outrK   rK   rL   r5  	  s   

zCSEProxy.loadr  c                 C  sX   || j jj|< | j jr&|tjjv r(| j j|}| D ]}|| j jj|< qd S d S d S rY   )	r  ra  r  r?  r0   rp   name_to_buffer
get_outputget_mutations)r|   r   r  r  
other_namerK   rK   rL   _update_store_cache	  s   zCSEProxy._update_store_cacheNr   r/   c                 C  sF   | j j| |d u r| || |tjjvr!| j j||||dS d S )N)r   )r  rc  r  r  r0   rp   r  r6  r  rK   rK   rL   r6  	  s   zCSEProxy.storec                 C  s:   | j j| | || |tjjvr| j |||S d S rY   )r  rc  r  r  r0   rp   r  r7  r  rK   rK   rL   r7  	  s
   zCSEProxy.store_reductionrl   rk   r  r  r.   rt  c                 C  s"   | j  jd7  _| j ||||S r  )r  r`  r3  r  rK   rK   rL   r3  	  s   zCSEProxy.reductionr  r	  r
  rv  r  rw  c                 C  s   | j |||S rY   )r  r  r  rK   rK   rL   r  	  s   	zCSEProxy.scanr  r  c                 C  r   rY   )r  r  r  rK   rK   rL   r  	  s   zCSEProxy.sortr  r  r  r  r  r  r  r  rz  c              	   C  s   | j |||||||S )a  
        [Note: Inductor bucketize op]

        Inputs:
        -------
        values: the values to be bucketized.
        boundaries: a tuple containing
          (a) the name of the boundaries tensor (which must be sorted, unless
          the sorting tensor is present),
          (b) the length of the tensor in the last dimension (i.e. the length of
          one set of boundaries),
          (c) the number of elements in the underlying storage (i.e. the length
          of the flattened tensor, ignoring striding), and
          (d) the stride of the tensor in the last dimension.
        boundary_indices: indices into a flattened version of the boundaries
        tensor, of the same size and shape as "values".  Each index points to
        the first element in the set of boundaries to be used for the
        corresponding value.
        indexing_dtype: the dtype to use when indexing into the boundaries
        tensor.  This must be int64 or int32.  This additionally specifies the
        dtype of the return value.
        right: see "Details" below.
        sorter: an optional tuple containing
          (a) the name of an optional sorting tensor, used to access unsorted
          boundaries without reordering the boundaries tensor, and
          (b) the stride of the tensor in the last dimension.
        The values in the sorting tensor are used as indices into the *last*
        dimension of the boundaries tensor, with all other indices matching.
        The size of the sorting and boundaries tensors must be equivalent.
        sorter_indices: must be present if the sorting array is present; see
        "boundary_indices" for the equivalent definition for the boundaries
        tensor.

        Output:
        -------
        The buckets each value belongs in, within a given set of boundaries.  0
        indicates a position before the first boundary, and len(boundaries_set)
        represents a position after the last boundary.

        Details:
        --------
        Given a value and a set of boundaries, calculate the bucket that each
        value belongs to.  This works differently in 1-D and N-D cases.

        for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [0, 4, 4, 8], right=True
        return =   [[ 0, 1, 1, 1], [1, 3, 3, 4]].

        for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [[0, 4], [4, 8]], right=True
        return =   [[ 0, 1, 1, 1], [0, 1, 1, 2]]

        Note that in the N-D boundaries case, the shape of "values" and
        "boundaries" must match in every dimension _except_ the last.

        When right == False, bucket i refers to range (boundaries[i], boundaries[i+1]].
        When right == True,  bucket i refers to range [boundaries[i], boundaries[i+1]).

        Boundaries must be non-decreasing, or a sorter must be provided which
        would re-index offsets in a non-decreasing order (e.g. the second output
        of torch.sort(offsets)).  Otherwise, the result is undefined.
        )r  r  r  rK   rK   rL   r  	  s   FzCSEProxy.bucketize)r  r  r  r  )r   rC   r(  r  r)  r  rD   r   )r   rC   r(  r   r)  r   rD   r
  r;  )
r  r  r   r  r  rX   r  rX   rD   r  r<  r  )r   rC   r  r  rD   rE   rY   r  r  r  r  r  r=  r  )r\   r]   r^   r   rH  r  r  r  r  r5  r  r6  r7  r3  r  r  r  r  rK   rK   r  rL   r    s(    

=+
1







r  )rB   rC   rD   rE   rY   )
rg   rC   r   r   r   r   r   r   rD   rE   )rg   r   rD   r   )rg   r   r   r   rD   rX   )rg   rC   rD   r   )F)rg   rC   r   rX   rD   r   r  )r  r  r  r  r  r  rD   r   )rg   rC   r  r   rD   rE   )rg   rC   rD   r   )r'  rC   r(  r   r)  r   rD   r*  )r  rC   rD   rX   rK   r>  )rD   r   )
__future__r   r  dataclassesenumrZ  r"  rH   r  rg  retypingr   r   r   r   r   r   r	   r
   r   r   r   r   typing_extensionsr   rv   r   torch.fxtorch._prims_commonr   torch.utilsr   r  torch.utils._ordered_setr   torch.utils._sympy.numbersr   torch.utils._sympy.printersr   _PythonPrintertorch.utils._sympy.symbolr   r   r   torch.utils._sympy.value_rangesr   r   r!  r   r   dtype_propagationr   ops_handlerr   r   utilsr    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   virtualizedr+   r,   r-   r.   r/   r0   collections.abcr1   r2   r3   r   r4   r5   r6   r7   ru  r9   rv  r:   r;   r<   r  r?   r@   r   r   r   rC   r  r  r  _logginggetArtifactLoggerr\   rF   	getLoggerr  rM   rN   rb   	dataclassr   r   r   r   r   KernelArgTyper   r   r   r   r   r   r   r   r   r   	lru_cacher   r  r  r#  bfloat16r:  float16rX   r  float64int8int16r  r;  r   uint16r  uint64r&  r=  r>  r  compile
IGNORECASEr  r  r  r?  r  INT_TO_FLOATr4  r  r  r  r  r  r  r  r  r  r  r  r  rl   ReductionCacheKeyr  rL  rV  rR  r  r  r  rK   rK   rK   rL   <module>   sV   ,4 
^	6M*b
V e
&,49>CHOU[`flrw}             $  )  .  3  8  =  B  G  L  Q  V  [  c	  F$ %  !s