o
    Ih&V                    @  s  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
mZmZmZ d dlmZmZ d dlmZ d dlmZ d dl	mZmZmZmZmZmZmZmZmZ d dlmZmZmZ d d	l m!Z! d dl"Z"d d
l"m#Z#m$Z$m%Z% d dl&m'  m(  m)Z* d dl+m,  m-Z. d dl/Z0d dl1Z0d dl2m-  m3Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z>m?Z?m@Z@mAZAmBZB d dlCmDZD d dlEmFZFmGZGmHZHmIZImJZJmKZK d dlLmMZM d dlNmOZOmPZPmQZQ d dlRmSZS ddlTmUZUmVZV ddlWmXZXmYZYmZZZ ddlVm[Z[m\Z\m]Z]m^Z^m_Z_ ddl`maZa ddlbmcZcmdZdmeZemfZf ddlgmhZh ddlimjZjmkZk ddl-mlZlmmZmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{ ddl|m}Z}m~Z~mZ er{d dlmZ dd lmZ dd!lmZ dd"l-mZ neZd#ed$< zd dlZejZd%ZW n ey   dZd&ZY nw ed'Zed(Zed)Zeee#f Zd#ed*< eeee#f Zd#ed+< eeZejejd,d-Ze0j}jZ	 eed.eed.f d/d0eeeeeed.f d.d/d0f   f Zd#ed1< ejd%d2G d3d4 d4Zdfd9d:Zdgd>d?ZdhdCdDZdhdEdFZdidLdMZg dNZg dOZ	djdkdTdUZdldWdXZ	djdkdYdZZedmdnd_d`Zedmdodbd`Z	%dmdpded`ZdqdidjZdrdmdnZdsdpdqZdsdrdsZdtdwdxZdud}d~ZdvddZG dd0 d0Zetd&d2G dd dZetG dd deZdwddZetG dd deZetG dd deZededededededdZded< 	%dmdxddZetG dd deZeee# ee# ge~f ZG dd deZG dd deZG dd deZetG dd deZetG dd deZetG dd deZdyddZdyddZ	%	&		&	dzd{ddZÐd|ddZetG dd deZetG ddÄ deŃZetG ddń deŃZetG ddǄ deŃZetG ddɄ deŃZetG dd˄ deɃZetG dd̈́ deŃZetG ddτ deŃZG ddф deʃZetG ddӄ deZetG ddՄ de΃ZetG ddׄ de΃ZАd}ddڄZѐd~dd܄ZG ddބ dރZetG dd deӃZG dd deԃZG dd deԃZG dd deԃZG dd deZG dd deՃZetG dd deӃZG dd deԃZetd&d2G dd deZetd&d2G dd deeZG dd de܃ZG dd deރZG dd deރZetG dd deZetG dd deZetd&d2G dd de݃ZG dd  d e݃ZG dd deZeeeeeeeeeeef  f ZG dd dZG dd deZG dd deZG d	d
 d
eZG dd deZetd&d2G dd de݃ZG dd deZG dd deZetd&d2G dd deZetd&d2G dd deZG dd deZG dd deZG dd de܃ZG dd deZG dd  d eZG d!d" d"eZG d#d$ d$eZG d%d& d&eZG d'd( d(eZG d)d* d*eZG d+d, d,eZG d-d. d.eZG d/d0 d0eZG d1d2 d2eZ G d3d4 d4eZetd&d2G d5d6 d6ZG d7d8 d8eZetd&d2G d9d: d:eZetG d;d< d<eӃZG d=d> d>eZejG d?d@ d@eZG dAd. d.eZG dBdC dCeZ	etd&d2G dDdE dEeZ
ddHdIZetd&d2G dJdK dKeZetd&d2G dLdM dMeZddQdRZetd&d2G dSdT dTeZG dUdV dVeZG dWdX dXeZetG dYdZ dZeZetG d[d\ d\eZG d]d^ d^eZG d_d` d`eZddddeZdS (      )annotationsN)	GeneratorIterableSequence)AbstractContextManagernullcontext)Enum)partial)	AnyCallableClassVarLiteralOptionaloverloadTYPE_CHECKINGTypeVarUnion)assert_neverNever	TypeAlias)patch)ExprIntegerSymbol)identity)GraphModuleSerializer)can_auto_functionalize)metrics)compute_required_storage_lengthis_boolean_dtypeis_float_dtypemake_channels_last_strides_for
StrideType)get_schema_info)compute_unbacked_bindingsfree_unbacked_symbolsrebind_unbackedresolve_unbacked_bindingsShapeEnvSymTypes
OrderedSet)CleanDivFloorDivModularIndexing)SymT   )configdependencies)BackendFeatureget_scheduling_for_deviceindex_prevent_reordering)Depextract_free_unbacked_symbols#extract_input_node_reduction_rangesextract_read_writesvar_builder)LoopBody)OpCounterCSEOpCountResultReductionType	StoreMode)benchmarker)DevicePropertiesReductionHint)argsortargsort_symcache_on_selfceildivconvert_shape_to_inductorconvert_shape_to_symintdeveloper_warningget_kernel_metadatair_dataclass
is_dynamicis_gpu	sympy_dotsympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_subs)opsOpsValueV)Node)CUDATemplate)GraphLowering)IndentedBufferr   rW   TF_T_U_V_IntLike_NumLikez  prefix	TensorBoxr   IRNode_NodeOrNodes)frozenc                   @  s.   e Zd ZU ded< ded< ded< ded< d	S )
GraphPartitionSignaturez5dict[str, Union[IRNode, sympy.Expr, TorchBindObject]]input_nodeslist[IRNode]output_nodeszdict[str, bool]input_deallocationboolskip_cudagraphN__name__
__module____qualname____annotations__ rq   rq   F/var/www/vscode/kcb/lib/python3.10/site-packages/torch/_inductor/ir.pyre      s
   
 re   node_or_nodesOptional[_NodeOrNodes]returnNonec                   s   d fdd  |  d S )Nnodesrt   ru   rv   c                   s   | d u rd S t | ttfr| D ]} | qd S t | tr*|  D ]} | q!d S t | ttttt	j
jjttttf	sEJ dt|  dd S )NzFound zE, which is not a supported top level IR node. See [Note: Inductor IR])
isinstancelisttupledictvalues
ExpandViewDynamicScalarAssertScalarra   sympylogicboolalgBooleanr   intEffectfulKernelShapeAsConstantBuffertype)rw   node_check_tensorboxrq   rr   r      s2   


z%validate_ir.<locals>._check_tensorbox)rw   rt   ru   rv   rq   )rs   rq   r   rr   validate_ir   s   r   namestrCallable[..., OpsValue]c                   s    t  tsJ d fdd}|S )	Nargsobjectkwargsru   rT   c                    s   t t | i |S N)getattrrS   r   r   r   rq   rr   fn      zops_wrapper.<locals>.fn)r   r   r   r   ru   rT   )rx   r   )r   r   rq   r   rr   ops_wrapper   s   r   orderSequence[int]&Callable[[Sequence[_T]], Sequence[_T]]c                   s(   t t| tt|  d fdd}|S )NindexSequence[_T]ru   c                   0   t  t ks
J  fddtt  D S )Nc                      g | ]} |  qS rq   rq   .0i)r   	inv_orderrq   rr   
<listcomp>       z4inverse_reorder.<locals>.reindex.<locals>.<listcomp>lenranger   r   r   rr   reindex      z inverse_reorder.<locals>.reindexr   r   ru   r   )r{   zipr   r   r   r   rq   r   rr   inverse_reorder   s   r   c                   s   d fdd}|S )Nr   r   ru   c                   r   )Nc                   r   rq   rq   r   )r   r   rq   rr   r      r   z1same_reorder.<locals>.reindex.<locals>.<listcomp>r   r   r   r   rr   r      r   zsame_reorder.<locals>.reindexr   rq   r   rq   r   rr   same_reorder   s   r   reindex1&Callable[[Sequence[_U]], Sequence[_V]]reindex2&Callable[[Sequence[_T]], Sequence[_U]]&Callable[[Sequence[_T]], Sequence[_V]]c                   s   d fdd}|S )Nr   r   ru   Sequence[_V]c                       | S r   rq   r   r   r   rq   rr   r        z fuse_reindexing.<locals>.reindex)r   r   ru   r   rq   )r   r   r   rq   r   rr   fuse_reindexing  s   r   )   r      r0   )   r   r   r   r0   seq(Sequence[Union[int, torch.SymInt, Expr]]	shape_envOptional[ShapeEnv]c                 C  s"   |du r
t | }|S t|| }|S )z1
    Convert strides to fill order (argsort)
    N)rC   rD   )r   r   
sorted_idxrq   rq   rr   get_fill_order  s
   
r   Sequence[Union[int, Integer]]c                   s0   dd t | D   fddtt| D }|S )z
    Convert stride order to fill order
    For channel last format,

    stride order = [3, 0, 2, 1] and fill order = [1, 3, 2, 0]
    c                 S     i | ]\}}||qS rq   rq   r   idxposrq   rq   rr   
<dictcomp>%      z+stride_order2fill_order.<locals>.<dictcomp>c                      g | ]} | qS rq   rq   r   lookuprq   rr   r   &      z+stride_order2fill_order.<locals>.<listcomp>)	enumerater   r   )r   
fill_orderrq   r   rr   stride_order2fill_order  s   r   c                 C  s>   t | |}dd tt| D }t|D ]\}}|||< q|S )z)
    Convert strides to stride order
    c                 S     g | ]}d qS r   rq   r   _rq   rq   rr   r   1      z$get_stride_order.<locals>.<listcomp>)r   r   r   r   )r   r   r   outr   elemrq   rq   rr   get_stride_order*  s
   

r   xLiteral[None]guard_shaperj   c                 C     d S r   rq   r   r   rq   rq   rr   ir_node_to_tensor7     r   torch.Tensorc                 C  r   r   rq   r   rq   rq   rr   r   ;  r   Optional[IRNode]Optional[torch.Tensor]c                   s   | d u rd S |st jjj nt  fdd|  D }t| r, fdd|  jD }nt	
|}|  }|  }t|}t|}t jjj  tj||||d }W d    |S 1 s_w   Y  |S )Nc                      g | ]} |qS rq   rq   r   sshape_fnrq   rr   r   J  r   z%ir_node_to_tensor.<locals>.<listcomp>c                   r   rq   rq   r   r   rq   rr   r   M  r   )sizestridedtypedevice)rU   graphsizevars	size_hintr   get_sizeis_storage_and_layout
get_layoutr   FlexibleLayoutcontiguous_strides	get_dtype
get_devicerH   r   suppress_guardstorchempty_stridedzero_)r   r   r   r   r   r   trq   r   rr   r   ?  s.   

valueOptional[Sequence[_T]] Optional[Sequence[Optional[_T]]]c                 C  s   t | tr
| s
d gS | S r   )rx   ry   r   rq   rq   rr   may_convert_to_optional[  s   r  2Union[IRNode, OutputSpec, torch.device, None, str]Optional[str]c                 C  sb   t | ts	| d u r| S t | tjr| jS t | ttfr!t|  S t	d|  dt| j
 d d S )Nzget_device_type(: ))rx   r   r   r   r   rb   
OutputSpecget_device_typer   r   rm   r   rq   rq   rr   r  e  s    r  &Union[IRNode, torch.device, None, str]c                 C  sl   t | }|dv rtt| ddkrdS dS |d u s"t| }d u r$dS ddlm} t|ts1J t||S )N)cpucuda_backendtritonTFr0   )TritonScheduling)	r  r   r1   r4   codegen.tritonr  rx   r   
issubclass)r   r   device_schedulingr  rq   rq   rr   	is_tritonq  s   
r  c                 C  s   t | dkS )Nr  )r  r	  rq   rq   rr   is_cpu  r   r  Union[Buffer, TensorBox]	alignmentr   c                   s~   t tr d u rdS t fddtt d D }tjj	
 d dkp:tjj	
 d dk}|o>|S )NFc                 3  s.    | ]}t jj |   d kV  qdS )r   N)rU   r   r   r   
get_strider   r  r   rq   rr   	<genexpr>  s
    
z-is_aligned_realized_tensor.<locals>.<genexpr>r0   )rx   rb   maybe_get_strideallr   r   r  rU   r   r   r   r   )r   r  aligned_stridesaligned_last_dimrq   r  rr   is_aligned_realized_tensor  s   r  strides1Sequence[_IntLike]strides2shapec                 C  s   t |t | krt | t |ksJ t|| |D ]'\}}}tjj|dr&qtjj||s?tjj|tjj|ks? dS qdS )zP
    Returns true if the strides are equal, ignoring dimensions of size 1 .
    r0   FT)r   r   rU   r   r   statically_known_leqstatically_known_equalssymbolic_hint)r   r"  r#  dims1s2rq   rq   rr   significant_strides_equal  s   $r*  tensorUnion[TensorBox, BaseView]strides"Sequence[Union[int, torch.SymInt]]c                 C  s   t | s| S tdd t||  D r| S t||  |  s"| S t| \}}g |j}t|  D ]\}}t	j
j|drE|| ||< q3t|j|j|j||j}tt||dS )a  
    Tries to match the strides of the tensor to those in the meta_strides. Strides of insignificant
    dimensions - size 0 or 1 - will be updated.

    If there are real stride differences (NHWC vs NCHW), or the tensor is not realized, then the input will be returned
    c                 s  $    | ]\}}t jj||V  qd S r   rU   r   r   r%  r   r(  r)  rq   rq   rr   r    
    
z2try_match_insignificant_strides.<locals>.<genexpr>r0   datalayout)r   r  r   r  r*  r   as_storage_and_layoutr   r   rU   r   r   r$  FixedLayoutr   r   r   offsetra   ReinterpretView)r+  r-  storage
old_layout
new_strider   r   
new_layoutrq   rq   rr   try_match_insignificant_strides  s,   

r>  c                   @  s  e Zd ZU e Zded< ejddZded< ejddZ	ded< ejddZ
d	ed
< eejdddZdddZdddZdddZdddZdddZdd!d"Zddd'd(Z	#ddd,d-Zdd/d0Zdd2d3Zdd5d6Zdd8d9Zdd;d<Zdd>d?Zdd@dAZddCdDZddFdGZe ddIdJZ!ddLdMZ"ddNdOZ#ddQdRZ$dddVdWZ%ddYdZZ&dd\d]Z'dd^d_Z(ddadbZ)ddddeZ*ddgdhZ+ddidjZ,ddkdlZ-ddmdnZ.dddqdrZ/ddudvZ0ddwdxZ1ddydzZ2dd{d|Z3	ddddZ4dddZ5dddZ6	ddddZ7dddZ8dddZ9dddZ:dddZ;dddZ<dddZ=dddZ>dddZ?dddZ@dddZAdddZBdddZCdddZDeErTe dddZFdSS dSS )rb   zClassVar[OrderedSet[Any]]_current_originsF)initOrderedSet[Any]originsOptional[list[str]]	tracebackOptional[torch.fx.Node]origin_nodeOrderedSet[Node]ru   Generator[None, None, None]c                 c  s.    t j}|| B t _z	d V  W |t _d S |t _w r   )rb   r?  )rB  oldrq   rq   rr   current_origins  s   
zIRNode.current_originsattrr   r   r
   rv   c                 C  s   t | || d S r   )r   __setattr__)selfrK  r   rq   rq   rr   _post_init_setattr  s   zIRNode._post_init_setattrc                 C  s<   |  dt| j |  dtjrt nd  |  dd  d S )NrB  rD  rF  )rN  r+   r?  r1   debug_ir_tracebackrD  format_stackrM  rq   rq   rr   __post_init__  s
   zIRNode.__post_init__OrderedSet[str]c                 C     t dd |  D S )Nc                 s      | ]}|j V  qd S r   r   r   deprq   rq   rr   r        z(IRNode.get_read_names.<locals>.<genexpr>r+   	get_readsrQ  rq   rq   rr   get_read_names     zIRNode.get_read_namesc                 C     | j S r   )rD  rQ  rq   rq   rr   get_traceback     zIRNode.get_tracebackc                 C  r]  r   rF  rQ  rq   rq   rr   get_origin_node  r_  zIRNode.get_origin_nodeOptional[Operation]c                 C  r   r   rq   rQ  rq   rq   rr   get_defining_op  r   zIRNode.get_defining_opTshortenrj   Sequence[str]c                 C  s:   dt | dd }|rt|dkr|d d  d}|gS )Nzorigins=rB   @   =   z...)r   r   )rM  rd  rB  rq   rq   rr   common_repr  s   zIRNode.common_reprlinesSequence[object]	multilinec                 C  sb   t |t | | }t tt|}|r&td|}t| j d| dS t| j d| dS )Nz,
z(
z
)(r  )ry   ri  mapr   indentjoinr   rm   )rM  rj  rd  rl  	new_linesrq   rq   rr   
str_helper  s   zIRNode.str_helpertorch.dtypec                 C  r]  r   r   rQ  rq   rq   rr   r     r_  zIRNode.get_dtypeOptional[torch.dtype]c                 C      z|   W S  ty   Y d S w r   )r   NotImplementedErrorrQ  rq   rq   rr   maybe_get_dtype  
   
zIRNode.maybe_get_dtypeLayoutc                 C     t dt|  d)Nz#get_layout() is not implemented by !rw  r   rQ  rq   rq   rr   r     r   zIRNode.get_layoutOptional[Layout]c                 C  rv  r   )r   rw  rQ  rq   rq   rr   maybe_get_layout  ry  zIRNode.maybe_get_layoutr  c                 C     |   S r   )r   rQ  rq   rq   rr   get_output_spec%     zIRNode.get_output_specOptional[OutputSpec]c                 C  rv  r   )r  rw  rQ  rq   rq   rr   maybe_get_output_spec(  ry  zIRNode.maybe_get_output_specc                 C  s   t |  tS )z4True for single tensor output (excludes MultiOutput))rx   r  rz  rQ  rq   rq   rr   has_tensor_output.  s   zIRNode.has_tensor_outputSequence[Expr]c                 C  r{  )Nz!get_size() is not implemented by r|  r}  rQ  rq   rq   rr   r   2  r   zIRNode.get_sizeOptional[Sequence[_IntLike]]c                 C  rv  r   )r   rw  rQ  rq   rq   rr   maybe_get_size5  ry  zIRNode.maybe_get_size.Union[_IntLike, sympy.Rel, Sequence[_IntLike]]c                 C  r  r   r   rQ  rq   rq   rr   r#  ;     zIRNode.shaper   c                 C     t |  S r   )rQ   r   rQ  rq   rq   rr   	get_numel?  r   zIRNode.get_numelc                 C     t jjt|  dS Nr   rU   r   r   is_expr_static_and_truer   Eqr  rQ  rq   rq   rr   is_zero_elementsB     zIRNode.is_zero_elementsr  c                 C     t dt|  )a)  
        If the IRNode refers to data which has not been materialized (e.g.,
        it is a Pointwise/Reduction that could potentially have more
        compute fused into it), realize the IRNode into physical memory,
        ending the possibility of fusing into it, but allowing, e.g., multiple
        users to access the data without having to recompute.

        Check StorageBox.realize for a particularly notable implementation.

        TODO(ezyang): I think, in principle, every IRNode should have an
        implementation of this, and most of the time no-op is OK, but you
        really do have to audit each IRNode for this, so for now, raise
        an error if it's not implemented.  Note that some code in graph.py
        will catch this thrown error and suppress it with a warning.
        zrealize NYI on r}  rQ  rq   rq   rr   realizeE  s   zIRNode.realizeNwriterOptional[IndentedBuffer]c                 C  r  )Nzcodegen_reference NYI on r}  rM  r  rq   rq   rr   codegen_referenceW     zIRNode.codegen_referenceOptional[torch.device]c                 C  r   r   rq   rQ  rq   rq   rr   r   Z  r   zIRNode.get_devicetorch.devicec                 C  s   |   }|d us
J |S r   r   rM  r   rq   rq   rr   get_device_or_error]  s   zIRNode.get_device_or_errorc                 C     dS NFrq   rQ  rq   rq   rr   has_exceeded_max_readsb  r   zIRNode.has_exceeded_max_reads$Callable[[Sequence[Expr]], OpsValue]c                 C     t t| jr   rw  r   rm   rQ  rq   rq   rr   make_loadere     zIRNode.make_loader Callable[[Sequence[Expr]], Expr]c                 C  r  r   r  rQ  rq   rq   rr   make_indexerh  r  zIRNode.make_indexerr!  c                 C  r  r   r  rQ  rq   rq   rr   r  k  r  zIRNode.get_stridec                 C  rv  r   )r  rw  rQ  rq   rq   rr   r  n  ry  zIRNode.maybe_get_stridec                 C  r  r   r  rQ  rq   rq   rr   get_namet  r  zIRNode.get_namec                 C  rv  r   )r  rw  rQ  rq   rq   rr   maybe_get_namew  ry  zIRNode.maybe_get_name	thresholdOptional[int]c                 C  r  r  rq   rM  r  rq   rq   rr   has_large_inner_fn}  r   zIRNode.has_large_inner_fnusersr   c                 C  r   r   rq   rM  r  rq   rq   rr   
mark_reuse  r   zIRNode.mark_reusec                 C  r   r   rq   rQ  rq   rq   rr   realize_hint  r   zIRNode.realize_hintc                 C  r  r   r  rQ  rq   rq   rr   unwrap_view  r  zIRNode.unwrap_viewc                 C  r  r   r  rQ  rq   rq   rr   freeze_layout  r  zIRNode.freeze_layoutr   	list[int]allow_paddingc                 C  r  r   r  rM  r   r  rq   rq   rr   freeze_layout_with_stride_order     z&IRNode.freeze_layout_with_stride_orderc                 C  r  r   r  rM  r   rq   rq   rr   freeze_layout_with_fill_order  r  z$IRNode.freeze_layout_with_fill_orderr   list[_IntLike]c                 C  r  r   r  rM  r   rq   rq   rr   freeze_layout_with_same_order  r  z$IRNode.freeze_layout_with_same_orderexact_stridesc                 C  r  r   r  rM  r  r  rq   rq   rr    freeze_layout_with_exact_strides  r  z'IRNode.freeze_layout_with_exact_stridesdependencies.ReadWritesc                 C  r  r   r  rQ  rq   rq   rr   get_read_writes  r  zIRNode.get_read_writesOrderedSet[Dep]c                 C  
   |   jS r   r  readsrQ  rq   rq   rr   rZ       
zIRNode.get_readsc                 C  r  r   )r   rZ  rQ  rq   rq   rr   	num_reads  r   zIRNode.num_readsr]   c                 C  r  r   r  rQ  rq   rq   rr   get_storage_numel  r  zIRNode.get_storage_numelOrderedSet[Symbol]c                 C  r  r   r  rQ  rq   rq   rr   get_unbacked_symbol_uses  r  zIRNode.get_unbacked_symbol_usesc                 C  r  r   r  rQ  rq   rq   rr   get_reduction_type  r  zIRNode.get_reduction_typeSequence[sympy.Expr]c                 C  r  r   r  rQ  rq   rq   rr   get_reduction_size  r  zIRNode.get_reduction_sizec                 C  r  r  rq   rQ  rq   rq   rr   	is_extern  r   zIRNode.is_externc                 C  r  r  rq   rQ  rq   rq   rr   is_no_op  r   zIRNode.is_no_opr   c                 C  r  r   r  r  rq   rq   rr   constant_to_device  r  zIRNode.constant_to_devicec                 C  r  r   r  rQ  rq   rq   rr   get_mutation_names  r  zIRNode.get_mutation_namesc                 C  r  r   r  rQ  rq   rq   rr   get_operation_name  r  zIRNode.get_operation_namec                 C  r  r   r  rQ  rq   rq   rr   get_inputs_that_alias_output  r  z#IRNode.get_inputs_that_alias_outputc                 C  r   r   rq   rQ  rq   rq   rr   r     r   zIRNode.dtype)rB  rG  ru   rH  )rK  r   r   r
   ru   rv   ru   rv   ru   rS  )ru   rC  ru   rE  ru   rb  T)rd  rj   ru   re  )TT)rj  rk  rd  rj   rl  rj   ru   r   ru   rs  )ru   ru  ru   rz  )ru   r~  ru   r  )ru   r  ru   rj   ru   r  )ru   r  )ru   r  ru   r   ru   r  r   r  r  ru   r   ru   r  ru   r  ru   r  ru   r  ru   r!  ru   r   r  r  ru   rj   r  r   ru   rv   ru   rb   Fr   r  r  rj   ru   rv   r   r  ru   rv   r   r  ru   rv   r  r  r  rj   ru   rv   ru   r  ru   r  ru   r   ru   r]   ru   r  ru   r  r   r  ru   rb   ru   re  )Grm   rn   ro   r+   r?  rp   dataclassesfieldrB  rD  rF  staticmethod
contextlibcontextmanagerrJ  rN  rR  r[  r^  ra  rc  ri  rr  r   rx  r   r  r  r  r  r   r  propertyr#  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rZ  r  r  r  r  r  r  r  r  r  r  r  r   r   rq   rq   rq   rr   rb     s   
 













































c                   @  s   e Zd Zd.ddZd/ddZd0d	d
Zd1ddZd2ddZd3ddZd3ddZ	d4ddZ
d5ddZd6ddZd7d d!Zd8d#d$Zd9d&d'Zd9d(d)Zd:d+d,Zd-S );	Operationru   rv   c                 C  s
   d | _ d S r   operation_namerQ  rq   rq   rr   rR    r  zOperation.__post_init__r  c                 C     t r   rw  rQ  rq   rq   rr   r     r   zOperation.get_devicerE  c                 C     t | dsJ | jS NrF  )hasattrrF  rQ  rq   rq   rr   ra       zOperation.get_origin_noderA  c                 C  r  )NrB  )r  rB  rQ  rq   rq   rr   get_origins  r	  zOperation.get_originsr   c                 C  s   | j d usJ | j S r   r  rQ  rq   rq   rr   r    r	  zOperation.get_operation_namerj   c                 C  r  r  rq   rQ  rq   rq   rr   r    r   zOperation.is_externc                 C  r  r  rq   rQ  rq   rq   rr   r    r   zOperation.is_no_opr  c                 C  r  r   r  rQ  rq   rq   rr   r    r   zOperation.get_read_writesr   c                 C  s   ||   v S r   )r[  )rM  r   rq   rq   rr   
is_user_of  r   zOperation.is_user_ofrS  c                 C  rT  )Nc                 s  rU  r   r   rV  rq   rq   rr   r    rX  z+Operation.get_read_names.<locals>.<genexpr>rY  rQ  rq   rq   rr   r[    r\  zOperation.get_read_namesr  c                 C  r  r   r  rQ  rq   rq   rr   rZ    r  zOperation.get_readslist[Buffer]c                 C  r  r   r  rQ  rq   rq   rr   get_outputs  r   zOperation.get_outputsOrderedSet[sympy.Symbol]c                 C     t  S r   r*   rQ  rq   rq   rr   get_unbacked_symbol_defs  r_  z"Operation.get_unbacked_symbol_defsc                 C  r  )a  
        Returns the unbacked symbols which are required to be in scope in
        order to successfully perform codegen for this buffer.  For example,
        a buffer that corresponds to an extern kernel call that takes i0 as
        an argument would return {i0} here.  This is used to generate necessary
        dependencies that ensure we actually bind i0 in codegen before you
        try to use it.

        Note that this is NOT transitive; in particular, if this buffer takes
        in as input another buffer with dynamic shape (e.g., (i0,)), we will
        not report it here, because you will already have a dependency
        on that buffer, which will eventually have a dependency on i0 if
        necessary.
        r*   rQ  rq   rq   rr   r    s   z"Operation.get_unbacked_symbol_usesr   c                 C  r  )z
        Gets extra global memory size needed by this buffer.
        Some algorithms (e.g. group gemm) may require extra global memory in the generated code.
        r   rq   rQ  rq   rq   rr   get_workspace_size  s   zOperation.get_workspace_sizeNr  r  r  )ru   rA  r  r  r  )r   r   ru   rj   r  r  ru   r  ru   r  r  )rm   rn   ro   rR  r   ra  r
  r  r  r  r  r  r[  rZ  r  r  r  r  rq   rq   rq   rr   r    s     













r  c                      s$  e Zd ZU ded< ded< ded< ded< dNddZdOddZdP fddZdQddZeZdRddZ	dSddZ
dTddZdTd d!ZedUd&d'ZeejfdVd*d+ZedWd-d.ZdXd0d1ZedQd2d3ZdYdZd8d9ZdNd:d;Zd[d=d>Zd\d@dAZd]dCdDZd^dFdGZd_dIdJZd`dLdMZ  ZS )aLoopsr  r   rs  r   Callable[..., Any]inner_fnr!  rangesru   r  c                 C  s&   t  jg dd | jD |  R  S )Nc                 s      | ]}t |V  qd S r   r%   r   erq   rq   rr   r        z1Loops.get_unbacked_symbol_uses.<locals>.<genexpr>)r+   unionr  inner_fn_free_unbacked_symbolsrQ  rq   rq   rr   r    s
   zLoops.get_unbacked_symbol_usesnamesre  r   c                   sF     d jj dt j  g fdd|D  d jg S )N'c                   s    g | ]}| d t  | qS =)r   )r   r   rQ  rq   rr   r   "  s     z!Loops._to_str.<locals>.<listcomp>origin_node=)rr  r   r   r   r   inner_fn_strrF  )rM  r  rq   rQ  rr   _to_str  s   zLoops._to_strrv   c                   s   t    d S r   )superrR  rQ  	__class__rq   rr   rR  &  r  zLoops.__post_init__c                 C  
   |  dS )Nr  r%  rQ  rq   rq   rr   __str__)  r  zLoops.__str__r  c                 C  r]  r   r   rQ  rq   rq   rr   r   .  r_  zLoops.get_devicerE  c                 C  r]  r   r`  rQ  rq   rq   rr   ra  1  r_  zLoops.get_origin_noder  c                 C  r]  r   r*  rQ  rq   rq   rr   r   4  r_  zLoops.get_sizec                 C  r]  r   r*  rQ  rq   rq   rr   get_pointwise_size7  r_  zLoops.get_pointwise_sizer   r
   r   ra   c                 O  sN   | dd }| dd }| |i |}|d| |d|p |j t|S )NrF  rD  )poprN  rD  ra   create)clsr   r   rF  tbrrq   rq   rr   r0  :  s   
zLoops.creater`   r/   c                   s    fddt | D S )Nc                   s*   g | ]\}}|d krt jjnt |qS r0   )r   SZerorP   )r   nr   r_   rq   rr   r   J      z Loops._index.<locals>.<listcomp>)r   )r  r`   rq   r_   rr   _indexH  s   
zLoops._indexr=   c              	   C  s   t t }t|2 ttdd | j|    |	 W  d    W  d    S 1 s0w   Y  W d    d S 1 s@w   Y  d S Nallow_indexingT)
r<   rU   MockHandlerset_ops_handlerr   r   r   r  inner_fn_argsgetvalue)rM  	opcounterrq   rq   rr   inner_fn_opcountO  s   RzLoops.inner_fn_opcountSequence[Sequence[_IntLike]]c                 C  s   |  | jfS r   )r9  r  rQ  rq   rq   rr   r>  Y  r  zLoops.inner_fn_argsc                 C  s   t jj| jg|  R  S r   )rU   KernelFormatterHandlerir_to_stringr  r>  rQ  rq   rq   rr   r$  \  s
   zLoops.inner_fn_strNr  r  rj   c                 C  s&   |d u rd}t |tj}|  j|kS r  )maxr1   realize_opcount_thresholdrA  num_opsr  rq   rq   rr   r  b  s   zLoops.has_large_inner_fnc                 C  s   |  | j}t| j|S r   )r9  r  r7   r  )rM  r   rq   rq   rr   r  h  s   z$Loops.inner_fn_free_unbacked_symbolsr  c                 C  sv   t tdd* |  r t|  |  |  jW  d    S t|  |  jW  d    S 1 s4w   Y  d S r:  )	r   r   r   r  r9   r  r   r  r  rQ  rq   rq   rr   rZ  l  s   $zLoops.get_readsrS  c                 C     t |  jS r   )r+   rA  read_buffersrQ  rq   rq   rr   r[  z  r  zLoops.get_read_namesr   c                 C  rH  r   )r   rA  rI  rQ  rq   rq   rr   r  }  r  zLoops.num_readsr  c                 C  r{  )Nz+get_reduction_size() is not implemented by r|  r}  rQ  rq   rq   rr   r       zLoops.get_reduction_sizer  c                 C  r{  )Nz+get_reduction_type() is not implemented by r|  r}  rQ  rq   rq   rr   r    rJ  zLoops.get_reduction_typerb   c                 C  r{  )Nz+constant_to_device() is not implemented by r|  r}  r  rq   rq   rr   r    rJ  zLoops.constant_to_devicer  )r  re  ru   r   r  r  r  r  r  )r   r
   r   r
   ru   ra   )r  r!  r`   r/   ru   r  )ru   r=   ru   rB  r   r  r  r  r  r  r  r  ) rm   rn   ro   rp   r  r%  rR  r,  __repr__r   ra  r   r.  classmethodr0  r  r/   INDEXr9  rE   rA  r>  r$  r  r  rZ  r[  r  r  r  r  __classcell__rq   rq   r'  rr   r    s>   
 







	





r  r   Union[Expr, Sequence[Expr]]r   rs  rT   c                C  s"   |j rttd|S td|S )Nnanr   )is_floating_pointrS   constantfloat)r   r   rq   rq   rr   nop_loader_fn  s   rU  c                   @  s>   e Zd ZdddZdddZdd	d
ZdddZdddZdS )	Pointwiseru   r  c                 C  s   |   rtt| jdS | jS Nrt  )r  r	   rU  r   r  rQ  rq   rq   rr   r    s   zPointwise.make_loaderr  c                 C  s   g S r   rq   rQ  rq   rq   rr   r    r   zPointwise.get_reduction_sizer  c                 C  r   r   rq   rQ  rq   rq   rr   r    r   zPointwise.get_reduction_typeoutput_nameindexer!Callable[[Sequence[Expr]], Never]varsr  rv   c                 C  s"   |   }t|p	d||||S Nunnamed)r  rS   storerM  rX  rY  r[  loaderrq   rq   rr   store_output  s   zPointwise.store_outputr   r  rb   c                 C  s.   |   }ttd||}t|| j|| jdS FMove this to a given device. Requires that all reads are to constants.override_devicer   r   r  r  )r  r   r   ConstantBufferrV  r   r  rM  r   r`  rq   rq   rr   r    s
   zPointwise.constant_to_deviceNr  r  r  rX  r  rY  rZ  r[  r  ru   rv   r  )rm   rn   ro   r  r  r  ra  r  rq   rq   rq   rr   rV    s    



	rV  c                   @  s6   e Zd ZU ded< dZded< dd
dZdddZdS )Scatterr  output_indexerNr?   scatter_moder   r  ru   rb   c                 C  s6   |   }ttd||}t|| j|| j| j| jdS )rc  rd  )r   r   r  r  rj  rk  )	r  r   r   rf  ri  r   r  rj  rk  rg  rq   rq   rr   r    s   zScatter.constant_to_devicerX  r  rY  rZ  r[  r  rv   c                 C  s6   |   }|d u r
d}tj||| |||| jdS )Nr]  )mode)r  rS   r^  rj  rk  r_  rq   rq   rr   ra    s   zScatter.store_outputr  rh  )rm   rn   ro   rp   rk  r  ra  rq   rq   rq   rr   ri    s
   
 
ri  
logical_ormaximumminimummuladdbitwise_xor)anyrE  minprodsumxor_sumz"dict[str, Callable[..., OpsValue]]REDUCTION_COMBINE_FNreduction_typearg_break_ties_leftCallable[..., object]c                   sR   t v rt  S dv rd fdd}|S d	kr"ddd}|S td )Nargmaxargminatuple[object, object]bru   tuple[OpsValue, OpsValue]c                   s   | \}}|\}}dkrt ||}nt ||}t ||}trCt ||}t ||}	t |t ||	}t |t ||	} rKt ||nt ||}
t |t ||
}t |||t |||fS )Nr~  )	rS   ltgteqr    nerm  logical_andwhere)r  r  a_valuea_indexb_valueb_indexmaskequala_isnanb_isnantierz  r   ry  rq   rr   argmax_combine_fn  s&   
z3get_reduction_combine_fn.<locals>.argmax_combine_fnwelford_combine#tuple[OpsValue, OpsValue, OpsValue]c                 S  sR   | \}}}|\}}}|| }|| }	||	 }
|||
  || || | |
  |	fS r   rq   )r  r  a_meana_m2a_weightb_meanb_m2b_weightdelta
new_weight	w2_over_wrq   rq   rr   welford_combine_fn  s   


z4get_reduction_combine_fn.<locals>.welford_combine_fnzunknown reduction_type=)r  r  r  r  ru   r  )r  r  r  r  ru   r  )rx  rw  )ry  r   rz  r  r  rq   r  rr   get_reduction_combine_fn  s   
r  c                      s:  e Zd ZU ded< ded< ded< ded< ddddZeZde fddZdfddZdgddZdhddZ	did!d"Z
djd$d%Zded&d'Zdkd+d,Ze	-dldmd8d9Zednd<d=Zeejd-fdod@dAZedpdDdEZedpdFdGZedqdKdLZedrdQdRZedsdYdZZedtd^d_Zedud`daZedvdbdcZ  ZS )w	Reductionr!  reduction_rangesr>   ry  rs  	src_dtyperB   reduction_hintru   r   c                 C  r)  )N)r  r  ry  r+  rQ  rq   rq   rr   r,  ,  r  zReduction.__str__r  c                   s"   t   t jdd | jD  B S )Nc                 s  r  r   r  r  rq   rq   rr   r  3  r  z5Reduction.get_unbacked_symbol_uses.<locals>.<genexpr>)r&  r  r+   r  r  rQ  r'  rq   rr   r  1  s   z"Reduction.get_unbacked_symbol_usesr  c                 C  r]  r   )r  rQ  rq   rq   rr   r  6  r_  zReduction.get_reduction_sizer  c                 C  r]  r   )ry  rQ  rq   rq   rr   r  9  r_  zReduction.get_reduction_typerX  rY  rZ  r[  r  reduction_varsSequence[Symbol]rv   c              	   C  s4   t | j| j| j| ||}t |pd|||S r\  )rS   	reductionr   r  ry  r  store_reduction)rM  rX  rY  r[  r  r   rq   rq   rr   r  <  s   
zReduction.store_reductionr   c                 C     t | jt | j S r   )r   r  r  rQ  rq   rq   rr   index_lengthK  r   zReduction.index_lengthSequence[Sequence[Expr]]c                 C  s$   |  | j}|  | jtj}||fS r   )r9  r  r  r/   R0_INDEXrM  r   rindexrq   rq   rr   r>  N  s   zReduction.inner_fn_argsc                 C  s*   |  | j}|  | jtj}t| j||S r   )r9  r  r  r/   r  r7   r  r  rq   rq   rr   r  S  s   z(Reduction.inner_fn_free_unbacked_symbolsr   r  rb   c              
   C  s>   |   }ttd||}t|| j|| j| j| j| j	t
jdS )rc  rd  r   r   r  r  r  ry  r  r  )r  r   r   rf  r  r   r  r  ry  r  rB   DEFAULTrg  rq   rq   rr   r  X  s   zReduction.constant_to_deviceN	dst_dtyper  r   r  %Union[ReductionType, Literal['scan']]reduction_numelr   
input_noder   tuple[ReductionHint, _IntLike]c	           #   
   C  s  d!dd}	t jj|}
t jjt|}|dkp(t j| tj o(|dvo(tj	}|	|
r1|	|s6t
jd	fS t| }|j}d
}|rWtjt jj| dd}tjt jj| dd}nd"dd}|}|d	kr||
|}|d	krpt
j|fS |d urt|trttdd t|\}}W d    n1 sw   Y  |d ur|d urt jjt|| }|
|krtd||||| t
jdfS t
j|fS |
|ks||d d
 krt
jd	fS t| |||||dkr|nd|t
jd}d#dd}||\}}|r||\}}t|dkrt
jd	fS t|  |! \\}}}d}d}|D ].}t jj"||} t jj#| |t$|% }!t&dd  |!D }"|"rA|d	7 }q|d	7 }q||krTt
j||
|fS t
j'||
|fS )$Nr   r   ru   rj   c                 S  s   t | ttfS r   )rx   r   r   r	  rq   rq   rr   
_is_statics  r  z(Reduction.num_splits.<locals>._is_staticscanr|  r0       T)inner_reductionFreduction_numel_hintr   
numel_hintc                 S  r  Nr0   rq   )r  r  rq   rq   rr   inner_reduction_splits     z4Reduction.num_splits.<locals>.inner_reduction_splitsr;  zUse previous IRNode's range and reduction_ranges instead of split. current ranges: %s, current reduction ranges: %s, current split: %d, new ranges: %s, new reduction ranges: %sr  r   rv  r  r3  r  tuple[Sequence[Expr], bool]c                   s   t d t|  |  |  d| d}| }|jd usJ dd |jD }g }d}t|jdd dD ]7 t	 fd	d
|D ri|
 j  jtjjv ritjj j }t|jdd }|  t|jdd |krid}q2||fS )Nr   r   r   r   r5  r4  c                 S  s&   g | ]}t |trt |tjs|qS rq   )rx   r   r   Numberr   r3  rq   rq   rr   r     s    
zBReduction.num_splits.<locals>.get_read_indices.<locals>.<listcomp>Fc                 S  r]  r   r   r	  rq   rq   rr   <lambda>      z@Reduction.num_splits.<locals>.get_read_indices.<locals>.<lambda>keyc                 3  s    | ]	}| j jv V  qd S r   )r   free_symbolsr  mdrq   rr   r        zAReduction.num_splits.<locals>.get_read_indices.<locals>.<genexpr>r   T)ComputedBufferr   r   r   r   r  
range_varssortedr  r  appendr   r   rU   r   name_to_bufferr   r5  decide_layout)r3  cbread_writesr  indiceschangedbuforiginal_striderq   r  rr   get_read_indices  s6   	z.Reduction.num_splits.<locals>.get_read_indicesr   c                 s  s    | ]}|d kV  qdS r0   Nrq   r   rq   rq   rr   r    r  z'Reduction.num_splits.<locals>.<genexpr>)r   r   ru   rj   )r  r   r  r   ru   r   )r3  r  ru   r  )(rU   r   r   r&  rQ   has_featurer3   REDUCE_TO_SINGLE_ELEMENTr1   split_reductionsrB   r  rA   r0  multi_processor_count	functoolsr	   choicesreduction_split_factorINNERrx   ra   r   r   r   r8   logdebugr  r   r2   index_vars_squeezer   r  simplify_with_rangesstride_hintsry   keysr  OUTER)#r   r  r  r  r  r  ry  r  r  r  r  r  should_splitpropsnum_smmin_elements_per_threadr  outer_reduction_splitssplit
new_rangesnew_reduction_rangesextracted_numel_hintr3  r  r  r  r   r  ranges1	num_outer	num_innerr   jr-  outerrq   rq   rr   
num_splitsg  s   
	











!

zReduction.num_splits<Callable[[Sequence[_IntLike], Sequence[_IntLike]], OpsValue](Callable[[Sequence[_IntLike]], OpsValue]c                   sn   dd D t || d fdd|d	v r3td
d
t dfddfddS S )z1Convert inner_fn from a reduction to an pointwisec                 S     g | ]	}t jj|qS rq   )rU   r   r   evaluate_static_shaper   r   rq   rq   rr   r     s    z2Reduction._unroll_reduction_fn.<locals>.<listcomp>r   r!  ru   r
   c                   s,   t  fddtjdd D  D S )Nc                 3  s    | ]} |V  qd S r   rq   )r   r  )r   value_fnrq   rr   r    s
    
z=Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<genexpr>c                 S  s   g | ]}t |qS rq   )r   r  rq   rq   rr   r     r   z>Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<listcomp>)r  reduce	itertoolsproductr   )
combine_fnr  r  r   rr   r     s   z*Reduction._unroll_reduction_fn.<locals>.fnr~  r}  Nr  r  c                   s*   dd |D }| |t  |tjfS )Nc                 S     g | ]}t |qS rq   )r   expandr   rq   rq   rr   r   0  r   zDReduction._unroll_reduction_fn.<locals>.value_fn.<locals>.<listcomp>)rS   
index_exprr   int64r   r  )flatten_indexr  rq   rr   r  -  s   z0Reduction._unroll_reduction_fn.<locals>.value_fnc                   s    | d S r  rq   r   )r   rq   rr   r  6  s    z0Reduction._unroll_reduction_fn.<locals>.<lambda>)r   r!  ru   r
   )r   r!  r  r!  ru   r  )r  r7  r   r   r  )r  r  ry  r  rq   )r  r  r   r  r  r  rr   _unroll_reduction_fn  s$   
	zReduction._unroll_reduction_fnr  ra   c
                   s  t jjt}
|
dkrDd fdd}|d|d|d|dd	 v s0J  d
d fdd}tj|||t|dS |
dkredv rTd fdd}ndfdd}tj| ||dS t	|
t
rt jj|
tjk rt|dkst|jrtj| | ||dS | | |||
|		\}}|tjkr|}|dkr|	d usJ t|	\}}|d usJ |d usJ | | |||||
S |dkr| | ||||	S tt| |||dS )Nr   valr   ru   Union[bool, float, int]c                   sH    t jkr	t| S  jrt| tjsJ t| S t| tjs J t| S r   )	r   rj   rR  rx   typingSupportsFloatrT  SupportsIntr   r
  r  rq   rr   py_cnstN  s   
z!Reduction.create.<locals>.py_cnstr0   )rv  rw  ru  rs  z* not supported for zero-dimension tensors!r   r   rT   c                   s   t   S r   rS   rS  r   )r  ry  rtypes_to_initsrq   rr   const_fnd     z"Reduction.create.<locals>.const_fnre  r  c                   s   t d S r  r  r   r  rq   rr   r   r  r   zReduction.create.<locals>.fnc                      dd D } | |S )Nc                 S     g | ]}t jjqS rq   r   r5  r6  r   rq   rq   rr   r   x  r   z0Reduction.create.<locals>.fn.<locals>.<listcomp>rq   r   reduction_index)r  r  rq   rr   r   w     
r  r  )r
  r   ru   r  )r   r   ru   rT   )rU   r   r   simplifyrQ   r  rV  r0  ry   rx   r   r   r1   unroll_reductions_thresholdrM   r   r	  r  rB   r  r8   !create_multilayer_existing_rangescreate_multilayerra   r  )r1  r   r  r  r  r  r  ry  r  r  r  r  r  r   hintr  r  r  rq   )r  r  r  ry  r  rr   r0  ;  s   

zReduction.creater   #Union[_NumLike, Sequence[_NumLike]]c                 C  s   | dv rt |rtdS t|rdS t|jS | dv r0t |r$tdS t|r*dS t|jS t|r6dnd}t|r>dnd}|||||||f|||ftd|fd	|  S )
N)rE  r}  z-infF)rt  r~  infTr   r0   )rv  ru  rw  rs  welford_reducer  online_softmax_reduce)r    rT  r   r   iinfort  rE  )ry  r   zeroonerq   rq   rr   default_accumulator  s0   
zReduction.default_accumulatorc                 C  s   | dkrdS t | |S )Nr#  r   )r  r(  ry  r   rq   rq   rr   default_value  s   zReduction.default_valuer  r]   r  c                 C  sP   | dkr|S | dkr|dkr|t jkrt jS | dkr&|dkr&|t jkr&t jS |S )Nr     i      )rB   r  
OUTER_TINY)r  r  r  rq   rq   rr   _multilayer_second_step_hint  s   
z&Reduction._multilayer_second_step_hintr`  
block_sizedefaultr{  c                   sF   t |gtjjt| d d	 fdd}|S )
Nr   r   r  r  ru   rT   c                   sh   |\}| ^ }| |  d fdd}r1t t  tjt tj}t ||S | S )Nru   rT   c                     s    gS r   rq   rq   )r  r`  	new_indexr   rq   rr   body  r  zCReduction._multilayer_wrap_loader.<locals>.wrapper_fn.<locals>.body)ru   rT   )rS   r  r  r   int32masked)r   r  reduction_blockr2  r  r/  r0  r`  	need_maskr  r   )r  r1  rr   
wrapper_fn  s   
z5Reduction._multilayer_wrap_loader.<locals>.wrapper_fn)r   r  r  r  ru   rT   )Viewdynamic_reshape_indexerrU   r   r   r  r   r  )r1  r`  r  r  r  r/  r0  r8  rq   r6  rr   _multilayer_wrap_loader  s   
z!Reduction._multilayer_wrap_loader@Callable[[Sequence[sympy.Expr], Sequence[sympy.Expr]], OpsValue]original_rangesoriginal_reduction_rangesr  Sequence[Integer]r  c                   sN   t dd D sJ dt|t|t| d fd	d
}|S )Nc                 s  s    | ]}|d kV  qdS r  rq   r  rq   rq   rr   r  4  r  zDReduction._multilayer_wrap_loader_existing_ranges.<locals>.<genexpr>z8Only enabled for numel_hint == 1, found original_ranges=merged_indexr  new_reduction_indexru   rT   c                   s:   | d t  }| t d  } |t|t| S r   )r   rz   )r@  rA  original_idxr1  r`  r=  r   rq   rr   r8  ;  s   zEReduction._multilayer_wrap_loader_existing_ranges.<locals>.wrapper_fn)r@  r  rA  r  ru   rT   )r  r9  r:  rz   )r1  r`  r=  r>  r  r  r8  rq   rC  rr   '_multilayer_wrap_loader_existing_ranges+  s   	z1Reduction._multilayer_wrap_loader_existing_rangesr8  
list[Expr]list[Integer]c                   s   |t jt jfvr
|nt j}t|||||||	|}|  |  d
 fdd}tj	j
t|}| |
||}||dt| ksDJ tt|||||t|d |	||d	S )a
        Break a large reduction up into multiple smaller reductions
        recursively
        r   r!  r  ru   rT   c                   s    g | |S r   rq   r  intermediate_loaderrq   rr   intermediate_fnp  s   z;Reduction.create_multilayer_helper.<locals>.intermediate_fnNr  )r   r!  r  r!  ru   rT   )r   float16bfloat16rT  r  r0  r  r  rU   r   r   r   rQ   r.  r   ra   )r1  r   r  r  r8  r=  r>  r  r  ry  r  r  intermediate_dtypeintermediaterJ  r  rq   rH  rr   create_multilayer_helperH  sD   
z"Reduction.create_multilayer_helperc
                 C  sb   t |}
t|
|d  |}| ||}| |||
|||}| ||||||g |||g|||	S )rG  r0   )rQ   r-   r*  r;  rO  )r1  r   r  r  r  r  r  ry  r  r  r  r/  r0  r8  rq   rq   rr   r    s&   
zReduction.create_multilayerc                 C  s8   |  |||||}| ||||||g ||||	d|
S )rG  r  )rD  rO  )r1  r   r  r  r  r=  r>  r  r  ry  r  r8  rq   rq   rr   r    s(   
z+Reduction.create_multilayer_existing_rangesr  r  r  r  
rX  r  rY  rZ  r[  r  r  r  ru   rv   r  ru   r  r  r   )r   r  r  rs  r  rs  r  r   r  r!  r  r!  ry  r  r  r   r  r   ru   r  )
r  r  r  r!  ry  r   r  rs  ru   r  )r   r  r  rs  r  rs  r  r  r  r  r  r  ry  r>   r  rB   r  r   ru   ra   ry  r   r   rs  ru   r!  )r  r]   r  r   r  rB   ru   rB   )r`  r   r  r!  r  r]   r  r]   r/  r]   r0  r!  ru   r{  )r`  r<  r=  r  r>  r  r  r?  r  r?  ru   r<  )r   r  r  rs  r  rs  r8  r  r=  r  r>  r  r  rE  r  rF  ry  r>   r  r]   r  rB   ru   ra   )r   r  r  rs  r  rs  r  r  r  r  r  r  ry  r>   r  r]   r  rB   ru   ra   )r   r  r  rs  r  rs  r  r  r=  r  r>  r  r  rF  r  rF  ry  r>   r  rB   ru   ra   )rm   rn   ro   rp   r,  rL  r  r  r  r  r  r>  r  r  r  r  r	  rM  rB   r  r0  r(  r*  r.  r;  rD  rO  r  r  rO  rq   rq   r'  rr   r  $  sT   
 








 $/
 #?&r  c                      s2   e Zd ZU ded< d fddZd ddZ  ZS )!MultiOutputReductionr   output_indexr   r  r  rs  	inner_fns)Union[INNER_FN_TY, Sequence[INNER_FN_TY]]r  r?  r  ry  r>   r  r  rB   c
              
     sX   t  r f t dkr d }
nd fdd	}
t j|||
|||||d
 |	| _d S )Nr0   r   r   r  reduction_idxru   tuple[OpsValue, ...]c                   s   t  fddD S )Nc                 3  s    | ]}| V  qd S r   rq   r   r   r   rW  rq   rr   r        z@MultiOutputReduction.__init__.<locals>.loader.<locals>.<genexpr>)rz   rZ  rU  rZ  rr   r`    s   z-MultiOutputReduction.__init__.<locals>.loaderr  )r   r  rW  r  ru   rX  )callabler   r&  __init__rT  )rM  r   r  rU  r  r  ry  r  r  rT  r`  r'  r\  rr   r^    s    


zMultiOutputReduction.__init__rX  r  rY  rZ  r[  r  r  r  ru   rv   c              	   C  sZ   t | j| j| j| ||}t|ttfsJ t	| || j
 }t |p'd|||S r\  )rS   r  r   r  ry  r  rx   rz   ry   r   rT  r  )rM  rX  rY  r[  r  r|   r   rq   rq   rr   r    s   

z$MultiOutputReduction.store_reduction)r   r  r  rs  rU  rV  r  r?  r  r?  ry  r>   r  rs  r  rB   rT  r   rP  )rm   rn   ro   rp   r^  r  rO  rq   rq   r'  rr   rS    s   
 %rS  c                   @  s"   e Zd ZeejdfdddZdS )OnlineSoftmaxReductionNr   r  r  rs  r  r  r  r  r  r  
num_outputr   r  rB   r  r   ru   Sequence[TensorBox]c
                   s<   t  fddt|D }
|
D ]}|  q|
S )z>
        Create the reduction disregarding splitting.
        c                 3  s.    | ]}t t d |	V  qdS )r$  N)ra   r0  rS  r   
output_idxr   r  r  r  r  r  r  rq   rr   r  %  s"    
z0OnlineSoftmaxReduction.create.<locals>.<genexpr>)rz   r   r  )r1  r   r  r  r  r  r  r`  r  r  resultsr   rq   rd  rr   r0    s   
zOnlineSoftmaxReduction.create)r   r  r  rs  r  rs  r  r  r  r  r  r  r`  r   r  rB   r  r   ru   ra  )rm   rn   ro   rM  rB   r  r0  rq   rq   rq   rr   r_    s
    
r_  c                   @  s<   e Zd ZeejfdddZedddZedddZ	dS )WelfordReductionr   r  r   rs  rU  Sequence[Callable[..., Any]]r  rF  r  ry  r>   r  rB   ru   ra  c              
     s6  dv sJ t jjt}dfdd}	|dkr-|	d}
|	d}|	d}|
||fS |d	krWdfdd dkrL d |	d|	d	fS t fddD S tjd |d\}}tj	kro||d	kr| 
|S fddtdD }|D ]}|  q|S )N)r#  r  r
  r   ru   ra   c                   s&   d fdd}t j|tdS )	Nr   r  ru   rT   c                   s   t  S r   r  r   )r   r
  rq   rr   r  K  s   z8WelfordReduction.create.<locals>.const.<locals>.inner_fnre  r   r  ru   rT   rV  r0  ry   )r
  r  )r   r   r  r  rr   constJ  s   z&WelfordReduction.create.<locals>.constr   r0   r`  4Callable[[Sequence[Expr], Sequence[Expr]], OpsValue]c                   s&   d fdd}t j|tdS )	Nr   r  ru   rT   c                   r  )Nc                 S  r  rq   r  r   rq   rq   rr   r   d  r   zKWelfordReduction.create.<locals>.copy.<locals>.inner_fn.<locals>.<listcomp>rq   )r   r  )r`  r  rq   rr   r  c  r  z7WelfordReduction.create.<locals>.copy.<locals>.inner_fnre  ri  rj  )r`  r  )r   r   r  r  r`  rr   copy`  s   z%WelfordReduction.create.<locals>.copyr#  c                 3  s    | ]} |V  qd S r   rq   rY  )rn  rq   rr   r  q  r  z*WelfordReduction.create.<locals>.<genexpr>)ry  r  c                   s*   g | ]}t t |	qS rq   )ra   r0  rf  rb  )r   r   rU  r  r  r  ry  rq   rr   r     s     z+WelfordReduction.create.<locals>.<listcomp>r   )r
  r   ru   ra   )r`  rl  ru   ra   )rU   r   r   r  rQ   rz   r  r  rB   r  r  r   r  )r1  r   r   rU  r  r  ry  r  r  rk  meanm2weightr   r  re  r   rq   )rn  r   r   rU  r  r  r  ry  rr   r0  ;  sT   



zWelfordReduction.creater   r!  c                 C  r  )N)r   r   r   rq   r)  rq   rq   rr   r*    r  zWelfordReduction.default_valuer  r]   c	              
     s$  t tjjt d }	|	r9|dkr9dfd
d}
j||d t|
ddt|
ddf|d|dS t	d   t
|t fdd|D g | g||}|D ]}|  qadddtjjt |}||}t
|tfdd|D |gd|S )rG  r   r  r   r  rW  r   r   ru   rT   c                   s   t | S r   r  )r   rW  r   rt  rq   rr   rS    s   z4WelfordReduction.create_multilayer.<locals>.constantr  r0   )r   r   rU  r  r  ry  r  r  c              	   3  s&    | ]}j | d dV  qdS )r   )r0  N)r;  )r   r`  )r/  r1  r  r  r  rq   rr   r    s    	
z5WelfordReduction.create_multilayer.<locals>.<genexpr>r   r  r`  r  c                 S  s   |g | |S r   rq   )r   r  r`  rq   rq   rr   intermediate_loader_fn  s   zBWelfordReduction.create_multilayer.<locals>.intermediate_loader_fnc                 3  s     | ]}t  | d V  qdS )rm  N)r	   r  r   )rr  rq   rr   r    
    
N)r   r  rW  r  r   r   ru   rT   )r   r  r  r  r`  r  ru   rT   )rQ   rU   r   r   r  r   r  r  r	   r-   rf  r0  rz   r  r   r.  )r1  r   r   rU  r  r  ry  r  r  r7  rS  intermediatesr   r  rq   )r/  r1  r   rr  r  r  r  rr   r    sb   

	


z"WelfordReduction.create_multilayerN)r   r  r   rs  rU  rg  r  rF  r  rF  ry  r>   r  rB   ru   ra  rR  )r   r  r   rs  rU  rg  r  rF  r  rF  ry  r>   r  r]   r  rB   ru   ra  )
rm   rn   ro   rM  rB   r  r0  r  r*  r  rq   rq   rq   rr   rf  :  s    	xrf  c                      s   e Zd ZU ded< ded< ded< ded< ded	< d
ed< ded< ded< dH fddZdI fddZdJdd ZdKd!d"ZdLd$d%ZdMd&d'Z	dMd(d)Z
dNd*d+ZdOd-d.ZdHd/d0Zeejfd1d2dPd<d=ZedQdFdGZ  ZS )RScanrF  scan_rangesr   =Callable[[tuple[Any, ...], tuple[Any, ...]], tuple[Any, ...]]r  zFCallable[[Sequence[_IntLike], Sequence[_IntLike]], Sequence[_IntLike]]r   rB   r  r   rT  tuple[torch.dtype, ...]dtypestuple[Callable[..., Any], ...]rU  ru   r  c                   :   t   t jdd | jD  B t jdd | jD  B S )Nc                 s  r  r   r  r  rq   rq   rr   r  ,  r  z0Scan.get_unbacked_symbol_uses.<locals>.<genexpr>c                 s  r  r   r  r  rq   rq   rr   r  -  r  )r&  r  r+   r  rv  r   rQ  r'  rq   rr   r  &  s   zScan.get_unbacked_symbol_usesrv   c                   0   t | jt | j t | jksJ t   d S r   )r   r  rv  r   r&  rR  rQ  r'  rq   rr   rR  0     "zScan.__post_init__rX  r  rY  %Callable[[Sequence[_IntLike]], Never]r[  r  	scan_varsr  c                   sR   |  || t fdd| jD }t| j| j|}t|p d| || j S )Nc                 3      | ]}| V  qd S r   rq   r   r  rh  rq   rr   r  <  r  z'Scan.store_reduction.<locals>.<genexpr>r]  )	r   rz   rU  rS   r  ry  r  r^  rT  )rM  rX  rY  r[  r  r|   resultrq   rh  rr   r  4  s   zScan.store_reductionc                 C  r  )Ncustomrq   rQ  rq   rq   rr   r  B     zScan.get_reduction_typer  c                 C  r]  r   )rv  rQ  rq   rq   rr   r  F  r_  zScan.get_reduction_sizec                 C  r]  r   r   rQ  rq   rq   rr   r   I  r_  zScan.get_sizec                 C  r]  r   r*  rQ  rq   rq   rr   r.  L  r_  zScan.get_pointwise_sizec                 C  r  r   )r   r  rv  rQ  rq   rq   rr   r  O  r   zScan.index_lengthrB  c                 C  .   |  | j}|  | jtj}| ||}|fS r   )r9  r  rv  r/   r  r   rM  r   r  r   rq   rq   rr   r>  R     zScan.inner_fn_argsc                 C  4   |  | j}|  | jtj}| ||}t| j|S r   )r9  r  rv  r/   r  r   r7   r  r  rq   rq   rr   r  X     z#Scan.inner_fn_free_unbacked_symbolsT)can_fallback_to_atenr   r  +tuple[Callable[[Sequence[Expr]], Any], ...]axisr  rj   r   r
   Sequence[Optional[TensorBox]]c                  s  g d    d d    g	t jtjs$d gt S tdkr9t jtjs9d gt S t jj}
|
t		}ttksNJ |

t|drgfddttD S | jd d  	|d\}t
|dkrtjjd u ptotdkotdk}|s|rd gt S d}nt
d 	fdd	
fddttD }|D ]}|  q|S )Nr0   c                   &   g | ]}t j | | d qS re  rV  r0  r   rT  r   ry  rU  r   rq   rr   r         zScan.create.<locals>.<listcomp>r   )r   r   r  r  pointwise_rangesrv  r  
scan_numelz3.3.0r   r  
scan_indexru   rE  c                   H   t |t ks
J t | t ksJ g | d   ||  d  S r   r   )r   r  )r  r  rv  rq   rr   r         zScan.create.<locals>.reindexc                   sB   g | ]}t 	d| | 
 |d qS ))r   r   ry  r  rU  r   r  rv  r  r   r  rT  rq   )ra   r0  r  )r  r   ry  rU  r   r  r  r   rv  	scan_typer   rq   rr   r     s*    )r   r  r  r  ru   rE  )rU   r   r  r3   SCANr   TUPLE_REDUCTIONr   r  rQ   r  r   Ler   r  ru  r   versionhip
has_tritontriton_version	SplitScanr  )r1  r   ry  rU  r   r  r  r  r  r   r   r  r  supports_splitre  r  rq   )r  r  r   ry  rU  r   r  r  r   rv  r  r   rr   r0  ^  sV    







zScan.creater   rs  r  r  r  r  r   r  c	           
   
     s*   d
 fdd}	t j||||	||d|d	S )Nr   r  rW  ru   rT   c                   s$   g | d   ||  d  S r   rq   rZ  r  r  rq   rr   r8    s   $z#Scan.num_splits.<locals>.wrapper_fnr  )r   r  r  r  r  r  ry  r  )r   r  rW  r  ru   rT   )r  r  )
r1  r   r   r  r  r  rv  r  r  r8  rq   r  rr   r    s   zScan.num_splitsr  r  )
rX  r  rY  r~  r[  r  r  r  ru   rv   r  r  r  r  rK  )r   r  ry  rx  rU  r  r   rF  r  r   r  rw  r  rB   r  rj   r   r
   ru   r  )r   r  r   rs  r  r  r  r   r  rF  rv  rF  r  rw  r  r   ru   r  )rm   rn   ro   rp   r  rR  r  r  r  r   r.  r  r>  r  rM  rB   r  r0  r  rO  rq   rq   r'  rr   ru    s4   
 








	aru  c                   @     e Zd ZdS )r  Nrm   rn   ro   rq   rq   rq   rr   r    s    r  c                      s   e Zd ZU ded< ded< ded< ded< ded	< d
ed< ded< ded< ded< d9 fddZd: fddZd;dd Zd<d!d"Zd=d#d$Zd=d%d&Z	d=d'd(Z
d>d)d*Zd?d,d-Zd9d.d/Zeejfd@d7d8Z  ZS )ASortrF  sort_rangesr   z:Callable[[Sequence[Expr], Sequence[Expr]], Sequence[Expr]]r   rB   r  r   rT  rx  ry  rz  rU  rj   stable
descendingru   r  c                   r{  )Nc                 s  r  r   r  r  rq   rq   rr   r    r  z0Sort.get_unbacked_symbol_uses.<locals>.<genexpr>c                 s  r  r   r  r  rq   rq   rr   r    r  )r&  r  r+   r  r  r   rQ  r'  rq   rr   r    s   zSort.get_unbacked_symbol_usesrv   c                   r|  r   )r   r  r  r   r&  rR  rQ  r'  rq   rr   rR    r}  zSort.__post_init__rX  r  rY  r  r[  r  r  c                   sV   |  || t fdd| jD }t| j|| j| j}t|p"d| || j	 S )Nc                 3  r  r   rq   r  rh  rq   rr   r  	  r  z'Sort.store_reduction.<locals>.<genexpr>r]  )
r   rz   rU  rS   sortry  r  r  r^  rT  )rM  rX  rY  r[  r  r|   r  rq   rh  rr   r    s   zSort.store_reductionc                 C  r  )Nr  rq   rQ  rq   rq   rr   r  	  r   zSort.get_reduction_typec                 C  r]  r   )r  rQ  rq   rq   rr   r  	  r_  zSort.get_reduction_sizec                 C  r]  r   r  rQ  rq   rq   rr   r   	  r_  zSort.get_sizec                 C  r]  r   r*  rQ  rq   rq   rr   r.  	  r_  zSort.get_pointwise_sizec                 C  r  r   )r   r  r  rQ  rq   rq   rr   r  	  r   zSort.index_lengthr  c                 C  r  r   )r9  r  r  r/   r  r   r  rq   rq   rr   r>  	  r  zSort.inner_fn_argsc                 C  r  r   )r9  r  r  r/   r  r   r7   r  r  rq   rq   rr   r  !	  r  z#Sort.inner_fn_free_unbacked_symbolsr   r  'tuple[Callable[[list[Expr]], Any], ...]r  r   r
   r  c	                   s*  g 	d   	 d d  	  g
t jtjs$d gt S t jj}
|
t
}d}t	j
jo=|
t||}|sGd gt S ttksQJ |
t|drj	fddttD S d 
fd
d	
fddttD }|D ]}|  q|S )Nr0   r+  c                   r  r  r  r  r  rq   rr   r   L	  r  zSort.create.<locals>.<listcomp>r   r  
sort_indexru   rE  c                   r  r   r  )r   r  )r  r  r  rq   rr   r   V	  r  zSort.create.<locals>.reindexc                   sD   g | ]}t td| | 	|
 d qS ))r   r   ry  r  rU  r   r  r  r   r  rT  r  r  rq   )ra   r0  r  r  )r  r   ry  rU  r   r  r  r   r   r  r  rq   rr   r   [	  s,    )r   r  r  r  ru   rE  )rU   r   r  r3   SORTr   r   r  rQ   r1   r  persistent_reductionsr  r   r  r   r  )r1  r   ry  rU  r   r  r  r  r  r   r   
sort_numel
max_rblockis_persistent_kernelre  r  rq   )r  r  r   ry  rU  r   r  r  r   r   r  r  rr   r0  '	  s0    




zSort.creater  r  )
rX  r  rY  r  r[  r  r  r  ru   rv   r  r  r  rQ  )r   r  ry  rx  rU  r  r   rF  r  r   r  rj   r  rj   r  rB   r   r
   ru   r  )rm   rn   ro   rp   r  rR  r  r  r  r   r.  r  r>  r  rM  rB   r  r0  rO  rq   rq   r'  rr   r    s.   
 








r  c                 C  s(   z	t | dd W dS  ty   Y dS w )NFfreezeT)r6  rw  r	  rq   rq   rr   r   w	  s   r   c                 C  s@   zt | dd\}}| r|  | W S  ty   Y dS w NFr  )r6  should_pad_stridespad_stridesis_contiguousrw  )r   _bufferr5  rq   rq   rr    is_contiguous_storage_and_layout	  s   
r  r  want_contiguousstride_order'Optional[Sequence[Union[int, Integer]]]r  r  tuple[StorageBox, Layout]c           	      C  s   t | trt| j|||||dS t | tr)t| j|||||d\}}| | j fS t | trc|r[|r?|   |   s>J n|durK| j	||d n|durW| j
||d n|   t| |  fS t | trvt| j|d\}}|| jfS t)z
    Try to simplify x into a StorageBox and a Layout.

    allow_padding only affect how we apply stride_order. When allow_padding
    is True, we have the freedom to add padding when applying the stride_order.
    r  r  r  r  r  Nr  r  )rx   ra   r6  r4  
StorageBoxr   Bufferr  r  r  r  r  r9  r5  rw  )	r   r  r  r  r  r  r   r5  bufferrq   rq   rr   r6  	  sR   






r6  c                 C  s2   zt | dd\}}||W S  ty   Y dS w r  )r6  is_stride_orderedrw  )r   r  r  r5  rq   rq   rr   "is_stride_order_storage_and_layout	  s   r  c                   @  s   e Zd ZU ded< dBddZdCdd	ZdDddZdEddZedFddZ	dGddZ
dHddZdIddZdJddZdKd d!ZdLd%d&ZdMd(d)ZdNd+d,Zd-d. Zd/d0 ZdMd1d2ZdMd3d4ZdOd6d7ZdPd9d:Zd;d< ZdQd?d@ZdAS )RBaseViewrb   r4  ru   r  c                 C  
   | j  S r   r4  r  rQ  rq   rq   rr   r  	  r  z!BaseView.get_unbacked_symbol_uses*Callable[[Sequence[Expr]], Sequence[Expr]]c                 C  s   t d|  )Nzmake_reindexer NYI on r  rQ  rq   rq   rr   make_reindexer	  r  zBaseView.make_reindexerr  c                   &   | j   |  d fdd}|S )Nr   r  ru   r   c                   r   r   rq   rh  innerr   rq   rr   rY  	  r   z&BaseView.make_indexer.<locals>.indexer)r   r  ru   r   )r4  r  r  rM  rY  rq   r  rr   r  	     
zBaseView.make_indexerr  c                   r  )Nr   r  ru   rT   c                   r   r   rq   rh  r  rq   rr   r`  	  r   z$BaseView.make_loader.<locals>.loaderri  )r4  r  r  rM  r`  rq   r  rr   r  	  r  zBaseView.make_loaderrs  c                 C  r  r   )r4  r   rQ  rq   rq   rr   r   	     
zBaseView.dtyperz  c                 C  r  r   r4  r   rQ  rq   rq   rr   r   	  r  zBaseView.get_layoutr  c                 C  r  r   r4  r   rQ  rq   rq   rr   r   	  r  zBaseView.get_devicerE  c                 C  r   r   rq   rQ  rq   rq   rr   ra  	  r   zBaseView.get_origin_noder   c                 C  r  r   r4  r  rQ  rq   rq   rr   r  	  r  zBaseView.get_namer  c                 C  r  r   r  rQ  rq   rq   rr   r.  	  r  zBaseView.get_pointwise_sizer  r   rv   c                 C     | j |S r   r4  r  r  rq   rq   rr   r   
  r   zBaseView.mark_reuserj   c                 C  r  r   r4  r  rQ  rq   rq   rr   r  
  r  zBaseView.has_exceeded_max_readsr  c                 C  r  r   r4  r  rQ  rq   rq   rr   r  
  r  zBaseView.realizec                 C  r  r   r4  r  rQ  rq   rq   rr   r  	
  r  zBaseView.realize_hintc                 C  r  r   r4  r  rQ  rq   rq   rr   r  
  r  zBaseView.get_storage_numelc                 C  r  r   r4  r  rQ  rq   rq   rr   r  
  r  zBaseView.is_externc                 C  r  r   )r4  is_module_bufferrQ  rq   rq   rr   r  
  r  zBaseView.is_module_bufferrS  c                 C  r  r   r4  r[  rQ  rq   rq   rr   r[  
  r  zBaseView.get_read_namesr  c                 C  sF   t tdd t|  |  jW  d    S 1 sw   Y  d S r:  )r   r   r   r9   r  r   r  rQ  rq   rq   rr   rZ  
  s   $zBaseView.get_readsc                 C  s"   | }t |tr|j}t |ts|S r   )rx   r  r4  )rM  r   rq   rq   rr   r  
  s
   

zBaseView.unwrap_viewr   r  c                 C  s2   |   }ttd||}t||  ||  dS rb  )r  r   r   rf  rV  r   r   rg  rq   rq   rr   r  %
  s   zBaseView.constant_to_deviceNr  )ru   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )rm   rn   ro   rp   r  r  r  r  r   r   r   r   ra  r  r.  r  r  r  r  r  r  r  r[  rZ  r  r  rq   rq   rq   rr   r  	  s0   
 



		











r  c                   @  s@   e Zd ZU ded< edd Zedd Zdd	d
Zdd Z	dS )r}   rE  r   c                 C  s   t jj}tttj|}|  }dgt|t|  t| }t|t|ks)J t	t|D ]A}|| dkrF|| dus?J || ||< q/|| du s\t jjj
jt|| dddr]q/|j|| ||  dddkspJ dq/|S )	zReplace `-1` with correct sizesNr  r0   Tsize_obliviousr   fallbackzKBroadcast failed in ExpandView({x.get_size()}, {new_size}) on dimension {i})rU   r   r   ry   rn  r   r  r   r   r   r   evaluate_exprr  r   )r   new_sizer   old_sizer   rq   rq   rr   _normalize_size5
  s"    zExpandView._normalize_sizec           
      C  s   |  ||}t|r\t|\}}t|t|j }|dksJ tjjg| }t|j	|jD ]\}}|
tjjjjt|dddsC|ntjj q-t|j|jt|||j}	t||	dS t||dS )Nr   r0   Tr  r3  )r4  r   )r  r   r6  r   r   r   r5  r6  r   r   r  rU   r   r   r   r  r  r7  r   r   ry   r8  r9  r}   )
r1  r   r  r:  r;  skipr<  r   r   r=  rq   rq   rr   r0  P
  s.   
zExpandView.createru   r  c                 C  r]  r   r  rQ  rq   rq   rr   r   l
  r_  zExpandView.get_sizec                   s4   |   }| j   t|t   fdd}|S )Nc                   sR   t | d  } t| t ksJ tt D ]} | dkr&tjj| |< q| S r  )ry   r   r   r   r5  r6  )r   r   actualr  rq   rr   r   t
  s   z*ExpandView.make_reindexer.<locals>.reindex)r   r4  r   )rM  targetr   rq   r  rr   r  o
  s
   
	zExpandView.make_reindexerNr  )
rm   rn   ro   rp   r  r  rM  r0  r   r  rq   rq   rq   rr   r}   1
  s   
 


r}   c                   @  s@   e Zd ZU ded< edd Zedd Zdd	d
Zdd ZdS )PermuteViewrE  dimsc                   s   |  |}t|ttt|ksJ t|r<t|\} t j j fdd|D  fdd|D  j	}t
||dS t||dS )Nc                      g | ]} j | qS rq   r  r   r;  rq   rr   r   
  r   z&PermuteView.create.<locals>.<listcomp>c                   r  rq   r   r   r  rq   rr   r   
  r   r3  )r4  r  )_map_neg_dimsr+   r   r   r   r6  r7  r   r   r8  r9  r  )r1  r   r  r:  r=  rq   r  rr   r0  
  s   
zPermuteView.createc                   s    fdd D S )Nc                   s$   g | ]}|d kr
|nt  | qS r   r  )r   r'  r  rq   rr   r   
     $ z-PermuteView._map_neg_dims.<locals>.<listcomp>rq   )r1  r  rq   r  rr   r  
     zPermuteView._map_neg_dimsru   r  c                   sD   t | | jt tt| jksJ | j   fdd| jD S )Nc                   r   rq   rq   r   r  rq   rr   r   
  r   z(PermuteView.get_size.<locals>.<listcomp>)r+   r  r  r   r   r4  r   rQ  rq   r  rr   r   
  s
   

zPermuteView.get_sizec                   s^   dd t | jD   fddtt| jD  t ttt| jks'J  fdd}|S )Nc                 S  r   rq   rq   )r   r   r  rq   rq   rr   r   
  r   z.PermuteView.make_reindexer.<locals>.<dictcomp>c                   r   rq   rq   r   invrq   rr   r   
  r   z.PermuteView.make_reindexer.<locals>.<listcomp>c                   s    fddD S )Nc                   r   rq   rq   r   r   rq   rr   r   
  r   z?PermuteView.make_reindexer.<locals>.reindex.<locals>.<listcomp>rq   r   r  r   rr   r   
  r  z+PermuteView.make_reindexer.<locals>.reindex)r   r  r   r   r+   )rM  r   rq   r  rr   r  
  s
   zPermuteView.make_reindexerNr  )	rm   rn   ro   rp   rM  r0  r  r   r  rq   rq   rq   rr   r  
  s   
 


r  c                   @  s6   e Zd ZeddddZedddZdddZdS )SqueezeViewNr'  c                  s>  t |rst|\}}g }g } d ur(t tsJ dd kr& t|jk s(J tt|j|jD ]0\}\}}	 d u rJ|dkrI|	| |	|	 q1| krY|	| |	|	 q1|dksaJ dq1t
|j|j|||j}
t||
dS  d u rt|dd | D S |   dksJ t| fddt| D S )	Nzexpected integer dim argumentr   r0   zexpected squeezed size to be 1r3  c                 S     g | ]}|d kr|qS r4  rq   r   rq   rq   rr   r   
  r   z&SqueezeView.create.<locals>.<listcomp>c                   s   g | ]
\}}| kr|qS rq   rq   r   r   r   r  rq   rr   r   
      )r   r6  rx   r   r   r   r   r   r   r  r7  r   r   r8  r9  r9  r0  r   )r1  r   r'  r:  r;  r  r<  r   r   r   r=  rq   r  rr   r0  
  s:   


"zSqueezeView.creater   r  c                   s@   dd | D }dd t | D t|  d
 fdd	}||fS )Nc                 S  r  r4  rq   r   rq   rq   rr   r   
  r   z(SqueezeView.squeezer.<locals>.<listcomp>c                 S  s   g | ]
\}}|d kr|qS r4  rq   r  rq   rq   rr   r   
  r  r   list[sympy.Expr]ru   tuple[sympy.Expr, ...]c                   sT   t | t ksJ |  d tjjg  }t| D ]\}}|||< qt|S )N )r   r   r5  r6  r   rz   )r   r1  r   r   lengthnot_onerq   rr   r   
  s
   "
z%SqueezeView.squeezer.<locals>.reindex)r   r  ru   r  )r   r   )r   r  r   rq   r  rr   squeezer
  s
   zSqueezeView.squeezerru   rv   c                 C  s   t d)Nzuse SqueezeView.create())AssertionError)rM  r4  rq   rq   rr   r^  
  r  zSqueezeView.__init__)r   r  r  )rm   rn   ro   rM  r0  r  r  r^  rq   rq   rq   rr   r  
  s    %r  c                   @  sT   e Zd ZU ded< ded< dd Zdd	d
ZdddZeZedd Z	dddZ
dS )GenericViewrE  r   r  r   c                 C  r]  r   )r   rQ  rq   rq   rr   r  
  r_  zGenericView.make_reindexerru   r   c                 C  sB   dd t t| jD }t| |}ddtt| d| S )Nc                 S     g | ]}t tj|qS rq   )rP   r/   rN  )r   r7  rq   rq   rr   r   
      z+GenericView.reindex_str.<locals>.<listcomp>zlambda , r  )r   r   r   ry   r   rp  rn  r   )rM  	index_old	index_newrq   rq   rr   reindex_str
  s
   zGenericView.reindex_strc                 C  s$   |  | jd| j d|   gS )Nsize=zreindex=)rr  r4  r   r	  rQ  rq   rq   rr   r,  
  s   zGenericView.__str__c                 C  s   | |t ||dS )Nr4  r   r   )ry   )r1  r   r  r   rq   rq   rr   r0  
  r  zGenericView.creater  c                 C  r]  r   r  rQ  rq   rq   rr   r     r_  zGenericView.get_sizeNr  r  )rm   rn   ro   rp   r  r	  r,  rL  rM  r0  r   rq   rq   rq   rr   r  
  s   
 


r  c                   @  sH   e Zd Zedd Zedd Zedd Zedd Zed	d
 Z	dS )r9  c                 C  s<   t | } t |}tjjjj}|t | dr| | } | S r  )r   r  rU   r   r   r   r  Lt)r   r   r  rq   rq   rr   handle_negative_index  s   

zView.handle_negative_indexc           	        s  t |ttfs	J | | |\ }tjj |r|S d}t	t
 dks/t	t
|dkr1d}d|v rD fdd}| |t||dS t|sJ|rw|r[t|s[t|t| }t|dd\}}t|j|j|t||j}t||dS |  |}| |t||dS )	NFr   Tc                   s   t dgt  S r  )rz   r   r   r  rq   rr   fake_reindex"  r  z!View.create.<locals>.fake_reindexr  )r  r3  )rx   rz   ry   resolve_negative_sizer   rU   r   r   statically_known_list_equalsr   r%   r  ExternKernelrequire_exact_stridesr   r   r6  r7  r   r   r8  r9  r:  )	r1  r   r  unbacked_symbols_in_sizesr  r:  r;  r=  r   rq   r  rr   r0    s6   zView.createc                 C  s   dd |D }dd | D } t |}tt|D ]}|| dkr3tjj||< tt| t|||<  nqtj	j
t| t| | |fS )Nc                 S  r  rq   rU   r   r   r  r  rq   rq   rr   r   A      z.View.resolve_negative_size.<locals>.<listcomp>c                 S  r  rq   r  r  rq   rq   rr   r   B  r  r  )ry   r   r   r   r5  Oner,   rQ   rU   r   r   guard_equals)r  r  r   rq   rq   rr   r  ?  s   zView.resolve_negative_sizec              	   C  sX   z	|  ||}W |S  ttfy+   t|g}|  ||}|  ||}t||}Y |S w r   )_dynamic_reshape_indexerr  
IndexErrorrQ   r   )r1  r  r  r   flatr   r   rq   rq   rr   r:  N  s   
zView.dynamic_reshape_indexerc                   sJ  t jjj}dd tt|D  tt |}t| }g |r|r| }| \}}|dkr>	t
jj |	||f n|dkrH|	| n||||kr^	| t jj|| n||||k r||||k r| \}}	|| | }||	 }||||k sn	| t jj|| nK||||krt
jj}
|}	t||
| |
| }
||||kr| }	t||
| |
| }
|| }||||kst jj|| nt|r|s!|r| }t jj|d 	t
jj |s|r| \}}t jj|d |s  tt| ksJ  fdd}|S )zG
        Perform a reshape entirely by modifying indexing math
        c                 S  r  rq   )rP   r/   VIEWr   rq   rq   rr   r   b  r  z1View._dynamic_reshape_indexer.<locals>.<listcomp>r0   c                   sH   t | t ksJ t | t ftt|  t fddD S )Nc                 3  s    | ]}t | V  qd S r   )rR   r  replacementsrq   rr   r    r[  zAView._dynamic_reshape_indexer.<locals>.reindex.<locals>.<genexpr>)r   r{   r   rz   r   r[  	view_exprr  rr   r     s   $z.View._dynamic_reshape_indexer.<locals>.reindex)rU   r   r   r   r   r   ry   r   r/  r  r   r5  r6  r  r  r.   r  reverse)r  r  r   	stack_new	stack_oldsize_oldvarsize_newvar2	size_new2divisormodulusr   rq   r  rr   r  Z  sj   



 zView._dynamic_reshape_indexerN)
rm   rn   ro   r  r  rM  r0  r  r:  r  rq   rq   rq   rr   r9    s    

.

r9  c                      s   e Zd ZU dZded< d/ fddZd0d	d
ZeZdd Zd1ddZ	d2ddZ
edd Zd3ddZdd Zd4ddZd5ddZd6d d!Zd"d# Zd7d%d&Zd8d9d*d+Zd:d-d.Z  ZS );r9  z*Pretend our storage has a different layoutrz  r5  ru   rv   c                   s2   t    t| jtrt| d| j  d S d S )Nr4  )r&  rR  rx   r4  r  r   rL  r  rQ  r'  rq   rr   rR    s   
zReinterpretView.__post_init__r   c                 C     |  | j| jgS r   )rr  r4  r5  rQ  rq   rq   rr   r,    s
   zReinterpretView.__str__c                 C  r  r   r  rQ  rq   rq   rr   r    r  zReinterpretView.get_namer  c                 C     | j jS r   )r5  r   rQ  rq   rq   rr   r     r  zReinterpretView.get_devicerE  c                 C  r   r   rq   rQ  rq   rq   rr   ra    r   zReinterpretView.get_origin_nodec                 C  r,  r   )r5  r   rQ  rq   rq   rr   r     r  zReinterpretView.dtyper  c                 C     t | jjS r   )ry   r5  r   rQ  rq   rq   rr   r     r   zReinterpretView.get_sizec                 C  r-  r   )ry   r5  r   rQ  rq   rq   rr   r    r   zReinterpretView.get_strider  c                      d fdd}|S )Nr   r  ru   rT   c                   sF    j  }t  || } j j jjkr!t| j jjS |S r   )r5  r  rS   loadr  r   r4  to_dtype_bitcast)r   rY  
tmp_loaderrQ  rq   rr   r`    s
   
z+ReinterpretView.make_loader.<locals>.loaderr   r  ru   rT   rq   r  rq   rQ  rr   r    s   zReinterpretView.make_loaderr  c                 C  r  r   )r5  r  rQ  rq   rq   rr   r    r  zReinterpretView.make_indexerc                 C  r]  r   r5  rQ  rq   rq   rr   r     r_  zReinterpretView.get_layoutc                 C  r   r   rq   rQ  rq   rq   rr   r    r   zReinterpretView.freeze_layoutr  c                 C  s$   t | jjt | jjB t | jjB S r   )r%   r5  r   r   r8  rQ  rq   rq   rr   r    s   


z(ReinterpretView.get_unbacked_symbol_usesNr  r  c                 C  s@   t jjj| j| jj| jj| jj|d ur|j	nt jjj	| jj
dS rW  )rU   r   wrapper_codecodegen_reinterpret_viewr4  r5  r   r   r8  	writeliner   r  rq   rq   rr   r    s   z!ReinterpretView.codegen_referencer   c                 C  r  r  rq   rQ  rq   rq   rr   r    r   zReinterpretView.num_readsr  r  r  r  r  r  r  r  r  r   r  r  )rm   rn   ro   __doc__rp   rR  r,  rL  r  r   ra  r   r   r   r  r  r  r   r  r  r  r  rO  rq   rq   r'  rr   r9    s(   
 








r9  c                   @  sT   e Zd ZU dZded< edd Zddd	ZeZe	d
d Z
dddZdddZdS )	DtypeViewz(Pretend our storage has a different typers  target_dtypec                 C  sD   t |rt|\}}t|j||j|j|j}t||dS t||dS )Nr3  )r4  r9  )	r   r6  r7  r   r   r   r8  r9  r8  )r1  r   	new_dtyper:  r;  r=  rq   rq   rr   r0    s   zDtypeView.createru   r   c                 C  r+  r   )rr  r4  r9  rQ  rq   rq   rr   r,    r  zDtypeView.__str__c                 C  r]  r   )r9  rQ  rq   rq   rr   r     s   zDtypeView.dtyper  c                 C  r  r   r4  r   rQ  rq   rq   rr   r     r  zDtypeView.get_sizer  c                   s   j    fdd}|S )Nc                   s   t  | jjjS r   )rS   r0  r9  r4  r   rh  r  rM  rq   rr   r`    s   z%DtypeView.make_loader.<locals>.loaderr4  r  r  rq   r<  rr   r    s   
zDtypeView.make_loaderNr  r  r  )rm   rn   ro   r7  rp   rM  r0  r,  rL  r   r   r   r  rq   rq   rq   rr   r8    s   
 



r8  c                   @  s&   e Zd Zedd ZedddZdS )		SliceViewc                   s   t jj| | tdd ||fD rtjtjnjj	fdd  fdd}||dd}|||}||fS )zz
        Normalize start and end such that both are in the range
        [0, x.get_size()[dim]] and start <= end.
        c                 s  r  r   r  r  rq   rq   rr   r  %  r  z0SliceView.normalize_start_end.<locals>.<genexpr>c                   s<    | |r| n | |}||r|}|S ||}|S r   )statically_known_geqr$  )r   lowerupperclamped_lowerclamped_full)max_funcmin_funcr   rq   rr   clamp,  s   
z,SliceView.normalize_start_end.<locals>.clampc                   s$   | d u r|S  | }  | ||S r   )r  )r
  r@  rA  r0  )rF  r1  dim_sizerq   rr   
clamp_wrap7  s   z1SliceView.normalize_start_end.<locals>.clamp_wrapr   )
rU   r   r   r   rs  r   MinMaxevaluate_minevaluate_max)r1  r   r'  startendrH  rq   )rF  r1  rG  rD  rE  r   rr   normalize_start_end  s   zSliceView.normalize_start_endr0   Tc                   s  t tt jsdksJ zdkr!|dkr!dkr!|W S W n	 ty+   Y nw t| |r>| | |\}t| d   < t	|rzt
|\}}t|j}	|	   |	 < t|j|j|	|j|j    }
t||
dS  fdd}t||dS )Nr   l    r0   r3  c                   sD   t | t ksJ d|  d t| } |     |  < | S )Nzwrong ndim r  )r   ry   r   r'  r  rM  steprq   rr   r   c  s   $z!SliceView.create.<locals>.reindexr  )r   r  rx   r   	TypeErrorry   r   rO  r-   r   r6  r   r7  r   r   r8  r9  r>  )r1  r   r'  rM  rN  rQ  rF  r:  r;  r<  r=  r   rq   rP  rr   r0  A  s6   

zSliceView.createN)r0   T)rm   rn   ro   rM  rO  r0  rq   rq   rq   rr   r>    s
    
$r>  c                   @  sF   e Zd ZU ded< ded< dddZdd
dZdddZdddZdS )BaseConstantrs  r   r  r   ru   r  c                 C  r  Nrq   rq   rQ  rq   rq   rr   r   r  r   zBaseConstant.get_sizer  c                 C  r]  r   r-  rQ  rq   rq   rr   r   u  r_  zBaseConstant.get_devicerE  c                 C  r   r   rq   rQ  rq   rq   rr   ra  x  r   zBaseConstant.get_origin_noder  c                 C  r  r   r*   rQ  rq   rq   rr   rZ  {  r_  zBaseConstant.get_readsNr  r  r  r  )rm   rn   ro   rp   r   r   ra  rZ  rq   rq   rq   rr   rS  m  s   
 


rS  c                   @  sD   e Zd ZU ded< ded< ded< dd	d
ZdddZdddZdS )Constantr
   r   rs  r   r  r   ru   r  c                   r.  )Nr   r  ru   rT   c                      t  j jS r   )rS   rS  r   r   r   rQ  rq   rr   r`    r  z$Constant.make_loader.<locals>.loaderr2  rq   r  rq   rQ  rr   r       zConstant.make_loaderr  c                 C  r   r   rq   rQ  rq   rq   rr   r    r   zConstant.realizerb   c                 C     t | j| j|dS )N)r   r   r   )rU  r   r   r  rq   rq   rr   r    r  zConstant.constant_to_deviceNr  r  r  )rm   rn   ro   rp   r  r  r  rq   rq   rq   rr   rU    s   
 

rU  c                   @  s:   e Zd ZU ded< ded< ded< dd	d
ZdddZdS )IndexingConstantr
   r   rs  r   r  r   ru   r  c                   r.  )Nr   r  ru   rT   c                   rV  r   )rS   r  r   r   r   rQ  rq   rr   r`    r  z,IndexingConstant.make_loader.<locals>.loaderr2  rq   r  rq   rQ  rr   r    rW  zIndexingConstant.make_loaderrb   c                 C  rX  )N)r   r   r   )rY  r   r   r  rq   rq   rr   r    r  z#IndexingConstant.constant_to_deviceNr  r  )rm   rn   ro   rp   r  r  rq   rq   rq   rr   rY    s   
 
rY  r   c                 C  s    t dd t| t||D S )Nc                 s  s&    | ]\}}}|d kp||kV  qdS r  rq   )r   leftrightr   rq   rq   rr   r    s
    
z2is_contiguous_strides_for_shape.<locals>.<genexpr>)r  r   r   r   )r   r#  rq   rq   rr   is_contiguous_strides_for_shape  s
   r\  c                 C  s   t j| j S r   )r1   padding_alignment_bytesitemsizert  rq   rq   rr   get_align_for_dtype  r   r_  c                   @  s$   e Zd ZdZd
ddZdddZd	S )r  zxAbstract base for Layout, MultiOutputLayout, NoneLayout.
    Represents the memory layout of the output of an Operation.ru   r  c                 C  r  r   r  rQ  rq   rq   rr   r     r  zOutputSpec.get_devicer   c                 C  r  r   r  rQ  rq   rq   rr   storage_size  r  zOutputSpec.storage_sizeNr  r  )rm   rn   ro   r7  r   r`  rq   rq   rq   rr   r    s    
r  c                   @  s   e Zd Zdedfd4ddZd5ddZeZd6ddZd7ddZe	d8ddZ
d7ddZd7d d!Zd"d# Ze	d$d% Zd&d' Zd(d) Zd*d+ Zd9d-d.Zd7d/d0Zd:d2d3ZdS );rz  Nr   r   r  r   rs  r   rE  r   Optional[list[Expr]]r8  r   ru   rv   c                 C  sn   |d u r	t |}|| _|| _t|t|ks!J d| d| tdd |D s,J || _|| _|| _d S )Nr
  	, stride=c                 s  s    | ]
}t |ttfV  qd S r   )rx   r   r   r   rq   rq   rr   r    s    z"Layout.__init__.<locals>.<genexpr>)	r   r   r   r   r   r  r   r   r8  )rM  r   r   r   r   r8  rq   rq   rr   r^    s   
$
zLayout.__init__r   c                 C  sr   d}| j dkrd| j  }| jjd u rdnd| jj }t| j d| jj | d| j d| j d| j | d	S )
Nrf  r   z	, offset=:z('z', z, size=rb  r  )r8  r   r   r   rm   r   r   r   )rM  r8  device_index_strrq   rq   rr   r,    s   
"zLayout.__str__c                 C  r]  r   r-  rQ  rq   rq   rr   r     r_  zLayout.get_devicerj   c                 C  s   t | j| jS r   )r\  r   r   rQ  rq   rq   rr   r    r  zLayout.is_contiguousr#  r!  r-  c                 C  sV   t | }|dvs| d dkrdS t|t| | D ]\}}}|dkr(||kr( dS qdS )N)r      r0   FT)r   r   r!   )r#  r-  ndimrZ  r[  r   rq   rq   rr   is_channels_last_contiguous  s   
z"Layout.is_channels_last_contiguousc                 C  sJ   t | jtttt| j| jD ]\}}}|dkr"||kr" dS qdS )Nr0   FT)r   r   reversedr   r   ry   r   )rM  rZ  r[  r   rq   rq   rr   is_transposed  s   zLayout.is_transposedc                   s   t jt  ksJ dd tjD }fdd|D } fdd|D  dd }|  dgt   }tt  D ]
}|| | | < q<tt  d D ]'}|| ||d  k}t|tsqtjj	j
|| ||d  kd	d
}|rv dS qOd	S )Nc                 S  s*   g | ]\}}t jjj|d ddkr|qS )r   r  r0   rU   r   r   r   )r   r   r'  rq   rq   rr   r     s
    z,Layout.is_stride_ordered.<locals>.<listcomp>c                   r  rq   r  r   rQ  rq   rr   r     r   c                   r   rq   rq   r   r   rq   rr   r     r   c                   s   t |   fdd| D S )Nc                      g | ]}  |qS rq   r   )r   element
sorted_arrrq   rr   r   
  r   zDLayout.is_stride_ordered.<locals>.sorted_indices.<locals>.<listcomp>)r  )arrrq   rm  rr   sorted_indices     z0Layout.is_stride_ordered.<locals>.sorted_indicesr  r0   Tr  F)r   r   r   r   r   rx   rj   rU   r   
_shape_envr  )rM  r   non_1_indicesr   rp  stride_orderedr   exprrq   )r   rM  rr   r    s*   
zLayout.is_stride_orderedc                 C  s:   dgt ttdt| jd  }t|g| }| |S Nr   r0   )ry   rh  r   r   r   r  r  rq   rq   rr   is_channels_last_stride_ordered  s   "
z&Layout.is_channels_last_stride_orderedc                 C  s*  t |}t| dkr| S tjst|| r| S t }t|dr)|j	
ddr)| S tdd t| |D s8| S t| }t|}dd tt| D }d	||d < d}t|d	d
 d	dD ]*\}	}
||	d	  }|| ||  }|tjkr|| dkrt||| }d}|||
< q]|s| S t jd	7  _|S )z
        The padding does not change stride order but makes sure all strides larger
        than the threshold are multiple of align.
        r   metadislike_paddingFc                 s  s     | ]}t |ttjfV  qd S r   )rx   r   r   r   r   rq   rq   rr   r  >  rs  z&Layout._pad_strides.<locals>.<genexpr>c                 S  r   r   rq   r   rq   rq   rr   r   G  r   z'Layout._pad_strides.<locals>.<listcomp>r0   N)rM  T)r_  r   r1   pad_channels_lastrz  rg  rU   get_current_noder  rx  getr  r  chainr   r   r   r   padding_stride_thresholdrF   r   num_comprehensive_padding)
in_stridesr   r   aligncurrent_fx_noder  r   new_stridespaddedrankr   prev_idxr   rq   rq   rr   _pad_strides$  s@   


zLayout._pad_stridesc                 C  s6   t | tsJ | jd usJ | | j| j| j| _d S r   )rx   r   r   r  r   r   rQ  rq   rq   rr   r  _  s   zLayout.pad_stridesc                 C  s   t jot| tS r   )r1   comprehensive_paddingrx   r   rQ  rq   rq   rr   r  d  r  zLayout.should_pad_stridesc                 C  s8   t | tr| S |  r|   t| j| j| j| j| jS r   )	rx   r7  r  r  r   r   r   r   r8  rQ  rq   rq   rr   as_fixedg  s   
zLayout.as_fixedr  c                 C  s(   t jsJ dt| j d|   S )Nzconvert z to FixedLayout first)r   r;  r   rm   r  r  rQ  rq   rq   rr   r  u  s   zLayout.make_indexerc                 C  s<   | j |j ko| j|jko| j|jko| j|jko| j|jkS r   r   r   r   r   r8  )rM  otherrq   rq   rr   __eq__{  s   



zLayout.__eq__
sympy.Exprc                 C  s   t | j| j| jS r   )r   r   r   r8  rQ  rq   rq   rr   r`    r  zLayout.storage_size)r   r  r   rs  r   rE  r   ra  r8  r   ru   rv   r  r  r  )r#  r!  r-  r!  ru   rj   r  ru   r  )rm   rn   ro   r   r^  r,  rL  r   r  r  rg  ri  r  rw  r  r  r  r  r  r  r`  rq   rq   rq   rr   rz    s*    





#
:

	rz  c                   @  s   e Zd ZdZdddZdS )r7  z A Tensor layout we cannot changeru   r  c                   s    fdd}|S )z1A closure containing math to read a given elementc                   sf   t | t  jksJ t | t  jksJ  j}t|  j jD ]\}}}|dkr0|||  }q!|S r  )r   r   r   r8  r   )r   r  r   r   szrQ  rq   rr   rY    s   z)FixedLayout.make_indexer.<locals>.indexerrq   r  rq   rQ  rr   r    s   	zFixedLayout.make_indexerNr  )rm   rn   ro   r7  r  rq   rq   rq   rr   r7    s    r7  c                      s   e Zd ZdZdZedd Zedd Zedd Zed	d
 Z	edd Z
dddZdddZdd Zdd Zdd fddZ  ZS )r   z(A Tensor layout we are allowed to changeFc                 C  sN   t | dkrg S tjjg}t| dd  D ]}|||d   qtt|S )Nr   r0   r  )r   r   r5  r  rh  r  ry   )sizesreversed_stridesr   rq   rq   rr   r     s   
z!FlexibleLayout.contiguous_stridesc                 C  s\   t tt| t |ksJ | |ftjj}dgt| }|D ]}|||< || |  }q|S )z
        Create a stride based on the order the dimensions should be filled in.

        In this format, channels last would be:
            [1, 3, 2, 0]
        N)r+   r   r   r   r5  r  )r  r   next_strider-  r   rq   rq   rr   fill_ordered  s   $zFlexibleLayout.fill_orderedc                 C  s0   t tt| t |ksJ t|}t| |S )z
        Create a stride based on the sorted order of a permuted range.

        In this format, channels last would be:
            [3, 0, 2, 1]
        )r+   r   r   r   r   r  )r  r   r   rq   rq   rr   rt    s   zFlexibleLayout.stride_orderedc                 C  sP   |t jkrt| tS |t jkrt| tS |t jkr t| S t	
d| t)aq  
        Create a stride based on a memory format.

        Memory format is translasted into a stride order,
        so channels_last is the same as:
            FlexibleLayout.stride_ordered(sizes, [3, 0, 2, 1])

        This interface does not support memory_format `torch.preserve_format`
        which should be used to deduce a format from another source
        z>stride_ordered_for_memory_format, unsuppored memory_format: %s)r   channels_lastr   rt  NHWC_STRIDE_ORDERchannels_last_3dNHWDC_STRIDE_ORDERcontiguous_formatr   r  r  rw  )r  memory_formatrq   rq   rr    stride_ordered_for_memory_format  s   



z/FlexibleLayout.stride_ordered_for_memory_formatc                 C  sD   t | t |ks
J dd |D }ttt ||jd}t| |S )z
        Create a stride that has the same stride order as given stride

        For example, if given stride is [1000, 1, 100, 10],
        the fill order should be [1, 3, 2, 0]
        c                 S  r  rq   rj  r  rq   rq   rr   r     r  z/FlexibleLayout.same_ordered.<locals>.<listcomp>r  )r   r  r   __getitem__r   r  )r  r   r   rq   rq   rr   same_ordered  s   zFlexibleLayout.same_orderedc                 C  sD   |  | j|}|  r|r| || j| j}t| j| j| j|| jS r   )rt  r   r  r  r   r7  r   r8  )rM  r   r  r<  rq   rq   rr   as_stride_order  s   zFlexibleLayout.as_stride_orderc                 C  s:   |}|   r|r| || j| j}t| j| j| j|| jS r   )r  r  r   r   r7  r   r8  )rM  r  r  r<  rq   rq   rr   as_exact_strides  s   zFlexibleLayout.as_exact_stridesc                 C  @   |  | j|}|  r| || j| j}t| j| j| j|| jS r   )r  r   r  r  r   r7  r   r8  )rM  r   r<  rq   rq   rr   as_fill_order     zFlexibleLayout.as_fill_orderc                 C  r  r   )r  r   r  r  r   r7  r   r8  )rM  r   r<  rq   rq   rr   as_same_order  r  zFlexibleLayout.as_same_orderNru   rv   c                   s2   |r	t ||}nt |}t |||| d S r   )r   r  r   r&  r^  )rM  r   r   r   r  r-  r'  rq   rr   r^    s   
zFlexibleLayout.__init__r  r   r  )rm   rn   ro   r7  r;  r  r   r  rt  r  r  r  r  r  r  r^  rO  rq   rq   r'  rr   r     s$    






r   c                      s4   e Zd ZdZd fddZdd	d
Zdd Z  ZS )NonOwningLayoutz,Is a view into the storage of another tensorviewUnion[BaseView, TensorBox]ru   rv   c                   s,   |  }t |j|j|j|j || _d S r   )r   r&  r^  r   r   r   r   r  )rM  r  r5  r'  rq   rr   r^  )  s   
zNonOwningLayout.__init__r  c                 C     |    S r   )r  r  rQ  rq   rq   rr   r  3  r   zNonOwningLayout.make_indexerc                 C  s4   | j  j}|dkrdS ddlm} tjj||S )Nr   Tr0   )	ALIGNMENT)	r  r   r8  utilsr  rU   r   r   statically_known_multiple_of)rM  r8  r  rq   rq   rr   maybe_guard_aligned6  s
   z#NonOwningLayout.maybe_guard_aligned)r  r  ru   rv   r  )rm   rn   ro   r7  r^  r  r  rO  rq   rq   r'  rr   r  &  s
    

r  c                   @     e Zd ZdZdS )CommBufferTypesymm_memN)rm   rn   ro   SYMM_MEMrq   rq   rq   rr   r  ?      r  c                      s4   e Zd ZU dZded< ded< d
 fdd	Z  ZS )CommBufferLayoutax  
    A layout that signifies the buffer is a comm buffer.
    In terms of striding, the layout is identical to `FixedLayout`.

    Buffers with this layout do not participate in in-place reuse - it can be
    neither the source nor the target for in-place reuse.

    For detailed motivation and usage of this layout, see
    NOTE [lowering-time collective optimization].
    r  comm_buffer_typer   
group_namer5  r   c                   sR   t |tstd| d| }t j|j|j|j|j	|j
d || _|| _d S )NzJA `CommBufferLayout` can only be initialized with a `FlexibleLayout` (got z).r  )rx   r   r  r  r&  r^  r   r   r   r   r8  r  r  )rM  r5  r  r  fixedr'  rq   rr   r^  R  s    

zCommBufferLayout.__init__)r5  r   r  r  r  r   )rm   rn   ro   r7  rp   r^  rO  rq   rq   r'  rr   r  C  s
   
 r  c                   @  sb   e Zd ZU ded< ejdd dZded< ejdd dZded	< dddZdd Z	dddZ
dS )
NoneLayoutr  r   c                   C     dgS r  rq   rq   rq   rq   rr   r  u  r  zNoneLayout.<lambda>default_factoryr  r   c                   C  r  r  rq   rq   rq   rq   rr   r  v  r  r   ru   r   c                 C  r  r  rq   rQ  rq   rq   rr   r`  x  r   zNoneLayout.storage_sizec                 C     | S r   rq   rQ  rq   rq   rr   r  {  r   zNoneLayout.as_fixedc                 C  r]  r   r-  rQ  rq   rq   rr   r   ~  r_  zNoneLayout.get_deviceNr  r  )rm   rn   ro   rp   r  r  r   r   r`  r  r   rq   rq   rq   rr   r  j  s   
 

r  c                      sx   e Zd Zd fddZeddd	Zejddd	Zd ddZd!ddZdd Z	e
d"ddZdd Zd#ddZ  ZS )$MutationLayoutSHOULDREMOVEr  rb   ru   rv   c                   s@   t  | | | d  || _|   }tj	
| d S r   )r&  r^  r  r   r   r  
get_bufferr  rU   r   mark_buffer_mutated)rM  r  r   r'  rq   rr   r^    s   z#MutationLayoutSHOULDREMOVE.__init__rE  c                 C  r  r   )real_layoutr   rQ  rq   rq   rr   r     r  z!MutationLayoutSHOULDREMOVE.strider   r   c                 C  r   r   rq   )rM  r   rq   rq   rr   r     r  r  c                 C  r  r   )r  r`  rQ  rq   rq   rr   r`    r   z'MutationLayoutSHOULDREMOVE.storage_sizer  c                   s,    fdd  | j }t|tsJ d|S )Nc                   sB   t | tr
 | jS t | tr |  S t | tr | jS | S r   )rx   r  r  r  r  
MutableBoxr4  )r  unwrap_viewsrq   rr   r    s   




z;MutationLayoutSHOULDREMOVE.get_buffer.<locals>.unwrap_viewsz1MutationLayoutSHOULDREMOVE must refer to a buffer)r  rx   r  )rM  r  rq   r  rr   r    s   
	z%MutationLayoutSHOULDREMOVE.get_bufferc                 C  r  r   )r  r5  rQ  rq   rq   rr   r    r  z&MutationLayoutSHOULDREMOVE.real_layoutFc              	   C  s   |   tj|  t|tr|j}|  |s6t	j
| | | dd t| | D dj}|   t|jjtsCJ t||j_|jS )Nc                 S      g | ]\}}t jj||qS rq   rU   r   r   r  r   r  r  rq   rq   rr   r         z;MutationLayoutSHOULDREMOVE.realize_into.<locals>.<listcomp>re  )r  rU   r   r  r  rx   ra   r4  r  rV  r0  r   r   r  r   r   r5  r   r  )r1  srcdstunsafe_aliasrq   rq   rr   realize_into  s(   

z'MutationLayoutSHOULDREMOVE.realize_intoc                 C  r  r   rq   rQ  rq   rq   rr   r    r   z#MutationLayoutSHOULDREMOVE.as_fixedr  c                 C  r  r   )r  r  rQ  rq   rq   rr   r    r  z'MutationLayoutSHOULDREMOVE.make_indexer)r  rb   ru   rv   ru   rE  )r   r   ru   rv   r  )ru   r  r  r  )rm   rn   ro   r^  r   r   setterr`  r  r  rM  r  r  r  rO  rq   rq   r'  rr   r    s    

"r  c                      sB  e Zd ZU ded< ded< dQ fddZdRd
dZdSddZdTddZdUddZe	dVddZ
dWddZdXddZdYdd ZdZd"d#Zd[d$d%Zd&d' Zd(d) Zd\dQd+d,ZdQd-d.ZdQd/d0Z	*d\dQd1d2Zd3d4 Zd]d6d7Zd^d_d;d<Zd=d> Zd`d@dAZd`dBdCZdadEdFZdbdHdIZdbdJdKZdcdLdMZdddOdPZ   Z!S )er  r  r   r  r5  ru   rv   c                   s   t    | dd  d S r  )r&  rR  rN  rQ  r'  rq   rr   rR    s   
zBuffer.__post_init__r  c                 C  r  r   )r   r  rQ  rq   rq   rr   r    r   zBuffer.make_indexerr   c                 C  s   | j sJ | | j S r   r   rQ  rq   rq   rr   r    r	  zBuffer.get_namer  c                 C  r  r   )r  r   rQ  rq   rq   rr   r     r   zBuffer.get_devicerb  c                 C  r   r   rq   rQ  rq   rq   rr   rc    r   zBuffer.get_defining_oprs  c                 C  r  r   )r   r   rQ  rq   rq   rr   r     r  zBuffer.dtyper  c                 C     g |   jS r   )r   r   rQ  rq   rq   rr   r     r  zBuffer.get_sizerE  c                 C  r  r   )r   r   rQ  rq   rq   rr   r    r  zBuffer.get_strider   c                 C  r  r   )r   r8  rQ  rq   rq   rr   
get_offset  r  zBuffer.get_offsetrz  c                 C  s"   t | jtr	| jS tt| jjr   )rx   r5  rz  rw  r   rm   rQ  rq   rq   rr   r     s   zBuffer.get_layoutc                 C  r]  r   r3  rQ  rq   rq   rr   r    r_  zBuffer.get_output_specc                 C  r  r   )r  rQ  rq   rq   rr   r    r  zBuffer.get_storage_numelc                 C  s0   t | jtrt | jts| j | _d S d S d S r   )rx   r5  rz  r  r  rQ  rq   rq   rr   r  	  s
   zBuffer.freeze_layoutFc                 C  &   t | jtsJ | jj||d| _d S Nr  )rx   r5  r   r  r  rq   rq   rr   r    s   z&Buffer.freeze_layout_with_stride_orderc                 C  "   t | jtsJ | j|| _d S r   )rx   r5  r   r  r  rq   rq   rr   r       z$Buffer.freeze_layout_with_fill_orderc                 C  r  r   )rx   r5  r   r  r  rq   rq   rr   r    r  z$Buffer.freeze_layout_with_same_orderc                 C  r  r  )rx   r5  r   r  r  rq   rq   rr   r    s   z'Buffer.freeze_layout_with_exact_stridesc                 C  r  r  r  rQ  rq   rq   rr   r  #  r  zBuffer.is_zero_elementsr  c                   s(      rtt  dS  fdd}|S )Nrt  c                   s      }t jp
d|| S r\  )r  rS   r/  r   r   rY  rQ  rq   rr   r`  +  s   z"Buffer.make_loader.<locals>.loader)r  r	   rU  r   r  rq   rQ  rr   r  &  s   zBuffer.make_loaderNr  r  c                 C  r  r   r  r  rq   rq   rr   r  1  r  zBuffer.codegen_referencec                 C  r   r   rq   rQ  rq   rq   rr   r  4  r   zBuffer.decide_layoutre  c                 C     t | jtr| jj gS dS rT  )rx   r5  r  r  r  rQ  rq   rq   rr   r  7     z#Buffer.get_inputs_that_alias_outputc                 C  r  rT  )rx   r5  r  r  r  rQ  rq   rq   rr   r  <  r  zBuffer.get_mutation_namesrS  c                 C  s   t |  gS r   )r+   r  rQ  rq   rq   rr   r[  A  r  zBuffer.get_read_namesr  c                 C  r  r   r*   rQ  rq   rq   rr   r  D  r_  zBuffer.get_unbacked_symbol_usesc                 C  r  r   r*   rQ  rq   rq   rr   r  G  r_  zBuffer.get_unbacked_symbol_defsc                 C  r   r   rq   rQ  rq   rq   rr   r  J  r   zBuffer.realizerj   c                 C  r  r  rq   rQ  rq   rq   rr   should_allocateM  r  zBuffer.should_allocater  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  )"rm   rn   ro   rp   rR  r  r  r   rc  r   r   r   r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r[  r  r  r  r  rO  rq   rq   r'  rr   r    sB   
 

















r  c                   @  s0   e Zd ZdddZdddZejZdd	d
ZdS )OperationBufferru   r  c                 C  s   | gS r   rq   rQ  rq   rq   rr   r  U  r_  zOperationBuffer.get_outputsr  c                 C  r  r   rq   rQ  rq   rq   rr   rc  X  r   zOperationBuffer.get_defining_oprv   c                 C  s   t |  t|  d S r   )r  rR  r  rQ  rq   rq   rr   rR  ^  s   
zOperationBuffer.__post_init__Nr  ru   r  r  )rm   rn   ro   r  rc  r  r  rR  rq   rq   rq   rr   r  R  s
    

r  c                   @     e Zd ZdddZdS )InputBufferru   r   c                 C  r  r  rq   rQ  rq   rq   rr   r  d  r   zInputBuffer.num_readsNr  )rm   rn   ro   r  rq   rq   rq   rr   r  c      r  c                   @  r  )DonatedBufferaY  
    Represents a donated buffer which is a saved tensor that is not alias to any
    fwd inputs, fwd user outputs, and bwd outputs. We generally cannot inplace
    reuse the input tensor memory during backward since it might be used in another
    function. However, donated buffer can be inplace reused during backward
    to save memory.
    N)rm   rn   ro   r7  rq   rq   rq   rr   r  h  r  r  c                   @  s.   e Zd ZU dZded< dddZdddZdS )rf  Nr  rd  ru   r  c                   r.  )Nr   r  ru   rT   c                   s,       }ttj   j|| S r   )	r   r  rS   r/  rU   r   constant_namer  rd  r  rQ  rq   rr   r`  v  s
   z*ConstantBuffer.make_loader.<locals>.loaderr2  rq   r  rq   rQ  rr   r  u  s   zConstantBuffer.make_loaderr   r  rb   c                 C  s   t tj|  || jdS N)r   r5  )rf  rU   r   r  r  r5  r  rq   rq   rr   r    s   z!ConstantBuffer.constant_to_devicer  r  )rm   rn   ro   rd  rp   r  r  rq   rq   rq   rr   rf  r  s   
 

rf  c                   @  s@   e Zd ZdddZdddZddddZdddZdddZdS )NoneAsConstantBufferru   r  c                 C  r  r   r*   rQ  rq   rq   rr   rZ    r_  zNoneAsConstantBuffer.get_readsr  c                 C  r  r   r*   rQ  rq   rq   rr   r    r_  z-NoneAsConstantBuffer.get_unbacked_symbol_usesNr  r  r   c                 C  s
   t jjjS r   )rU   r   r4  none_strr  rq   rq   rr   r    r  z&NoneAsConstantBuffer.codegen_referencer  c                 C  s
   t d dS Nr-  )r  rQ  rq   rq   rr   r    r  z$NoneAsConstantBuffer.get_output_specrj   c                 C  r  r  rq   rQ  rq   rq   rr   r    r   z&NoneAsConstantBuffer.has_tensor_outputr  r  r   r  r  r  )rm   rn   ro   rZ  r  r  r  r  rq   rq   rq   rr   r    s    


r  c                   @  s6   e Zd ZU ded< dddZddddZdddZdS )r   r   ru  ru   r  c                 C  
   t | jS r   )r%   ru  rQ  rq   rq   rr   r    r  z.ShapeAsConstantBuffer.get_unbacked_symbol_usesNr  r  r   c                 C  s   t jj| jS r   )rU   r   r4  codegen_sizevarru  r  rq   rq   rr   r    r  z'ShapeAsConstantBuffer.codegen_referencerj   c                 C  r  r  rq   rQ  rq   rq   rr   r    r   z'ShapeAsConstantBuffer.has_tensor_outputr  r   r  r  )rm   rn   ro   rp   r  r  r  rq   rq   rq   rr   r     s
   
 
r   c                      s   e Zd ZU ded< d>ddZd?dd	Zd@ddZdAddZdBddZdCddZ	dD fddZ
dEddZdFddZdGd d!ZedHd#d$Z	%	%dIdJd+d,Ze	%dKd-d.ZdLd0d1Zd>d2d3ZdMd5d6ZdMd7d8ZdNd<d=Z  ZS )Or  r  r4  ru   r  c                 C  s(   | j dur| j S t| jdr| jj S dS )z
        Returns self.name if it exists, otherwise returns the name of the data node if that exists.
        If neither exist, returns None.
        Nr   )r   r  r4  rQ  rq   rq   rr   get_computed_buffer_name  s
   
z'ComputedBuffer.get_computed_buffer_namer   c                 C  r  r   r4  r  rQ  rq   rq   rr   r    r  zComputedBuffer.num_readsr  c                 C  r  r   r4  rZ  rQ  rq   rq   rr   rZ    r  zComputedBuffer.get_readsrS  c                 C  r  r   r  rQ  rq   rq   rr   r[    r  zComputedBuffer.get_read_namesr  c                 C  sz   t tdd, | j r"t|  | j | j W  d    S t|  | j	 W  d    S 1 s6w   Y  d S r:  )
r   r   r   r4  r  r9   get_store_functionr.  r  r   rQ  rq   rq   rr   r    s   
$zComputedBuffer.get_read_writesr  c                 C  s.   t |  t |  B t |  B | j B S r   )r%   r   r  r  r4  r  rQ  rq   rq   rr   r    s   


z'ComputedBuffer.get_unbacked_symbol_usesr  c                   s6   |   s| jtjjvr|  dkr| j S t  S r  )	r  r   rU   r   mutated_buffersr  r4  r  r&  rQ  r'  rq   rr   r    s   

zComputedBuffer.make_loaderCallable[..., None]c                 C  sV   |     }t| jtttfrt| jj	| j
|S t| jts"J t| jj| j
|S r   )r   r  r  rx   r4  r  ru  r  r	   r  r   rV  ra  r  rq   rq   rr   r    s
   z!ComputedBuffer.get_store_functionOptional[list[int]]c                   s   t | jtrYt| j | j \\}}|  j	}t
dd |D s&J fdd|D }|rYt | jttfrA| j| n|  fdd|D }ddlm} |||  S dS )	al  
        If our layout is still flexible, try to determine the stride order based on stride orders of reads.

        TODO(jansel): A better algorithm here would look at downstream consumers of this
                      value and try to do global graph-level layout optimization.
                      This is also something just begging to be autotuned.
        c                 s  s"    | ]}t |tjtjfV  qd S r   )rx   r2   StarDep	MemoryDepr  rq   rq   rr   r    s
    
z0ComputedBuffer.get_fill_order.<locals>.<genexpr>c                   s.   g | ]}t |tjrt|jd d  D qS )c                 S  s   i | ]}|d kr|t jjqS r   r  )r   vrq   rq   rr   r   
      z<ComputedBuffer.get_fill_order.<locals>.<listcomp>.<dictcomp>)rx   r2   r  rR   r   r  )r  rq   rr   r   	  s    
z1ComputedBuffer.get_fill_order.<locals>.<listcomp>c                   s   g | ]
}t jj| qS rq   rU   r   r   r  r   ru  )r  rq   rr   r     s    r0   pick_loop_orderN)rx   r5  r   r2   r  r4  r.  r  r  r  r  ru  r  r   	schedulerr  r   )rM  
index_varsr   r  stride_lengthsr  rq   )r  r  rr   r     s*   


zComputedBuffer.get_fill_orderrv   c                 C  s6   t | jtr|  }|r| | d S |   d S d S r   )rx   r5  r   r   r  r  r  rq   rq   rr   r    s   zComputedBuffer.decide_layoutetuple[tuple[list[sympy.Expr], list[sympy.Expr]], LoopBody, tuple[list[sympy.Expr], list[sympy.Expr]]]c           
      C  s   t j| j | j dd\}}ttd|   t	| 
 |  r$|n|d d |g|R  }W d    n1 s:w   Y  g }g }g }g }| D ]+\}}	||d v rd|rYJ || ||	 qK||d v slJ || ||	 qK||f|||ffS )Nqr_   rd  r0   r   )r2   r  r4  r.  r  r   r   rf  r   r;   r  r  itemsr  )
rM  r   
var_rangesr2  r  reduce_vars
index_sizereduce_sizer  r   rq   rq   rr   get_default_sizes_body%  s2   



z%ComputedBuffer.get_default_sizes_bodyNextra_indexing_constraints*Optional[tuple[dict[Any, Any], list[Any]]]recompute_sizes_body_funcOptional[Callable[..., Any]]:tuple[tuple[list[sympy.Expr], list[sympy.Expr]], LoopBody]c                   s    \\}}}\}}|r|||f|||f\\}}}\}}g |j  |durmt|tr4t|dks6J |\}}	t|tsAJ t|	tsHJ tdd |	D sSJ |j	}
|
|ks`J |
|f fdd|	D }	 |	7  g |
 tjtjs|   fdd}|| }tt ptj }|||||\}}}|||||\}}}tj||d	d
\\}}}t|||||g|||}||f|fS )an  
        This is a main place where we do loop transformations in a
        backend-agnostic way.

        Here we:
            1) Remove any 1 dimensions
            2) Fuse contiguous dimensions together
            3) Reorder dimensions based on stride orders

        Optional argument extra_indexing_constraints can be used to append additional
        indexing expressions to existing ones derived from buffer's body. This can be useful
        to fuse scheduler nodes with compatible ranges, e.g. (s0*s1*...,) and (s0, s1, s2, ...)
        on CPU by preventing indexing simplifications and obtaining index/reduce ranges for
        the scheduler node compatible with other nodes.
        Optional argument recompute_sizes_body_func can be used to recompute sizes and body
        on the default body. This can be useful to append additional loop transformations.
        Nr   c                 s  s    | ]}t |tV  qd S r   )rx   r   )r   frq   rq   rr   r  t  r[  z6ComputedBuffer.simplify_and_reorder.<locals>.<genexpr>c                   s   g | ]}| vr|qS rq   rq   r  )index_formulasrq   rr   r   |      z7ComputedBuffer.simplify_and_reorder.<locals>.<listcomp>c           	        s\    | ||\}}}|| } |r'tjj| |t | |\}}}t||}n|}|||fS r   )_apply_loop_reorderingrU   r   r   _simplify_loopsr5   r   )	x_varssupport_varsr  simplify_loopsreindex0r   r   _pruner   r  memory_addrsrM  rq   rr   simplify_and_reorder  s   



zAComputedBuffer.simplify_and_reorder.<locals>.simplify_and_reorderpr_   )r  indexing_exprsr|   rx   rz   r   r{   ry   r  r  get_write_exprsrU   r   r  r3   PREFER_STORE_LOOP_ORDERextendget_read_exprsrM   r  r1   loop_ordering_after_fusionr2   index_vars_no_squeezer;   )rM  r  r  r  r  r2  r  r  extra_indexing_rangesextra_indexing_exprexpected_var_rangesr  r  should_merge_loopsiter_rangesiter_reindexr   reduce_rangesreduce_reindex	iter_varsr  rq   r  rr   r  F  sx   



z#ComputedBuffer.simplify_and_reorderc              
     s   ddl m} |du rg }z* fdd|D }t|t|kr)t|d t ks+J tt|||}W n  tyV   tjrLt	dt
t | ttt}Y nw fdd|D t|t|fS )	zU
        Shuffle the order of loops around to hopefully improve performance.
        r0   r  Nc                   s   g | ]}t jj| qS rq   r  r  )r  r  rq   rr   r         z9ComputedBuffer._apply_loop_reordering.<locals>.<listcomp>r   z%Did not simplify complex index:
%s
%sc                   r   rq   rq   r   )r  rq   rr   r     r   )r  r  r   ry   rh  	Exceptionr1   r  r  warningr{   r   r   r   r   )r  r  r  r  priority_idxr  r-  r   rq   )r  r  r  rr   r    s,   
z%ComputedBuffer._apply_loop_reorderingr  c                 C  r  r   r4  r  rQ  rq   rq   rr   r    r  z!ComputedBuffer.get_reduction_sizec                 C  r  r   r4  r  rQ  rq   rq   rr   r    r  z!ComputedBuffer.get_reduction_typerj   c                 C  r  r   )r4  r  rQ  rq   rq   rr   r    r  zComputedBuffer.is_no_opc                 C  r  NTrq   rQ  rq   rq   rr   r    r   zComputedBuffer.should_allocater   r  rb   c                 C  r  )rc  r4  r  r  rq   rq   rr   r       z!ComputedBuffer.constant_to_devicer  r  r  r  r  r  r  )ru   r  )ru   r  r  )ru   r  NN)r  r  r  r  ru   r   r   r  r  r  )rm   rn   ro   rp   r  r  rZ  r[  r  r  r  r  r   r  rE   r  r  r  r  r  r  r  r  r  rO  rq   rq   r'  rr   r    s2   
 









'"s
#


r  c                      sb   e Zd ZdZd! fd
dZd"ddZdd Zd#ddZd$ddZd%ddZ			d&d'dd Z
  ZS )(TemplateBufferzt
    Represents a Triton (in the future other type) of template operator
    that we can fuse an epilogue onto.
    r5  rz  inputsSequence[IRNode]make_kernel_renderr  ru   rv   c                   s@   t  jd |d t|| _|| _tj| | _	tj
|  d S r  )r&  r^  InputsKernelunwrap_storager*  r,  rU   r   register_bufferr   register_operation)rM  r5  r*  r,  r'  rq   rr   r^    s
   zTemplateBuffer.__init__r  c                 C  s   | j ddS )NT	normalize)r9   rQ  rq   rq   rr   r     r   zTemplateBuffer.get_read_writesc              	     s   |   |     fdd}tj||  d|d}| jD ]j   fdd}| jtj| dddjO  _q|S )Nc                   s"   t |dksJ t | dS )Nr   fake)r   rS   r^  r  )rY  r   rq   rr   dummy  r  z1TemplateBuffer.extract_read_writes.<locals>.dummyrq   r1  c                   s(   t |dksJ t  |  d S r  )r   rS   r/  r  r  )rY  inprq   rr   r4    s   T)	r  r   r  r2   r9   r   r*  r5  r  )rM  r2  r4  depsrq   )rY  r5  r   rr   r9     s   


z"TemplateBuffer.extract_read_writesr  c                 C  s   t jjS r   )r   r5  r  rQ  rq   rq   rr   r    r  z!TemplateBuffer.get_reduction_sizer  c                 C  r   r   rq   rQ  rq   rq   rr   r    r   z!TemplateBuffer.get_reduction_typerj   c                 C  r  r%  rq   rQ  rq   rq   rr   r  "  r   zTemplateBuffer.should_allocateNr  r  r  r  c                 C  s   |   dfd fS rT  r  )rM  r  r  rq   rq   rr   r  %  s
   z#TemplateBuffer.simplify_and_reorder)r5  rz  r*  r+  r,  r  ru   rv   r  r  r  r  r(  )r  r  r  r  )rm   rn   ro   r7  r^  r  r9   r  r  r  r  rO  rq   rq   r'  rr   r)    s    



r)  c                      sB   e Zd Z		dd fdd	ZdddZdddZdddZ  ZS )TritonTemplateBufferNmutated_inputsOptional[Iterable[IRNode]]allowed_prologue_inpsOptional[OrderedSet[str]]ru   rv   c                   s   t  ||| |_g_|durFtjjjtjjjf}t	j
jj}||v s0J d| d| jd    j fdd|D 7  _|rM|_dS t _dS )a  
        NOTE:[TritonTemplates with multiple outputs]
        We want the ability for TritonTemplates to output multiple tensors. Triton
        kernels have no notion of outputs and this is done by creating tensors that
        are then mutated by the kernel. Currenlty our STORE_OUTPUT codegen doesn't
        support creating multinode outputs for triton templates.
        We work around this by creating an extra input buffer during the lowering
        and we mark them as mutated inputs.
        Nz$Mutated inputs are only allowed for z	 but got r   c                      g | ]}t t d |qS r-  MutationOutputr  r   r  r   rM  rq   rr   r   S  r  z1TritonTemplateBuffer.__init__.<locals>.<listcomp>)r&  r^  r8  outputsr   rS   higher_orderflex_attentionflex_attention_backwardrU   r   current_noder  r*  r   r+   r:  )rM  r5  r*  r,  r8  r:  allowed_setrF  r'  rA  rr   r^  4  s&   


zTritonTemplateBuffer.__init__r  c                 C  r]  r   )rB  rQ  rq   rq   rr   r  \  r_  z TritonTemplateBuffer.get_outputsrS  c                 C  r]  r   )r:  rQ  rq   rq   rr   get_allowed_prologue_inps_  r_  z.TritonTemplateBuffer.get_allowed_prologue_inpsr   c                 C  s   d| j  d}|S )NzTritonTemplateBuffer(layout=r  r3  )rM  r   rq   rq   rr   r,  b  s   zTritonTemplateBuffer.__str__r(  )r8  r9  r:  r;  ru   rv   r  r  r  )rm   rn   ro   r^  r  rH  r,  rO  rq   rq   r'  rr   r7  3  s    
(
r7  c                      sf   e Zd ZdZd fddZdddZd ddZdd Zd ddZd!ddZ	d"ddZ
d ddZ  ZS )#ChoiceCallera.  
    Represents a possible choice used in autotune_process.py.
    During autotuning, self.benchmark() is first called to get benchmark result,
    and if this choice is selected, self.output_node() is called to get the output_node.

    Children classes: TritonTemplateCaller, CUDATemplateCaller.
    r   r   rf   r  r5  rz  descriptionru   rv   c                   s&   t    || _|| _|| _|| _d S r   )r&  r^  r   r5  rf   rJ  )rM  r   rf   r5  rJ  r'  rq   rr   r^  s  s
   

zChoiceCaller.__init__rT  c                G  s   |   }t||d|iS )Nr   )to_callabler@   	benchmark)rM  r   r   algorq   rq   rr   rL    rq  zChoiceCaller.benchmarkc                 C  r  r   r  rQ  rq   rq   rr   	call_name  r   zChoiceCaller.call_namec                 C  r  r   r  rQ  rq   rq   rr   rK    r   zChoiceCaller.to_callablec                 C  r  r   r  rQ  rq   rq   rr   hash_key  r   zChoiceCaller.hash_keyra   c                 C  r  r   r  rQ  rq   rq   rr   output_node  r   zChoiceCaller.output_node<dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType]]]c                 C  s   i S )zRInformation returned here is logged to the autotune log file when that is enabled.rq   rQ  rq   rq   rr   	info_dict  r  zChoiceCaller.info_dictc                 C  r  )Nunsupported_choicerq   rQ  rq   rq   rr   autoheuristic_id  r   zChoiceCaller.autoheuristic_id)
r   r   rf   r  r5  rz  rJ  r   ru   rv   )ru   rT  r  )ru   ra   )ru   rQ  )rm   rn   ro   r7  r^  rL  rN  rK  rO  rP  rR  rT  rO  rq   rq   r'  rr   rI  j  s    




rI  c                   @  r  )TritonTemplateCallerBaseru   r
   c                 C  r  r   r  rQ  rq   rq   rr   get_make_kernel_render  r   z/TritonTemplateCallerBase.get_make_kernel_renderN)ru   r
   )rm   rn   ro   rV  rq   rq   rq   rr   rU    r  rU  c                      sb   e Zd ZdZd fddZed ddZed!ddZej	d"ddZ
d#ddZd$ddZ  ZS )%MultiTemplateBufferaG  
    Represents a Buffer with multiple backing implementation choices.

    Choices can be TritonTemplates or ExternKernels. During scheduling if there is a potential
    epilogue we will benchmark each of the choices with the epilogue to determine an implementation.
    Otherwise, the fastest base choice will be chosen.
    r5  rz  r*  rg   choice_timings'Callable[[], dict[ChoiceCaller, float]]unfiltered_choiceslist[ChoiceCaller]r:  rS  ru   rv   c                   s>   t  j||d |d || _d | _|| _tdd |D | _d S )N)r5  r*  r,  r:  c                 s  s0    | ]}t |tpt |tjjjo|jV  qd S r   )rx   rU  r   	_inductorselect_algorithmExternKernelCallerhas_out_variant)r   choicerq   rq   rr   r    s    

z/MultiTemplateBuffer.__init__.<locals>.<genexpr>)r&  r^  _choice_timings_fn_choice_timingsoriginal_inputsr  _output_plannable)rM  r5  r*  rX  rZ  r:  r'  rq   rr   r^    s   zMultiTemplateBuffer.__init__rj   c                 C  r]  )z^
        Are all possible choices TritonTemplates or Extern Kernels with out variants
        )rd  rQ  rq   rq   rr   output_plannable  s   z$MultiTemplateBuffer.output_plannabledict[ChoiceCaller, float]c                 C  s   | j d u r
|  | _ | j S r   )rb  ra  rQ  rq   rq   rr   rX    s   

z"MultiTemplateBuffer.choice_timingscallerrU  c                 c  sR    t |tjjjsJ | j|jksJ | j}| | _z	d V  W || _d S || _w r   )rx   r   r\  r]  TritonTemplateCallerr5  r,  rV  )rM  rg  renderrq   rq   rr   swap_as_triton_caller  s   
z)MultiTemplateBuffer.swap_as_triton_callerc                 C  sJ   t |tjjjs
J |  |jjksJ |  |jj	ksJ |
 | _d S r   )rx   r   r\  r]  rh  r   r5  r   r  r   rV  r,  )rM  rg  rq   rq   rr   finalize_as_triton_caller  s   z-MultiTemplateBuffer.finalize_as_triton_callertuple[ChoiceCaller, float]c                 C  s    t | j| jjd}|| j| fS )Nr  )rt  rX  r|  )rM  
min_choicerq   rq   rr   get_min_choice  s   z"MultiTemplateBuffer.get_min_choice)r5  rz  r*  rg   rX  rY  rZ  r[  r:  rS  ru   rv   r  )ru   rf  )rg  rU  )rg  rU  ru   rv   )ru   rl  )rm   rn   ro   r7  r^  r   re  rX  r  r  rj  rk  rn  rO  rq   rq   r'  rr   rW    s    
rW  c                      s&   e Zd Zd fddZd	d
 Z  ZS )CUDATemplateBufferworkspace_sizer   templaterW   ru   rv   c                   s    t  ||| || _|| _d S r   )r&  r^  rp  rq  )rM  r5  r*  r,  rp  rq  r'  rq   rr   r^    s   
zCUDATemplateBuffer.__init__c                 C  s   | j d ur| j S dS r  )rp  rQ  rq   rq   rr   r    r   z%CUDATemplateBuffer.get_workspace_size)rp  r   rq  rW   ru   rv   )rm   rn   ro   r^  r  rO  rq   rq   r'  rr   ro    s    ro  c                      s,   e Zd Zd fddZd	 fddZ  ZS )
CppTemplateBufferru   rv   c                   s&   t  ||| || _|| _d | _d S r   )r&  r^  rq  r`  rB  )rM  r5  r*  r,  rq  r`  r'  rq   rr   r^    s   
zCppTemplateBuffer.__init__rz  c                   sV   t | jtr&t | jtsJ | jd }t |tsJ |j}t |ts$J |S t  S r  )	rx   r5  MultiOutputLayoutrB  r   r  rz  r&  r   )rM  first_outputr5  r'  rq   rr   r     s   

zCppTemplateBuffer.get_layoutr  r  )rm   rn   ro   r^  r   rO  rq   rq   r'  rr   rr    s    rr  c                   @  sX   e Zd ZU ded< dddZddd	ZedddZedd Z	dddZ
dddZdS )r-  r  r*  ru   r  c                   s   t tj  }tj | jD ]#}t|tr | fdd|D  qt|tr&q|	 |
  qt tj  fdd|  D }tj||t  dS )Nc                 3      | ]	} |  V  qd S r   r  r  r  rq   rr   r    r  z/InputsKernel.get_read_writes.<locals>.<genexpr>c                 3  ru  r   r  r@  rv  rq   rr   r        
)r  writesindex_exprs)r+   r2   r6   r  r*  rx   ry   updater   rq  r  r  
ReadWrites)rM  r  inputrx  rq   rv  rr   r    s    


zInputsKernel.get_read_writesr  c                 C  r  r   r  rQ  rq   rq   rr   rZ  %  r  zInputsKernel.get_readsr   rb   c                 C  sz   t |tr|j}t |tr|j}t |trt |tst|}t |tr)| |S t |t	r0|S t |t
tfs;J ||S r   )rx   ra   r4  r  r  r9  r  realize_inputunwrap_storage_for_inputTorchBindObjectr  r1  r   rq   rq   rr   r~  (  s   





z%InputsKernel.unwrap_storage_for_inputc                 C  s@   g }| D ]}t |trdd |D }nt|}|| q|S )Nc                 S  r  rq   )r-  r~  r   rq   rq   rr   r   @  r   z/InputsKernel.unwrap_storage.<locals>.<listcomp>)rx   ry   r-  r~  r  )r*  
inputs_newr   rq   rq   rr   r.  ;  s   

zInputsKernel.unwrap_storagerj   c                 C  r  r%  rq   rQ  rq   rq   rr   r  F  r   zInputsKernel.is_externr   c                 C  r  r  rq   rQ  rq   rq   rr   r  I  r   zInputsKernel.num_readsNr  r  )r   rb   ru   rb   r  r  )rm   rn   ro   rp   r  rZ  rM  r~  r  r.  r  r  rq   rq   rq   rr   r-    s   
 




r-  c                   @  s    e Zd Zd	ddZd
ddZdS )	NopKernelru   rj   c                 C  r  r%  rq   rQ  rq   rq   rr   r  N  r   zNopKernel.is_no_opr  c                 C  r  r   r*   rQ  rq   rq   rr   rZ  Q  r_  zNopKernel.get_readsNr  r  )rm   rn   ro   r  rZ  rq   rq   rq   rr   r  M  s    
r  c                   @  s@   e Zd ZdZedd ZedddZedd ZdddZdS )ConcatKernelzn
    There isn't actually a real kernel for concat, we just change the
    storage for the upstream data.
    c                 C  s  |d   }|d  }t|d  }dg}|| g}d|  kr)t|k s,J  J tdt|D ]Z}||  }	|||  t|	t|ksLJ ||  |ksVJ ||   |ks`J tt|D ]}
|
|krw||
 |	|
  ||
< qftjj	
||
 |	|
 ||
< qf|||  q3t|}tjrt|||d j}tt|D ]!}|| }t|r| }t|trt|j|jrt|} nqtdd |D }tjjjd }t|tsJ |du rtdd |D rt|}td t||||dg d}t|}g }tt|D ]N}| || t j!|||| || dd	}|j"| t|| j#t$r6|| j#% }n|| j#}|& rVt'||   j(rVt)|sV||*  q	t|dkrntj+|t,j-rntj.| tj/||_0| 1|j"|_"tj2| |S )
Nr   r0   c                 s  r  r   )r   r  rq   rq   rr   r    r  z&ConcatKernel.create.<locals>.<genexpr>Fc                 s  sB    | ]}d |j v o|j d  jtjdp|j d  jtjdV  qdS )r
  r  N)rx  r  r   r  r  r   argrq   rq   rr   r    s    

)r   r   r   r   r   r5  r*  )rF  )3r   r   ry   r   r   r   r  rU   r   r   r  r   r   r1   r  rz  r  r   r   r   rx   r7  rg  r   r   r!   rs  rF  r   r  r  r  r>  r0  r*  r4  r  r  is_input_bufferrM   r   rL   r  r  r3   FOREACHregister_operation_listr/  r   r.  r0  )r1  r*  r'  r   r   r  offsets_startoffsets_endr   
input_sizer  output_strider   r5  any_input_is_storage_and_layoutfx_node_argsconcat_kernelkernelop_namesinput_bufferinput_unwrappedrq   rq   rr   r0  [  s   
 



 zConcatKernel.createNc                 C  s   t |tr| |j|S t |jtrCt |jjtr|jjsdS |d u r%dS t|	 t|	 ks3dS t
dd t|	 |	 D S t |jjtoPt |jt S )NFTc                 s  r/  r   r0  r1  rq   rq   rr   r    r2  z=ConcatKernel.can_realize_into_without_copy.<locals>.<genexpr>)rx   ra   can_realize_into_without_copyr4  rW  r5  r7  re  r   r  r  r   r   ExternKernelAlloc)r1  r  r  rq   rq   rr   r    s$   
z*ConcatKernel.can_realize_into_without_copyc              	   C  s   t |tst|rt|\}}t||d}t |tsJ |t |tr*| |j|S t |trJ|  t	|jds;J | 
||rJt||j_|jS tj| | | dd t| | D d}| ||S )Nr3  r5  c                 S  r  rq   r  r  rq   rq   rr   r     r  z-ConcatKernel.realize_into.<locals>.<listcomp>re  )rx   r9  r   r6  ra   r  r4  r  r  r  r  r  r5  rV  r0  r   r   r  r   r   )r1  r  r  r:  r5  pwrq   rq   rr   r    s,   


	zConcatKernel.realize_intoru   rj   c                 C  r  r%  rq   rQ  rq   rq   rr   r    r   zConcatKernel.should_allocater   r  )	rm   rn   ro   r7  rM  r0  r  r  r  rq   rq   rq   rr   r  U  s    
`
 r  c                      s&  e Zd ZU dZded< ejedZded< dZ	ded	< dZ
d
ed< dZd
ed< ejedZded< dZded< dZded< dZded< ejedZded< ejedZded< 							dfdg fddZdhddZdid!d"Zd#d$ Zd%d& Zdgd'd(Zd)d* Zdjdkd+d,Zdld-d.Zd/d0 Zed1d2 Zedmd4d5Z ed6d7 Z!ed8d9 Z"ed:d; Z#e			<dndodAdBZ$edpdCdDZ%edpdEdFZ&edGdH Z'edIdJ Z(edKdL Z)dgdMdNZ*dOdP Z+djdqdSdTZ,dUdV Z-dWdX Z.dpdYdZZ/dgd[d\Z0d]d^ Z1d_d` Z2didadbZ3drdddeZ4e4Z5  Z6S )sr  rq   ztuple[Any, ...]constant_argsr  zdict[str, Any]r   NzOptional[ReinterpretView]output_viewr  python_kernel_namecpp_kernel_namezIterable[str]ordered_kwargs_for_cpp_kernelzFOptional[Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator]]op_overloadzOptional[list[dict[str, Any]]]arg_propertiesz#Optional[dict[str, dict[str, Any]]]kwarg_propertiesz"dict[sympy.Symbol, pytree.KeyPath]unbacked_bindingszlist[MutationOutput]mutation_outputsru   rv   c                   sn   t  j|||d || _|r|ni | _|| _|
| _| | | | |	| _| 	  i | _
g | _tjj| _d S Nr  )r&  r^  r  r   r  r  set_cpp_kernel_nameset_python_kernel_namer  collect_arg_kwarg_propertiesr  r  rU   r   rF  fx_node)rM  r   r5  r*  r  r   r  r  r  r  r  r'  rq   rr   r^    s    

zExternKernel.__init__r  c                 C  s   | g| j S r   )r  rQ  rq   rq   rr   r  4  r   zExternKernel.get_outputsr  c                 C  r  r   r*   rQ  rq   rq   rr   r  7  r_  z%ExternKernel.get_unbacked_symbol_defsc                 C  s   t | jtjjrdd | jjjD ndd tt| j	D | _
t | jtjjr1dd | jjjD ni | _t | jtjjrW| jsJdd | jjjD | _dd | jjjD | _d S d S )Nc                 S  s$   g | ]}|j s|j|j|jd qS ))r   r   r*  )
kwarg_onlyr   	real_typer*  r  rq   rq   rr   r   >  s    z=ExternKernel.collect_arg_kwarg_properties.<locals>.<listcomp>c                 S  s   g | ]}i qS rq   rq   r   rq   rq   rr   r   H  r   c                 S  s   i | ]}|j |j|jd qS ))r   r*  )r   r  r*  r  rq   rq   rr   r   K  r  z=ExternKernel.collect_arg_kwarg_properties.<locals>.<dictcomp>c                 S     g | ]}|j r|jqS rq   r  r   r  rq   rq   rr   r   V  
    c                 S  s   g | ]}|j r|qS rq   )r  r  rq   rq   rr   r   Y  s
    )rx   r  r   _ops
OpOverload_schema	argumentsr   r   r*  r  allarg_propertiesr  schema_kwargsrQ  rq   rq   rr   r  :  s*   

z)ExternKernel.collect_arg_kwarg_propertiesc                 C  s$   t | jtr|   |   d S d S r   )rx   r5  r   apply_constraintr  rQ  rq   rq   rr   r  ]  s   zExternKernel.decide_layoutc                 C  s$   t | |\}}|r|| d S d S r   )rJ   r6  )rM  wrapper
origin_str_detailed_origin_strrq   rq   rr   codegen_commentb  s   zExternKernel.codegen_commentc                 C  r  r   r  rM  r  rq   rq   rr   codegeng  r   zExternKernel.codegenc                 C  s   || _ tjjrt| jtjjsd S | j}| j d u rB|j	dkr;|j
dkr+|jdd n|jdd}d| d| _ d S |jj| _ d S d S )Natenr0  .r   r   z
at::_ops::z::call)r  rU   r   cpp_wrapperrx   r  r   r  r  	namespace_overloadnamerm   r  replacer  r   )rM  r  r  opnamerq   rq   rr   r  j  s   




z ExternKernel.set_cpp_kernel_namec                 C  sd   || _ |d ur	d S | j}|d u rd S t|tjjr"d|j | _ d S |jdd d|j | _ d S )Nztorch.ops.higher_order.z._ops.z.ops.r  )	r  r  rx   r   r  HigherOrderOperatorrm   rn   r  )rM  r  r  rq   rq   rr   r    s   z#ExternKernel.set_python_kernel_namec                 C  s:   |    }r	|jntjj}tjjrtjj| j|S | j	S r   )
r   r   rU   r   device_typer  r4  get_c_shim_func_namer  r  )rM  dr   rq   rq   rr   get_kernel_name  s   zExternKernel.get_kernel_namec                 C  s:   t j|  |  |  |  |  |  d}|  |S )N)r   r   r  r  rF  rD  )	rV  r0  r   r   r  r   ra  r^  r  )r   r  rq   rq   rr   
copy_input  s   zExternKernel.copy_inputituple[Any, list[Any], list[Any], Callable[[Any, Any], Any], Optional[dict[sympy.Symbol, pytree.KeyPath]]]c                   sx  ||d}t |\} g g }g }|D ]/}t|to"t|t  d r.|| qt|tjr>tj	j
jj|d d}|| q fdd}	fdd|D }|D ]}
t|
rbt|
dd	 qVg }|D ]n}
t|
ts|
 tj	jv r|tj	j|
   qgt|
ts|
 tj	jv r|tj	j|
   qgt|
tr||
  qgt|
tjjjr|
jj}|
jjd
kr|d usJ |tjj|   qg|t|
dd qg|	||\}}||i |}d }tjj }rt |tj!| t"||tj!j#$d}t|t%t&fs	|gn|}|D ]'}t|tj'r3|j(r3d}tj	j!j#$dd  }r/| d| }|tj	_)q||||	|fS )Nr   r  )r   c                   sd   g }t | }t |}D ]}|r|t| q|t| qt| }|dg |di fS )Nr   r   )iterr  nextpytreetree_unflattenr|  )new_tensor_argsnew_non_tensor_argsr  
it_tensorsit_non_tensors	is_tensorr3  )	args_specis_arg_tensorrq   rr   unflatten_args  s   z3ExternKernel.process_kernel.<locals>.unflatten_argsc                   rk  rq   r}  r  r1  rq   rr   r     r   z/ExternKernel.process_kernel.<locals>.<listcomp>Tr  r  )r   r
  zEsparsity not handled. Please file issue for sparse inference weights.stack_tracez Found from : 
 )*r  tree_flattenr  rx   rb   GeneratorStater   r   rU   r   r   r   create_symintnoder   r6  r  r  	constantstorchbind_constantsr  get_real_objr   r\  irr   r   r   r  default_generatorsclone_stater   	fake_moder&   rF  r$   rx  r|  ry   rz   Tensor	is_sparsedisable_cudagraphs_reason)r1  r  r   r   binded_args	args_flattensor_argsnon_tensor_argsr  r  r   example_argsdevice_indexnew_args
new_kwargsexample_outputr  r   example_out_lir   msgr  rq   )r  r1  r  rr   process_kernel  s~   



zExternKernel.process_kernelc              	   C  sV  t |tsJ t |tr|S | }tj| }|dus J | }|durQd|j	v rQt |j
trQ|j	d jtjdsG|j	d jtjdrQ|t|  n|  tj| dd\}}|d }| |}tjj||}tjj||}	tjj||}
t||	|
 }||krtd|	|
| tt|jt |! |" | |	|
dd	S )
z
        In order to pass this to an extern kernel we need a
        ReinterpretView not a View.  This allows us to avoid some
        unneeded copies.
        Nr
  r  r3  r_   r   z@convert_to_reinterpret_view failed: stride=%s offset=%s index=%sr  r3  )#rx   r  r9  r  rU   r   r  r  ra  rx  r5  r   r  r   r  r  r  r!   r   r  r2   r  r  r   r  stride_vars
offset_varrN   r  r  rw  r4  r7  r  r   )r1  r   x_unwrap_viewr  x_unwrap_view_fx_node
index_argsr  r  r   r-  r8  expectedrq   rq   rr   convert_to_reinterpret_view  sf   






z(ExternKernel.convert_to_reinterpret_viewc                 C  s  |d u rt  S t|tjtjjjtfrt|dS t|t	r.t
jtj|j| | dS t|tr5|S t|tr@| |jS t|trQt| |j| dS t|trp|  t| rpz| |W S  tyo   Y nw t|tr{|  |S t|ttfr|S |  |S )N)ru  )r   r   r3  )!r  rx   r   r   r   r   r   r   r   rU  rU   r   add_tensor_constantr   r+  r   r   r   rf  ra   r}  r4  r9  r   r  r  r   r  r  rw  r  NonTensorObjr  r  rq   rq   rr   r}  Y  s<   







zExternKernel.realize_inputc                 C  sD   t |rt| dkr|S | D ]
}|dkr|  S q| |S rv  )r   r   r  r  )r1  r   r   rq   rq   rr   require_stride1z  s   
zExternKernel.require_stride1Fr   Optional[Sequence[int]]r  r  c              	     s  |d us
 d us
J   dv r sS trt trI|r=tddt|r6ttj	j
 jn||d S tddd | d S t ttfrs|r[ |sh rst  j rs d urqt S S t trt  trtdt  tr|r  |s rt   j rS ttr|r |sǈ rt  j rɈS ttrtjtrtjtst rt jtsz!| j_|r| j||dW S  r| j |dW S W n
 t y   Y nw d } } d urItj	j
 fdd	t!t" D }|D ]}t#j$j%&|d
dq;| 'tdd|| d |ret|scJ S |r|d urr d ustJ t#j$j%(|t S S )N)r   r0   TF)r  r  r  r  r  zHthe MutationLayoutSHOULDREMOVE's real layout shouldn't be FlexibleLayoutr  c                   s4   g | ]}  | d r | dr|qS )r   r   )r%  r?  r   r   r  r   r   rq   rr   r     s    z0ExternKernel.require_strides.<locals>.<listcomp>r   r0   ))r  r   rx   r   r   r6  r  r   rU   r   r   
size_hintsr   r7  r  r  r*  r   r>  r  r  r  r  ra   r4  r  r9  r  r  r  require_stride_orderr  rw  r   r   r   r\  loweringslice_r  r  )r1  r   r   r  r  expanded_dims	orig_sizer'  rq   r  rr   require_strides  s   
	


	


zExternKernel.require_stridesc                 C     | j |||dS )N)r  r  r  )r1  r   r  r  rq   rq   rr   r  !  s   z"ExternKernel.require_exact_stridesc                 C  r  )N)r   r  r  )r1  r   r   r  rq   rq   rr   r  '     z!ExternKernel.require_stride_orderc                 C     |  |tS r   )r  r  r  rq   rq   rr   require_channels_last+  r'  z"ExternKernel.require_channels_lastc                 C  r  r   )r  r  r  rq   rq   rr   require_channels_last_3d/  r'  z%ExternKernel.require_channels_last_3dc              	   C  s    |  |tttt| S r   )r  ry   rh  r   r   r   r  rq   rq   rr   require_contiguous3  s    zExternKernel.require_contiguousc                 C  r   r   rq   rQ  rq   rq   rr   r  7  r   zExternKernel.apply_constraintc                 C  s   t |ttfs	J t |trt|}| jsJ dt|}t| j}||k rQtd| j||  t||D ]}| j| d }|	||v rH|| n| j| d  q5|S )Nz/ExternKernel.arg_properties should not be emptyzv%s has %d unprovided positional arguments. Will check if they are in the keyword arguments or will use default values.r   r*  )
rx   ry   rz   r  r   r  r  r  r   r  )rM  r   r   n_args
n_pos_argsr   arg_namerq   rq   rr   fill_non_provided_args:  s(   	

z#ExternKernel.fill_non_provided_argsr  rC  c           	      C  s   t jjrig }d }|r"| jr"t| jt|ksJ ddd | jD }t| jD ]?\}}|d ur@||| }|r=|dnd }nt| j| }| jrY|t| jk rY| j| dnd }|	t jj
|| q'|S tt jj
j| jS )NzDnames passed to codegen_const_args does not match self.constant_argsc                 S  s   i | ]}| d |qS r   )r|  r  rq   rq   rr   r   h  r  z3ExternKernel.codegen_const_args.<locals>.<dictcomp>r   )rU   r   r  r  r   r  r   r|  r*  r  r4  val_to_arg_strrn  )	rM  r  r  name_to_arg_propertiesr   r   proptype_r   rq   rq   rr   codegen_const_args\  s0   
zExternKernel.codegen_const_argsc                 C  s   t jjr| jd ur| g | j| j| j}d}n| j}d}g }t|D ]4\}}t jjrN| j	r6|t
| j	k s:J d| j	| d}|t jj|| q$|t jj| q$|rb||   |S )NFTz-Invalid access to ExternKernel.arg_propertiesr   )rU   r   r  r  r  r*  r  r   r   r  r   r|  r  r4  r  r  r  )rM  r*  need_codegen_constant_argsr   r   r   r  rq   rq   rr   codegen_args|  s&   zExternKernel.codegen_argsc                 K  sX   ||v r	| |S || jv r| j |S | jr%|| jv r%| j | dS t| d)zGiven an argument name, queries for values in (in order):
        1. any provided kwargs for this function.
        2. the class self.kwargs member.
        3. any available default arguments in self.allarg_properties.r*  z not in self.allarg_properties)r|  r   r  r  )rM  r  r   rq   rq   rr   get_kwargs_value  s   

zExternKernel.get_kwargs_valuec                 C  s   t jjrR| jd urt| jdkrg S g }| jD ]8}|r |dkr q| |}t|t	j
r1|| q| jrB|| jv rB| j|dnd }|t jj|| q|S dd | j D }|S )Nr   r   r   c                 S  s(   g | ]\}}| d t jj| qS r!  rU   r   r4  r  )r   kr  rq   rq   rr   r     s    z/ExternKernel.codegen_kwargs.<locals>.<listcomp>)rU   r   r  r  r   r  r  r  rx   r   r   r  r  r|  r4  r  r   r  )rM  skip_outr   r  r  r  rq   rq   rr   codegen_kwargs  s,   


zExternKernel.codegen_kwargsc              	   C  st   t jr6tjjs8t|  dkrd S tjj|  }tjj| 	 }|
d|   d| d| d d S d S d S )Nr   zassert_size_stride(r  r  )r1   size_assertsrU   r   r  rQ   r   r4  codegen_shape_tupler  r6  r  )rM  r  r   r   rq   rq   rr   codegen_size_asserts  s   z!ExternKernel.codegen_size_assertsc                 C  s   |   }|  }|g g|fS )zD
        get output sizes and strides, for template_codegen
        )r   r  )rM  _size_striderq   rq   rr   get_group_stride  s   zExternKernel.get_group_stridec                   s  t jj|  }|  }fdd|D }dd tt|D ttt||jdd}dd t	|D fddttD }fd	d|D | 
 }|}t jj||g\}}}	td
\}
 tt| fdd|D }tt||}|t|fS )zC
        Manually get canonicalization of the output index
        c                   rk  rq   )r   r  )r   rq   rr   r     r   z-ExternKernel.canonicalize.<locals>.<listcomp>c                 S  s   g | ]	}t d | qS )r  )rO   r   rq   rq   rr   r     r  T)r  r!  c                 S  r   rq   rq   r   rq   rq   rr   r     r   z-ExternKernel.canonicalize.<locals>.<dictcomp>c                   r   rq   rq   r   r   rq   rr   r     r   c                   r   rq   rq   r   )r  rq   rr   r     r   cc                   r   rq   rq   r  )add_varrq   rr   r     r   )rU   r   r   r   r  r   r   r  r  r   r  r  r:   r{   r   rR   r   r  rz   )rM  r  r-  index_orderr   rY  r   	new_sizesr   r
  r   replacementrq   )r!  r  r   r   rr   canonicalize  s$   
 zExternKernel.canonicalizec                 C  sD   t tj  }| jD ]}|t|O }q	| j D ]}|t|O }q|S r   )r+   r   r   r  maybe_free_unbacked_symbolsr   r|   )rM  r3  r  rq   rq   rr   r    s   
z%ExternKernel.get_unbacked_symbol_usesr   c                   sP   t  dd }d|g}| fddt D 7 }|d j  |S )Nr  zpython_kernel_name=c                   s$   g | ]}|j  d t |j  qS r!  )r   r   )r   r  rQ  rq   rr   r     s    z(ExternKernel.__str__.<locals>.<listcomp>r#  )r   r  fieldsr  rF  rr  )rM  kernel_namerj  rq   rQ  rr   r,    s   
zExternKernel.__str__rq   NNNNrq   Nr  r  r  r   r  r  ru   rv   )r  r  ru   rv   )ru   r  )NNF)r   r  r  r  r  )r  rC  r  )7rm   rn   ro   r  rp   r  r  r{   r   r  r  r  ry   r  r  r  r  r  r  r^  r  r  r  r  r  r  r  r  r  r  r  rM  r  r  r}  r  r  r  r  r  r  r	  r  r  r  r  r  r  r  r  r%  r  r,  rL  rO  rq   rq   r'  rr   r     s   
 


#


m
D
 
	 



" 

	


r  c                      sB   e Zd ZdddZ							dd fddZdd
dZ  ZS )ExternKernelOutru   rv   c                 C  s   |  | g |  | jdd}|  }tjjr!| jdkr!d}n|  }|   }r.|j	ntjj
}|||  | jr@| j nd || d S )NT)r  ztorch::inductor::_mm_plus_mmaoti_torch__mm_plus_mm_out)r  r  r  r  rU   r   r  r  r   r   r  generate_extern_kernel_outr  r  )rM  r  r   r(  r  r   rq   rq   rr   r    s    

zExternKernelOut.codegenrq   Nc
           
        sF   t  d || |||pi d ||||	
 tj| | _tj|  d S r   )r&  r^  r.  rU   r   r/  r   r0  )
rM  r5  r*  r  r   r  r  r  r  r  r'  rq   rr   r^  #  s   zExternKernelOut.__init__rj   c                 C  r  r%  rq   rQ  rq   rq   rr   r  >  r   zExternKernelOut.should_allocater  r)  r  )rm   rn   ro   r  r^  r  rO  rq   rq   r'  rr   r+    s    
r+  c                      s   e Zd Zd	 fddZ  ZS )
RandomSeedscountr   r   r  ru   rv   c                   sF   t t j}t jt|t j|gdg |j|j|ggddtj	j
d d S )Nr  zaten.randint.low_outzat::_ops::randint_low_out::call)r5  r*  r  r  r  r  )r   r%  r  r&  r^  r7  rt  rE  r  randintlow_out)rM  r/  r   limitsr'  rq   rr   r^  C  s   
zRandomSeeds.__init__)r/  r   r   r  ru   rv   rm   rn   ro   r^  rO  rq   rq   r'  rr   r.  B      r.  c                      sH   e Zd ZdddZ						dd fddZdd
dZdd Z  ZS )r  ru   rv   c                 C  sL   |  | g |  |  }tjj| | t| jt	r$| 
| d S d S r   )r  r  r  rU   r   r4  generate_extern_kernel_allocrx   r5  rz  r  rM  r  r   rq   rq   rr   r  W  s   
zExternKernelAlloc.codegenrq   Nc	           	        sL   t  d || |||pi d ||||
 g | _tj| | _tj|  d S r   )	r&  r^  r.  rB  rU   r   r/  r   r0  )	rM  r5  r*  r  r   r  r  r  r  r'  rq   rr   r^  ^  s   zExternKernelAlloc.__init__rj   c                 C  r  r  rq   rQ  rq   rq   rr   r  |  r   z!ExternKernelAlloc.should_allocatec                 C  r  r   r  rQ  rq   rq   rr   r    r   z"ExternKernelAlloc.apply_constraintr  )rq   NNNrq   Nr  )rm   rn   ro   r  r^  r  r  rO  rq   rq   r'  rr   r  V  s    

r  c                      s>   e Zd ZdZd fddZddd	Zd
d ZdddZ  ZS )r?  zP
    An output buffer that represents the mutation of a pre-existing buffer
    mutating_noder  ru   rv   c                   sD   t  jd |d | }tj| |g| _|| _tj| | _	d S r  )
r&  r^  r  rU   r   r  mutation_namesr7  r/  r   )rM  r5  mutated_noder7  mutated_node_namer'  rq   rr   r^    s   zMutationOutput.__init__c                 C  r]  r   )r7  rQ  rq   rq   rr   rc    r_  zMutationOutput.get_defining_opc                 C  r]  r   )r8  rQ  rq   rq   rr   r    r_  z!MutationOutput.get_mutation_namesrj   c                 C  r  r  rq   rQ  rq   rq   rr   r    r   zMutationOutput.should_allocate)r7  r  ru   rv   r  r  )	rm   rn   ro   r7  r^  rc  r  r  rO  rq   rq   r'  rr   r?    s    
r?  c                      sP   e Zd ZU dZi Zded< e	ddddZ	dd fddZdddZ	  Z
S )TMADescriptora$  
    An IR node representing a host-side TMA descriptor in the Triton API
    (the ones obtained via create_{1d,2d}_tma_descriptor calls). Mostly
    useful for user-defined Triton kernels relying on host-side TMA; but
    can, in principle, be used for Inductor's Triton templates, too.
    zdict[Any, TMADescriptor]_CACHENr+  rb   r  list[Union[int, torch.SymInt]]
block_dimselement_sizer  c                 C  s8   t ||||f}|| jvrt||||| j|< | j| S r   )idr<  r;  )r1  r+  r  r>  r?  r  rq   rq   rr   r0    s   

zTMADescriptor.createru   rv   c                   s   t |dv sJ t |t |ksJ |d u r| j}|| _|| _|| _|| _t | j| _|g}g | j| j| j}t 	d t
t|| d|t|d  tj| | _tj|  d S )N)r0   r   r3  )r   r   r^  r+  r  r>  r?  r  r&  r^  r  r9  r   rz   rU   r   r/  r   r0  )rM  r+  r  r>  r?  r*  r  r'  rq   rr   r^    s>   
zTMADescriptor.__init__c                 C     | |  d S r   )generate_tma_descriptorr  rq   rq   rr   r    r  zTMADescriptor.codegenr   )r+  rb   r  r=  r>  r=  r?  r  )
r+  rb   r  r=  r>  r=  r?  r  ru   rv   r  )rm   rn   ro   r7  r<  rp   rM  r0  r^  r  rO  rq   rq   r'  rr   r;    s   
 	-r;  c                      s\   e Zd Zdd ZdddZd fdd	Zdd
dZd fddZdddZdddZ	  Z
S )UserDefinedTritonKernelc                   s   ddl m} ddlm} || j g }g }g }t |ret dr0| fdd j	D  nt ds7J | j
 t drR jD ]}| jj|  qEnt d	sYJ | j  j} j  |||fS )
Nr   )	Autotuner)kernel_side_tablerestore_idxc                 3  s    | ]	} j j| V  qd S r   )r   	arg_namesr   r  rq   rr   r    rw  zBUserDefinedTritonKernel.get_kernel_and_metadata.<locals>.<genexpr>restore_value	reset_idxreset_to_zero)triton.runtime.autotunerrD  *torch._higher_order_ops.triton_kernel_wraprE  
get_kernel
kernel_idxrx   r  r  rF  rI  rJ  r  r   rG  rK  configs)rM  rD  rE  rP  restore_value_argsreset_to_zero_argsr   rq   rH  rr   get_kernel_and_metadata  s,   




z/UserDefinedTritonKernel.get_kernel_and_metadataru   rv   c              	     s  ddl m}  \ }}}| |j||j\}}}fddjD }	t fdd jD }
g }g }g }t	
|	 tt	d|D ]f\}}|| t|trd||  ||  qGt|ttttjfr{|| |t| qG||
v r|d |t qG|d u r	 | r|d |t qG|  qGtd	t| d
| | |j|||||d d d S )Nr   )triton_version_uses_attrs_dictc                   s   i | ]}|  |qS rq   r  )r   r  rQ  rq   rr   r     r  z3UserDefinedTritonKernel.codegen.<locals>.<dictcomp>c                   r  rq   )rG  r   rH  rq   rr   r     r   z3UserDefinedTritonKernel.codegen.<locals>.<listcomp>rf  r  zUnsupported arg type: r  T)	arg_typesraw_argstriton_metar  r   )torch._inductor.utilsrT  rS  !define_user_defined_triton_kernelr   gridr  r+   
constexprsr  r}  r  r   repeatr  rx   rb   r  r   r   rT  rj   r   r   r   r/  rw  r  generate_kernel_callr   )rM  r  rT  rP  rQ  rR  new_namerX  extra_launch_args
named_argsconstexpr_namesr   rV  raw_args_filteredr   r  rq   )r  rM  rr   r    sp   
	



	



zUserDefinedTritonKernel.codegenr  c                   s   t   t| jB S r   )r&  r  r%   r[  rQ  r'  rq   rr   r  O  s   z0UserDefinedTritonKernel.get_unbacked_symbol_usesc                 C  r  r   r*   rQ  rq   rq   rr   r  T  r_  z0UserDefinedTritonKernel.get_unbacked_symbol_defsc                  sZ  g }i }g }   D ]3\}}	t|	tr4t|	}
||v r*tj|
g|| R  }
||
 |
||< q
||	 |	||< q
t	|dksFJ |d 
 _t d tjd|t|| |_|_ \}}}} fdd|jD _ddlm} t	|dkr|d jni } fdd||i  |D _fddjD _tj d S )Nr   r-  c                   s   g | ]}| v r|qS rq   rq   r  kernel_argsrq   rr   r   x  r  z4UserDefinedTritonKernel.__init__.<locals>.<listcomp>)identify_mutated_tensorsc                   r   rq   rq   r   r  rd  rq   rr   r     s    c                   s    g | ]}t t jd | qS r=  )r?  r  r   r@  rQ  rq   rr   r     s    )r  rx   ra   r-  r~  r}  r;  r0  r  r   r   r   r&  r^  r  rz   rO  r[  rS  rG  r  rM  rf  r   mutable_argsr  rU   r   r0  )rM  rO  r[  tma_descriptor_metadatare  r*  r   r  r  r  r   r  rP  r   rf  autotuned_kwargsr'  )re  rM  rr   r^  W  sL   








z UserDefinedTritonKernel.__init__r  c                 C  r  r   )ry   r  rQ  rq   rq   rr   r    r  z#UserDefinedTritonKernel.get_outputsr  c                 C  r]  r   r-  rQ  rq   rq   rr   r     r_  z"UserDefinedTritonKernel.get_devicer  r  r  r  )rm   rn   ro   rS  r  r  r  r^  r  r   rO  rq   rq   r'  rr   rC    s    
K

5rC  c                      H   e Zd ZdZdddZdddZd	d
 ZdddZd fddZ  Z	S )InplaceBernoulliFallbackE
    This needs to be a custom class to handle mutation properly
    ru   rv   c                 C  s   dd | j D \}tjjr)||   d| ddtt| j	 d|j
  d S ||   d| ddtt| j	 d|j
  d S )Nc                 s      | ]}|  V  qd S r   r  r   r   rq   rq   rr   r    r  z3InplaceBernoulliFallback.codegen.<locals>.<genexpr>rm  r  z, NULL)r  )r*  rU   r   r  r6  r  rp  rn  reprr  ending)rM  r  r   rq   rq   rr   r    s   ,,z InplaceBernoulliFallback.codegenrj   c                 C  r  r  rq   rQ  rq   rq   rr   r    r   z(InplaceBernoulliFallback.should_allocatec                 C     | j d  gS r  r*  r  rQ  rq   rq   rr   r    r  z+InplaceBernoulliFallback.get_mutation_namesr  c                 C  r  r   r*   rQ  rq   rq   rr   r    r_  z1InplaceBernoulliFallback.get_unbacked_symbol_defsc                   sV   t  jd t| d| |g||d tj|  tj	| | _
tj|  d S )Nr-  r  )r&  r^  r  r   r.  rU   r   r  r  r/  r   r0  )rM  r  r   r  r'  rq   rr   r^    s   
z!InplaceBernoulliFallback.__init__r  r  r  
rm   rn   ro   r7  r  r  r  r  r^  rO  rq   rq   r'  rr   rl        


rl  c                      sX   e Zd ZdZdddZdddZd	d
 ZdddZd fddZe	ddddZ
  ZS )InplaceCopyFallbackrm  ru   rv   c                 C  s    |   \}}}|||| d S r   )r  codegen_device_copy)rM  r  r  r  non_blockingrq   rq   rr   r    s   zInplaceCopyFallback.codegenrj   c                 C  r  r  rq   rQ  rq   rq   rr   r    r   z#InplaceCopyFallback.should_allocatec                 C  rs  r  rt  rQ  rq   rq   rr   r    r  z&InplaceCopyFallback.get_mutation_namesr  c                 C  r  r   r*   rQ  rq   rq   rr   r    r_  z,InplaceCopyFallback.get_unbacked_symbol_defsc                   sJ   t  jd |||ddd tj|d   tj| | _tj|  d S )Nz
aten.copy_aoti_torch_copy_)r  r  r   )	r&  r^  rU   r   r  r  r/  r   r0  )rM  r5  r*  r  r'  rq   rr   r^    s   zInplaceCopyFallback.__init__Frz  c                   s6    fdd||fD }|f}t t| d||}|S )Nc                   rk  rq   r  rp  r  rq   rr   r     r   z.InplaceCopyFallback.create.<locals>.<listcomp>r-  )rx  r  r   )r1  r  r  rz  r*  r  r  rq   r  rr   r0    s   zInplaceCopyFallback.creater  r  r  r  )rz  rj   )rm   rn   ro   r7  r  r  r  r  r^  rM  r0  rO  rq   rq   r'  rr   rx    s    


rx  c                   @  s@   e Zd ZdZdddZdddZd	d
 ZdddZdddZdS )MutatingFirstArgExternKernelrm  ru   rv   c                 C  sJ   g dd | j D tt| j}||   dd| d|j  d S )Nc                 s  rn  r   ro  rp  rq   rq   rr   r    r  z7MutatingFirstArgExternKernel.codegen.<locals>.<genexpr>rm  r  r  )r*  rn  rq  r  r6  r  rp  rr  )rM  r  argrefsrq   rq   rr   r    s   
z$MutatingFirstArgExternKernel.codegenrj   c                 C  r  r  rq   rQ  rq   rq   rr   r    r   z,MutatingFirstArgExternKernel.should_allocatec                 C  rs  r  rt  rQ  rq   rq   rr   r    r  z/MutatingFirstArgExternKernel.get_mutation_namesr  c                 C  r  r   r*   rQ  rq   rq   rr   r    r_  z5MutatingFirstArgExternKernel.get_unbacked_symbol_defsc                 C  r  r%  rq   rQ  rq   rq   rr   has_side_effects  r   z-MutatingFirstArgExternKernel.has_side_effectsNr  r  r  )	rm   rn   ro   r7  r  r  r  r  r~  rq   rq   rq   rr   r|    s    

	
r|  c                      s   e Zd Zd fddZ  ZS )ResizeStorageBytesru   rv   c                   s   t |ts	J dt jd t| d| |g|fd tj	|
  tj| | _tj|  d| _d| _tjj|j
  d S )NzTODO: dynamic shapesr-  )r  z"inductor_ops.resize_storage_bytes_z&torch::inductor::resize_storage_bytes_)rx   r   r&  r^  r  r   r.  rU   r   r  r  r/  r   r0  r  r  never_reuse_buffersrq  r4  )rM  variabler  r'  rq   rr   r^  	  s   
zResizeStorageBytes.__init__r  r3  rq   rq   r'  rr   r    r4  r  c                      s(   e Zd Zd fddZd	ddZ  ZS )
SetSourceTensorKernelru   rv   c                   s   |   t j| ||gdtjjjjd t	j
j|j  t	j
j|  t	j
j|   | }tt|d|| tt|d|| g| _d S )Nz!torch.ops.aten.set_.source_Tensor)r  r  r-  )r  r&  r^  r   r   rS   r  set_source_TensorrU   r   r  rq  r4  r  r   r?  r  r  )rM  self_tensorstorage_tensorr   r'  rq   rr   r^    s   

zSetSourceTensorKernel.__init__re  c                 C  s   | j d  | j d  gS rv  rt  rQ  rq   rq   rr   r  +  s   z2SetSourceTensorKernel.get_inputs_that_alias_outputr  r  )rm   rn   ro   r^  r  rO  rq   rq   r'  rr   r    s    r  c                      sP   e Zd ZdZdddZdddZd	d
 ZdddZdddd fddZ  Z	S )ScatterFallbackz
    This needs to be a custom class to handle mutation properly.
    This class handles both aten.scatter_ and aten.scatter_reduce_.
    It also handle the case `src` being a scalar properly.
    ru   rv   c              
   C  s   | j d }tjjrddd}||v r|| }| jr%dd | jD \}}}ndd | jD \}}| jd }|||| jd	 ||g| j| j	| j|| 
  d S )
Nr  rv  ru  )rq  multiplyc                 s  rn  r   ro  rp  rq   rq   rr   r  ?  r  z*ScatterFallback.codegen.<locals>.<genexpr>c                 s  rn  r   ro  rp  rq   rq   rr   r  A  r  r0   r   )r   rU   r   r  src_is_tensorr*  r  generate_scatter_fallbackr  r  r  )rM  r  r  get_operator_enumr   r   r  rq   rq   rr   r  6  s$   


zScatterFallback.codegenrj   c                 C  r  r  rq   rQ  rq   rq   rr   r  M  r   zScatterFallback.should_allocatec                 C  rs  r  rt  rQ  rq   rq   rr   r  P  r  z"ScatterFallback.get_mutation_namesr  c                 C  r  r   r*   rQ  rq   rq   rr   r  S  r_  z(ScatterFallback.get_unbacked_symbol_defsNTr  include_selfr'  r   r  r  r  c          
   
     s   t |t _ jr fdd|||fD }|f}	n fdd||fD }||f}	t jd t| d ||	||dt|ddg|d t	j
|  t	j
  _t	j
  d S )	Nc                   rk  rq   r  rp  rQ  rq   rr   r   e  r   z,ScatterFallback.__init__.<locals>.<listcomp>c                   rk  rq   r  rp  rQ  rq   rr   r   h  r   r-  r  r  r  )r  r  r  )rx   ra   r  r&  r^  r  r   r.  r   rU   r   r  r  r/  r   r0  )
rM  r  r   r'  r   r  r  r  tensorsr  r'  rQ  rr   r^  V  s&   
zScatterFallback.__init__r  r  r  )r'  r   r  r  r  rj   ru   rv   rv  rq   rq   r'  rr   r  /  s    


r  c                      rk  )IndexPutFallbackzQ
    This needs to be a custom class to handle mutation and indices properly
    ru   rv   c           	      C  s   dd | j D ^}}}g }t|}t| jD ]\}}| j| d ur)|t| q|tjjj	 q|j
|  |||g|  R   d S )Nc                 s  rn  r   ro  rp  rq   rq   rr   r    r  z+IndexPutFallback.codegen.<locals>.<genexpr>)r*  r  r   r  r  r  rU   r   r4  r  generate_index_put_fallbackr  r  )	rM  r  r   r|   valid_indicesr  iter_valid_indicesr   r   rq   rq   rr   r    s   zIndexPutFallback.codegenrj   c                 C  r  r  rq   rQ  rq   rq   rr   r    r   z IndexPutFallback.should_allocatec                 C  rs  r  rt  rQ  rq   rq   rr   r    r  z#IndexPutFallback.get_mutation_namesr  c                 C  r  r   r*   rQ  rq   rq   rr   r    r_  z)IndexPutFallback.get_unbacked_symbol_defsc           	   	     s   | _ dd |D } fdd||g|D }d}t jd t| d ||fd||d tj j	d 
  tj  _tj  d S )	Nc                 S  s   g | ]}|d ur|qS r   rq   r   rq   rq   rr   r     r   z-IndexPutFallback.__init__.<locals>.<listcomp>c                   rk  rq   r  r  rQ  rq   rr   r     r   aoti_torch_index_put_outr-  zaten.index_put_)r  r  r  r   )r  r&  r^  r  r   r.  rU   r   r  r*  r  r/  r   r0  )	rM  r  r   r  r|   
accumulater  r  r  r'  rQ  rr   r^    s    	zIndexPutFallback.__init__r  r  r  rv  rq   rq   r'  rr   r  z  rw  r  c                   @  s"   e Zd Zedd ZdddZdS )	
DeviceCopyc                 C  s   |  stdd | D rtjjs||S tj	| tj	|
  td |f}tt|| | d| |g|S )Nc                 s  s    | ]	}|t jjv V  qd S r   )rU   r   r  r  rq   rq   rr   r    r  z$DeviceCopy.create.<locals>.<genexpr>zDeviceCopy in input programr  )r  r  r[  r1   aot_inductoruse_runtime_constant_foldingr  rU   r   add_device_infor   rI   r  r   r   r   r}  )r1  r   r   rz  r  rq   rq   rr   r0    s(   

zDeviceCopy.createru   rv   c                 C  s\   |   }t|dksJ | jr||d | j |d  d S ||d |  |d  d S )Nr   r   r0   )r  r   r  ry  r  r6  rq   rq   rr   r    s   zDeviceCopy.codegenNr  )rm   rn   ro   rM  r0  r  rq   rq   rq   rr   r    s    
r  c                      sJ   e Zd ZdZdddZdddZd fd
dZdddZdddZ  Z	S )r~   z;
    The result of a call to aten._local_scalar_dense.
    ru   r  c                 C  r  r   r*   rQ  rq   rq   rr   rZ    r_  zDynamicScalar.get_readsrj   c                 C  r  r  rq   rQ  rq   rq   rr   r    r   zDynamicScalar.should_allocaterv   c                   s<   |   t d ttdd| |g || _|| _d S Nr  r-  )	r  r&  r^  r  r   r   r.  symkeypath)rM  r  r  r4  r'  rq   rr   r^    s   
zDynamicScalar.__init__r  c                 C  s   t | jgS r   )r+   r  rQ  rq   rq   rr   r    r   z&DynamicScalar.get_unbacked_symbol_defsc                 C  rA  r   )codegen_dynamic_scalarr  rq   rq   rr   r    r  zDynamicScalar.codegenr  r  r  r  )
rm   rn   ro   r7  rZ  r  r^  r  r  rO  rq   rq   r'  rr   r~     s    


r~   c                      sR   e Zd ZdZdddZdddZd fd
dZdddZdd ZdddZ	  Z
S )r   z5
    The result of a call to aten._assert_scalar
    ru   r  c                 C  r  r   r*   rQ  rq   rq   rr   rZ    r_  zAssertScalar.get_readsrj   c                 C  r  r  rq   rQ  rq   rq   rr   r    r   zAssertScalar.should_allocaterv   c                   s,   t  d ttddg  || _|| _d S r  )r&  r^  r  r   r   scalarr  )rM  r  r  r'  rq   rr   r^    s   
zAssertScalar.__init__c                 C  r  r%  rq   rQ  rq   rq   rr   r~    r   zAssertScalar.has_side_effectsc                 C  r  r   )r%   r  rQ  rq   rq   rr   r     r  z%AssertScalar.get_unbacked_symbol_usesc              	   C  s   t jsd S tt|  }tjjr3d| d}tjjj	| j
dd}|d| d| j d| d d S tjjj| j
dd}|d	| d
 |dt| j d ||   d d S )Nzstd::to_string(r  F)r  zif (!(z()) { throw std::runtime_error("Expected z but received " + z); }zif not rc  z    raise RuntimeError(z = None)r1   scalar_assertsr  r  r  rU   r   r  r4  codegen_cpp_sizevarr  r6  r  codegen_python_sizevarrq  r  )rM  r  symbol
symbol_strsizevarrq   rq   rr   r    s"   zAssertScalar.codegenr  r  r  )rm   rn   ro   r7  rZ  r  r^  r~  r  r  rO  rq   rq   r'  rr   r     s    


r   c                   @  s   e Zd ZU ded< ded< dS )ExternKernelNoder   r   zexport_schema.Noder   Nrl   rq   rq   rq   rr   r     s   
 r  c                      s   e Zd Z	d"ddd# fddZd#ddZd$d
dZdd Zedd Zdd Z	dd Z
dd Zdd Zd#ddZed%ddZedd Z fd d!Z  ZS )&FallbackKernelNr  ru   rv   c                  sN  t  j|t|t||d d _| _t|tjjtjj	fs,J d| dt
| d| _| _|d u r8i n| _tj j g  _g  _t jtjj	rRd S d j v r[d S  jj}tjj jrs j|d   d S |jrt|std|   j j\}	}d fdd}
tjj ||	|D ]	\}}|
|| qd S )Nru  Fz#Fails to create FallbackKernel for r  z not supported_c10d_functionalr   z'NYI: Can't generate FallbackKernel for ru   rv   c                   s   t  jtjrt |ttfsJ t jrt |ttfrJ |d u r%d S  jd u r,d S d fdd}t	 jrK|d urG|D ]}|| q@d S d S t jsSJ || d S )Nru   rv   c                   s>   j |    jjrjtt|  d|  d S d S r  )	alias_namesr  r  
alias_infois_writer  r?  r  r   r   )inforM  rq   rr   	add_alias  s   zPFallbackKernel.__init__.<locals>.handle_aliasing_and_mutation.<locals>.add_aliasr  )
rx   r   r   ListTypery   rz   library_utilsis_tensor_like_typer  is_tensorlist_like_type)r  r  r  optional_tensor_argrQ  )r  rr   handle_aliasing_and_mutations  s"   

z=FallbackKernel.__init__.<locals>.handle_aliasing_and_mutationr  )!r&  r^  rz   use_runtime_dispatchr  rx   r   r  r  r  r   r  r  r   rU   r   warn_fallbackr  r  r8  r   r  _libraryr  mutates_and_returns_first_argr  r  
is_mutabler   rw  r*  r  
zip_schema)rM  r5  r  r  nontensor_argsr  r   r  schemar   r  r  r  r'  rQ  rr   r^  '  sN   zFallbackKernel.__init__c                 C  s   | |  | jt| dd S Nr  )(codegen_unbacked_symbol_defs_for_outputsr  rB  r   r  rq   rq   rr   codegen_unbacked_symbol_defs  s   z+FallbackKernel.codegen_unbacked_symbol_defsr  c                 C  :   t | dd  }rttjjj|}|d usJ | S t S r  r   r'   rU   r   r   r   r  r+   rM  r  resolvedrq   rq   rr   r       
z'FallbackKernel.get_unbacked_symbol_defsc                   s   t jG dd d  fdd| jD }| || j\}}tjjr=t| j	t
jjr=| ||}dd t| j	jj|D }ndd |D }| j| |S )Nc                   @      e Zd ZU ded< dddZdS )	z)FallbackKernel.codegen_args.<locals>.Shimr
   refru   r   c                 S  r]  r   )r  rQ  rq   rq   rr   rL    r_  z2FallbackKernel.codegen_args.<locals>.Shim.__repr__Nr  )rm   rn   ro   rp   rL  rq   rq   rq   rr   Shim     
 r  c                   s   g | ]} |  qS rq   ro  r  r  rq   rr   r     r   z/FallbackKernel.codegen_args.<locals>.<listcomp>c                 S  s"   g | ]\}}t jj||jqS rq   )rU   r   r4  r  r  )r   paramr   rq   rq   rr   r     s    c                 S  r  rq   r  r  rq   rq   rr   r     r  )r  	dataclassr*  r  r  rU   r   r  rx   r  r   r  r  r  r   r  r  r   rz  )rM  r  r   r   rq   r  rr   r    s   zFallbackKernel.codegen_argsc                 C  s   | rdd | D }|d S t |tjr|jS t |ttfrItdd |D }dd |D }t|dkr7|d S |D ]}t|j	rD|  S q9|d S d S )Nc                 S  s   g | ]
}|  r|  qS rq   r  r  rq   rq   rr   r     r  z.FallbackKernel.find_device.<locals>.<listcomp>r   c                 s  s    | ]	}t d |V  qd S r   )r  find_devicer  rq   rq   rr   r    rw  z-FallbackKernel.find_device.<locals>.<genexpr>c                 S  s   g | ]}|r|qS rq   rq   )r   r   rq   rq   rr   r     r   r0   )
rx   r   r  r   ry   rz   r+   r   rM   r   )r  r  devices
device_setr   rq   rq   rr   r    s$   
zFallbackKernel.find_devicec                 C  s"   t | jtjjr
dS t| j S r  )rx   r  r   r  r  r#   r  rQ  rq   rq   rr   r~    s   zFallbackKernel.has_side_effectsc                 C  r]  r   )r  rQ  rq   rq   rr   r    r_  z+FallbackKernel.get_inputs_that_alias_outputc                 C  s   t | jdks	J | jS r  )r   r8  rQ  rq   rq   rr   r    s   z!FallbackKernel.get_mutation_namesc                   sP  t d j ttsJ jj\}	|}fddj
D }j}tjjs9g ||S td d }|||}dd  t|tjjjr]||d |d j}n|jj}t|dkr|jrmjnj}|d j} ||g}	n fddt|jD }	t tjj ||	i d	d
}
tjj !|
 g ||S )Nz4Extern kernel node added for node %s with target %s.c                   s   g | ]}j |fi  qS rq   rU  rg  )r   rM  rq   rr   r     r  z<FallbackKernel.export_extern_kernel_node.<locals>.<listcomp>c                 S  s   t | tjr(|}t |ttfrt|dksJ |d }tjjtj	|
 ddS t | tjrBt |  tjrBtjjdd |D dS tdt|  )	Nr0   r   r   )	as_tensorc                 S  s   g | ]
}t j| d qS )r   )export_schemaTensorArgumentr  )r   r   rq   rq   rr   r     s    zZFallbackKernel.export_extern_kernel_node.<locals>.handle_single_output.<locals>.<listcomp>)
as_tensorszUnsupported return type )rx   r   
TensorTypery   rz   r   r  Argumentr0  r  r  r  getElementTypeRuntimeErrorr   )return_typeoutputr   rq   rq   rr   handle_single_output  s"   
zFFallbackKernel.export_extern_kernel_node.<locals>.handle_single_outputr   r0   c                   s   g | ]
\}} |j |qS rq   r  )r   return_schemar  )r  rq   rr   r     s    
)r  r*  rB  metadata)r   r   )"r  r  r  r  rx   r  r  r*  r  r  r  rU   r   aot_moder   serialize_inputsr   _higher_order_ops	torchbindCallTorchBindr  returnsr  r   rB  r  r  r   r  r  rV   r   extern_kernel_nodesr  )rM  r   ordered_kwargsr  
serializernamed_argumentsr  rB  r  output_argumentsr   rq   )r  r   rM  rr   export_extern_kernel_node  sL   




z(FallbackKernel.export_extern_kernel_nodec                   s:  j }|jdkr+t|tjjsJ tjjr*ddl	m
} t||vr*td| d_n|jdkr:t|tjjs9J ntjjrAd_dfd	d
}ddd  jrZ|  n<g   }tjjrt|tjjrt fddt||jjD r|  ntjj| tjtr  d S )Nr  r   )inductor_fallback_opszG%s is missing a c-shim implementation, using proxy executor as fallbackT
_quantizedru   rv   c               	     sD   d }    }   j j|  j| jr j d S  j d S r   )r  ,generate_fallback_kernel_with_runtime_lookupr  r  r  r  rB  r  )r   exported_argsr  rq   rr   do_runtime_dispatchF  s   
z3FallbackKernel.codegen.<locals>.do_runtime_dispatchr   torch.JitTyperj   c                 S  s(   t | tjpt | tjot |  tjS r   )rx   r   
NumberTypeOptionalTyper  r  rq   rq   rr   	is_numberU  s   z)FallbackKernel.codegen.<locals>.is_numberc                 3  s&    | ]\}}d |v o |j V  qdS )zc10::complexNr  )r   arg_strop_arg)r  rq   rr   r  c  s
    
z)FallbackKernel.codegen.<locals>.<genexpr>r  )r   r  ru   rj   )r  r  rx   r   r  r  rU   r   r  torchgen.aoti.fallback_opsr  r   r  r!  r  r  r  r  rs  r   r  r  r4  generate_fallback_kernelr5  rz  r  r  )rM  r  r  r  r  r   rq   )r  rM  r  rr   r  /  sF   




zFallbackKernel.codegenr  r   c                 C  s"   t | j| jt|  t|  S r   )r7  r   r   rG   r   r   )r  rq   rq   rr   tensor_to_layoutu  s   

zFallbackKernel.tensor_to_layoutc                   s   t jf}||vrtjjnt }|  j|g|R i |\}}}}	}
W d    n1 s-w   Y   ||}|d u rJ t|d||||	|
dn|sPJ d t	|d||||	|
d fdd|g }t
|tttfrw|_|S |g_|S )Nr-  r  z"Not sure where to find device infoc                   s   t ttfrt fddttD S t tr, fdd D S t tj	r;t
 S t trBS t tjrLjjS d u sZJ dt dd S )Nc                 3  s,    | ]} | t |fg V  qd S r   r   r   generate_outputr  r  rq   rr   r    s
    
zAFallbackKernel.create.<locals>.generate_output.<locals>.<genexpr>c                   s*   i | ]\}}| |t |fg qS rq   r  )r   r  r
  r  rq   rr   r     r8  zBFallbackKernel.create.<locals>.generate_output.<locals>.<dictcomp>zFallbackKernel output type z is not supported)rx   ry   rz   r   r   r   r{   r  r   r  MultiOutputr  r   SymIntr   ru  )r  r  r1  r  packed)r  r  rr   r    s,   



z.FallbackKernel.create.<locals>.generate_output)r  *_fused_moving_avg_obs_fq_helper_functionalrU   r   r  r   r  r  r  rs  rx   ry   rz   r{   rB  )r1  r  r   r   fake_incorrect_kernelscontextr  r  r  r  r  r   rB  rq   r  rr   r0  ~  sL   	
	
zFallbackKernel.createc                   s
   t   S r   )r&  r  rQ  r'  rq   rr   r    r  zFallbackKernel.apply_constraintr   r  r  )r  r   )rm   rn   ro   r^  r  r  r  r  r  r~  r  r  r  r  r  rM  r0  r  rO  rq   rq   r'  rr   r  &  s(    	
l




OF
Fr  c                      s<   e Zd ZdZdddZdddZd	d
d fddZ  ZS )ComplexViewz9View a complex number as two dtyped numbers or vice versaru   rj   c                 C  r  r  rq   rQ  rq   rq   rr   r    r   zComplexView.should_allocatere  c                 C  rs  r  rt  rQ  rq   rq   rr   r    r  z(ComplexView.get_inputs_that_alias_outputNr  rv   c                  s   t  j||||||d d S )Nr  )r&  r^  )rM  r5  r  r  r  r  r  r'  rq   rr   r^    s   

zComplexView.__init__r  r  r  )rm   rn   ro   r7  r  r  r^  rO  rq   rq   r'  rr   r     s    

r   c                   @  r  )	rs  r  r   ru   r  c                 C  r]  r   r-  rQ  rq   rq   rr   r     r_  zMultiOutputLayout.get_deviceNr  )rm   rn   ro   rp   r   rq   rq   rq   rr   rs    r  rs  c                      sN   e Zd Zdd ZdddZd fddZdddZdddZdddZ  Z	S )r  c                 C  s   t |dkrW|d \}}t|tr!| | d| d|dd  S t|tr=tjj|| 	 t
|}| ||dd  S t|trR| | d| d|dd  S td||S )Nr   []r0   z['z']znon supported index type: )r   r  ry   codegen_list_tuple_accessrz   rU   r   r4  codegen_tuple_accessr  r   r{   r  )rM  basenamer  ityper   tuple_accessrq   rq   rr   r    s   
 

 
z%MultiOutput.codegen_list_tuple_accessru   rv   c                 C  s2   | |  | | jd  | j | | d S r  )codegen_multi_outputr  r  r*  r  r  r  rq   rq   rr   r    s
   zMultiOutput.codegenr5  r  r  list[tuple[Any, ...]]c                   s8   t  d ||gd tj| | _tj|  || _d S rT  )r&  r^  rU   r   r/  r   r0  r  )rM  r5  r|  r  r'  rq   rr   r^    s   
zMultiOutput.__init__r  c                 C  s   | j d  S r  )r*  r  rQ  rq   rq   rr   r    r  z$MultiOutput.get_unbacked_symbol_usesrj   c                 C  s&   t | jdkrt| jd trdS dS )Nr0   r   TF)r   r*  rx   rr  rQ  rq   rq   rr   r    s
   zMultiOutput.should_allocatere  c                 C  s   dd | j D S )Nc                 S  s.   g | ]}t |trt| d kr| qS r   )rx   r  r   r  r  r   r5  rq   rq   rr   r   #  s    z<MultiOutput.get_inputs_that_alias_output.<locals>.<listcomp>)r*  rQ  rq   rq   rr   r  "  s   z(MultiOutput.get_inputs_that_alias_outputr  )r5  r  r  r	  ru   rv   r  r  r  )
rm   rn   ro   r  r  r^  r  r  r  rO  rq   rq   r'  rr   r    s    


r  c                   @  s  e Zd ZU dZded< dtddZdud	d
ZdvddZdwddZdxddZ	dyddZ
dzd{ddZd|dd Zd}d!d"Zd~d#d$Zd}d%d&Z	'ddd+d,Zdd-d.Zdd1d2Z	'ddd4d5Zdd7d8Zdd:d;Zdd<d=Zdd?d@ZddBdCZddEdFZdtdGdHZdtdIdJZddMdNZddPdQZdydRdSZddTdUZddVdWZ ddYdZZ!dd\d]Z"dd_d`Z#dzddcddZ$e%ddfdgZ&ddidjZ'ddkdlZ(ddndoZ)e%dpdq Z*dydrdsZ+e+Z,dS )r  zC
    TensorBox / StorageBox allow in-place mutation of Tensors
    rb   r4  ru   rj   c                 C  r  r   r  rQ  rq   rq   rr   r  5  r  z!MutableBox.has_exceeded_max_readsr  c                 C  r  r   r  rQ  rq   rq   rr   r   8  r  zMutableBox.get_devicer  c                 C  r  r   r=  rQ  rq   rq   rr   r  ;  r  zMutableBox.make_loaderr  c                 C  r  r   )r4  r  rQ  rq   rq   rr   r  >  r  zMutableBox.make_indexerr!  c                 C  r  r   )r4  r  rQ  rq   rq   rr   r  A  r  zMutableBox.get_strider   c                 C  r  r   r  rQ  rq   rq   rr   r  D  r  zMutableBox.get_nameNr  r  c                 C  r  r   )r4  r  r  rq   rq   rr   r  G  r   zMutableBox.has_large_inner_fnr  r   rv   c                 C  r  r   r  r  rq   rq   rr   r  J  r   zMutableBox.mark_reusec                 C  r  r   r  rQ  rq   rq   rr   r  M  r  zMutableBox.realize_hintc                 C  r  r   )r4  r  rQ  rq   rq   rr   r  P  r  zMutableBox.unwrap_viewc                 C  r  r   )r4  r  rQ  rq   rq   rr   r  S  r  zMutableBox.freeze_layoutFr   r  r  c                 C     | j ||S r   )r4  r  r  rq   rq   rr   r  V  r  z*MutableBox.freeze_layout_with_stride_orderc                 C  r  r   )r4  r  r  rq   rq   rr   r  [  r   z(MutableBox.freeze_layout_with_fill_orderr   r  c                 C  r  r   )r4  r  r  rq   rq   rr   r  ^  r   z(MutableBox.freeze_layout_with_same_orderr  c                 C  r  r   )r4  r  r  rq   rq   rr   r  a  r  z+MutableBox.freeze_layout_with_exact_stridesr  c                 C  r  r   )r4  r  rQ  rq   rq   rr   r  f  r  zMutableBox.get_read_writesr  c                 C  r  r   r  rQ  rq   rq   rr   rZ  i  r  zMutableBox.get_readsc                 C  r  r   r  rQ  rq   rq   rr   r  l  r  zMutableBox.num_readsr]   c                 C  r  r   r  rQ  rq   rq   rr   r  o  r  zMutableBox.get_storage_numelr  c                 C  r  r   r$  rQ  rq   rq   rr   r  r  r  zMutableBox.get_reduction_typer  c                 C  r  r   r#  rQ  rq   rq   rr   r  u  r  zMutableBox.get_reduction_sizec                 C  r  r   r  rQ  rq   rq   rr   r  x  r  zMutableBox.is_externc                 C  r  r   )r4  r  rQ  rq   rq   rr   r  {  r  zMutableBox.is_no_opr   r  c                 C  r  r   r&  r  rq   rq   rr   r  ~  r   zMutableBox.constant_to_devicere  c                 C  r  r   )r4  r  rQ  rq   rq   rr   r    r  zMutableBox.get_mutation_namesc                 C  r  r   )r4  r  rQ  rq   rq   rr   r    r  zMutableBox.get_operation_namec                 C  r  r   )r4  r  rQ  rq   rq   rr   r    r  z'MutableBox.get_inputs_that_alias_outputc                 C  r  r   r  rQ  rq   rq   rr   r    r  zMutableBox.realizer  c                 C  r  r   r  rQ  rq   rq   rr   r    r  z#MutableBox.get_unbacked_symbol_usesrS  c                 C  r  r   r  rQ  rq   rq   rr   r[    r  zMutableBox.get_read_namesrb  c                 C  r  r   )r4  rc  rQ  rq   rq   rr   rc    r  zMutableBox.get_defining_opr  r  c                 C  r  r   )r4  r  r  rq   rq   rr   r    r   zMutableBox.codegen_referencer  c                 C  r  r   r4  r  rQ  rq   rq   rr   r5    s   
zMutableBox.layoutrz  c                 C  r  r   r  rQ  rq   rq   rr   r     r  zMutableBox.get_layoutc                 C  r  r   r  rQ  rq   rq   rr   r    r  zMutableBox.get_output_specr  c                 C  r  r   r;  rQ  rq   rq   rr   r     r  zMutableBox.get_sizec                 C  r,  r   )r4  r   rQ  rq   rq   rr   r     r  zMutableBox.dtypec                 C  sn   t | jtrt| j dt| jj d}d}| jj}nt| j d}| j}d}|tt||g}d|S )Nrm  z))r  
)rx   r4  r  r   rm   ro  r   rp  )rM  line0endlr  rj  rq   rq   rr   r,    s   


zMutableBox.__str__r  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )-rm   rn   ro   r7  rp   r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  rZ  r  r  r  r  r  r  r  r  r  r  r  r  r[  rc  r  r   r5  r   r  r   r   r,  rL  rq   rq   rq   rr   r  -  s\   
 
































r  c                   @  s   e Zd Zedd ZdS )ra   c                 C  s   t | tr| S tt| S r   )rx   r   ra   r  )r4  rq   rq   rr   r0    s   
zTensorBox.createN)rm   rn   ro   r  r0  rq   rq   rq   rr   ra     s    c                   @  sT   e Zd Zdd Zdd ZdddZdd
dZdddZdd ZdddZ	dd Z
dS )r  c                 C  s&   t | jttfr| j tjjv S dS r  )rx   r4  r  r9  r  rU   r   graph_inputsrQ  rq   rq   rr   r    s   zStorageBox.is_input_bufferc                 C  s   t | jto| j tjjv S r   )rx   r4  rf  r  rU   r   r  rQ  rq   rq   rr   r    s   zStorageBox.is_module_bufferru   r  c                 C  s   t | jtttttfr| j S t | jtt	t
tfs!J t| j| j }| j }td t| j | j | j d| jd| _tj| j| j_tj| j | j| j_|| j_|| j_| jjS )Nr  r  )rx   r4  r  r-  r  r9  r)  r  rV  r  ru  r  r   ra  r^  r   r   r   r   rU   r   r/  r   r0  rB  rF  rD  )rM  rF  rD  rq   rq   rr   r    s<   



	
zStorageBox.realizerv   c                 C  s4   t | jttfr| j jdkr|   dS dS dS )zL
        Called on buffers we expect to be forced to realize later.
        r0   N)rx   r4  rV  r  rA  nontrivial_read_countr  rQ  rq   rq   rr   r    s   zStorageBox.realize_hintrj   c                 C  s"   t | jto|  tjkp|  S r   )rx   r4  rV  r  r1   realize_acc_reads_thresholdr  rQ  rq   rq   rr   r    s   z!StorageBox.has_exceeded_max_readsc                   sh   |dkr2t | jttfr2t| jr'| j  ddg}t fdd|D r'dS |  tj	kp1| 
 S dS )zj
        A heuristic to decide if we should realize a tensor
        that is used multiple times.
        r0   expsigmoidc                 3  s    | ]}| j v V  qd S r   )used_opsr  opcountrq   rr   r    r[  z5StorageBox.should_realize_on_reuse.<locals>.<genexpr>TF)rx   r4  rV  r  r  rA  rs  r  r1   realize_reads_thresholdr  )rM  r  	heavy_opsrq   r  rr   should_realize_on_reuse  s   

z"StorageBox.should_realize_on_reuser  r   c                 C  s   |  |r|   d S d S r   )r  r  r  rq   rq   rr   r    s   
zStorageBox.mark_reusec                 C  r  r   r  rQ  rq   rq   rr   r    r  zStorageBox.num_readsNr  r  r  r  )rm   rn   ro   r  r  r  r  r  r  r  r  rq   rq   rq   rr   r    s    

!


r  c                   @  s*   e Zd ZU ded< ded< dZded< dS )Subgraphr   r   ztorch.fx.GraphModulegraph_moduleNzOptional[GraphLowering]r   )rm   rn   ro   rp   r   rq   rq   rq   rr   r    s   
 r  buffersr+  c                 C  s,   dd | D } t tdd | D t | k S )Nc                 S  s"   g | ]}t |tr| n|qS rq   )rx   r9  r  r   r  rq   rq   rr   r   &  s    z(_has_aliased_buffers.<locals>.<listcomp>c                 s  r  r   )r@  r  rq   rq   rr   r  +  r  z'_has_aliased_buffers.<locals>.<genexpr>)r   r+   )r  rq   rq   rr   _has_aliased_buffers%  s   r  c                      s\   e Zd ZU dZded< dZded< dZded< d fddZedddZ	dddZ
  ZS )InvokeSubgraphNOptional[Subgraph]subgraphzOptional[list[TensorBox]]operandsOptional[list[MultiOutput]]rB  r  list[TensorBox]r5  rs  ru   rv   c                   s6   t  jd ||d || _tj| | _tj|  d S r  )r&  r^  r"  rU   r   r/  r   r0  )rM  r"  r#  r5  r'  rq   rr   r^  4  s   zInvokeSubgraph.__init__c                   s\  t jjjd }dd |D } fdd|D }dd }g }t|D ] \}}t|tr0|| q!|||  }	| 	||	 q!|}|jd u rqt jj
|j||jd|_t |j |jj|  W d    n1 slw   Y  |jj}
d }|D ]}t|ts| } nqy|d usJ t||t|dd	dfddfddt|
D }
|
_|
S )Nr  c                 S     g | ]}|j d  qS r  rx  r  rq   rq   rr   r   D  r   z)InvokeSubgraph.create.<locals>.<listcomp>c                   rk  rq   r  r  r  rq   rr   r   I  r   c                 S  s   dd | D S )Nc                 S  s$   g | ]}t |tjr|jjn|qS rq   )rx   r   r  r   ru  r   rq   rq   rr   r   L  r  zBInvokeSubgraph.create.<locals>.handle_sym_expr.<locals>.<listcomp>rq   r  rq   rq   rr   handle_sym_exprK  r  z.InvokeSubgraph.create.<locals>.handle_sym_exprgmexample_inputssubgraph_namer-  )r"  r#  r5  r  rb   indr   c                   sH   t | ttfr	| S tt|  |  |  |  | 	 j
d t|fgS )Nr  )rx   r   r  r  r7  r   r   r   r  r   r8  ry   )r  r-  )invoke_subgraphrq   rr   create_outputs  s   z,InvokeSubgraph.create.<locals>.create_outputc                   s   g | ]	\}} ||qS rq   rq   r   r   r  )r/  rq   rr   r     r  )r  rb   r-  r   )rU   r   rF  r   r   rx   r   r  r   r  make_subgraphr  r   set_graph_handlerrungraph_outputsr   r   rs  rB  )r1  r"  r#  fx_operandsfake_operandsr(  new_operandsr   operandexample_striderB  r   rq   )r1  r/  r.  rr   r0  @  sJ   


zInvokeSubgraph.createc                 C  rA  r   )codegen_invoke_subgraphr  rq   rq   rr   r    r  zInvokeSubgraph.codegen)r"  r  r#  r%  r5  rs  ru   rv   )r"  r  r  )rm   rn   ro   r"  rp   r#  rB  r^  rM  r0  r  rO  rq   rq   r'  rr   r   .  s   
 Fr   c                      s~   e Zd ZU dZded< dZded< dZded< dZded< dZd	ed
< d  fddZ	e
d!ddZd"ddZd#ddZ  ZS )$ConditionalNr   	predicate7Optional[list[Union[TensorBox, ShapeAsConstantBuffer]]]r#  r!  true_subgraphfalse_subgraphr$  rB  rb   -list[Union[TensorBox, ShapeAsConstantBuffer]]r  r5  rs  r  ,Optional[dict[sympy.Symbol, pytree.KeyPath]]ru   rv   c           	        sj   || _ || _|| _|| _t|g| \}}t jd |||d |d ur&|| _tj	
| | _tj	|  d S N)r   r5  r*  r  )r<  r#  r>  r?  _split_by_sym_typer&  r^  r  rU   r   r/  r   r0  )	rM  r<  r#  r>  r?  r5  r  sym_argsr  r'  rq   rr   r^    s   	zConditional.__init__ra   true_fnfalse_fnc              	     s   |}fdd|D }tjjjd }dd |D }||fD ]/}|jd u rOtjj|j||jd|_t|j |jj	|  W d    n1 sJw   Y  q |jj
}|jj
}	d|fd|	ffD ]\}
}t|rrtd|
 d	| q`t|t|	ksJ ||	ftt||	D ]5\}\}}| | ksJ |||f| | ksJ |||f| j| jksJ |||fqtd
d |g| D }ttjjjtjjjdd }|d usJ dt||||t|d|dddd  fddtt|tjjjd D }|_|S )Nc                   rk  rq   r  r  r  rq   rr   r     r   z&Conditional.create.<locals>.<listcomp>r  c                 S  r&  r  r'  r  rq   rq   rr   r     r   r)  rE  rF  zVOutput aliasing is currently not supported in compiled torch.cond. The outputs of the z% subgraph of torch.cond are aliased: c                 s  s"    | ]}t |ts| V  qd S r   )rx   r   r   )r   orq   rq   rr   r    s    
z%Conditional.create.<locals>.<genexpr>r  zcannot determine devicer-  )r<  r#  r>  r?  r5  r  r   Union[int, torch.SymInt]ru   Union[int, sympy.expr]c                 S  s   t | tr| S | jjS r   )rx   r   r   ru  )r   rq   rq   rr   _maybe_expr  s   
z'Conditional.create.<locals>._maybe_exprc              
     sf   g | ]/\}\}}t t| |  fd d| D  fdd| D | jdt|fgqS )c                   r   rq   rq   r   r  rJ  rq   rr   r     r   z1Conditional.create.<locals>.<listcomp>.<listcomp>c                   r   rq   rq   rK  rL  rq   rr   r     r   r  )	r  r7  r   r   r   r   r   r8  ry   )r   r   r  merged_output)rJ  conditionalrq   rr   r     s    
r
  )r   rH  ru   rI  )r}  rU   r   rF  r   r1  r  r   r2  r3  r4  r  r  r   r   r   r   r   r   r8  r  r'   r   r   rx  r|  r;  rs  rB  )r1  r<  rE  rF  r#  r5  r6  r"  true_outputsfalse_outputsr   rB  r   tofor   r  rq   )rJ  r1  rN  rr   r0    sp   

$
	zConditional.createc                 C  s*   | |  ||  | jt| di  d S r  )codegen_conditionalr  r  rB  r   r  rq   rq   rr   r    s   
zConditional.codegenr  c                 C  r  r  r  r  rq   rq   rr   r    r  z$Conditional.get_unbacked_symbol_defs)r<  rb   r#  r@  r>  r  r?  r  r5  rs  r  rA  ru   rv   )r<  ra   rE  r  rF  r  r#  r@  r  r  )rm   rn   ro   r<  rp   r#  r>  r?  rB  r^  rM  r0  r  r  rO  rq   rq   r'  rr   r;    s   
 
Vr;  r   	list[Any]-tuple[list[ShapeAsConstantBuffer], list[Any]]c                 C  s<   g }g }| D ]}t |tr||j q|| q||fS r   )rx   r   r  ru  )r   non_sym_argsrD  r  rq   rq   rr   rC    s   
rC  c                      st   e Zd ZU dZded< dZded< dZded< dZded< dZded	< d fddZ	e
dddZdddZ  ZS )	WhileLoopNr=  carried_inputsadditional_inputsr!  cond_subgraphbody_subgraphr$  rB  r@  r  r5  rs  ru   rv   c                   sZ   || _ || _|| _|| _t|| \}}t jd |||d tj	| | _
tj|  d S rB  )rX  rY  rZ  r[  rC  r&  r^  rU   r   r/  r   r0  )rM  rX  rY  rZ  r[  r5  rD  r  r'  rq   rr   r^  -  s   zWhileLoop.__init__cond_fnbody_fnc              	     s   fdd|D } fdd|D }|| }t jjjd t jjjd  }dd |D }||fD ]/}|jd u r^t jj|j||jd|_t |j |jj|  W d    n1 sYw   Y  q/|jj	}	|jj	}
t
|
rrtd|
 t|	d	ks|J |	|	d
 }t|ts| tjksJ |t| d
ksJ |t|d
ksJ d|d
  }t|t|
ksJ ||
ftt||
D ]N\}\}}ddd}|| |  || |  | | ksJ ||||f| | ksJ |||f| j| jksJ |||fqt||||t|ddfddt|
D }t||D ]\}}| t jjv rBt jj|  q,|_|S )Nc                   rk  rq   r  r  r  rq   rr   r   M  r   z$WhileLoop.create.<locals>.<listcomp>c                   rk  rq   r  r  r  rq   rr   r   N  r   r  c                 S  r&  r  r'  r  rq   rq   rr   r   R  r   r)  zOutput aliasing is currently not supported in compiled torch.while_loop. The outputs of the body_fn subgraph of torch.while_loop are aliased: r0   r   z9torch.while_loop is assumed to have at least one operand.	lhs_exprslist[Union[int, sympy.expr]]	rhs_exprsru   rv   c                 S  s(   t | |D ]\}}tjj|| qd S r   )r   rU   r   r   r  )r_  ra  lhsrhsrq   rq   rr   _guard_list_equalsy  s   z,WhileLoop.create.<locals>._guard_list_equalsr-  )rX  rY  rZ  r[  r5  c              
     sF   g | ]\}}t t| | | | | jd  t|fgqS )r  )	r  r7  r   r   r   r  r   r8  ry   r0  )
while_looprq   rr   r     s    )r_  r`  ra  r`  ru   rv   ) rU   r   rF  r   r1  r  r   r2  r3  r4  r  r  r   rx   r   r   r   rj   r   r   r   r   r  r   r8  rW  rs  r  r  r  rq  rB  )r1  r\  r]  rX  rY  
all_inputsfx_all_inputsfake_all_inputsr"  cond_outputsbody_outputsr  r   r   opbord  rB  r5  r   rq   )r1  re  rr   r0  E  sr   


 &
	zWhileLoop.createc                 C  rA  r   )codegen_while_loopr  rq   rq   rr   r    r  zWhileLoop.codegen)rX  r@  rY  r@  rZ  r  r[  r  r5  rs  ru   rv   )r\  r  r]  r  rX  r@  rY  r@  r  )rm   rn   ro   rX  rp   rY  rZ  r[  rB  r^  rM  r0  r  rO  rq   rq   r'  rr   rW  %  s   
 frW  c                      s@   e Zd Z	dddd fddZd fdd	ZdddZ  ZS )r   Nr  ru   rv   c             	     s~   t  j|||||d |d ddlm} dd |D }	||g ||	R |}
|
d us+J |
| _tjj|
d | _	| tjj|
< d S )N)r   r  r   )get_effect_keyc                 S  s    g | ]}t |tr|jn|qS rq   )rx   r  r   )r   r  rq   rq   rr   r     s    z,EffectfulKernel.__init__.<locals>.<listcomp>)
r&  r^  torch._higher_order_ops.effectsrn  effect_typerU   r   effectful_opsr|  prev_effect_buffer)rM  r5  r  r  r  r  r   r  rn  uncovered_argsrp  r'  rq   rr   r^    s$   
zEffectfulKernel.__init__r  c                   s0   t   }| jd ur|jt| j  |S r   )r&  r  rr  r  rq  r2   r  r  )rM  r  r'  rq   rr   r    s   

zEffectfulKernel.get_read_writesrj   c                 C  r  r%  rq   rQ  rq   rq   rr   r~    r   z EffectfulKernel.has_side_effectsr   r  r  r  )rm   rn   ro   r^  r  r~  rO  rq   rq   r'  rr   r     s    	 
r   c                   @  r  )r  Nr  rq   rq   rq   rr   r    s    r  c                   @  s\   e Zd ZU ddlmZ ded< ded< dd ZddddZdddZdddZ	dddZ
d	S )r  r   )FakeScriptObjectr   r   +Union[FakeScriptObject, torch.ScriptObject]r   c                 C  r]  r   r   rQ  rq   rq   rr   r    r_  zTorchBindObject.get_nameNr  r  ru   c                 C  r]  r   r   r  rq   rq   rr   r    r_  z!TorchBindObject.codegen_referencec                 C  r]  r   r  rQ  rq   rq   rr   	get_value  r_  zTorchBindObject.get_valuetorch.ScriptObjectc                 C  s   t | jtjr
| jS | jjS r   )rx   r   r   ScriptObjectreal_objrQ  rq   rq   rr   r    s   zTorchBindObject.get_real_objr   c                 C  sB   |   }t| }t|d }dd |D }tdd |dS )Nr   c                 S  s(   g | ]}t |tjr| |  qS rq   )rx   r   r  r?  numelr  rq   rq   rr   r     s    
z1TorchBindObject.get_buf_bytes.<locals>.<listcomp>c                 S  s   | | S r   rq   )r   yrq   rq   rr   r    s    z/TorchBindObject.get_buf_bytes.<locals>.<lambda>)r  r{   __obj_flatten__r  r  r  r  )rM  real_script_obj	flat_dict
flat_elems
flat_sizesrq   rq   rr   get_buf_bytes  s   zTorchBindObject.get_buf_bytesr   r  )ru   ru  )ru   rw  r  )rm   rn   ro   "torch._library.fake_class_registryrt  rp   r  r  rv  r  r  rq   rq   rq   rr   r    s   
 

r  c                   @  s2   e Zd ZU ded< ded< dd ZddddZdS )r  r   r   r  r   c                 C  r]  r   r   rQ  rq   rq   rr   r    r_  zGeneratorState.get_nameNr  r  ru   c                 C  r]  r   r   r  rq   rq   rr   r    r_  z GeneratorState.codegen_referencer   r  )rm   rn   ro   rp   r  r  rq   rq   rq   rr   r    s
   
 r  c                   @  sH   e Zd ZdddZdddZddddZedddZedddZdS )_CollectiveKernelru   rj   c                 C  r  r  rq   rQ  rq   rq   rr   r    r   z!_CollectiveKernel.should_allocatec                 C  r  r%  rq   rQ  rq   rq   rr   r~    r   z"_CollectiveKernel.has_side_effectsNr  r  rv   c                 C  sB   t | jtjju sJ d| j}|jj| _dd |jjD | _	d S )Nz,Setting cpp kernel needs a valid op_overloadc                 S  r  rq   r  r  rq   rq   rr   r   "  r  z9_CollectiveKernel.set_cpp_kernel_name.<locals>.<listcomp>)
r   r  r   r  r  r  r   r  r  r  )rM  r  r  rq   rq   rr   r    s   
z%_CollectiveKernel.set_cpp_kernel_namer*  !Union[TensorBox, list[TensorBox]]c                   s  t jj | j||g|R i |\}}}}}	W d    n1 s!w   Y  |	r1J | d|	 |D ]}
|
  q3|d   | t d||||t|}j	
 fdd|D  j
dd |D  d|v rj	tt d|d  j|d   d S d S )Nr  r   r-  c                   r<  r=  r>  r@  r   r  rq   rr   r   G  r  z4_CollectiveKernel.create_inplace.<locals>.<listcomp>c                 S  s   g | ]}|  qS rq   r  r
  rq   rq   rr   r   K  r   r   )rU   r   r  r  r  r   r  r  tree_leavesr  r  r  r  r?  r  )r1  r  r*  r   r   _example_outputr  r  r  r  
tensor_arginpsrq   r  rr   create_inplace,  s>   


z _CollectiveKernel.create_inplacec                   s   t jj  j||g|R i |\}}}}}	W d    n1 s!w   Y  |	r1J | d|	 |D ]}
|
  q3t|tr` ||} t|d|||| fddt	|D _
j
S   |||||g_
S )Nr  r-  c                   s(   g | ]\}}t  |t|fgqS rq   )r  r  ry   )r   r   r+  r1  r  rq   rr   r     s    z9_CollectiveKernel.create_out_of_place.<locals>.<listcomp>)rU   r   r  r  r  rx   ry   r  rs  r   rB  r  )r1  r  r*  r   r   r  r  r  r  r  r  r   rq   r  rr   create_out_of_placei  sD   


z%_CollectiveKernel.create_out_of_placer  r   r*  )r*  r  ru   rv   )r*  r  )	rm   rn   ro   r  r~  r  rM  r  r  rq   rq   rq   rr   r    s    

<r  c                      s4   e Zd Zdd ZedddZd fd
dZ  ZS )_WaitKernelc                 C  s`   | j d }t|tr|j d gS t|tr.|j d }t|tr,|jd \}}|j | gS g S g S r  )r*  rx   r  r  r  )rM  r5  collr   r   rq   rq   rr   get_volatile_reads  s   




z_WaitKernel.get_volatile_readsr5  ra   ru   rv   c           	      C  s   t jj | ||\}}}}}W d    n1 sw   Y  |r*J | d| | t| d||||}|jtt| d|| d S )Nr  r-  )	rU   r   r  r  r  r   r  r  r?  )	r1  r  r5  r  r  r  r  r  r  rq   rq   rr   create_wait  s(   

z_WaitKernel.create_waitr  c                   s6   t   }|  }|D ]}|jt|  q|S r   )r&  r  r  r  rq  r2   r  r  )rM  r  volatile_readsvrr'  rq   rr   r    s
   
z_WaitKernel.get_read_writes)r5  ra   ru   rv   r  )rm   rn   ro   r  rM  r  r  rO  rq   rq   r'  rr   r    s
    r  r   r   r  c                 C  sd   t | ttfrt| S t | ttfr%ttj  }| D ]}|t	|O }q|S t | t
jr/t| S t S r   )rx   r)   r   r%   rz   ry   r+   r   r   r&  r   r  )r   r3  r   rq   rq   rr   r&    s   r&  )rs   rt   ru   rv   )r   r   ru   r   )r   r   ru   r   )r   r   r   r   ru   r   r   )r   r   r   r   ru   r   )r   r   ru   r   r  )r   r   r   rj   ru   rv   )r   rb   r   rj   ru   r   )r   r   r   rj   ru   r   )r   r   ru   r   )r   r  ru   r  )r   r
  ru   rj   )r   r  r  r   ru   rj   )r   r!  r"  r!  r#  r!  ru   rj   )r+  r,  r-  r.  ru   r,  )r   rP  r   rs  ru   rT   )ry  r   r   rs  rz  rj   ru   r{  )r   rb   ru   rj   )TFNFN)r   rb   r  rj   r  rj   r  r  r  rj   r  r  ru   r  )r   rb   r  r   ru   rj   )r   r!  r#  r!  ru   rj   )r   rs  ru   r   )r  r+  ru   rj   )r   rT  ru   rU  )r   r   ru   r  (  
__future__r   r  r  r  r  loggingtextwraprD  r  collections.abcr   r   r   r   r   enumr   r	   r
   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   unittest.mockr   r   r   r   r   torch._export.serde.schema_exportserder  r  torch._library.utilsr  r  r  torch._loggingr   torch.fxtorch.utils._pytree_pytreer  torch._dynamo.utilsr   torch._export.serde.serializer   *torch._higher_order_ops.auto_functionalizer   torch._inductorr   torch._prims_commonr   r   r    r!   r"   torch._subclasses.fake_tensorr#   %torch.fx.experimental.symbolic_shapesr$   r%   r&   r'   r(   r)   torch.utils._ordered_setr+   torch.utils._sympy.functionsr,   r-   r.   torch.utils._sympy.symbolr/   rf  r1   r2   codegen.commonr3   r4   r5   r6   r7   r8   r9   r:   	loop_bodyr;   ops_handlerr<   r=   r>   r?   runtime.benchmarkingr@   runtime.hintsrA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   virtualizedrS   rT   rU   torch.fx.noderV   codegen.cuda.cuda_templaterW   r   rX   rY   r   rp   r  __version__r  r  ImportErrorrZ   r[   r\   r   r]   rT  r^   	getLoggerrm   r  ro  r  r{   r   rc   r  re   r   r   r   r   r   r  r  r   r   r   r   r  r  r  r  r  r*  r>  rb   r  r  rU  rV  ri  rx  r  r  INNER_FN_TYrS  r_  rf  ru  r  r  r   r  r6  r  r  r}   r  r  r  r9  r9  r8  r>  rS  rU  rY  r\  r_  r  rz  r7  r   r  r  r  r  r  r  r  r  r  rf  r  r   r  r)  r7  rj   ry   PrimitiveInfoTyperI  rU  rW  ro  rr  r-  r  r  r  r+  r.  r  r?  r;  rC  rl  rx  r|  r  r  r  r  r  r~   r   r  r  r   rs  r  r  ra   r  r  r  r   r;  rC  rW  r   r  r  r  r  r  r&  rq   rq   rq   rr   <module>   s8   , H
*

"	


& tD  #?     7:& _ E <
_N+: S(R L 'T{
  JE"40GA ,      5-J 0*0K/$9   &= W	\  /" 8