o
    Ih#                    @  s:  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlmZ d dlmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlm  mZ d dlmZ d dlm Z m!Z! d dl"m#Z# d d	l$m%Z% d d
l&m'Z' d dl(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z: ddl;m<Z< ddl:m=Z=m>Z> ddl?m@Z@ ddlAmBZB ddlmCZCmDZDmEZEmFZFmGZGmHZHmIZI ddlJmKZK ddlLmMZMmNZNmOZOmPZPmQZQmRZRmSZS ddlTmUZU ddlVmWZWmXZXmYZY erd dlZm[Z[m\Z\ d dl]Z]ddl^m_Z_ eQ j`Zaebejcejedf Zeee:jfeRf Zgdcd"d#Zhddd&d'Zided+d,Zjdfd.d/Zkdgd2d3Zlemedenf Zoeebeenejf d4f eeogebend4f f f Zp	dhdid=d>Zqdjd?d@ZrejsG dAdB dBZtG dCdD dDZuG dEdF dFZvejsG dGdH dHevZwejsG dIdJ dJevZxejsG dKdL dLevZyG dMdN dNevZzejsG dOdP dPevZ{ejsG dQdR dRe{Z|ejsG dSdT dTe{Z}ejsG dUdV dVe{Z~G dWdX dXe{ZejsG dYdZ dZevZejsG d[d\ d\eZejsG d]d^ d^eZedZG d_d` d`eNZG dadb dbeZdS )k    )annotationsN)count)AnyCallableOptionalTYPE_CHECKINGUnion)Expr)dtype)countersdynamo_timed)DebugPrinterManager)MultiKernelState)	cache_dir)CallMethodKeyConvertIntKeyDivideByKeyresolve_unbacked_bindingsSymTypes)_get_qualified_name)
OrderedSet)SingletonInt)symbol_is_typeSymT   )async_compileconfigir)output_code_log)IRNodeReinterpretView)triton_heuristics)DeviceProperties)cache_on_selfget_benchmark_nameLineContextsympy_product	sympy_str
sympy_substriton_version_uses_attrs_dict)V   )ArgNameCodeGenDeferredLineIndentedBufferPythonPrinterWorkspaceArgWorkspaceZeroMode)cexpr)	config_ofshould_unwrap_unspec_argsignature_to_meta)IteratorSequence)GraphLoweringnode
BufferLikereturnReuseKeyc                 C  s,   t j| }|  |  tt jj|fS N)r*   graphget_allocation_storage_sizeget_device_or_error	get_dtyper'   sizevarssimplify)r:   storage_size rF   S/var/www/vscode/kcb/lib/python3.10/site-packages/torch/_inductor/codegen/wrapper.pybuffer_reuse_keyT   s
   rH   	input_buf
output_bufc                 C  s   |   |  kr
dS |  | krdS tjjtj| }tjjtj|}t|t|ksDtjj|d| rFtjj	||rFdS dS )NFgffffff?T)
rA   rB   r*   r?   rC   rD   r@   r'   statically_known_geqstatically_known_leq)rI   rJ   
input_sizeoutput_sizerF   rF   rG   can_match_buffer_size`   s"   

rO   argtorch.Argumentstrc           
      C  s   ddl m}m} t| j}|dkr&| jd ur | jjr d| dS d| dS ||v r0|| }|S | D ]2\}}t	|d |}t
|dkrf|d }||v sXJ d	| d
| || }	| d|	 d  S q4td| )Nr+   )CONTAINER_PYTHON_TO_CPPPYTHON_TO_CPPTensorzat::&z const&z\[([a-zA-Z_]+)]r   zunsupported z type in convert_arg_type: <>zunsupport python_type: )cpprS   rT   repr	real_type
alias_infois_writeitemsrefindalllenAssertionError)
rP   rS   rT   python_typecpp_typepy_containercpp_containercontainer_matchcontained_typecpp_contained_typerF   rF   rG   convert_arg_type   s(   

rj   retc                 C  sT   t | j}ddd}||d }|d usJ d| |dkr(| jd ur(|d7 }|S )Nz
at::Tensorzstd::vector<at::Tensor>)rU   zList[Tensor]zNYI return type: rU   rV   )rZ   r[   getr\   )rk   rc   python_to_cpprd   rF   rF   rG   convert_return_type   s   
rn   kerneltorch._ops.OpOverloadc                 C  s   | j j}| j j}t|}|dksJ d|dkrt|d }n|dkr3ddd |D }d| d}d	d |D }| d
d| dS )Nr   z#must have at least one return valuer+   , c                 S     g | ]}t |qS rF   )rn   ).0rrF   rF   rG   
<listcomp>       z%get_cpp_op_schema.<locals>.<listcomp>zstd::tuple<rX   c                 S  s    g | ]}t | d |j qS ) )rj   namers   rP   rF   rF   rG   ru      s     ())_schema	argumentsreturnsra   rn   join)ro   argsr~   num_returnscpp_return_valuetuple_returnscpp_arg_typerF   rF   rG   get_cpp_op_schema   s   r   .rx   configslist[triton.Config]gridslist[TritonGrid]wrapperOptional[PythonWrapperCodegen]tuple[str, str]c              	     s  t  d!dd d" fd	d
}d#d$fdd}d }|d| d r2tjjr2j nt }  | t|dkrX||d \}}	|d| d|	  n`t|dks`J t|t|ksjJ t	t
  }
tt||dd ddD ]<\}}|jrdd |j D }d|}nd}||\}}	d| d | }||
v rq{|
| ||d| d |	  q{W d    n1 sw   Y  W d    n1 sw   Y  | fS )%NitemUnion[int, sympy.Expr]r<   
sympy.Exprc                 S  s   t | tjr| S t| S r>   )
isinstancesympyr	   Integer)r   rF   rF   rG   _convert_to_sympy_expr      z@user_defined_kernel_grid_fn_code.<locals>._convert_to_sympy_exprgrid
TritonGridc                   sb   du st | r| | fS t fdd| D }|tjjr.tfdd|D fS dfS )a'  
        This function return a tuple of two values: the first one is for the real grid
        which is used in the generated code; the second one is an example grid with
        concreate values which is used in the autotune block to run the generated
        kernels at compile time.
        Nc                 3  s    | ]} |V  qd S r>   rF   rs   g)r   rF   rG   	<genexpr>       zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>c                 3       | ]}  |t|V  qd S r>   generate_example_arg_valuetyper   )r   rF   rG   r      s
    
)callabletuplecodegen_python_shape_tupler   tritonautotune_at_compile_time)r   
sympy_grid)r   r   rF   rG   determine_grid   s   	
z8user_defined_kernel_grid_fn_code.<locals>.determine_gridlinerR   example_gridOptional[str]c                   s@    |  rtjjr jvrj |p|  d S d S d S d S r>   )	writeliner   r   r   kernel_autotune_nameskernel_autotune_calls)r   r   )rx   outputr   rF   rG   r      s   

z3user_defined_kernel_grid_fn_code.<locals>.writelinegrid_wrapper_for_def z(meta):r+   r   zreturn c                 S     t | d jS Nr+   ra   kwargsxrF   rF   rG   <lambda>      z2user_defined_kernel_grid_fn_code.<locals>.<lambda>Tkeyreversec                 S  s    g | ]\}}d | d| qS )zmeta['z'] == rF   )rs   rx   valrF   rF   rG   ru     s    z4user_defined_kernel_grid_fn_code.<locals>.<listcomp>z and Trueif z	: return )r   r   r<   r   )r   r   r>   )r   rR   r   r   )r/   r   r   r   r   indent
contextlibnullcontextra   r   rR   sortedzipr   r^   r   addgetvalue)rx   r   r   r   r   r   fn_namekernel_autotune_calls_indentr   r   seencguards	statementrF   )r   rx   r   r   rG    user_defined_kernel_grid_fn_code   sN   

	

 r   c                   s^   t  j| jdd ddlm  ddlm t| jg fdd|  	 S )zg
    Given a triton kernel function pointer collect the transitive closure of
    its dependencies
    Tstripr   )JITFunction)	constexprc              	     s  t dd t| jD }| jjdi }| jjjD ]}|v r!q|| jjv r| jj| }t| rM	  
d j|jdd | | qt|tttfr	  t|rgd|jd}n|}|| }rt|trd	|j d
|j }nd	|}
| | d|  n

| d|  | q||v r|dkrt|dr|jdrˈ
d|j d|j d|  | qd S )Nc                 s  s     | ]}|j d kr|jV  qdS )LOAD_GLOBALN)opnameargval)rs   instrF   rF   rG   r   4  s    
z^user_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse.<locals>.<genexpr>__annotations__z@triton.jitTr   ztl.constexpr(r{   : . = tl
__module__r   zfrom z import z as )r   disBytecodefn__globals__rl   __code__co_namesr   newliner   splicesrcr   intrR   boolvaluer   r   __name__hasattr
startswith)
cur_kernelunqualified_loadsglobal_annotationssymbol_namesymbol
symbol_str
annotationannotation_coder   compile_wrapperr   symbols_includedtraverserF   rG   r   /  sT   










zKuser_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse)
r/   r   r   r   r   triton.languager   r   r   r   )ro   rF   r   rG   9user_defined_triton_kernel_transitive_closure_source_code   s   :r   c                   @  s&   e Zd ZU ded< ded< dd ZdS )SymbolicCallArgrR   innerr   
inner_exprc                 C  s
   t | jS r>   )rR   r   selfrF   rF   rG   __str__s     
zSymbolicCallArg.__str__N)r   r   __qualname__r   r   rF   rF   rF   rG   r   m  s   
 r   c                      s:   e Zd Z fddZdddZdd
dZdddZ  ZS )MemoryPlanningStatec                   s    t    tt| _d| _d S Nr   )super__init__collectionsdefaultdictlist
reuse_pooltotal_allocated_buffer_sizer   	__class__rF   rG   r  x  s   

zMemoryPlanningState.__init__r   r=   r<   r   c                 C  s   t | j|d S r>   )r   r  rl   )r   r   rF   rF   rG   __contains__     z MemoryPlanningState.__contains__FreeIfNotReusedLinec                 C  s   | j |  }|jrJ |S r>   )r  pop	is_reusedr   r   r   rF   rF   rG   r    s   
zMemoryPlanningState.popr   Nonec                 C  s   |j rJ | j| | d S r>   )r  r  appendr  rF   rF   rG   push  s   
zMemoryPlanningState.push)r   r=   r<   r   )r   r=   r<   r  )r   r=   r   r  r<   r  )r   r   r   r  r	  r  r  __classcell__rF   rF   r  rG   r   w  s
    

r   c                   @     e Zd ZdS )WrapperLineNr   r   r   rF   rF   rF   rG   r        r  c                   @  s2   e Zd ZU ded< ded< dddZdddZdS )EnterSubgraphLinePythonWrapperCodegenr   r9   r?   r<   r  c                 C  s   | j | j j d S r>   )r   push_computed_sizescomputed_sizesr   rF   rF   rG   __post_init__     zEnterSubgraphLine.__post_init__coder/   c                 C  s   | j | j |  d S r>   )r   push_codegened_graphr?   	do_indentr   r  rF   rF   rG   codegen  s   zEnterSubgraphLine.codegenNr<   r  r  r/   r<   r  r   r   r   r   r  r!  rF   rF   rF   rG   r    s
   
 
r  c                   @  s*   e Zd ZU ded< dddZdd	d
ZdS )ExitSubgraphLiner  r   r<   r  c                 C  s   | j  | j _d S r>   )r   pop_computed_sizesr  r   rF   rF   rG   r    r
  zExitSubgraphLine.__post_init__r  r/   c                 C  s   | j   |  d S r>   )r   pop_codegened_graphdo_unindentr   rF   rF   rG   r!    s   
zExitSubgraphLine.codegenNr"  r#  r$  rF   rF   rF   rG   r%    s   
 
r%  c                   @  s(   e Zd ZU ded< ded< dd	d
ZdS )EnterDeviceContextManagerLiner   
device_idxzOptional[int]last_seen_device_guard_indexr  r/   r<   r  c                 C  s   t jjrO|d t jjr,| jd u r |t jj  d d S | j| jks*J dd S | jd u rC|t jj	  d| j d d S |d| j d d S |dt jj
| j d |  |t jj| j d S )	N
z) stream_guard(stream, this->device_idx_);z4AOTInductor only supports running on one CUDA devicez device_guard(z);zdevice_guard.set_index(with :)r*   r?   cpp_wrapperr   aot_moder+  
device_opscpp_aoti_stream_guardr*  cpp_aoti_device_guarddevice_guardr  
set_devicer   rF   rF   rG   r!    s$   


z%EnterDeviceContextManagerLine.codegenNr#  )r   r   r   r   r!  rF   rF   rF   rG   r)    s   
 r)  c                   @     e Zd ZdddZdS )	ExitDeviceContextManagerLiner  r/   r<   r  c                 C  s   t jjs
|  d S d S r>   )r*   r?   r/  r(  r   rF   rF   rG   r!       z$ExitDeviceContextManagerLine.codegenNr#  r   r   r   r!  rF   rF   rF   rG   r7    s    r7  c                   @  s4   e Zd ZU ded< dddZdddZdddZdS )MemoryPlanningLiner  r   stater   r<   c                 C  s   | S )zFirst pass to find reuserF   r   r;  rF   rF   rG   plan  s   zMemoryPlanningLine.planr  r/   r  c                 C     dS )zSecond pass to output codeNrF   r   rF   rF   rG   r!    s    zMemoryPlanningLine.codegenrR   c                 C  sr   g }t | D ]#}|jdkrqt| |j}||j d|jtju r%| n|  qt| j	 dd
| dS )zF
        Emits a string representation that fits on one line.
        r   =rz   rq   r{   )dataclassesfieldsrx   getattrr  r   r   Bufferget_namer   r   )r   r   fieldr   rF   rF   rG   r     s   
"zMemoryPlanningLine.__str__Nr;  r   r<   r:  r#  r<   rR   )r   r   r   r   r=  r!  r   rF   rF   rF   rG   r:    s
   
 

r:  c                   @  s*   e Zd ZU ded< dddZdddZdS )AllocateLiner;   r:   r;  r   r<   r:  c                 C  s   | j  tjjv rt| jS t| j }tj	r+||v r+|
|}d|_t| j|j | j S | j  jdkrM| j| j }|d urM| jtttj|d7  _| S )NTcpur+   )r:   rD  r*   r?   removed_buffersNullLiner   rH   r   allow_buffer_reuser  r  	ReuseLinerA   r   static_shape_for_buffer_or_noner  r   	functoolsreduceoperatormul)r   r;  r   	free_linestatic_shaperF   rF   rG   r=    s   


zAllocateLine.planr  r/   r  c                 C  s2   | j  tjjvsJ | j| j }|| d S r>   )r:   rD  r*   r?   rJ  r   make_buffer_allocationr   r   r  r   rF   rF   rG   r!    s   zAllocateLine.codegenNrF  r#  )r   r   r   r   r=  r!  rF   rF   rF   rG   rH    s   
 
rH  c                   @  s6   e Zd ZU ded< dZded< dd
dZdddZdS )r  r;   r:   Fr   r  r;  r   r<   r:  c                 C  sl   t | j dkr| S t| jjtjr| S | jrJ | j t	j
jv r(t| jS tjr4|t| j|  | S r   )ra   r:   get_inputs_that_alias_outputr   layoutr   MultiOutputLayoutr  rD  r*   r?   rJ  rK  r   r   rL  r  rH   r<  rF   rF   rG   r=    s   

zFreeIfNotReusedLine.planr  r/   r  c                 C  s8   | j  tjjvsJ | js|| j| j  d S d S r>   )	r:   rD  r*   r?   rJ  r  r   r   make_buffer_freer   rF   rF   rG   r!    s   zFreeIfNotReusedLine.codegenNrF  r#  )r   r   r   r   r  r=  r!  rF   rF   rF   rG   r  
  s
   
 
r  c                   @  s>   e Zd ZU ded< ded< dZded< dddZdddZdS )rM  r;   r:   	reused_asTr   
delete_oldr;  r   r<   r:  c                 C  sL   | j  tjjv r| j tjjv sJ t| jS | j tjjvs$J | S r>   )r:   rD  r*   r?   rJ  r[  rK  r   r<  rF   rF   rG   r=  '  s
   
zReuseLine.planr  r/   r  c                 C  sL   | j  tjjvsJ | j tjjvsJ || j| j | j| j	 d S r>   )
r:   rD  r*   r?   rJ  r[  r   r   make_buffer_reuser\  r   rF   rF   rG   r!  .  s
   zReuseLine.codegenNrF  r#  )r   r   r   r   r\  r=  r!  rF   rF   rF   rG   rM  !  s   
 
rM  c                   @  r  )rK  Nr  rF   rF   rF   rG   rK  6  r  rK  c                   @  sH   e Zd ZU ded< ded< edddZedd
dZedddZdS )CommBufferLiner  r   	ir.Bufferr:   r<   r   c                 C  sF   ddl m} | j }| j }||rtd| j t||j S )Nr   )is_symbolicz-The size of a comm buffer can't be symbolic: )torch._inductor.utilsr`  r:   	get_numelrB   rb   r   itemsize)r   r`  numelr
   rF   rF   rG   size?  s   


zCommBufferLine.sizeir.CommBufferTypec                 C      | j  }t|tjsJ |jS r>   )r:   get_output_specr   r   CommBufferLayoutcomm_buffer_typer   rX  rF   rF   rG   rj  K     
zCommBufferLine.comm_buffer_typerR   c                 C  rg  r>   )r:   rh  r   r   ri  
group_namerk  rF   rF   rG   rm  Q  rl  zCommBufferLine.group_nameNr<   r   )r<   rf  rG  )r   r   r   r   propertyre  rj  rm  rF   rF   rF   rG   r^  :  s   
 r^  c                   @  s"   e Zd Zd
ddZedd Zd	S )CommBufferAllocateLiner  r/   r<   r  c                 C  sx   | j  tjjvsJ | j  }| j  }| j  }t| j  }t| j 	 }|
| | j| j| j||||| d S r>   )r:   rD  r*   r?   rJ  
get_devicerB   r   get_size
get_strider   make_allocation_linerj  rm  r   )r   r  rx   devicer
   shapestriderF   rF   rG   r!  Z  s$   


zCommBufferAllocateLine.codegenc                 C  s^   | t jjkr(| d|| d|| d| d|j d| dtdd dS td	|  )
Nz = empty_strided_p2p(rq   z, torch.device("cuda:z"), group_name="z", alloc_id=r   l    r{   zUnsupported comm buffer type: )r   CommBufferTypeSYMM_MEMcodegen_shape_tupleindexrandomrandintNotImplementedError)rj  rm  r   rx   ru  r
   rv  rw  rF   rF   rG   rt  n  s$   

z+CommBufferAllocateLine.make_allocation_lineNr#  )r   r   r   r!  staticmethodrt  rF   rF   rF   rG   rp  X  s    
rp  c                   @  r6  )	CommBufferFreeLiner  r/   r<   r  c                 C  s,   | j | j}|| d| jj d d S )Nz # z buffer free)r   rZ  r:   r   rj  r   rV  rF   rF   rG   r!    s   zCommBufferFreeLine.codegenNr#  r9  rF   rF   rF   rG   r    s    r  c                      s  e Zd ZdZ fddZe	dFdGddZdHddZdIddZdHddZ	dJddZ
dHddZedHdd ZedHd!d"ZdKd%d&ZedLd(d)ZdHd*d+ZdMd-d.ZdNd0d1ZdHd2d3ZdHd4d5ZdHd6d7ZdOd9d:ZdPd<d=ZdLd>d?ZdHd@dAZdHdBdCZdFdQdEdFZdGdH ZdIdJ ZdKdL ZdMdN Z dOdP Z!dRdQdRZ"dSdSdTZ#dHdUdVZ$dTdXdYZ%dUd\d]Z&dUd^d_Z'dUd`daZ(dbdc Z)ddde Z*dVdkdlZ+dWdndoZ,dpdq Z-drds Z.dtdu Z/			dXdYd|d}Z0d~d Z1dPddZ2dd Z3dd Z4dd Z5dd Z6dZddZ7dd Z8d[ddZ9dd Z:ddd\ddZ;ddd\ddZ<d]ddZ=d^ddZ>d_ddZ?d_ddZ@dRddZA	dFd`ddZBdaddZCdd ZDdd ZEdd ZFdd ZG			dbdcddĄZHddddǄZIdedd˄ZJdFdfdd΄ZKdgdd҄ZLdgddԄZMddք ZNdd؄ ZOddڄ ZPdd܄ ZQddބ ZRdd ZSdd ZTdd ZUdhddZVdd ZWdddddddiddZXdd ZYdd ZZdd Z[dFddZ\djddZ]	dFddZ^dkddZ_dlddZ`dmddZadnddZbdoddZcdpddZddqddZedd ZfdFddZgdd ZhdrddZidd Zjdsd%d&Zkd'd( Zld)d* Zmdtd-d.Zndud0d1Zod2d3 Zpd4d5 Zqd6d7 Zrd8d9 Zsd:d; Zted<d= Zued>d? Zved@dA ZwedBdC ZxedDdE Zy  ZzS (v  r  zB
    Generate outer wrapper in Python that calls the kernels.
    c                   s  t    t  _t  _t  _t  _t  _t  _	t  _
t  _t  _t  _tt   _i  _t  _g  _d _d _d _d _d _tjjrRdnd _tjjr[dnd _d  _d _i  _ tt   _!t  _"d  _# $  g  _%g  _& '   (   )  tjj*stjj+, D ]
\}} -|| qtt.   _/tt.   _0i  _1t23d  j4 _4t23d d fd
d}| _5i  _6tt   _7t8  _9tt   _:i  _;t<t=j>j?t=j>j@d _Ag  _Bd S )N #r  z
std::move(r{   Tr   rR   r<   c                   s(    j |  tjjr j|  d S d S r>   )importsr   r   r   r   r   )r   r   rF   rG   add_import_once  s   z6PythonWrapperCodegen.__init__.<locals>.add_import_once)debug_printer_leveluse_array_ref)r   rR   r<   r  )Cr   r  r   _names_iterr/   r  headerprefixsuffixkernel_declarationswrapper_callkernel_autotune_defsr   subgraph_definitionsr   rR   r   src_to_kernelkernel_numel_exprlinesdeclaredeclare_maybe_referenceendingcommentnone_strr*   r?   r/  
move_beginmove_endr+  supports_intermediate_hooksuser_defined_kernel_cacheunbacked_symbol_declsr  launcher_fn_nameset_launcher_fn_namecodegened_graph_stackcomputed_sizes_stackwrite_headerwrite_prefix!write_kernel_autotune_defs_headerr0  constant_reprsr^   write_constant
BufferName	allocatedfreedreusesrO  	lru_cachewrite_get_raw_streamr  _metas
_meta_varsr   multi_kernel_statealready_codegened_subgraphsallocated_workspacesr   r   aot_inductor debug_intermediate_value_printerallow_stack_allocationdebug_printeradditional_files)r   rx   hashedr  r  r   rG   r    sp   

zPythonWrapperCodegen.__init__Nis_subgraphr   subgraph_namer   parent_wrapperr   partition_signatures$Optional[ir.GraphPartitionSignature]c                 C  s.   | r|d usJ |d usJ t |||S t S r>   )SubgraphPythonWrapperCodegenr  )r  r  r  r  rF   rF   rG   create  s   zPythonWrapperCodegen.creater<   r  c                 C  s
   d| _ d S )Ncall)r  r   rF   rF   rG   r    r   z)PythonWrapperCodegen.set_launcher_fn_namerx   rR   r  c                 C  s   | j | d|  d S )Nz = None  # )r  r   )r   rx   r  rF   rF   rG   r    r   z#PythonWrapperCodegen.write_constantc              	   C  s   t jj }d}|d ur|jd urd|j }d}ttjjdkr#d}| j	j
d| dtj d| dd	d
 | jj
dd	d
 zddlm} | jj
dd	d
 W n ttfyY   Y nw tjre| jd d S d S )Nr  z
# AOT ID: r   zRfrom torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_infoz
                aH  
                from ctypes import c_void_p, c_long, c_int
                import torch
                import math
                import random
                import os
                import tempfile
                from math import inf, nan
                from cmath import nanj
                from torch._inductor.hooks import run_intermediate_hooks
                from torch._inductor.utils import maybe_profile
                from torch._inductor.codegen.memory_planning import _align as align
                from torch import device, empty_strided
                from z import AsyncCompile
                from torch._inductor.select_algorithm import extern_kernels
                from torch._inductor.codegen.multi_kernel import MultiKernelCall
                z
            Tr   a  
                aten = torch.ops.aten
                inductor_ops = torch.ops.inductor
                _quantized = torch.ops._quantized
                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
                reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
                alloc_from_pool = torch.ops.inductor._alloc_from_pool
                async_compile = AsyncCompile()
            )_SymmetricMemoryzs
                empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
                zfrom torch.cuda import nvtx)torch_guardsTracingContexttry_getaot_graph_namer   r   r  r  r  r   r   r   r  torch._C._distributed_c10dr  AttributeErrorImportErrorannotate_trainingr   )r   contextaot_config_commentaot_inductor_debug_utilsr  rF   rF   rG   r    sB   
z!PythonWrapperCodegen.write_headerr  c                 C     d S r>   rF   )r   r  rF   rF   rG   include_extra_header5     z)PythonWrapperCodegen.include_extra_headerc                 C     | j dtj d d S )Na	  
                import torch
                from torch._dynamo.testing import rand_strided
                from torch._dynamo.utils import preserve_rng_state
                from torch._inductor.select_algorithm import AlgorithmSelectorCache
                from aH   import AsyncCompile

                async_compile = AsyncCompile()
                generate_example_value = AlgorithmSelectorCache.generate_example_value
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
            )r  r   r   r   r   rF   rF   rG   r  8  s
   z6PythonWrapperCodegen.write_kernel_autotune_defs_headerc                 C  sn   dt j d}tjjr| j| | jtj	j
d tj	js5| jj|dd | jtj	j
d d S d S )NzU
            import triton
            import triton.language as tl
            from z+ import start_graph, end_graph
            get_raw_streamTr   )r!   r   r   r   r   r   r   r   r*   r?   r1  import_get_raw_stream_asr/  r  )r   
import_strrF   rF   rG   write_triton_header_onceH  s   z-PythonWrapperCodegen.write_triton_header_oncec                 C  sD   t jjr| jtjjd tjj	s | j
tjjd d S d S )Nr  )r   r   r   r   r   r*   r?   r1  r  r/  r  r   rF   rF   rG    write_get_raw_stream_header_onceZ  s   z5PythonWrapperCodegen.write_get_raw_stream_header_oncemetaTritonMetaParamsc                 C  sv   t |}|| jvr6dt| j }|| j|< | j| d|  tjjr6| j| d|  | j	
| | j| S )Nr  r   )rZ   r  ra   r  r   r   r   r   r   r  r   )r   r  varrF   rF   rG   add_meta_oncee  s   


z"PythonWrapperCodegen.add_meta_once	list[str]c                   s    fdd   D S )Nc                   s   g | ]}|  jqS rF   )codegen_referencer  rs   r   r   rF   rG   ru   r  s    z8PythonWrapperCodegen.get_output_refs.<locals>.<listcomp>)get_graph_outputsr   rF   r   rG   get_output_refsp  s   
z$PythonWrapperCodegen.get_output_refsc                 C  r  r>   rF   r   rF   rF   rG   mark_output_typev  r  z%PythonWrapperCodegen.mark_output_type>dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]c                 C     t jjS r>   )r*   r?   graph_inputsr   rF   rF   rG   get_graph_inputsy  s   z%PythonWrapperCodegen.get_graph_inputslist[IRNode]c                 C  r  r>   )r*   r?   graph_outputsr   rF   rF   rG   r  ~     z&PythonWrapperCodegen.get_graph_outputsc              
   C  s   |    D ]B\}}t|tjtjfrq|tjj	vs t|tj
r!qt| dkr*q| | }| | }| jd| d| d| d qd S )Nr   zassert_size_stride(rq   r{   )r  r^   r   r   r	   r   TorchBindObjectr*   r?   graph_input_namesGeneratorStater&   rr  r   rs  r  r   )r   rx   bufre  rw  rF   rF   rG   codegen_input_size_asserts  s   "z/PythonWrapperCodegen.codegen_input_size_assertsc                 C  sj   | j d |   D ]&\}}t|tjtjfrqd| d}| j | d| d}| j | qd S )Nz(# make sure graph inputs are not nan/infzassert not z.isnan().any().item()z.isinf().any().item())	r  r   r  r^   r   r   r	   r   r  )r   rx   r  r   rF   rF   rG   codegen_input_nan_asserts  s   z.PythonWrapperCodegen.codegen_input_nan_assertsc                 C     | j d d S )NzV

            async_compile.wait(globals())
            del async_compile
            )r  r   r   rF   rF   rG   write_async_compile_wait  s   z-PythonWrapperCodegen.write_async_compile_waitinput_namesc                 C  s@   d |}t|dkr|d7 }| j| d | jd d S )Nrq   r+   ,z = argszargs.clear())r   ra   r  r   )r   r  lhsrF   rF   rG   
write_args  s
   
zPythonWrapperCodegen.write_argsr   c                 C  s8   t jr| jd d}|S | jd| j d d}|S )Na  
                class Runner:
                    def __init__(self, partitions):
                        self.partitions = partitions

                    def recursively_apply_fns(self, fns):
                        new_callables = []
                        for fn, c in zip(fns, self.partitions):
                            new_callables.append(fn(c))
                        self.partitions = new_callables

                    def call(self, args):
                r   z
                def z(args):
                r+   )r   graph_partitionr  r   r  r   prefix_indentrF   rF   rG   !write_launcher_fn_call_get_indent  s   	z6PythonWrapperCodegen.write_launcher_fn_call_get_indentc                 C  r  r>   )r*   r?   r  r   rF   rF   rG   get_graph_input_names  r  z*PythonWrapperCodegen.get_graph_input_namesc                 C  s   | j d usJ |   |  }| j|< tjjr$| jt	j
j  t	j
 }tjr6| jd| d |   }rA| | |   |   W d    d S 1 sTw   Y  d S )Nz0training_annotation = nvtx._device_range_start(''))r  r  r  r  r   r   r   debug_sync_graphr   r*   r?   r1  synchronizeget_training_phaser  r  r  codegen_inputs"codegen_input_size_and_nan_asserts)r   r  phaser  rF   rF   rG   r    s    



"z!PythonWrapperCodegen.write_prefixc                 C  s$   t jr|   t jr|   d S d S r>   )r   size_assertsr  nan_assertsr  r   rF   rF   rG   r    s
   z7PythonWrapperCodegen.codegen_input_size_and_nan_assertsr*  c                 C  sX   |    d| }tjjr| j| d| d tjjr|S | | d| d |S )Nstream = get_raw_stream(r{   )	r  r   r   r   r   r   r*   r?   r/  )r   r*  r?   rx   rF   rF   rG   r    s   
z)PythonWrapperCodegen.write_get_raw_streamc                 C  s
   | j d S )N)r  r   rF   rF   rG   get_codegened_graph  r   z(PythonWrapperCodegen.get_codegened_graphc                 C     | j | d S r>   )r  r  )r   r?   rF   rF   rG   r       z)PythonWrapperCodegen.push_codegened_graphc                 C  
   | j  S r>   )r  r  r   rF   rF   rG   r'    r   z(PythonWrapperCodegen.pop_codegened_graphc                 C  s   ddl m} | j||S )Nr   )deepcopy)copyr  r  r  )r   r  r  rF   rF   rG   r     s   z(PythonWrapperCodegen.push_computed_sizesc                 C  r  r>   )r  r  r   rF   rF   rG   r&    r   z'PythonWrapperCodegen.pop_computed_sizesc                 C  s   t | j S r>   )nextr  r   rF   rF   rG   next_kernel_suffix     z'PythonWrapperCodegen.next_kernel_suffixc                 C  s   |  t|| j tjjr=|   | j dtj	j
| d | j  | j tj	j
| | j d| d| d || _d S )Nr-  r.  r  r   r{   )r   r)  r+  r   r   r   r  r   r*   r?   r1  r4  r  r5  )r   r*  rF   rF   rG   codegen_device_guard_enter  s    


z/PythonWrapperCodegen.codegen_device_guard_enterc                 C  s&   |  t  tjjr| j  d S d S r>   )r   r7  r   r   r   r   r(  r   rF   rF   rG   codegen_device_guard_exit  s   z.PythonWrapperCodegen.codegen_device_guard_exitoutput_refsc                 C  s2   |r| j dd| d  d S | j d d S )Nzreturn (rq   , )z	return ())r  r   r   )r   r  rF   rF   rG   generate_return#  s   z$PythonWrapperCodegen.generate_returnresultr/   c                 C  r  r>   rF   r   r  rF   rF   rG   generate_before_suffix)  r  z+PythonWrapperCodegen.generate_before_suffixc                 C  sB   t jrd| jt| jdkrdnd }|d| d d S d S )Nrq   r+   r  r  z-
                runner = Runner(partitions=[z{])
                call = runner.call
                recursively_apply_fns = runner.recursively_apply_fns
                )r   r  r   all_partition_namesra   r   )r   r  all_partition_name_listrF   rF   rG   generate_after_suffix,  s   
z*PythonWrapperCodegen.generate_after_suffixc                 C  r  r>   rF   r  rF   rF   rG   generate_end:  r  z!PythonWrapperCodegen.generate_endc                 C  s   |  || d S r>   )generate_extern_kernel_alloc)r   fallback_kernelr   rF   rF   rG   generate_fallback_kernel=  r  z-PythonWrapperCodegen.generate_fallback_kernelc              
   C  s   t |jtj}| }| }| }| j}tj	r"d|v r"d| }|r9| 
| j | dd| d|  d S | 
| j | d| dd| d|  | jrrtjrt|d urvtd d  d	7  < | 
d
|jd| d d S d S d S d S )Nview_as_complexz.clone()rz   rq   r{   r   inductorintermediate_hooksr+   zrun_intermediate_hooks()r   rX  r   
NoneLayoutrD  get_origin_nodeget_kernel_namer  r   memory_planningr   r  r   r  generate_intermediate_hooksr   rx   )r   extern_kernelr   	no_returnoutput_nameorigin_nodekernel_namer  rF   rF   rG   r  @  s.   
*$z1PythonWrapperCodegen.generate_extern_kernel_allocro   outout_viewr   ru  c                 C  sz   t jjj}|||d d d |d|r|n|  | | | dd| d W d    d S 1 s6w   Y  d S )Nexternzout=rz   rq   r{   )r*   r?   wrapper_coder  set_printer_argsr  r   r   )r   ro   r'  r(  r   ru  debug_printer_managerrF   rF   rG   generate_extern_kernel_out]  s   
	"z/PythonWrapperCodegen.generate_extern_kernel_outFc                   s   |j }|j}|rtdd |D }tdd |D }|j  d}d fdd|D }d fdd|D }t |j}d}| d	|j	 d
}| d| d| d| }	| d|	 d}
|
S )Nc                 s      | ]
}t jj|V  qd S r>   r*   r?   rC   atomically_apply_size_hintrs   drF   rF   rG   r   p  s    zEPythonWrapperCodegen._generate_tma_descriptor_call.<locals>.<genexpr>c                 s  r.  r>   r/  r1  rF   rF   rG   r   q  s    
z.data_ptr()rq   c                 3      | ]	}t  |V  qd S r>   r  val_to_arg_strrs   dimr   rF   rG   r   w  s    c                 3  r3  r>   r4  r6  r   rF   rG   r   x  s    
z$triton.tools.experimental_descriptorz.create_d_tma_descriptorrz   r{   )
dims
block_dimsr   tensorr  r   r  r5  element_sizerank)r   descapply_size_hintsr9  r:  ptrr<  r  r   r   r  rF   r   rG   _generate_tma_descriptor_calll  s$   z2PythonWrapperCodegen._generate_tma_descriptor_callc                 C  s.   |  |}|j d| | j }| | d S Nr   )rA  rx   r  r   )r   r>  r  r   rF   rF   rG   generate_tma_descriptor  s   
z,PythonWrapperCodegen.generate_tma_descriptorc           	      C  sf   | dd tt| }|dr|d dg| 7 }n|r(|dt| 7 }|d7 }| | d S )Nrz   r  zaten.scatter_reducerq   r  z	, reduce=r{   )r   maprR   r   rZ   r   )	r   r   inputscpp_kernel_namepython_kernel_namesrc_is_tensorrP  r   r   rF   rF   rG   generate_scatter_fallback  s   

z.PythonWrapperCodegen.generate_scatter_fallbackc                 C  s4   dd | d}||||g}| | || d S )N[rq   ])r   r   wrap_kernel_call)r   ro   r   indicesvalues
accumulateindices_strr   rF   rF   rG   generate_index_put_fallback  s   z0PythonWrapperCodegen.generate_index_put_fallbackbuf_namerG  rF  codegen_argsop_overloadOptional[torch._ops.OpOverload]c              	   C  s&   |  | d| dd| d d S )Nr   rz   rq   r{   )r   r   )r   rR  rG  rF  rS  rT  raw_argsoutputsrF   rF   rG   ,generate_fallback_kernel_with_runtime_lookup  s   &
zAPythonWrapperCodegen.generate_fallback_kernel_with_runtime_lookupc                 C  s6   t d | |W  d    S 1 sw   Y  d S )NPythonWrapperCodegen.generate)r   	_generate)r   is_inferencerF   rF   rG   generate  s   
$rY  c                 C  s   t jrdS dS )Nr   r+   )r   r  r   rF   rF   rG   get_wrapper_call_indent  s   z,PythonWrapperCodegen.get_wrapper_call_indentc                 C  s6  t jr|   t }|| j |d || j tj	j
r*tj	jr*tj	jr*t }|| j t }|| j  t jrE| | t jrL|   |rVt jrV|   n|   t jjrft jjsf|   | jD ]}t|trw| | j qi| j| qi| ! }| "  t jj#r| jtj	j$%  t jr| &  t jjrt jjs| '  t jjr| (  t j)rt js| jd | *| W d    n1 sw   Y  | +  || j, | - }|| || j W d    n1 sw   Y  | .| || j/ | 0| | 1| | 2| |3 | j43 fS )Nr  z+nvtx._device_range_end(training_annotation))5r   profile_bandwidthr  r/   r   r  r   r  r*   r?   r0  r/  is_const_graphr  r   	ExitStackenter_contextr  r   profiler_mark_wrapper_call#generate_profiler_mark_wrapper_callgenerate_start_graphr   memory_planmemory_plan_reuser   store_cubinr   !generate_reset_kernel_saved_flagsr  r   r  r!  r  r  r  r1  r  generate_end_graph generate_save_uncompiled_kernelsgenerate_and_run_autotune_blockr  r  finalize_prefixr  r]  r  r  r  r  add_benchmark_harnessgetvaluewithlinemapr  )r   r[  r  stackr   r  wrapper_call_indentrF   rF   rG   rZ    sl   






+



zPythonWrapperCodegen._generatec              
   C  s   | j d i }| j  d | j  }tjtjkrDtj	t
 ddd}||d |j}W d   n1 s9w   Y  td| zt|| W dS  ty` } ztd	| |d}~ww )
z
        Compose self.kernel_autotune_defs and self.kernel_autotune_calls into a single block of
        code and execute it to trigger Triton kernel compilation and auto-tuning
        zQ
            async_compile.wait(globals())
            del async_compile
        r,  z.pyF)dirr  deletezutf-8NzAuto-tuning code written to %sz%Failed to run autotuning code block: )r  r   r   r   r   levelloggingDEBUGtempfileNamedTemporaryFiler   writeencoderx   debugexec	ExceptionRuntimeError)r   scopetuning_codef	file_patherF   rF   rG   rk    s8   z4PythonWrapperCodegen.generate_and_run_autotune_blockc                 C  s"   ddl m} || | j| _d S )Nr+   )MemoryPlanner)r   r  r=  r  )r   r  rF   rF   rG   re  '  s   z PythonWrapperCodegen.memory_planc                 C  s  t j }| jr2t| jd tr2| jd jj|vr2| j  | jr2t| jd tr2| jd jj|vst	 g}g }t
t| jD ]/}| j| }t|trV||d | j|< q?t|trb|t	  q?t|trn||  q?||  t|dks~J tdd |D }d S )Nr  r   c                 s  s    | ]}|j V  qd S r>   )r  )rs   srF   rF   rG   r   I  s    
z9PythonWrapperCodegen.memory_plan_reuse.<locals>.<genexpr>)r*   r?   get_output_namesr  r   r:  r:   rx   r  r   rangera   r=  r  r  r%  sum)r   	out_namesplanning_statespast_planning_statesir   _total_allocated_buffer_sizerF   rF   rG   rf  ,  s8   





z&PythonWrapperCodegen.memory_plan_reuser   ir.TensorBox
bound_varsOrderedSet[sympy.Symbol]c           	   	     sp  | j  td  fdd}td  fdd}t|tjr<t|tjr)||v r+d S  | d|  || d S t|t	j
rt| D ]#\}}t|tjrk||vrk | d|| d| d || qHt| D ]#\}}t|tjr||vr | d|| d| d || qrd S t|t	jrd S t|t	jrd S tjjjrd S tdt| )	Nc                         |  d|  d |  dS )Nz_size = z.size()_sizer   rx   r  rF   rG   sizeofU     
zDPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.sizeofc                   r  )Nz
_stride = z	.stride()_strider  r  r  rF   rG   strideofZ  r  zFPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.strideofr   rJ  rK  zUnknown value type: )r  rO  r  r   r   r	   Symbolr   r   r   	TensorBox	enumeraterr  rs  r  r  r  	_inductorr   r  rb   r   )	r   rx   r   r  r  r  r7  re  rw  rF   r  rG   codegen_input_symbol_assignmentM  s:    
 

z4PythonWrapperCodegen.codegen_input_symbol_assignmentc                 C  sX   t tj  }|  }dd | D dd | D  }|D ]\}}| ||| qdS )z$Assign all symbolic shapes to localsc                 S  s$   g | ]\}}t |tjr||fqS rF   r   r   r  rs   kvrF   rF   rG   ru     s
    z7PythonWrapperCodegen.codegen_inputs.<locals>.<listcomp>c                 S  s$   g | ]\}}t |tjs||fqS rF   r  r  rF   rF   rG   ru     s   $ N)r   r   r  r  r^   r  )r   r  r  rE  rx   r   rF   rF   rG   r  w  s   z#PythonWrapperCodegen.codegen_inputssymsympy.Symbolc                 C  sd   t |tjr.t|tjr0|| jv rd S | j| tj	j
j| }| | dt|  d S d S d S rB  )r   r   r  r   r   PRECOMPUTED_SIZEr  r   r*   r?   rC   inv_precomputed_replacementsr   pexpr)r   r  exprrF   rF   rG   ensure_size_computed  s   
z)PythonWrapperCodegen.ensure_size_computedc                 C  r  r>   rF   r   rF   rF   rG   rl    r  z$PythonWrapperCodegen.finalize_prefixTrD   r   r	   rD   c                C  s   t d)Nz8codegen_cpp_sizevar is only implemented for cpp_wrapper!)r}  r   r   rD   rF   rF   rG   codegen_cpp_sizevar  r  z(PythonWrapperCodegen.codegen_cpp_sizevarc                C  s   t ||dS )Nr  )r  r  rF   rF   rG   codegen_python_sizevar  r
  z+PythonWrapperCodegen.codegen_python_sizevarc                 C  
   |  |S r>   )r  )r   r   rF   rF   rG   codegen_sizevar  r   z$PythonWrapperCodegen.codegen_sizevarbasenamer{  c                 C  s   | d| dS )NrJ  rK  rF   )r   r  rx   r{  rF   rF   rG   codegen_tuple_access  r  z)PythonWrapperCodegen.codegen_tuple_accessrv  Sequence[Expr]c                 C  sN   g t | j|}t|dkrdS t|dkrd|d  dS dd| dS )Nr   ()r+   rz   r  rq   r{   )rD  r  ra   r   )r   rv  partsrF   rF   rG   r     s   z/PythonWrapperCodegen.codegen_python_shape_tuplec                 C  r  r>   )r   )r   rv  rF   rF   rG   rz    r   z(PythonWrapperCodegen.codegen_shape_tuplec                 C  s.   d d|t|t|| || |gS )Nzalloc_from_pool({})rq   )formatr   r  rR   r   )r   rx   offsetr
   rv  rw  rF   rF   rG   codegen_alloc_from_pool  s   z,PythonWrapperCodegen.codegen_alloc_from_poolr   Callable[..., None]c                 C  s   ||j jkr+||j jkr+||j jkr+|d ur&||jkr&d|  d| dS |  S | |}| |}| |}|d urW||jkrWd|  d| d| d| d| dS d|  d| d| d| d	S )Nzaten.view.dtype(rq   r{   z#aten.view.dtype(reinterpret_tensor(z), zreinterpret_tensor()rX  re  rw  r  r
   rD  r   r  )r   datare  rw  r  r   r
   rF   rF   rG   codegen_reinterpret_view  s   




( z-PythonWrapperCodegen.codegen_reinterpret_viewnon_blockingc                 C  s    |  | d| d| d d S )Nz.copy_(rq   r{   r  )r   r   dstr  rF   rF   rG   codegen_device_copy  s    z(PythonWrapperCodegen.codegen_device_copyc                 C  s$   |  | j | d| | j  d S rB  )r   r  r  )r   rx   r   rF   rF   rG   codegen_multi_output  s   $z)PythonWrapperCodegen.codegen_multi_outputc                 C  s0  dd |j D \}t|jdkr| |j d| d not|jdkr9t|jd tr9| |j d| d nSt|jdkrt|jd tr| |j d	| d | d
|j d|jd j d|j d|jd j d	 | |j d|j d|jd j  nt	d|j | |
  d d S )Nc                 s  s    | ]}|  V  qd S r>   r  )rs   trF   rF   rG   r     r   z>PythonWrapperCodegen.codegen_dynamic_scalar.<locals>.<genexpr>r   r   .item()r+   z = 1 if z.item() else 0z_undivided = zassert z_undivided % z
 == 0, f'{z_undivided} not divisible by 'z_undivided // unrecognized keypath z = None)rE  ra   keypathr   r  r   r   r   divisorrb   rD  )r   r:   r  rF   rF   rG   codegen_dynamic_scalar  s&   
z+PythonWrapperCodegen.codegen_dynamic_scalarc              	     s:   fdd} fdd} fdd}  g d     jdd	d
 tjj D ]\}} d|  ||| |	 |j
|j q,ttjjdkrl d tjj D ]\}} d|  ||| qZtjj D ]}\}}t|tjrttjjj|d trqrt|tjrttjjdkr d  d|  |||  qrt|tjr||tjjj|dd qrt|tjr||d|j
j d qrdd | D }dd | D }	||||	| |   qrdd!tjj"  d}
 d|
   d W d    d S 1 sw   Y  d S )Nc                   s8     |  d| d| d| d| d
 d S )Nz = rand_strided(rq   
, device='	', dtype=r{   )r   r   )rx   rv  rw  ru  r
   r   r   rF   rG   add_fake_input  s   zFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_fake_inputc                   s     |  d|  d S rB  r  )rx   r   r   rF   rG   add_expr_input     zFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_expr_inputc                   s8   dd l }t|tjsJ  |  d||d d S )Nr   z = pickle.loads(r{   )pickler   r  ScriptObjectr   dumps)rx   r   r  r  rF   rG   add_torchbind_input  s    zKPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_torchbind_input)r  r  z3def benchmark_compiled_module(times=10, repeat=10):z
                from torch._dynamo.testing import rand_strided
                from torch._inductor.utils import print_performance
                Tr   zglobal r   zimport pickle*   fallbackztorch.cuda.default_generators[z].graphsafe_get_state()c                 S     g | ]}t jjj|d dqS r  r  r*   r?   rC   	size_hintr  rF   rF   rG   ru   8      zBPythonWrapperCodegen.benchmark_compiled_module.<locals>.<listcomp>c                 S  r  r  r  r  rF   rF   rG   ru   <  r  zcall([rq   z])zfn = lambda: z8return print_performance(fn, times=times, repeat=repeat))#
writelinesr   r   r*   r?   	constantsr^   r   re  rw  ru  r
   ra   torchbind_constantsr  r   r   r  rC   
var_to_valrl   r   r   r  get_real_objr	   r  r  r{  rr  rs  rq  rB   r   keys)r   r   r  r  r  rx   r   torchbind_objrv  rw  call_strrF   r  rG   benchmark_compiled_module  sn   


$z.PythonWrapperCodegen.benchmark_compiled_modulec                 C  sh   t jsdS | | |g d |  |ddt  dg W d   dS 1 s-w   Y  dS )zL
        Append a benchmark harness to generated code for debugging
        N)r  r  zif __name__ == "__main__":zBfrom torch._inductor.wrapper_benchmark import compiled_module_mainzcompiled_module_main('z', benchmark_compiled_module))r   benchmark_harnessr  r  r   r$   r   r   rF   rF   rG   rm  L  s   

"z*PythonWrapperCodegen.add_benchmark_harnessr&  kernel_bodymetadatagpucpp_definitionc                 C  sf   t jjrd| d| }| j| tjjrd S |r| dnd}d| | d| }| j| d S )Nz

r   r,  r  )	r   r   r   r  r   r*   r?   r/  r  )r   r&  r  r  r  r  bodymetadata_commentrF   rF   rG   define_kernel^  s   z"PythonWrapperCodegen.define_kernelfn_codec                 C  r  r>   )r  r   )r   r  rF   rF   rG   define_subgraph_launcher_fnq  r  z0PythonWrapperCodegen.define_subgraph_launcher_fnr   "list[list[Union[int, sympy.Expr]]]c           (   	     s  ddl m} ddlm}m}	m}
 ddlm m}m	}m
}m} ddlm}m} |  |j}g i g g }fdd	d5 fdd	}t|jD ]\}}||jv r^|| |ddd qJ|vrcqJ| }| d u rx|| |ddd qJt|tjr||||d qJt|tjr||||| | d qJt|tjr|||||j | |jjd qJt|ttjfot j!j"#|d}||||||d qJt$d dd |jD d}|t%&t j!' i t()|dt*dgd}|rt+||d< |r
t+||d< t,|dkr |	- }g t.tj/|d }n]d6fdd i fd!d|D }|r>t,|t,|ks@J g }t0t1||d"d# dd$D ]\}}|2||g t.t3|g t.t4|d% qN|
j|g t.t56 d&}g 7 }t8|j9g}t,|dkr6 D ]}t|tjtjfs|2| q|2t5| |:t5| t+|}|| j;v rg | j;| |R S | d't,| j; } t< }!t=jj>r|!?d(| d) n	|!?d(|d) | |d*< |@|A  |!B|  |!Bd+g t.||d,|d-|d. tC|}"t=jj>r$|"Dd/| d0d/|  d0}"|!B|" t j!' }#|!?d1|#jE d2 tFG|j9\}$}%tFH|j9}&d3|& d4|% }'| I| |!J |' | |f| j;|< | ||fS )7Nr   )patch_triton_dtype_reprr   )config_to_dict	FixedGridPrecomputedGridr+   )ConstexprArgKernelArgTypeSizeArg	TensorArgTMADescriptorArg)gen_common_triton_importsTritonKernelc                   s    |   |  d S r>   )r  )idxrP   )arg_indices	signaturerF   rG   add_to_signature  s   
zPPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_to_signatureFc                   s   |rt  r
| | |jv r|j |j< d S d S |jv s"J |r=t  r1|  |jd n| | d|j< d S |rRt  rK|  |jd d |j< d S | | d S )Nr  r+   )r)   rx   )r  rP   is_constexprequals_1equals_none)r  r  r  r   rF   rG   add_arg  s"   


zGPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_argr  T)r  )r  )rx   bufferr
   )rx   r  r
   r  )r  c                 S  rr   rF   )r,   r  rF   rF   rG   ru     rv   zJPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<listcomp>)
size_dtyperM  argdefs)rM  )r  ru  r  r   restore_valuereset_to_zeror  r   r<   r   c                   sx   t | tjr0g | j}|s| S |jtd |D ]}| v rqtdt   |< qt|  S t | t	s7J t
| S )N)r   _launcher_s)r   r   r	   free_symbolssortrR   r  ra   r(   r   r   )r  symbolsr  )extra_launcher_argsrF   rG   rename_sizes_for_launcher  s   



zYPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.rename_sizes_for_launcherc                   s   g | ]	}g t  |qS rF   )rD  )rs   r   )r  rF   rG   ru   (  s    c                 S  r   r   r   r   rF   rF   rG   r   -  r   zHPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<lambda>r   )r   pythonrY   )	grid_typeprecomputed_gridsr
  _zasync_compile.triton(z, '''r&  zG
            @triton_heuristics.user_autotune(
                configs=z ,
                inductor_meta=z,
                triton_meta=z{,
                filename=__file__,
                custom_kernel=True,
            )
            @triton.jit
            r   rz   z''', device_str='r  z# Original path: r.  )FFF)r  r   r<   r   )Ktorch.utils._tritonr  runtime.triton_heuristicsr  r  r  commonr  r  r  r  r  r   r  r  r   r  	arg_names
constexprsr   r   TMADescriptorrC  rD  rB   r    r  rX  r  r   r   r   r*   r?   rC   statically_known_equalsr6   r"   r  get_current_device_or_throwdictfromkeysr4   r   ra   setup_grid_as_argsrD  sympifyr   r   r  r  r3   rR   rN  r  idr   extendr  r/   r   unique_user_kernel_namesr   updateinductor_meta_commonr   r   replacer   inspectgetsourcelinesgetsourcefiler  r   )(r   ro   r   r   restore_value_argsreset_to_zero_argsr   r  r  r  r  r  r  r  r  r  r  original_nameequal_to_1_argsr   r  r   rP   r  triton_signaturetriton_metainductor_metaextra_launcher_call_argsr  r   cfg	cache_keyrx   r   
kernel_srccurrent_devicer  linenosrcfiler  rF   )r  r  r  r  r
  r   r  r  rG   !define_user_defined_triton_kernelt  s*  	$









z6PythonWrapperCodegen.define_user_defined_triton_kernelr  c                 C  sN   | d|j  d}|d ur|d| 7 }| | dt|j  t||jS )Nr  rd  r   )r  r   r  rd  r   )r   r&  treer  r  rF   rF   rG   generate_numel_expry  s
   z(PythonWrapperCodegen.generate_numel_exprwsr1   c              
   C  s.  |  }t| |}|jtjkr| | nP|jtjkr)| | | | | n<|jtjkr`| j	
|}|rMt|trBt|jtsDJ t|j||_n| | | | | || j	|< nt|jtjjr| jtj| ||j|jtjj|jfdd |jtjkr| jt| | d S d S d S )N)r+   )rv  rw  )rD  rH  	zero_moder2   UNINITIALIZEDr   ZERO_ON_CALLmake_zero_bufferZERO_PER_GRAPHr  rl   r   r:   r1   maximumrb   r   r   r   r   r  make_allocationru  r
   r*   r?   rC   r  r   )r   r6  rx   r   priorrF   rF   rG   generate_workspace_allocation  sF   





z2PythonWrapperCodegen.generate_workspace_allocationc                 C  s$   |j tjkr| t| | d S d S r>   )r7  r2   r;  r   r  )r   r6  rF   rF   rG   generate_workspace_deallocation  s   z4PythonWrapperCodegen.generate_workspace_deallocationc                 C  s   | d| j  S )Nz.zero_())r  )r   rx   rF   rF   rG   r:    r  z%PythonWrapperCodegen.make_zero_bufferc                 C  s   | dd | d| j S )Nrz   rq   r{   )r   r  )r   rx   	call_argsrF   rF   rG   rL    s   z%PythonWrapperCodegen.wrap_kernel_callc                 C  s8   | j d | j dtjj d || j   d S )Nz*from torch.profiler import record_functionzwith record_function('graph_z_inductor_wrapper_call'):)r  r   r*   r?   graph_idra  r   )r   ro  rF   rF   rG   rc    s
   z8PythonWrapperCodegen.generate_profiler_mark_wrapper_callc                 C  r  )Nzstart_graph())r  r   r   rF   rF   rG   rd    r  z)PythonWrapperCodegen.generate_start_graphc                 C  s   | j dtjd d S )Nz
end_graph(r{   )r  r   r   profile_bandwidth_outputr   rF   rF   rG   ri    r   z'PythonWrapperCodegen.generate_end_graphc                 C  r  )NU
            for kernel in globals().values():
                if isinstance(kernel, zU.CachingAutotuner):
                    kernel.cuda_kernel_saved = False
            r  r   r!   r   r   rF   rF   rG   rh    s
   z6PythonWrapperCodegen.generate_reset_kernel_saved_flagsc                 C  s   | j dtj d dS )a[  
        Precompile and save the CUBINs of the Triton kernels that haven't
        been precompiled and saved as a side effect of running the generated
        JIT model (Python wrapper). This can happen when the model contains
        control flow: only one pass through the control flow operators covers
        the kernels that are saved, the remaining kernels are not launched,
        hence not saved. The main purpose of this codegen is to compile and
        save the Triton kernels outside the active control flow path for
        subsequent AOTInductor code generation and compilation.
        rD  a  .CachingAutotuner):
                    if not kernel.cuda_kernel_saved:
                        if len(kernel.launchers) == 0:
                            kernel.precompile()
                        kernel.save_gpu_kernel(
                            grid=(0, 0, 0),   # use dummy grid
                            stream="stream",  # use dummy stream
                            launcher=kernel.launchers[0],
                        )
            NrE  r   rF   rF   rG   rj    s
   z5PythonWrapperCodegen.generate_save_uncompiled_kernelsc                   s   dd   fdd|D S )Nc                 S  sJ   t | trt| r| d S | S t | ttttfrt| S ttj	j
| S )Nr  )r   rR   r5   r   floatr   r   r  r*   r?   rC   rD   )rP   rF   rF   rG   wrap_arg  s
   
zAPythonWrapperCodegen.prepare_triton_kernel_call.<locals>.wrap_argc                   s   g | ]} |qS rF   rF   ry   rG  rF   rG   ru     rv   zCPythonWrapperCodegen.prepare_triton_kernel_call.<locals>.<listcomp>rF   )r   rA  rF   rH  rG   prepare_triton_kernel_call  s   	z/PythonWrapperCodegen.prepare_triton_kernel_callc                   s0  t |trt |tjr|j }tj|}n tj	|d ur(|}tj|}n|d us0J dd| }|}t
dd | D }t
dd tj|D }t
dd | D }	| }
| }tjjj| jtjd}d| d	|	 d
|
 d| d	| d	| d} j| d|  t |tjr j|dd}|} j| d|  |S t|tjst |trt |tr| jv r|S |d u rdS |}t |tr|j}|tjjj v rtjjj | }ttjjj!|tjdS t |tt"t#t$frt|S t |t%rdd	& fdd|D  dS t'dt(| )NzBV.graph.get_buffer(arg) and raw_arg can't be None at the same timetmp_arg_c                 s  $    | ]}t jjj|tjd V  qdS r  Nr*   r?   rC   r0  r   unbacked_symint_fallbackrs   r  rF   rF   rG   r         
zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>c                 s  rK  rL  rM  rO  rF   rF   rG   r     rP  c                 s  rK  rL  rM  rO  rF   rF   rG   r     rP  r  zgenerate_example_value(rq   z, 'z', r{   r   T)r>  r?  r  rJ  c                 3  r   r>   r   rs   ar   rF   rG   r   C      rK  zUnsupported type ))r   torch_dtyper   r  r;  rD  r*   r?   
get_buffertry_get_bufferr   rr  get_allocation_sizers  rq  rB   rC   r  
get_layoutr  r   rN  r   r   rA  
issubclassr   Basicr   rR   r  r   r  r0  r   rF  r   r  r   r~  r   )r   rP   arg_typeraw_argr{  rR  r  re  allocation_sizerw  ru  r
   r  r   rF   r   rG   r     st   




*


 z/PythonWrapperCodegen.generate_example_arg_valuec                   s2   t |trdd fdd|D  d S t|S )NrJ  rq   c                 3  s    | ]}  |V  qd S r>   )_grid_dim_str)rs   r   r   rF   rG   r   J  s    z5PythonWrapperCodegen._grid_dim_str.<locals>.<genexpr>rK  )r   r  r   r  )r   grid_per_dimrF   r   rG   r^  G  s   
z"PythonWrapperCodegen._grid_dim_str)ru  r   	arg_typesrV  r*  c             
   C  s  |pt j }|s|jdks| | || dS | |}d|}t	| |j
t j}	|sGd|	 d}
| | d| d| d|
 d dS |   tjjr|| jvr|durbt|t|ksfJ di }g }|du rvdgt| }nt|t|ksJ d	tt|||D ][\}\}}}d}t|trd
t|v r|d
\}}t|trtd|r|}|||< n||vr| ||||}|||< n|| }n| ||||}||du r|n| d
|  q| j| dd| d|	 d | jdddd | D  d | j| t jjrdS t jjj }|!|||d | | | d| d|	 d W d   dS 1 sCw   Y  dS )z
        Generates kernel call code.

        triton: Defines whether the backend uses Triton for codegen. Otherwise it uses the CUDA language when gpu=True,
                and C++ when gpu=False.
        rI  Nrq   z	c_void_p(r{   r   rz   z$call_args and arg_types do not matchz#call_args and raw_args do not matchr?  z^(workspace|semaphore)z.run(z	, stream=del c                 s      | ]}|V  qd S r>   rF   ry   rF   rF   rG   r         z<PythonWrapperCodegen.generate_kernel_call.<locals>.<genexpr>r,  )"r*   r?   r  r   r   rL  rI  r   r  r  r{  r  r   r   r   r   ra   r  r   r   rR   splitrT  r_   matchr   r  r   rN  r   r/  r*  r  r+  )r   r&  rA  ru  r   r`  rV  r*  call_args_strstream_name
stream_ptrtensor_argsall_argsr  rP   r[  r\  r   arg_strr,  rF   rF   rG   generate_kernel_callO  sz   







"

$z)PythonWrapperCodegen.generate_kernel_callc                 C  r  r>   )r  r  )r   r   rF   rF   rG   r     r  zPythonWrapperCodegen.writelinec                 C  s   |D ]}|  | qd S r>   r  )r   r  r   rF   rF   rG   r    r8  zPythonWrapperCodegen.writelinesc                 C  s   | j t| d S r>   )r  r  r%   )r   ctxrF   rF   rG   ra    r  z"PythonWrapperCodegen.enter_contextc                   s   ddl m}m} | rdd l}t|trt|jjS t|t	j
r$t|S t|ttfrEtjG dd d tt| fdd|D S t|tjjrPt|S t|tjtjtfr^| S | rlt||jjrl||S t|tjrv| S t|S )Nr   )dtype_to_stringhas_triton_packagec                   @  s   e Zd ZU ded< dd ZdS )z1PythonWrapperCodegen.val_to_arg_str.<locals>.Shimr   refc                 S  s   | j S r>   )rp  r   rF   rF   rG   __repr__  s   z:PythonWrapperCodegen.val_to_arg_str.<locals>.Shim.__repr__N)r   r   r   r   rq  rF   rF   rF   rG   Shim  s   
 rr  c                 3  s     | ]} t |V  qd S r>   r4  rQ  rr  r   rF   rG   r     rS  z6PythonWrapperCodegen.val_to_arg_str.<locals>.<genexpr>)r  rn  ro  r   r   r   r  r:   r  r   r	   r   r  r@  	dataclassrZ   r   r  _ops
OpOverloadr   r   rC  
MutableBoxr    r  languager
   r  )r   r  type_rn  ro  r   rF   rs  rG   r5    s,   
z#PythonWrapperCodegen.val_to_arg_strr  r;   c                 C  sP   |  }| }t| }ttj|}t| }| |	 |||||S r>   )
rq  rB   r   rr  r*   r?   rW  rs  r=  rD  )r   r  ru  r
   rv  allocation_shaperw  rF   rF   rG   rU    s   z+PythonWrapperCodegen.make_buffer_allocationc              
   C  s   |d u r|}|  |}|  |}|  |}	|jdv r-| d|j d| d|	 d| d
}
n| d| d|	 d|j d| d
}
||krN|
d	| d|	 d }
|
S )
N)rI  cudaxpuz = empty_strided_rz   rq   r{   z = empty_strided(r  r  z.as_strided()r   r   )r   rx   ru  r
   rv  rw  rz  rz  codegen_allocation_shape_tuplecodegen_stride_tupler'  rF   rF   rG   r=    s:   


	z$PythonWrapperCodegen.make_allocationr  c              	   C  s(   | j  | d| | j d| j d| 	S )Nr     rw   )r  r  r  )r   new_nameold_namer  rF   rF   rG   make_tensor_alias	     (z&PythonWrapperCodegen.make_tensor_alias%Union[BufferLike, ir.TorchBindObject]c                 C  s   d|   S )Nra  rD  )r   r  rF   rF   rG   rZ  
	  s   z%PythonWrapperCodegen.make_buffer_freenames_to_delc                 C  s   dd dd |D  S )Nra  rq   c                 s  rb  r>   rF   )rs   rx   rF   rF   rG   r   	  rc  z:PythonWrapperCodegen.make_free_by_names.<locals>.<genexpr>)r   )r   r  rF   rF   rG   make_free_by_names	  r   z'PythonWrapperCodegen.make_free_by_namesr  r  del_linec              	   C  s(   | j  | d| | | j d| j d	S )Nr   r   reuse)r  r  r  )r   r  r  r  rF   rF   rG   codegen_exact_buffer_reuse	  r  z/PythonWrapperCodegen.codegen_exact_buffer_reuseoldnewr\  c                 C  s   |  |  ks
J | }| }d}|tj vr%|r%d| | }| | kr<| | kr<| |||S | 	|| | d| j
j}| j | d| | d| j dS )N;z; r   r   r  r  )rB   rD  r*   r?   r  rZ  rr  rs  r  r  r  r   r  r  )r   r  r  r\  r  r  r  reinterpret_viewrF   rF   rG   r]  	  s    "z&PythonWrapperCodegen.make_buffer_reuseviewir.ReinterpretViewc                 C  s8   |  t|| j | d|  | j d| j d d S )Nr   r  z alias)r   r.   r  r  r  r  )r   rx   r  rF   rF   rG   codegen_deferred_allocation#	  s   &z0PythonWrapperCodegen.codegen_deferred_allocationr_  c                 C  sR  |  }|tjjv s|| jv st|tjrd S | j| t|	 tj
tjfr.| s.d S | }t|tjr:d S t|tjrBd S t|tjrt|jtjs]J dt|j d|j t|jjtjsmJ t|jjt|jjjtjs~J t|jj| |jjj | ||j d S t|tjr| t| | d S | t| | d S )Nzunexpected r   )rD  r*   r?   rJ  r  r   r   DonatedBufferr   get_defining_opExternKernelAllocMultiOutputshould_allocaterh  MutationLayoutSHOULDREMOVEr  NonOwningLayoutr  r    r   r  
StorageBoxrC  codegen_allocationr  ri  r   rp  rH  )r   r  rx   rX  rF   rF   rG   r  +	  sB   


 "z'PythonWrapperCodegen.codegen_allocationc                 C  s   |  }t|tjtjfr| | | d S t| tjr)| t	| | d S | 
|s0d S | j| | t| | d S r>   )rD  r   r   InputBufferr  r   rZ  rh  ri  r  	can_reuser  r   r  )r   r  rx   rF   rF   rG   codegen_freeS	  s   
z!PythonWrapperCodegen.codegen_freec                 C  sf   |  }|tjjv p1|tjjv ottjj| tj p1|tjj	v p1|tjj
v p1|tjjv p1|| jv  S r>   )rD  r*   r?   rJ  r  r   graph_inputs_originalr   r  r  r  never_reuse_buffersr  )r   input_bufferoutput_bufferrx   rF   rF   rG   r  g	  s   


	
zPythonWrapperCodegen.can_reusec                 C  s$   |  | jv o| j|   |  kS r>   )rD  r  )r   r  reused_bufferrF   rF   rG   	did_reusew	  s   zPythonWrapperCodegen.did_reuser  r  c                 C  s`   t ||sJ | | | j|  | j|  | | j| < | t| || d S r>   )	rO   r  r  r   rD  r  r  r   rM  )r   r  r  rF   rF   rG   codegen_inplace_reuse	  s   
z*PythonWrapperCodegen.codegen_inplace_reusec                 C  s,   t |}|| jv r|S | j| | j| S r>   )rR   r  r   r  )r   r   rx   rF   rF   rG   codegen_unbacked_symbol_decl	  s
   

z1PythonWrapperCodegen.codegen_unbacked_symbol_declr$  rW  r   unbacked_bindings,Optional[dict[sympy.Symbol, pytree.KeyPath]]c                   sp   t tjjj|}|sd S | D ]%\}d
 fdd  fdd}| | | d	|  | j  qd S )Nr  rR   r  pytree.KeyPathc                   s:  |dkr| S t |dkr3t|d tr3t|d tjr3 |  d|d j d|d j d|dd  S t|d trL |  d|d j d|dd  S t|d tjr}tjj	rk d	|d j d
|  d|dd  S  |  d|d j d|dd  S t|d t
r |  d|d j d|dd  S td| )NrF   r   r   r+   r   rz   r{   r  z	std::get<z>(rJ  rK  z.__floordiv__(r  )ra   r   r   pytreeSequenceKeyrx   r  r*   r?   r/  r   r  rb   )r  r  )gorF   rG   r  	  s*   *$&"$zIPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.goc                    s   t jjrEtdkr+d }  d  t| tjr(t| jdkr(dd  S S td t	j
s5J  d j  dd  S  S )Nr+   r   )r*   r?   r/  ra   rD  r   r   r  rM  r  r  r  )r'  r  r  r$  rW  rF   rG   go_outer	  s   


 
zOPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.go_outerr   )r  rR   r  r  )	r   r*   r?   rC   	shape_envr^   r   r  r  )r   r$  rW  r  r  r  rF   r  rG   (codegen_unbacked_symbol_defs_for_outputs	  s   
z=PythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputsc                   s    fdd}fdd}z? j j dj  |  tj}tj jj|d W d    n1 sAw   Y  |  W   d S   w )Nc                    sT   t jjt  ksJ tjj D ]\} }j |  d| j  qd S rB  )ra   r?   r  r   r   r  r  )inner_inputouter_input)outer_inputsr   subgraphrF   rG   _codegen_subgraph_prefix	  s   zSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_prefixc                    sR   t jjt  ksJ tjj D ]\} }| d|   j  qd S rB  )ra   r?   r  r   r   r  r  )inner_outputouter_output)outer_outputsr   r  rF   rG   _codegen_subgraph_suffix	  s   zSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_suffix subgraph: )parent_graph)	r  r?   r   r  rx   r*   set_graph_handlercodegen_subgraphr'  )r   r  r  r  r  r  r  rF   )r  r  r   r  rG   codegen_subgraph_by_inlining	  s   		z1PythonWrapperCodegen.codegen_subgraph_by_inliningc                 C  sh   t |t |jjksJ d|jj d| t|jj|D ]\}}| | j | d| | j  qd S )Nzgraph_input_names:z, outer_inputs: r   )ra   r?   r  r   r   r  r  )r   r  r  r  r  r  rF   rF   rG   codegen_subgraph_prefix
  s   "z,PythonWrapperCodegen.codegen_subgraph_prefixpartition_idir.GraphPartitionSignaturec           	   	   C  s   |j }|j}d| t|dkrdnd }dd |D }d|t|dkr*dnd }| d| d| d	 d
d | D }|rO| dd|  | d| d| d| d | d| d dS )z'Generate code to call a graph partitionrq   r+   r  r  c                 S     g | ]}|  qS rF   r  )rs   r:   rF   rF   rG   ru   %
  rv   z?PythonWrapperCodegen.codegen_partition_call.<locals>.<listcomp>	partition	_args = [rK  c                 S  s   g | ]\}}|r|qS rF   rF   )rs   rx   
deallocaterF   rF   rG   ru   +
  s
    ra  rz   z) = self.partitions[z](partition_args)zdel partition_argsN)input_deallocationoutput_nodesr   r  ra   r   r^   )	r   r  r  r  r  rE  output_namesrW  r  rF   rF   rG   codegen_partition_call
  s"   z+PythonWrapperCodegen.codegen_partition_callnum_partitionsc                 C  s   dd t |D | _d S )Nc                 S  s   g | ]}d | qS )
partition_rF   )rs   r  rF   rF   rG   ru   8
  s    z@PythonWrapperCodegen.set_all_partition_names.<locals>.<listcomp>)r  r  )r   r  rF   rF   rG   set_all_partition_names7
  r  z,PythonWrapperCodegen.set_all_partition_namesc              	   C  s   |j j}d|}t|dkr|d7 }d|t|dkrdnd }| |j j d| d |d t| D ]
}| d|  q7| d| d	|j j d|j j d
 d S )Nrq   r+   r  r  r  rK  ra  rz   z) = r  )r?   r  r   ra   r   rx   )r   r  r  r  r  inner_inputsouter_output_namesr  rF   rF   rG   codegen_subgraph_call:
  s   
z*PythonWrapperCodegen.codegen_subgraph_callc              	   C  s  t jjr| ||| d S | |j | d | | j d|j  | ||| t j}|j	|j_	|jj| j
vr{t |j% tdd |j \}}W d    n1 sYw   Y  W d    n1 shw   Y  | j
|jj | |j | ||| d S )Nr  r  r  F)r*   r?   r0  r  r  r   r  rx   r  r/  r  r  r   patchr!  r   r  r   r  )r   r  r  r  r  subgraph_coder  rF   rF   rG   r  O
  s&   

z%PythonWrapperCodegen.codegen_subgraphc                   sb   |   |   dt|j  dd |jD } fddtt|jD }| |j|| d S )N = [None] * c                 S  r  rF   r  rs   r  rF   rF   rG   ru   o
  rv   z@PythonWrapperCodegen.codegen_invoke_subgraph.<locals>.<listcomp>c                      g | ]
}  d | dqS rJ  rK  rF   rs   r  r  rF   rG   ru   p
      )rD  r   ra   rW  rE  r  r  r  )r   invoke_subgraphr  r  rF   r  rG   codegen_invoke_subgraphk
  s
   z,PythonWrapperCodegen.codegen_invoke_subgraphc                   s   |   dd |jD } fddtt|jD }|j }t|jtj	s+| d}| 
  dt|j  | 
d| d | 
t| |jj | |j|| | 
t|  | 
d | 
t| |jj | |j|| | 
t|  d S )	Nc                 S  r  rF   r  r  rF   rF   rG   ru   v
  rv   z<PythonWrapperCodegen.codegen_conditional.<locals>.<listcomp>c                   r  r  rF   r  r  rF   rG   ru   w
  r  r  r  r   r.  zelse:)rD  operandsr  ra   rW  	predicater  r   r   ShapeAsConstantBufferr   r  true_subgraphr?   r  r%  false_subgraph)r   conditionalr  r  r  rF   r  rG   codegen_conditionals
  s   


z(PythonWrapperCodegen.codegen_conditionalc           
        s:  |   dd |jD }dd |jD }|   dt|  t|D ]\}}|   d| d|  q$g  fddtt|D |}  dg}t|}|d t| }	| d	 | t| |j	j
 | |j	|| | d
|d  d | t|  | t| |jj
 | |j||	 | t|  d S )Nc                 S  r  rF   r  r  rF   rF   rG   ru   
      z;PythonWrapperCodegen.codegen_while_loop.<locals>.<listcomp>c                 S  r  rF   r  r  rF   rF   rG   ru   
  r  r  rJ  z] = c                   r  r  rF   r  r  rF   rG   ru   
  r  _cond_resultzwhile True:zif not r   z: break)rD  carried_inputsadditional_inputsr   ra   r  r  r  r  cond_subgraphr?   r  r%  body_subgraph)
r   
while_loopouter_carried_inputsouter_additional_inputsr  inpcond_outer_inputscond_outer_outputsbody_outer_inputsbody_outer_outputsrF   r  rG   codegen_while_loop
  sD   
z'PythonWrapperCodegen.codegen_while_loopc                 C  s^   z$t | dd r
W d S t| tr| W S tjj| }|d u r |W S t|W S  ty.   Y d S w )Nr  )rB  r   r   r*   r?   
_shape_env_maybe_evaluate_staticr|  )r   r   rF   rF   rG   statically_known_int_or_none
  s   

z1PythonWrapperCodegen.statically_known_int_or_nonec                 C  s4   g }| D ]}t |}|d u r d S || q|S r>   )r  r  r  )lstr  r   numrF   rF   rG   %statically_known_list_of_ints_or_none
  s   
z:PythonWrapperCodegen.statically_known_list_of_ints_or_nonec                 C     t | d uS r>   )r  r  )r  rF   rF   rG    is_statically_known_list_of_ints
  s   z5PythonWrapperCodegen.is_statically_known_list_of_intsc                 C  s   t |  S r>   )r  r  rr  r  rF   rF   rG   rN  
  s   z4PythonWrapperCodegen.static_shape_for_buffer_or_nonec                 C  r  r>   )r  rN  r  rF   rF   rG   !can_prove_buffer_has_static_shape
  s   z6PythonWrapperCodegen.can_prove_buffer_has_static_shaper>   )r  r   r  r   r  r   r  r  r"  )rx   rR   r  rR   r<   r  )r  rR   )r  r  r<   rR   r<   r  r<   r  r<   r  )r  r  rn  )r*  r   r<   rR   rG  )r*  r   r<   r  )r  r  r<   r  r  r/   r<   r  )ro   rR   r'  rR   r(  r   r   r  ru  rR   r<   r  )F)NNN)
rR  rR   rG  rR   rF  rR   rS  r  rT  rU  )rx   rR   r   r  r  r  )r  r  )r   r	   rD   r   r<   rR   )r   r	   r<   rR   )r  rR   rx   rR   r{  rR   r<   rR   )rv  r  r<   rR   )r   r  r<   rR   )r  r   )NTN)
r&  rR   r  rR   r  r   r  r   r  r   )r  rR   )r   r  )r&  rR   r  r   )r6  r1   )NN)r&  rR   )r  r;   )r  )r  r  )r  r  )r  rR   r  rR   r  rR   )r  r;   r  r;   r\  r   )rx   rR   r  r  r<   r  r  r_  )r  r_  r  r_  )r$  rR   rW  r   r  r  r<   r  )r  r   r  r  )r  r   ){r   r   r   __doc__r  r  r  r  r  r  r  r  r#   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r'  r  r&  r	  r  r  r  r  r  r  r  r  r-  rA  rC  rI  rQ  rX  r\  r]  rZ  rk  re  rf  r  r  r  rl  r  r  r  r  r   rz  r  r  r  r  r  r  rm  r  r  r3  r5  r?  r@  r:  rL  rc  rd  ri  rh  rj  rI  r   r^  rl  r   r  ra  r5  rU  r=  r  rZ  r  r  r]  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rN  r  r  rF   rF   r  rG   r    s   R=
		O#!*]  '	Sa# (	P-*	r  c                      s   e Zd ZdZ	d3d4 fd	d
Zd5ddZd5ddZdd Zdd Zdd Z	d6ddZ
d7ddZd8ddZd8d d!Zd9d#d$Zd:d&d'Zd;d)d*Zd< fd-d.Zed5d/d0Zed5d1d2Z  ZS )=r  a  
    A wrapper codegen that generates code for a subgraph. For most of the
    methods, we rely on the implementation in the PythonWrapperCodegen. But we
    override a few functions to produce cleaner code (like avoiding writing
    imports twice in the output code)
    Nr  rR   r  r  r  r  c                   s    || _ || _|| _t   d S r>   )r  r  r  r   r  )r   r  r  r  r  rF   rG   r  
  s   z%SubgraphPythonWrapperCodegen.__init__r<   r  c                 C  s   | j | _d S r>   )r  r  r   rF   rF   rG   r  
  s   z1SubgraphPythonWrapperCodegen.set_launcher_fn_namec                 C  r  r>   rF   r   rF   rF   rG   r  
  r  z)SubgraphPythonWrapperCodegen.write_headerc                 C  r  r>   rF   r  rF   rF   rG   rm  
  r  z2SubgraphPythonWrapperCodegen.add_benchmark_harnessc                 C  r  r>   rF   r  rF   rF   rG   r  
  r  z6SubgraphPythonWrapperCodegen.benchmark_compiled_modulec                 C  r  r>   rF   r   rF   rF   rG   r    r  z5SubgraphPythonWrapperCodegen.write_async_compile_waitc                 C  r  r>   )r  r	  r   rF   rF   rG   r	    s   
z/SubgraphPythonWrapperCodegen.next_kernel_suffixr  r/   c                 C  r  r>   rF   r  rF   rF   rG   r    r  z2SubgraphPythonWrapperCodegen.generate_after_suffixr   c                 C  s   | j d| j d d}|S )Nz
            def z(args):
            r+   )r  r   r  r  rF   rF   rG   r    s   z>SubgraphPythonWrapperCodegen.write_launcher_fn_call_get_indentc                 C  r>  r   rF   r   rF   rF   rG   r]    r  z4SubgraphPythonWrapperCodegen.get_wrapper_call_indentr  c                 C      | j  }r
|j}|S tjj}|S r>   )r  input_nodesr*   r?   r  )r   r  rE  rF   rF   rG   r    s
   
z-SubgraphPythonWrapperCodegen.get_graph_inputsr  c                 C  s(   | j  }rt|j }|S tjj}|S r>   )r  r  r  r  r*   r?   r  )r   r  namesrF   rF   rG   r     s
   
z2SubgraphPythonWrapperCodegen.get_graph_input_namesr  c                 C  r  r>   )r  r  r*   r?   r  )r   r  rW  rF   rF   rG   r  '  s
   
z.SubgraphPythonWrapperCodegen.get_graph_outputsr  r_  c                   s0   |  }| j }r||jv rd S t | d S r>   )rD  r  r  r   r  )r   r  rx   r  r  rF   rG   r  .  s   z/SubgraphPythonWrapperCodegen.codegen_allocationc                 C     | j   d S r>   )r  r  r   rF   rF   rG   r  8  s   z5SubgraphPythonWrapperCodegen.write_triton_header_oncec                 C  r  r>   )r  r  r   rF   rF   rG   r  A  s   z=SubgraphPythonWrapperCodegen.write_get_raw_stream_header_oncer>   )r  rR   r  r  r  r  r"  rG  r  rn  r  r  r  r  )r   r   r   r   r  r  r  rm  r  r  r	  r  r  r]  r  r  r  r  r#   r  r  r  rF   rF   r  rG   r  
  s*    





	

	

r  )r:   r;   r<   r=   )rI   r;   rJ   r;   )rP   rQ   r<   rR   )rk   rQ   r<   rR   )ro   rp   r<   rR   r>   )
rx   rR   r   r   r   r   r   r   r<   r   rG  )
__future__r   r  r   r@  r   rO  r"  rt  rQ  r|  r_   rv  	itertoolsr   typingr   r   r   r   r   r   r	   r  
torch._opstorch.utils._pytreeutils_pytreer  r
   rT  torch._dynamo.utilsr   r   #torch._inductor.codegen.debug_utilsr   $torch._inductor.codegen.multi_kernelr   %torch._inductor.runtime.runtime_utilsr   %torch.fx.experimental.symbolic_shapesr   r   r   r   r   torch.fx.noder   torch.utils._ordered_setr    torch.utils._sympy.singleton_intr   torch.utils._sympy.symbolr   r   r  r   r   r   	codecacher   r   r    runtimer!   runtime.hintsr"   r#   r$   r%   r&   r'   r(   r)   virtualizedr*   r  r,   r-   r.   r/   r0   r1   r2   	cpp_utilsr3   triton_utilsr4   r5   r6   collections.abcr7   r8   r   r?   r9   doprintr  r   ru  rR   r=   rC  r;   rH   rO   rj   rn   r   r  r   r  r   r   r   rt  r   r   r  r  r%  r)  r7  r:  rH  r  rM  rK  r^  rp  r  r  r  r  rF   rF   rF   rG   <module>   s   $	$	




*	
XM	")                a