o
    Ih                     @   s@  d Z ddlZddlZddlZddlZddlZddlmZmZ ddl	m
Z
mZmZmZ ddlZddlm  mZ ddlmZmZmZ ddlmZmZ ddlmZmZmZmZ ddlm Z m!Z!m"Z" dd	l#m$Z$m%Z% dd
l&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z; ddl<m=Z= ddl>m?Z? erddl@mAZA e$eBdZCe$eBdZDdd ZEdd ZFdd ZGG dd dZHG dd dZIeH ZJg d ZKe=eeejLejJjMjNjOgZPeQ aRd!d" ZSG d#d$ d$ZTd%aUd%aVd%aWejXd1d&d'ZYejXd(d) ZZd2d+d,Z[d-d. Z\d/d0 Z]dS )3a  
Provides functionality for compiling PyTorch's autograd (automatic differentiation) system.

This module implements compiled autograd, which traces and optimizes backward pass
computations at runtime. The key components are:

- AutogradCompilerInstance: Traces and compiles autograd graphs using FX
- Context managers (_enable/_disable): Control when compiled autograd is active
- Utility functions: Support graph manipulation, tensor operations, and hooks

Compiled autograd can significantly improve backward pass performance by removing
Python overhead and enabling additional optimizations. It works by capturing
backward computations into an FX graph that can be compiled and optimized,
while maintaining the same semantics as eager mode autograd.
    N)Counterdefaultdict)AnyOptionalTYPE_CHECKINGUnion)call_backward	call_hookFakeCompiledAutogradEngineGetItemSourceLocalSource)countersget_chromium_event_loggerlazy_format_graph_codeset_locals_to_stealcompile_contextCompileContext	CompileId)getArtifactLoggertrace_structuredclone_preserve_strides)FakeTensorMode)GraphModule)BackwardState)	decomposedisable_autocast_cachedisable_proxy_modes_tracingfetch_object_proxyProxyTorchDispatchModePythonKeyTracertrack_tensor_tree)
DimDynamicShapeEnv)preserve_node_metaset_stack_trace)
OrderedSet)CapturedTraceback)Proxycompiled_autogradcompiled_autograd_verbosec                   C   s   t jjjdS )Nr,   )torch_logging	_internal	log_stateis_artifact_enabled r2   r2   S/var/www/vscode/kcb/lib/python3.10/site-packages/torch/_dynamo/compiled_autograd.py snapshot_verbose_logging_enabledF   s   
r4   c                   C   s   t jjjjS N)r-   	_inductorconfigtriton
cudagraphsr2   r2   r2   r3   snapshot_cudagraph_enabledL      r:   c                 C   s   | d urt | S | S r5   r   )xr2   r2   r3   maybe_cloneP   s   r=   c                   @   $   e Zd Zdd Zdd Zdd ZdS )OpNamespacec                 C   s   t  | _d S r5   )r   custom_function_name_counterselfr2   r2   r3   __init__\   r;   zOpNamespace.__init__c                    s   |rd| }| j | }| j |  d7  < | | }t| |r!J t||| |r5t| |tj  |S tjj fdd}t| || |S )NCppNode   c                     s    | i |S r5   r2   argskwargsresultr2   r3   run_non_traceable_cpp_in_eagerm      z7OpNamespace.add.<locals>.run_non_traceable_cpp_in_eager)r@   hasattrOpsetattrr-   _dynamoallow_in_graphdisable)rB   namefnis_custom_functionis_traceablecountrK   r2   rI   r3   add_   s   
	zOpNamespace.addc                 C   s
   t | |S r5   )getattr)rB   rS   r2   r2   r3   gett   s   
zOpNamespace.getN)__name__
__module____qualname__rC   rX   rZ   r2   r2   r2   r3   r?   [   s    r?   c                   @   r>   )rN   c                 C   s   || _ || _|| _d| _d S )Nz#torch._dynamo.compiled_autograd.ops)rT   rU   r[   r\   )rB   rS   rT   rU   r2   r2   r3   rC   y   s   
zOp.__init__c                 O   s   | j |i |S r5   )rT   )rB   rG   rH   r2   r2   r3   __call__      zOp.__call__c                 C   s   | j d | j S )N.)r\   r[   rA   r2   r2   r3   __repr__   r_   zOp.__repr__N)r[   r\   r]   rC   r^   ra   r2   r2   r2   r3   rN   x   s    rN   )inputssizesscalarshookspacked_datac                 C   s   t tt| d d dS )N)compiled_autograd_idframe_idframe_compile_idr   )rg   r2   r2   r3   make_compile_context   s   rj   c                   @   s   e Zd Zd`ddZdd ZedefddZd	ee	j
 d
ee deeeef  deeeeef   fddZdee fddZdd Zdede	jjjdee fddZdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Z d+d, Z!d-efd.d/Z"d0d1 Z#d2d3 Z$d4d5 Z%dee fd6d7Z&d8d9 Z'd:d; Z(d<d= Z)d>d? Z*d@dA Z+edBdC Z,edDdE Z-dFdG Z.dHdI Z/dJdK Z0dLdM Z1dNdO Z2dPdQ Z3dRdS Z4dTdU Z5	dadeeeeef   fdVdWZ6dXefdYdZZ7d[ed\ed]ee	jj8 fd^d_Z9dS )bAutogradCompilerInstancereturnNc                 C   sT   || _ t | _| jj| _t | _tdd| jd| _t	 | _
t| j
d| _d | _d S )NT)allow_fallback_kernelsallow_non_fake_inputs	shape_envsymbolic)compiler_fn
contextlib	ExitStackstackcloser%   ro   r   fake_tensor_moder"   	fx_tracerr!   
proxy_modehooks_proxy)rB   rq   r2   r2   r3   rC      s   


z!AutogradCompilerInstance.__init__c                 C   s    t |tjsJ | jj||dS )N)source)
isinstancer-   Tensorrv   from_tensor)rB   r<   rz   r2   r2   r3   	wrap_fake   s   z"AutogradCompilerInstance.wrap_fakec                 C   s   t t| |S r5   r   )rS   idxr2   r2   r3   rz      rL   zAutogradCompilerInstance.sourcerb   rc   rd   originsc                    sn  t d d  d7  < tt _t j _ j  t  _	t
 jd j	d jidd d  _i  _tj  j_tjjtd j_i  j_i  _ fdd	tD \} _ _ _ _ jt   |\}}} fd
dt!|D } "|||  fddt!|D } "| j|}	t!|D ]\}
}|	|
  j|j#< qt!|D ]:\}} $d|}t%|t&r j'(||t)j*||< qt%|t+rֈ j'j, j'j-||t)j*d||d||< qt.dt/| "| j| t!|D ]\}
} j|
  j|j#< q jt0i   j j1  j j2  jt3   j1j'd usJ  j1j'} jtjj4j56| t7t89 |||fS )Nr+   capturesrE   graph_idTlog_pt2_compile_event)
tracer_clsc                 3   s"    | ]} j d |di V  qdS )placeholderr2   N)rw   create_proxy).0rS   rA   r2   r3   	<genexpr>   s
    
z9AutogradCompilerInstance.begin_capture.<locals>.<genexpr>c              	      s$   g | ]\}}  | d |qS )rb   )r~   rz   )r   r   r<   rA   r2   r3   
<listcomp>   s    z:AutogradCompilerInstance.begin_capture.<locals>.<listcomp>c              	      s*   g | ]\}} j | d |tjqS )rc   )ro   $create_unspecified_symint_and_symbolrz   r$   DYNAMIC)r   r   valrA   r2   r3   r      s    
rd   )rz   dynamic_dim)hintrz   zUnexpected scalar type: ):r   nextCOMPILE_COUNTERidrj   r   	__enter__timetime_nsstart_time_nsr   log_event_startaot_graph_cls_nameaot_graph_infosr-   nnModulerw   rootfxGraphr"   graphtensor_attrssymnode_proxy_lookup_graph_placeholderssizes_proxyscalars_proxyry   packed_data_proxyrt   enter_contextr&   	enumeratebind_objects_to_proxiesnoderz   r{   intro   r   r$   r   floatcreate_symfloatnodecreate_unspecified_symbolAssertionErrortyper   rv   rx   r   experimentalsymbolic_shapes_suppress_guardsstrr   current_compile_id)rB   rb   rc   rd   r   
args_proxyinputs_originssizes_originsscalars_originsproxiesisymintr   r   rz   symvalenvr2   rA   r3   begin_capture   s   











z&AutogradCompilerInstance.begin_capturecompile_reasonsc                    s&    sJ t ddd  fddd d S )Nartifactc                   S   
   dddS )N!compiled_autograd_compile_reasonsjsonrS   encodingr2   r2   r2   r2   r3   <lambda>#     z>AutogradCompilerInstance.log_compile_reasons.<locals>.<lambda>c                      s    S r5   r2   r2   r   r2   r3   r   '  s    metadata_fn
payload_fn)r   )rB   r   r2   r   r3   log_compile_reasons  s   

z,AutogradCompilerInstance.log_compile_reasonsc                    s   fdd   D } j}|j|j~tjjfdd}	jjd|	||g|R i dd |d ur:j	|  fdd}
|
 }fd	d
}tj
jjj||d}tj|}|S )Nc                       g | ]}  |qS r2   to_proxyr   erA   r2   r3   r   >      zDAutogradCompilerInstance.proxy_call_aot_backward.<locals>.<listcomp>c                    s"   t jjjj| | g|R  }|S r5   )r-   
_functorch_aot_autogradruntime_wrappers_backward_prologue_functional)ctx_saved_tensorsctx_symints	flat_argsout)maybe_subclass_metadatametadatar2   r3   call_aot_bwd_prologueF  s   
zOAutogradCompilerInstance.proxy_call_aot_backward.<locals>.call_aot_bwd_prologuecall_functionkindtargetrG   rH   c                     s  dd } | j j}fddt|td u D } }t|tjks*J fdd|D }||d t|< d urD| d}i d }j jjD ]p}|j	dkrb|| j
|< |d7 }qO|j	d	kr~t|jdkspJ fd
d|jd D }qO|j	dkr|j}j|}	tjj|	tj | jd|	di }
|
|< qO|j	dkrjj|fdd}
|
|< qOtd|d usJ dd   fdd|D }|| |S )Nc                 S   s,   d}| j D ]}|jdkr|d7 }q |S |S )Nr   r   rE   )nodesop)r   num_argsr   r2   r2   r3   
num_inputsb  s   

zkAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graph.<locals>.num_inputsc                       g | ]} | qS r2   r2   r   r   )pgradsr2   r3   r   o  s    zkAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graph.<locals>.<listcomp>c                    r   r2   r   r   rA   r2   r3   r   u  r   r   r   rE   outputc                    s2   g | ]}t |tjjrtj|  jn|qS r2   )r{   r-   r   Noder*   rw   r   n)rB   value_remapr2   r3   r     s    get_attrr2   r   c                    s    |  S r5   r2   )r   )r   r2   r3   r     s    ziAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graph.<locals>.<lambda>zshouldn't get herec                   S   s<   t   tdddddW  d    S 1 sw   Y  d S )Nr   {   r   r-   zerosr2   r2   r2   r3   dummy  s   $zfAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graph.<locals>.dummyc                    s$   g | ]}t |tjjr  n|qS r2   )r{   r-   r   r*   )r   o)r   r2   r3   r     s    )
_bw_moduler   ranger   _get_compiled_autograd_symintslensymintsappendr   r   r   rG   r   rw   get_fresh_qualnamerO   r   rY   create_node	node_copyr   r   )r   r   	pall_argsr   psymintsargs_idxpoutputsr   rS   qualnamerJ   outputs)ctxpbackward_stater   rB   )r   r   r3   copy_paste_aot_backward_grapha  sX   









zWAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graphc                    sX   t jj fdd}tj|}jjd|t|i d}	 }
|g|g |S )Nc                     s   j |  dS )N)
is_runtime)creation_fn)unwrapped_argsr  subclass_metar2   r3   make_subclass  rL   zkAutogradCompilerInstance.proxy_call_aot_backward.<locals>.proxy_subclass_constructor.<locals>.make_subclassr   r   )r-   rP   rQ   pytreetree_mapr   rw   r   tupleallocate_dummyr   )r
  r  r  r  punwrapped_argspoutputr   rA   r	  r3   proxy_subclass_constructor  s   zTAutogradCompilerInstance.proxy_call_aot_backward.<locals>.proxy_subclass_constructor)make_subclass_override)r   _forward_clsr   r   r-   rP   rQ   rw   r   ry   r   r   r   _backward_epilogue_functionalr  r  r   )rB   pinputspsaved_tensorssaved_tensorspctxr  maybe_backward_state_idxr   CompiledFunctionr   r  r  r  resultspresultsr2   )r  r   r   r  r   rB   r3   proxy_call_aot_backward*  s>   

H
z0AutogradCompilerInstance.proxy_call_aot_backwardbackward_idxr  r  c              
   C   s  | j d usJ | j | }| |}| |}	t|jdr'| ||	||||}
n| jjdt||	g|R i d}
|
d us<J t > g }t	|D ]&\}}|d u sT|
| d u rZ|
d  qF|\}}}}|
tj||||d qF| ||
 W d    t|S 1 sw   Y  t|S )N_aot_idr   r   )sizedtypelayoutdevice)ry   r   rM   r  r  rw   r   r   r   r   r   r-   emptyr   r  )rB   rb   output_metadatasr  r  r  r  r  r  r  r   grad_insr   output_metadatar#  r$  r"  r!  r2   r2   r3   proxy_call_backward  sN   	


	


z,AutogradCompilerInstance.proxy_call_backwardc                 C   s>   ||  | | |  | | f}| t|d gd S )N   )rc   stridesstorage_offset
proxy_callcopy_slices_prologue)rB   rb   baseviewrG   r2   r2   r3   call_copy_slices_prologue  s   	z2AutogradCompilerInstance.call_copy_slices_prologuec                 C   s    |  t||||fd gt| S r5   )r-  copy_slices_epiloguer   )rB   needs_input_gradrJ   res
grad_slicer2   r2   r3   call_copy_slices_epilogue  s
   
z2AutogradCompilerInstance.call_copy_slices_epiloguec                 C   s8   t   tddgW  d    S 1 sw   Y  d S )Nr   i[r   rA   r2   r2   r3   r    s   $z'AutogradCompilerInstance.allocate_dummyc                 C   s   t ||||S )zBinds ops.fn_name = fn)opsrX   )rB   fn_namerT   rU   rV   r2   r2   r3   bind_function  s   z&AutogradCompilerInstance.bind_functionc                 C   s    t |}| ||g|R |S )z:Proxies a call to ops.fn_name(grads, *args) into the graph)r7  rZ   r-  )rB   r8  gradsrG   r(  r   r2   r2   r3   apply_functional  s   
z)AutogradCompilerInstance.apply_functionalc                    sn   t |\}}t fdd|}jjd||i d fdd|D }| fddtt|D  |S )z*Proxies a call to fn(*args) into the graphc                    s
     | S r5   r   )r   rA   r2   r3   r     s   
 z5AutogradCompilerInstance.proxy_call.<locals>.<lambda>r   rF   c                    s   g | ]}   qS r2   )r  )r   _rA   r2   r3   r   !      z7AutogradCompilerInstance.proxy_call.<locals>.<listcomp>c                    r   r2   r2   r   )	proxy_outr2   r3   r   "  r=  )r  tree_flattenr  rw   r   r   r   r   )rB   rT   rG   r(  r   r<  
proxy_argsrJ   r2   )r>  rB   r3   r-    s   "z#AutogradCompilerInstance.proxy_callc                 C   sX   t d}t| j|g|R }| jjd||i d}t|t|ks$J | || |S )zEProxies a call to ops.validate_outputs(outputs, *args) into the graphvalidate_outputsr   rF   )	r7  rZ   r  r  r   rw   r   r   r   )rB   r<  r  rG   r(  r   r@  new_proxy_outputsr2   r2   r3   rA  %  s   
z)AutogradCompilerInstance.validate_outputsc                 C   sJ   |  |}|  |}| jjdtj||fi d}|  }| |g|g |S )Nr   rF   )r   rw   r   r-   rX   r  r   )rB   old_varnew_varold_var_proxynew_var_proxyr>  rJ   r2   r2   r3   
accumulate0  s   

z#AutogradCompilerInstance.accumulatec                    s(    j dt|g fdd|D R |S )Nr   c                    r   r2   r   r   r<   rA   r2   r3   r   @  r   z<AutogradCompilerInstance.proxy_call_hook.<locals>.<listcomp>)rw   r   r	   )rB   hookrG   rH   r2   rA   r3   proxy_call_hook:  s   z(AutogradCompilerInstance.proxy_call_hookc                 C   sN   | j d usJ | j | }| j| }| j||dd}|  }| |g|g |S )Nunpack_hook	hook_type)ry   r   rJ  r  r   )rB   hook_iddata_idrI  dataproxyr   r2   r2   r3   rK  E  s   

z$AutogradCompilerInstance.unpack_hookr   c                 C   s|   | j d usJ | j | }| j||| dd}t  t|| ||< | || g|g W d    |S 1 s7w   Y  |S )Ntensor_pre_hookrL  )ry   rJ  r   r=   r   )rB   rb   rN  r   rI  rQ  r2   r2   r3   rR  R  s   

z(AutogradCompilerInstance.tensor_pre_hookc                 C   sn   | j d usJ | j | }| j||dd}t  dd |D }| || W d    |S 1 s0w   Y  |S )Npre_hookrL  c                 S      g | ]}t |qS r2   r=   rH  r2   r2   r3   r   h  r=  z5AutogradCompilerInstance.pre_hook.<locals>.<listcomp>ry   rJ  r   r   )rB   rb   rN  rI  r   r2   r2   r3   rS  _  s   

z!AutogradCompilerInstance.pre_hookc                 C   sp   | j d usJ | j | }| j|||dd}t  dd |D }| || W d    |S 1 s1w   Y  |S )N	post_hookrL  c                 S   rT  r2   rU  rH  r2   r2   r3   r   v  r=  z6AutogradCompilerInstance.post_hook.<locals>.<listcomp>rV  )rB   r  rb   rN  rI  r   r2   r2   r3   rW  l  s   

z"AutogradCompilerInstance.post_hookc                 C   s|   t |tjsJ | jd usJ | j| }| j||dd}t  t|g}| ||g W d    |S 1 s7w   Y  |S )Npost_acc_grad_hookrL  )r{   r-   r|   ry   rJ  r   r=   r   )rB   inputrN  rI  rQ  r2   r2   r3   rX  z  s   


z+AutogradCompilerInstance.post_acc_grad_hookc                 C   sB  i }d}t |j}|d jdksJ |d }t |j }tt}|| |d ks+J |t| d }|| |d ks=J t|D ]>\}	}
|sS|
jd j	j
dkrSd}qA|
jd j	j
d	k}t|
jd  dk}|r|rt |
j }td
d |D r|
||	< qA|r| D ]}
td|
 |
jd  |
jd< qt | S g S )NFr   rb   rE   r   cudaTcpuc                 s   sB    | ]}t |jtjjo|jjd v pt |jto|jj V  qdS ))primsatenN)r{   r   r-   _ops
OpOverload	namespacerN   rU   r   userr2   r2   r3   r     s    	

zDAutogradCompilerInstance.move_graph_nodes_to_cuda.<locals>.<genexpr>zMoving node %s from cpu to cuda)listr   r   userskeysr   r   r   metar$  r   r!  allvaluesverbose_logdebugr[  )rB   r   to_movehas_cuda_inputsr   rb   inputs_usersfirst_getitem_idxlast_getitem_idxr   r   is_cpu	is_scalar
node_usersr2   r2   r3   move_graph_nodes_to_cuda  s:   
	z1AutogradCompilerInstance.move_graph_nodes_to_cudac                 C   s6   t |tjjo|jdko|jtjjjj	tjjj
jfv S )Nr   )r{   r-   r   r   r   r   r7  r^  sym_sizer   	sym_numeldefault)rB   r   r2   r2   r3   is_sym_node  s   z$AutogradCompilerInstance.is_sym_nodec                    s   t   t| jjjddD ]\}} |j  q|tt	d ks$J  fdd}t| jjj
}| jj| t| jjj
}td||  d S )Nr   )r   rE   c                    s0   |  v p| j dkp| j dkp| j dko| jtv S )Nr   r   r   )r   r   _impure_targetsr   unpack_nodesr2   r3   	is_impure  s   z/AutogradCompilerInstance.dce.<locals>.is_impurezDCE removed %d nodes)r(   r   rw   r   
find_nodesupdatere  rf  r   r   r   eliminate_dead_coderj  rk  )rB   r   r   r}  beforeafterr2   r{  r3   dce  s   zAutogradCompilerInstance.dcec                 C   s   t | jj| jj|S r5   )r   rw   r   r   )rB   r   r2   r2   r3   create_graph_module  s   z,AutogradCompilerInstance.create_graph_modulec              	      s  j dtjdi  j  j ddj |fi  g t	 r,
j jj jjD ]}dD ]}||jv r@|j|= q5q1tddd fddd	                   d
j  t dg td dddd}td| td| td fddd fdd}t  j!dt"# djij$dd j%&d d d  |' fS )Nr   r2   r   )tensor_metaexample_valuer   r   c                   S   r   )N&compiled_autograd_graph_pre_reorderingstringr   r2   r2   r2   r2   r3   r     r   z6AutogradCompilerInstance.end_capture.<locals>.<lambda>c                      s&   t  jj jjd j djddS )NCompiledAutogradPreReorderingFprint_output)r   rw   r   r   r   print_readabler2   rA   r2   r3   r      s    r   r  rb   zCompiled autograd graphT)include_deviceinclude_stridecoloredz%scompiled_autograd_graphc                      s    j ddS )NFr  )r  r2   )r   r2   r3   r   '  s    )r   c              	      s   zWda  D ]}||  jdd||< qt / tj | |||||W  d    W  d    W da S 1 s:w   Y  W d    n1 sIw   Y  W da d S W da d S da w )NT)non_blockingF)in_compiled_autograd_region
pin_memoryr[  _disablerj   r   )compiled_fnrb   rc   rd   re   packed_inputsr   )runtime_inputs_to_moverB   r2   r3   runtime_wrapper*  s   0z=AutogradCompilerInstance.end_capture.<locals>.runtime_wrapperr+   r   r   )(rw   r   r
   _exec_final_callbacks_stubrt   ru   r   
create_argr   r:   rt  r   r   rg  r   rename_aot_dispatcher_nodesdelay_unpack_hook_nodesreorder_tensor_pre_hook_nodes'reorder_pre_hook_nodes_to_schedule_asapreorder_accumulate_grad_nodes%reorder_pre_hook_nodes_to_mimic_eager reorder_post_acc_grad_hook_nodesreorder_post_hook_nodesr  r  r   r   r   compiled_autograd_loginforj  rk  r   log_event_endr   r   r   r   __exit__rq   )rB   r  r   fieldlazy_graph_coder  r2   )r   r  rB   r3   end_capture  sz   



z$AutogradCompilerInstance.end_capturec                 C   s
  | j du rdS dtjjjdtjjjfdd}tt}| j D ]\}}|d }|d }d}|d	 j	}|| r=d
||  }||  d7  < t
|j}	t|	}
|
dusTJ z|
jdkrct|	}
|
jdksZW n	 tym   Y qw zt
| jj	j}t|D ]}t| qzt|}|jdkr|||
st|}|jdkr|||
r|
jdkr|jdkr|jst|}q|||
std| | d
|
j |_t|
jD ]\}}d| | d
|j |j| _qt|	}
t|}|
jdkr|jdksW q ty   td| j ||| Y qw dS )z
        Renames nodes as they appear in the AOTDispatcher backward graphs, prefixed by AOT id
        e.g. AOTDispatcher backward graph X's `sin_Y` -> `aotX_sin_Y`
        Ncaaotc                 S   s   | j |j k}|st| j dot|j do| j j|j jk}|sFt| j drFt|j drF|j  dkrFt|jddrF| j  |jd  k}|o]| j|jko]| j|jko]t| j	t|j	kS )Nr[   rS   zaten::reshapeoriginal_aten)
r   rM   r[   rS   rg  rZ   r   r   r   all_input_nodes)r  r  target_matchr2   r2   r3   
is_similarH  s0   




zHAutogradCompilerInstance.rename_aot_dispatcher_nodes.<locals>.is_similarca_node_start_idxaot_id aot_gmr<  rE   r   r   zIFailed to match %s%s (NodeCall %s) nodes with AOT backward graph %s nodes)r   r-   r   r   r   r   r   r   itemsr   iterr   r   r   StopIterationrw   r   re  rS   r   r  rj  rk  )rB   r  aot_id_counternodecall_indexr  r  r  aot_id_postfix	aot_graphaot_itaot_nodeca_itr<  ca_noder   inpr2   r2   r3   r  @  st   






z4AutogradCompilerInstance.rename_aot_dispatcher_nodesc                 C   s   dd | D }|S )Nc                 S   s    g | ]}t |tjju r|qS r2   )r   r-   r   r   r   r2   r2   r3   r     s     z:AutogradCompilerInstance.get_all_nodes.<locals>.<listcomp>r2   )rG   r   r2   r2   r3   get_all_nodes  s   z&AutogradCompilerInstance.get_all_nodesc                 C   s8   | j dks| j dkr| jtjkr| jd j dkrdS dS )Nr   r   r   TF)r   r   operatorgetitemrG   rz  r2   r2   r3   is_placeholder  s   

z'AutogradCompilerInstance.is_placeholderc                 C   s   | j jjdtjjjjdD ]:}|jd |jd }}d}|j	t
jkr)|}|jd }t||g}||jurG| |sG|| |durG|| qdS )a  
        Usage of AOTAutograd causes all the accumulate_grad_ nodes to get pushed to the end of
        the graph.  This differs from eager mode, which schedules them as soon as possible. This
        pass attempts to reorder the graph to mimic eager behavior.
        r   r   r   r   rE   N)rw   r   r~  r-   r7  inductoraccumulate_grad_rw  rG   r   r  r  maxprevr  r   )rB   r   
param_node	grad_nodegetitem_nodeargr2   r2   r3   r    s   



z6AutogradCompilerInstance.reorder_accumulate_grad_nodesc                 C   sD   | j jjdtdD ]}|jdddkrq	t|j}|| q	dS )zp
        We can delay unpack hooks until they are needed, even later than in the eager autograd engine.
        r   r  rM  NrK  )	rw   r   r~  r	   rH   rZ   minre  prepend)rB   r   
first_userr2   r2   r3   r    s   

z0AutogradCompilerInstance.delay_unpack_hook_nodesc                 C   sl   | j jjdtdD ]*}|jdddkrq	|jd }|jd }||jur3| |s3|	| |	| q	dS )a  
        Usage of AOTAutograd causes all the tensor_pre_hook nodes to get pushed
        to the end of the graph. This differs from eager mode, which schedules
        them as soon as possible. This pass attempts to reorder the graph to
        mimic eager behavior.
        r   r  rM  NrR  r   rE   )
rw   r   r~  r	   rH   rZ   rG   r  r  r   )rB   r   r  
input_noder2   r2   r3   r    s   




z6AutogradCompilerInstance.reorder_tensor_pre_hook_nodesc                 C   s   | j jjdtdD ]s}|jdddkrq	|jd }| |jd }g }g }|g}|D ]}|jdkrJ|j	t
jkrJ||jd  || || q+t||D ]\}}	|| ||	 qPt|}
|
|jur|| |
s||
| |D ]}|| qtq	dS )a  
        In this function, we schedule the pre hooks as soon as possible. This
        does not match eager behavior (schedule pre hook right before its
        registered node), but it can make acc grad be scheduled properly when
        the pre hooks are registered to them. After reordering acc grad node, we
        will reorder the pre hooks again to mimic eager behavior.
        r   r  rM  NrS  r   rE   )rw   r   r~  r	   rH   rZ   rG   r  r   r   r  r  r   zipremover  r  r  )rB   r   r  input_nodes	to_remove	to_append
hook_blockr   abr  r2   r2   r3   r    s4   





z@AutogradCompilerInstance.reorder_pre_hook_nodes_to_schedule_asapc                 C   s   g }| j jjdtdD ]}|jdddkrq|| qt|D ]D}|jd }t	|j
 }t|dkr6q!tdd |D sAJ tt|d j
 }||jure|| || |D ]}|| q]q!dS )	a%  
        Usage of AOTAutograd causes all the pre_hook nodes to get pushed to the
        end of the graph. This differs from eager mode, which schedules them
        right before their registered node execution. This pass attempts to
        reorder the graph to mimic eager behavior.
        r   r  rM  NrS  r   c                 s   s&    | ]}|j d ko|jtjkV  qdS )r   N)r   r   r  r  rb  r2   r2   r3   r   #  s
    
zQAutogradCompilerInstance.reorder_pre_hook_nodes_to_mimic_eager.<locals>.<genexpr>)rw   r   r~  r	   rH   rZ   r   reversedrG   rd  re  rf  r   rh  r   r  r  )rB   	pre_hooksr   hook_getitem_nodere  registered_noder  r2   r2   r3   r    s.   




z>AutogradCompilerInstance.reorder_pre_hook_nodes_to_mimic_eagerc                 C   s   g }| j jjdtdD ]}|jdddkrq|| qt|D ]<}|jd }|jd }d}t	|j
 D ]}|jdkrJ|jtjjjjkrJ|} nq6|dusSJ d|| || q!dS )	a  
        Usage of AOTAutograd causes all the post_acc_grad_hook nodes to get
        pushed to the end of the graph. This differs from eager mode, which
        schedules them as soon as possible. This pass attempts to reorder the
        graph to mimic eager behavior.
        r   r  rM  NrX  r   rE   z8post_acc_grad_hook must have corresponding acc grad node)rw   r   r~  r	   rH   rZ   r   r  rG   rd  re  rf  r   r   r-   r7  r  r  rw  )rB   post_acc_grad_hooksr   r  r  acc_grad_noder   r2   r2   r3   r  /  s.   





z9AutogradCompilerInstance.reorder_post_acc_grad_hook_nodesc                    sl  g }| j jjdtdD ]  jdddkrq|  qt|D ]  jd } jd } jd }t	|dkr9q!g }|
t| |D ]}|
 fd	d
t|j D  qDt|}|jdkr|jtjjjjkr|jd }d}	t|j D ]}
|
jdkr|
jtkr|
jdddkr|
}	qx|	dur|	| |  q!| jur| |s|| |  q!dS )a  
        Usage of AOTAutograd causes all the post_hook nodes to get pushed to the
        end of the graph. This differs from eager mode, which schedules them as
        soon as possible. This pass attempts to reorder the graph to mimic eager
        behavior.
        r   r  rM  NrW  r   rE      c                 3   s:    | ]}|j d kr|jtkr jdddks|V  qdS )r   rM  NrW  )r   r   r	   rH   rZ   rb  rz  r2   r3   r   p  s    

zCAutogradCompilerInstance.reorder_post_hook_nodes.<locals>.<genexpr>rX  )rw   r   r~  r	   rH   rZ   r   r  rG   r   extendrd  re  rf  r  r   r   r-   r7  r  r  rw  r  r  )rB   
post_hooksr  output_nodesr  input_nodes_and_usersr  r  r  post_acc_grad_hook_noder   r2   rz  r3   r  V  sN   













z0AutogradCompilerInstance.reorder_post_hook_nodesc                    s   |d u rd S t |tr fdd|D S t |tr$t fdd|D S t |tjtjfr3 j|j S t |tjs;|S t	 j
|}t |tjjjjsLJ |jS )Nc                    r   r2   r   rH  rA   r2   r3   r     r   z5AutogradCompilerInstance.to_proxy.<locals>.<listcomp>c                 3   s    | ]}  |V  qd S r5   r   rH  rA   r2   r3   r     s    z4AutogradCompilerInstance.to_proxy.<locals>.<genexpr>)r{   rd  r  r-   SymIntSymFloatr   r   r|   r    rw   r   r   proxy_tensor_ProxyTensorrQ  )rB   tr  r2   rA   r3   r     s   

z!AutogradCompilerInstance.to_proxyc                    s   t  tjjrB|r5t|t|ksJ g }tt|D ]}|| \}}| ||d  | |  q| n fddtt|D  t|t ksLJ t| d | j	d  S )Nc                    r   r2   r2   r   r   r2   r3   r     r=  zDAutogradCompilerInstance.bind_objects_to_proxies.<locals>.<listcomp>constanttracer)
r{   r-   r   r*   r   r   set_node_originr   r#   rw   )rB   objectsr   r   bound_proxiesr   r  	node_namer2   r  r3   r     s   z0AutogradCompilerInstance.bind_objects_to_proxiesindexc                 C   s4   | j d usJ | j | }t }t||d | jd |S )Nr  )ry   r   r#   rw   )rB   r  rQ  bw_stater2   r2   r3   bind_backward_state  s
   
z,AutogradCompilerInstance.bind_backward_stater  r  pyobjc           	      C   s   d}|d ur-|j }t|dr-|jd u rtd|| _|j}t| jjj	||jj
d| j|< | | d| d}t  d }|d|}t| d S )	Nr  r   zThis compiled backward function was saved by AOTAutogradCache, which does not support
                    compiled autograd. Please turn off AOTAutogradCache using `TORCHINDUCTOR_AUTOGRAD_CACHE=0`.)r  r  r  z (NodeCall )rZ  z:raw_stack_trace = CapturedTraceback.extract().format()[-1])r  rM   _lazy_backward_infoRuntimeErrorr   r   r   rw   r   r   	bw_moduler   r)   extractformatreplacer'   )	rB   r  r  r  maybe_aot_idforward_clsnew_coderaw_stack_tracenew_stack_tracer2   r2   r3   r    s(   

z(AutogradCompilerInstance.set_node_originrl   Nr5   ):r[   r\   r]   rC   r~   staticmethodr   rz   rd  r-   r|   r   r   r   r  r   r   r   r  autogradfunctionBackwardCFunctionr   r)  r1  r6  r  r9  r;  r-  rA  rG  rJ  rK  rR  rS  rW  rX  rt  rx  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r  Functionr  r2   r2   r2   r3   rk      s    

c
 
3
1`b

	$#'<

rk   Fc              
   c   s    |rt |tu sJ ddlm} |jjdkr%dazd V  W dad S daw dd l}|jj	j
tt| |\}}t rC|jj	j
t daz)|jd d V  W d    n1 sZw   Y  W |sdda|jj	j
|| d S |ssda|jj	j
|| w )Nr   )
eval_frameforce_eagerTF)r   booltorch._dynamor	  _stancestance%compiled_autograd_enabled_force_eagertorch._inductor.cudagraph_trees_CrP   r+   set_autograd_compiler	functoolspartialrk   r4   set_verbose_loggerrj  compiled_autograd_enabledr  set_multithreading_enabled)rq   dynamicr	  r-   prior_compilerprior_dynamicr2   r2   r3   _enable  sD   


r  c               
   c   s`    t jjjd d\} }dazd V  W | rdat jjj| | d S | r&dat jjj| | w )NFT)r-   r  rP   r+   r  r  )r  r  r2   r2   r3   r    s$   

r  rl   c                   C   sH   da trJ tjjjd d tjjjd  tjjj  t	
 ad S )NF)r  r  r-   r  rP   r+   r  r  clear_cache	itertoolsrW   r   r2   r2   r2   r3   reset#  s   r  c                 C   sT   | d }| ||}|d usJ || || }	||||	}
||
|
jtjdgS )Nr   )memory_format)new_empty_stridedcopy_
as_stridedcloner-   contiguous_format)rb   
base_sizesbase_stridesbase_storage_offset
view_sizesview_stridesview_storage_offsetgradrJ   offsetr5  r2   r2   r3   r.  0  s   	
r.  c                 C   sf   d gt |  }tt | D ]#}| | r0|| d u rq|dkr*|||  |||< q|| ||< q|S )Nr   )r   r   r!  )r3  rJ   r4  r5  grad_inputsr   r2   r2   r3   r2  D  s   
r2  )Fr  )^__doc__rr   r  r  r  r   collectionsr   r   typingr   r   r   r   r-   torch.utils._pytreeutils_pytreer  torch._dynamo.external_utilsr   r	   r
   torch._dynamo.sourcer   r   torch._dynamo.utilsr   r   r   r   torch._guardsr   r   r   torch._loggingr   r   torch._prims_commonr   torch._subclassesr   torch.fxr   %torch.fx.experimental._backward_stater   "torch.fx.experimental.proxy_tensorr   r   r   r    r!   r"   r#   %torch.fx.experimental.symbolic_shapesr$   r%   torch.fx.tracebackr&   r'   torch.utils._ordered_setr(   torch.utils._tracebackr)   torch.fx.proxyr*   r[   r  rj  r4   r:   r=   r?   rN   r7  r   r  r  r  rw  ry  rW   r   rj   rk   r  r  r  contextmanagerr  r  r  r.  r2  r2   r2   r2   r3   <module>   s~   $	


	        C)

