o
    Ih.$                     @   sH  d Z ddlZddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZ dd Zdejj fddZ!dejj dee" fddZ#dejj dee" fddZ$de%fddZ&de'ee"  fddZ(dd Z)G dd dZ*ede* d  d$d"d#Z+dS )%a  
This module implements CUDA graphs support for TorchDynamo backends.

CUDA graphs allow for capturing and replaying GPU operations, which can significantly
reduce CPU overhead in GPU-accelerated PyTorch models. This module provides:

- CUDA graph creation and management for both forward and backward passes
- Input mutation detection and handling
- Device compatibility checking
- Stack trace management for debugging
- Integration with TorchInductor's cudagraph trees

The backend supports two main modes:
1. cudagraphs: Full CUDA graph support with both forward and backward pass optimization
2. cudagraphs_inner: Lower-level CUDA graph implementation used for benchmarking

Key components:
- CudagraphsBackend: Main backend class for CUDA graph integration
- Mutation detection utilities to ensure graph safety
- Device mapping and compatibility checks
- Stack trace collection for debugging
    N)defaultdict)Optional)config)aot_autograd)	boxed_nop)BoxedDeviceIndex'check_multiple_devices_or_any_cpu_nodesformat_default_skip_messageget_mutation_stack_traceget_placeholder_info#log_cudagraph_skip_and_bump_counter)	BoxedBoolcount_tangents%get_first_incompatible_cudagraph_nodenum_fw_fixed_argumentsoutput_node)StorageWeakRef   )register_backendc                 C   s  dd }t t}d}t }| jD ]q}|jdkr3t||jtjr.|t||j	  
| |d7 }q|jdkrt|jds?q|jj}t|jD ]8\}}|t|jk rY|j| }	n|j|jvr`qH|j|j }	d}
|jrq|jjrqd	}
|
r||t||	j	  O }qHq|S )
Nc                 S   s   d| v r| d S | d S )Nvalfake_result )metar   r   U/var/www/vscode/kcb/lib/python3.10/site-packages/torch/_dynamo/backends/cudagraphs.pymeta_fk7   s   z%find_input_mutations.<locals>.meta_fkr   placeholderr   call_function_schemaFT)r   setnodesop
isinstancer   torchTensorr   _typed_storageaddhasattrtargetr   	enumerate	argumentslenargsnamekwargs
alias_infois_write)gr   inputs	input_idxmutated_inputsnschemaiargargumentmut_argr   r   r   find_input_mutations6   s:   



r:   gmc                 C   sD   i }| j jD ]}|jdd }t|tjr|j|vr|||j< q|S )Nr   )graphr   r   getr!   r"   r#   device)r;   device_node_mappingr4   tr   r   r   get_device_node_mapping]   s   
rA   	aot_modelreturnc                 C   s2   t | jtt| }|sd S t| j}t||S N)r:   r<   r   ranger   r
   )rB   	num_fixedmutation_indicesplaceholdersr   r   r   3check_for_mutation_ignore_cuda_graph_managed_tensorf   s
   

rI   c                 C   sN   t jst| | }r|S tt|  }r|S t|  }r%td|j dS d S )Nzincompatible op ())r   (cudagraph_backend_support_input_mutationrI   r   rA   r   r	   r,   )rB   rF   mut_skipskipnoder   r   r   check_for_skipq   s   rO   c                 C   s$   t tt| }|jdksJ |jS )Ncuda)nextiterrA   typeindex)r;   r>   r   r   r   get_device_index   s   rU   c                 C   s.   t | }t|jdksJ dd |jd D S )Nr   c                 S   s&   g | ]}t |tjjjr|jnd qS rD   )r!   r"   fxrN   Nodestack_trace).0r7   r   r   r   
<listcomp>   s    z$get_stack_traces.<locals>.<listcomp>r   )r   r*   r+   )r;   outputr   r   r   get_stack_traces   s
   r\   c                    sj   ddl m tdtd  d fdd	} fdd}t||tj|dd	tjj	j
d
}|| S )Nr   )cudagraphify_implTFc                    s   t | |}ttt|}t| | }r#t td|  |S  t|  ||t	| j
ddt| t| jt| jd	}d|_|S )Nzskipping cudagraphs due to Fdevice_indexis_backwardis_inferencestack_tracesrH   mutated_input_idxsT)r   r   r*   rO   r   disabler   r   rU   rE   valuer\   r   r<   r:   _boxed_call)rB   
aot_inputsra   interpfixedskip_msgoutboxed_device_indexr]   do_cudagraphsdynamo_inputsr   r   forward_cudagraphs   s,   

z&cudagraphs.<locals>.forward_cudagraphsc                    s   t  |}s	 S t }t | }r5td| tjjjjddd us)J  fdd}d|_	|S ||t
|t ddt t jt jd	}d|_	|S )Nzskipping cudagraphs due to %sF)create_if_none_existsc                    s       | S rD   )set_to_running_backward)r1   rB   managerr   r   fn   s   z3cudagraphs.<locals>.backward_cudagraphs.<locals>.fnTr^   )r   r   rO   r   r"   	_inductorcudagraph_treesget_managerre   rf   rE   rU   r\   r   r<   r:   )rB   rg   rh   ri   rj   ru   rk   )rm   r]   rn   rs   r   backward_cudagraphs   s8   
z'cudagraphs.<locals>.backward_cudagraphs)ra   )fw_compilerbw_compilerinference_compilerkeep_inference_input_mutations)F)torch._inductor.cudagraph_treesr]   r   r   r   	functoolspartialr"   _dynamor   %cudagraph_backend_keep_input_mutation)dynamo_modelro   rp   ry   aot_cudagraphsr   rl   r   
cudagraphs   s   &
r   c                   @   s(   e Zd ZdZedd Zedd ZdS )CudagraphsBackendr   c                  C   s   ddl m}  |   d S )Nr   reset_cudagraph_trees)r~   r   r   r   r   r   reset   s   
zCudagraphsBackend.resetc                 C   s
   t | |S rD   )r   )modelr1   r   r   r   __call__   s   
zCudagraphsBackend.__call__N)__name__
__module____qualname__compiler_namestaticmethodr   r   r   r   r   r   r      s    
r   )r,   compiler_fnTc                    s  t |ttfs	J  rdd |D nt|tj  tj }|tj  tj	| | |  W d   n1 s>w   Y  |  tj | tj  tj
 tjj|d |  W d   n1 spw   Y  t ttfsf fdd}|S )zBThis isn't registered as a backend, but is used in some benchmarksc                 S   s   g | ]}t |qS r   )r"   
zeros_likerY   xr   r   r   rZ      s    z$cudagraphs_inner.<locals>.<listcomp>N)streamc                     sT   t t | ks
J  rt| D ]	\}}|| q  r(dd D S S )Nc                 S   s   g | ]}|  qS r   )cloner   r   r   r   rZ     s    z1cudagraphs_inner.<locals>.run.<locals>.<listcomp>)r*   zipcopy_replay)
new_inputsdstsrccopy_inputscopy_outputsr<   static_inputsstatic_outputsr   r   run  s   zcudagraphs_inner.<locals>.run)r!   listtupler"   rP   synchronizeStreamwait_streamcurrent_streamr   	CUDAGraphr<   )r   r1   r   r   r   r   r   r   r   cudagraphs_inner   s*   





r   )TT),__doc__r   collectionsr   typingr   r"   torch._dynamor   torch._dynamo.backends.commonr    torch._dynamo.backends.debuggingr   torch._inductor.cudagraph_utilsr   r   r	   r
   r   r   torch._inductor.utilsr   r   r   r   r    torch.multiprocessing.reductionsr   registryr   r:   rV   GraphModulerA   strrI   rO   intrU   r   r\   r   r   r   r   r   r   r   <module>   s4    '	
	N