o
    Ih                      @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZmZ d dlmZ d dlmZmZmZmZ d dlmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d d	lm Z  d d
l!m"Z" d dl#m$Z$m%Z%m&Z&m'Z'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ erd dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 ddl8m9Z9 ddl:m;Z; ddl8m<Z< ddl=m>Z> ddl?m@Z@ dZAdaBe-eCdZDeEeCZFG dd dZGG dd dZHG d d! d!eIZJejKdDd$d%ZLejMG d&d' d'ZNejMG d(d) d)ZOeO ZPee"jQe"jRf ZSejMG d*d+ d+ZTejMG d,d- d-ZUG d.d/ d/eUZVG d0d1 d1ZWG d2d3 d3ZXG d4d5 d5eUZYG d6d7 d7eWeYZZG d8d9 d9eXeYZ[G d:d; d;eWeUZ\G d<d= d=eXeUZ]dEdBdCZ^dS )F    )annotationsN)IterableSequence)ThreadPoolExecutor)byrefc_size_tc_void_pCDLL)AnyCallableOptionalTYPE_CHECKINGUnion)multiprocessing)get_interface_for_device)rand_strided)ir)CppCodeCacheCUDACodeCache
DLLWrapperget_hashPyCodeCache)get_gpu_typeis_gpu)getArtifactLogger)
OrderedSet)BaseProcess)Queue)
ModuleType)TritonTemplateCaller   )WorkspaceArg)config)WorkspaceZeroMode)benchmarker)VCUDA_VISIBLE_DEVICESF
autotuningc                   @     e Zd ZdS )PingN__name__
__module____qualname__ r.   r.   T/var/www/vscode/kcb/lib/python3.10/site-packages/torch/_inductor/autotune_process.pyr)   ;       r)   c                   @  r(   )PongNr*   r.   r.   r.   r/   r1   ?   r0   r1   c                   @  r(   )!NonzeroWorkspaceNotSupportedErrorNr*   r.   r.   r.   r/   r2   C   r0   r2   deviceOptional[int]c              	   c  sx    | du r
dV  dS t jt}t| t jt< zdV  W |du r&t jt= dS |t jt< dS |du r6t jt= w |t jt< w )z
    Context manager to set the CUDA_VISIBLE_DEVICES environment variable to the
    specified single device. If device is None, don't manipulate the environment.
    N)osenvirongetr&   str)r3   currentr.   r.   r/   set_cuda_visible_deviceG   s   r:   c                   @  s   e Zd ZU dZdZded< dZded< dZded< dZded	< e	d(ddZ
e	d(ddZd)ddZd*ddZd*ddZd+ddZ	d,d-dd Zd*d!d"Zd*d#d$Zd.d*d&d'ZdS )/TuningProcessz
    Abstraction for launching a helper process to benchmark kernels. Spawns
    the parent process and uses multiprocessing queues to send benchmark
    requests and return results.
    Nr4   r3   zOptional[BaseProcess]processzOptional[Queue[Any]]request_queueresponse_queue
Queue[Any]returnNonec                 C  sF   t dtjt z	t| | W dS  ty"   t 	d Y dS w )z4
        Entry point for the child process.
        z2Entering TuningProcess child. Visible devices = %szException in TuningProcessN)
autotuning_logdebugr5   r6   r7   r&   r;   workloop	Exception	exception)r=   r>   r.   r.   r/   process_maini   s   
zTuningProcess.process_mainc                 C  s\   	 |   }|du rdS t|tr|t  nt|tr$||  n	tdt| q)z<
        Work loop for the benchmarking subprocess.
        TNzInvalid request type )	r7   
isinstancer)   putr1   BenchmarkRequest	benchmarkRuntimeErrortype)r=   r>   objr.   r.   r/   rD   z   s   

zTuningProcess.workloopboolc                 C  s   | j duo| jduo| jduS )z?
        True if the sub-process has been initialized.
        Nr<   r=   r>   selfr.   r.   r/   valid   s
   
zTuningProcess.validc                 C  s   d | _  | _| _dS )z2
        Reset to an uninitialized state.
        NrP   rQ   r.   r.   r/   clear   s   zTuningProcess.clearc                 C  s   |   rdS td}| | _| | _|j| j| j| jfd| _| jdus)J t	| j
 | j  W d   dS 1 s?w   Y  dS )z
        Create child process, request/response queues, and do the warm up.
        Set the environment to make only the provided GPU device visible
        to the process.
        Nspawn)targetargs)rS   r   get_contextr   r=   r>   ProcessrG   r<   r:   r3   start)rR   ctxr.   r.   r/   
initialize   s   


"zTuningProcess.initializerN   r
   c                 C  s&   |    | jdusJ | j| dS )z8
        Push a work item to the child process.
        N)r\   r=   rI   )rR   rN   r.   r.   r/   rI      s   zTuningProcess.put      ^@      @      ?c                 C  s   | j dusJ | jdusJ 	 z@|}d}|durB|dkrB|d8 }z	| jjdd}W n tjy9   | j  s7 Y nw |durB|dks|du rM| jj|d}|W S  tjyl   | j j}|du rg| j||d  |    w )a,  
        Get a response from the child process. Raises queue.Empty on timeout
        or if the process dies.

        This method is (so far) only used by TuningProcessPool, where torch._inductor.config entries are being used
        to populate the timeouts:

        Arguments:

            @param result_timeout: Timeout in seconds, defaults to 120.0 or to
                                   config.max_autotune_subproc_result_timeout_seconds when called by TuningProcessPool
            @param graceful_timeout: Timeout in seconds to allow graceful shutdown (SIGTERM is sent after this time).
                                    Defaults to 3.0 or to config.max_autotune_subproc_graceful_timeout_seconds
            @param terminate_timeout: Timeout in seconds after SIGTERM, until we send SIGKILL if the process
                                      remains alive. Defaults to 1.0 or to
                                      config.max_autotune_subproc_terminate_timeout_seconds.
        Returns:
            A response from the child process (Any type)
        NTr_   g      ?timeout)graceful_timeoutterminate_timeout)	r<   r>   r7   queueEmptyis_aliveexitcodekillrT   )rR   result_timeoutrb   rc   remaining_timeoutresstatusr.   r.   r/   r7      s>   
zTuningProcess.getc                 C  s8   |   r| jdusJ | jdusJ | jd dS dS )z8
        Signal the child process to terminate.
        N)rS   r<   r=   rI   rQ   r.   r.   r/   	terminate   s
   zTuningProcess.terminatec                 C  s$   | j dur| j   |   dS dS )z5
        Wait for the child process to exit.
        N)r<   joinrT   rQ   r.   r.   r/   wait   s   

zTuningProcess.wait      @c                 C  s   | j d urA|   | j j|d | j  r;td| j j | j   | j j|d | j  r;td| j j | j   | 	  d S d S )Nr`   z&Sending SIGTERM to process with PID %dz&Sending SIGKILL to process with PID %d)
r<   rm   rn   rf   rB   warningpiderrorrh   rT   )rR   rb   rc   r.   r.   r/   rh     s$   




zTuningProcess.kill)r=   r?   r>   r?   r@   rA   )r@   rO   r@   rA   )rN   r
   r@   rA   )r]   r^   r_   )r@   r
   )rp   r_   )r+   r,   r-   __doc__r3   __annotations__r<   r=   r>   staticmethodrG   rD   rS   rT   r\   rI   r7   rm   ro   rh   r.   r.   r.   r/   r;   \   s&   
 






3
	r;   c                   @  s\   e Zd ZU dZdZded< dZded< dd	d
ZdddZdddZ	dddZ
dddZdS )TuningProcessPoolz
    Maintains a pool of TuningProcesses to benchmark kernels in parallel
    across devices. By default, we create one TuningProcess per device and
    set the sub-process environment to make only that device visible.
    Nz$Optional[queue.Queue[TuningProcess]]	processeszOptional[ThreadPoolExecutor]executorr@   rA   c                 C  s   | j du | jdu ksJ | j durdS |  }td| t | _ |D ]}t|d}|  |	t
  | j 	| q$| j jD ]}t|jddtsMJ q@tt|d| _tsfdaddl}|| j dS dS )z,
        Start the child processes.
        Nz$Sub-process autotune device list: %s)r3   )ri   )max_workersTr   )ry   rz   get_device_listlogrC   rd   r   r;   r\   rI   r)   rH   r7   r1   r   lenEXIT_HANDLER_REGISTEREDatexitregisterrm   )rR   devicesr3   pr   r.   r.   r/   r\   $  s&   


zTuningProcessPool.initializeSequence[Optional[int]]c                 C  sf   t jsdgS t }t|}| }ttjv r-dd tjt dD }t	||ks+J |S t
t|S )zD
        Gather the list of devices to be used in the pool.
        Nc                 S  s   g | ]}t |qS r.   )int).0dr.   r.   r/   
<listcomp>W      z5TuningProcessPool.get_device_list.<locals>.<listcomp>,)r"   autotune_multi_devicer   r   device_countr&   r5   r6   splitr~   listrange)rR   gpu_typedevice_interfacecountr   r.   r.   r/   r|   I  s   
z!TuningProcessPool.get_device_listc                 C  s^   | j dur| j   d| _ | jdur-| jjD ]}|  q| jjD ]}|  q!d| _dS dS )z:
        Signal all child processes to terminate.
        N)rz   shutdownry   rd   rm   ro   )rR   r   r.   r.   r/   rm   ]  s   





zTuningProcessPool.terminatechoicer   floatc              	   C  s   |j dusJ | jdusJ | j }||j  z1z|tjtjtjW W | j| S  tj	yJ   t
d| d td Y W | j| S w | j| w )z
        Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
        remove it from the queue, execute the benchmark in that subprocess, and return
        the TuningProcess to the queue.
        NzFailed to benchmark choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.inf)bmreqry   r7   rI   r"   +max_autotune_subproc_result_timeout_seconds-max_autotune_subproc_graceful_timeout_seconds.max_autotune_subproc_terminate_timeout_secondsrd   re   warningswarnr   )rR   r   r<   r.   r.   r/   rV   l  s&   

zTuningProcessPool.targetchoiceslist[TritonTemplateCaller]!dict[TritonTemplateCaller, float]c                 C  sP   | j dus	J d| jdusJ i }t|| j| j|D ]\}}|||< q|S )z>
        Benchmark each choice in a separate process.
        Nz&Tuning process pool is not initialized)ry   rz   zipmaprV   )rR   r   resultsr   resultr.   r.   r/   rK     s   
zTuningProcessPool.benchmarkrt   )r@   r   )r   r   r@   r   r   r   r@   r   )r+   r,   r-   ru   ry   rv   rz   r\   r|   rm   rV   rK   r.   r.   r.   r/   rx     s   
 

%

rx   c                   @  sZ   e Zd ZU ded< ded< ded< ded< d	ed
< dZded< edddZdddZdS )
TensorMetaztorch.devicer3   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr   offsetNzOptional[str]nameirnodes/Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]r@   #Union[TensorMeta, list[TensorMeta]]c              	     s   t |tr fdd|D }tdd |D sJ |S |}t |tjr*tjd|d}| }|d us4J | }|d us>J t||t	j
jj| tjdt	j
jj| tjdt	j
jj| jtjd| dS )	Nc                   s   g | ]}  |qS r.   )from_irnodesr   xclsr.   r/   r     s    z+TensorMeta.from_irnodes.<locals>.<listcomp>c                 s  s    | ]}t |tV  qd S N)rH   r   r   r.   r.   r/   	<genexpr>  s    z*TensorMeta.from_irnodes.<locals>.<genexpr>fake)r   layout)fallback)r3   r   r   r   r   r   )rH   r   allr   LayoutBuffer	get_dtype
get_devicer   r%   graphsizevars
size_hintsget_sizer"   unbacked_symint_fallback
get_stride	size_hint
get_layoutr   get_name)r   r   r   noder   r3   r.   r   r/   r     s8   
zTensorMeta.from_irnodestorch.Tensorc                 C  s   t | j| j| j| j| jdS )N)r3   r   
extra_size)r   r   r   r3   r   r   rQ   r.   r.   r/   	to_tensor  s   zTensorMeta.to_tensor)r   r   r@   r   )r@   r   )r+   r,   r-   rv   r   classmethodr   r   r.   r.   r.   r/   r     s   
 #r   c                   @  sN   e Zd ZdZdddZdddZdddZddd ddZddd ddZdS )!rJ   a1  
    Only handle triton template benchmark for now. The extern kernel benchmark
    can be done inside the same process since they usually don't cause crash.

    Important: Instances of this class and subclasses have to be serializable
    across process boundaries. Do not put CUDA Tensors in here!
    kernel_namer8   input_tensor_metar   output_tensor_meta
extra_argsIterable[Any]r@   rA   c                   sh   || _ t|tr|g}|| _t ttfr,t dkr(t fdd D s(J  d   | _|| _	d S )Nr    c                 3  s2    | ]}d D ]}t  d |t ||kV  qqdS ))r3   r   r   r   r   r   N)getattr)r   r   attrr   r.   r/   r     s    z,BenchmarkRequest.__init__.<locals>.<genexpr>r   )
r   rH   r   r   tupler   r~   r   r   r   )rR   r   r   r   r   r.   r   r/   __init__  s   

zBenchmarkRequest.__init__input_tensorsr   output_tensorCallable[[], None]c                G     t r   NotImplementedErrorrR   r   r   r.   r.   r/   make_run_fn  s   zBenchmarkRequest.make_run_fnc                 C  s   d S r   r.   rQ   r.   r.   r/   cleanup_run_fn  s   zBenchmarkRequest.cleanup_run_fnNr   Optional[torch.Tensor]r   c                G  r   r   r   rR   fnr   r   r.   r.   r/   do_bench  s   zBenchmarkRequest.do_benchc          
      G  s   t tj}|rt }|d u r't|dksJ tdd | jD }| j	 }|r3t | }t }z
| j
|d|i}W n tyO   t d td Y S w |r\t | }t }| j|g||R  }|rzt | }	t dt| |||	 |   |S )Nr   c                 s  s    | ]}|  V  qd S r   )r   r   r.   r.   r/   r     s    z-BenchmarkRequest.benchmark.<locals>.<genexpr>r   z0Skipping op due to nonzero workspace requirementr   z6InChildProcess %s: load %f, create tensor %f, bench %f)rB   isEnabledForloggingDEBUGtimer~   r   r   r   r   r   r2   infor   r   rC   r8   r   )
rR   r   r   rC   start_tscreate_tensor_elapser   load_elapseoutbench_elapser.   r.   r/   rK     s>   

zBenchmarkRequest.benchmark)
r   r8   r   r   r   r   r   r   r@   rA   r   r   r   r   r@   r   rt   r   r   r   r   r@   r   )	r+   r,   r-   ru   r   r   r   r   rK   r.   r.   r.   r/   rJ     s    


rJ   c                   @  s,   e Zd ZdZddddZdd	dddZdS )TestBenchmarkRequestz
    Supports unit testing. Defined in this file so that the TuningProcess
    sub-process knows how to unpickle these objects.
    NvalueOptional[float]r@   rA   c                 C  s
   || _ d S r   )r   )rR   r   r.   r.   r/   r   ?  s   
zTestBenchmarkRequest.__init__r   r   r   r   r   r   c                G  s   | j d u r	td| j S )NzFailed to run)r   rE   r   r.   r.   r/   rK   B  s   
zTestBenchmarkRequest.benchmarkr   )r   r   r@   rA   r   )r+   r,   r-   ru   r   rK   r.   r.   r.   r/   r   9  s
    r   c                   @     e Zd Zdddd	d
ZdS )GPUDeviceBenchmarkMixinNr   r   r   r   r   r@   r   c          	      G  s   t dd g ||D }t|dksJ d| tdd |D d}t|}t|dkr5tt|}n| }|| t|}|	  W d    |S 1 sSw   Y  |S )Nc                 s  s<    | ]}t |tjrt|jjr|jjd ur|jjV  qd S r   )rH   torchTensorr   r3   rM   indexr   tensorr.   r.   r/   r   Q  s    

z3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>r    zCan not mix devices c                 s  s$    | ]}t |jjr|jjV  qd S r   )r   r3   rM   r   r.   r.   r/   r   Z  s    

cuda)
r   r~   nextr   itercurrent_devicer3   r$   benchmark_gpusynchronize)	rR   r   r   r   device_idx_setdevice_typer   
device_idxr   r.   r.   r/   r   K  s*   



z GPUDeviceBenchmarkMixin.do_benchr   r+   r,   r-   r   r.   r.   r.   r/   r   J      r   c                   @  r   )CPUDeviceBenchmarkMixinNr   r   r   r   r   r@   r   c                G  s
   t |S r   )r$   benchmark_cpur   r.   r.   r/   r   n     
z CPUDeviceBenchmarkMixin.do_benchr   r  r.   r.   r.   r/   r  m  r  r  c                      sD   e Zd Z				d"d# fddZd$ddZdd Zd%d d!Z  ZS )&TritonBenchmarkRequestr   Nr   r8   r   r   r   r   r   module_pathmodule_cache_key
num_stagesr   	num_warpsmatrix_instr_nonkdimwaves_per_eukpackworkspace_argOptional[WorkspaceArg]r@   rA   c                   sF   t  |||| || _|| _|| _|| _|	| _|
| _|| _|| _	d S r   )
superr   r  r	  r
  r  r  r  r  r  )rR   r   r   r   r   r  r	  r
  r  r  r  r  r  	__class__r.   r/   r   z  s   
zTritonBenchmarkRequest.__init__r   r   r   r   c                  s0  t | j| j}td| j| j t|| jjt	| j
 dj_i dd l}d|jv r3dd< jjdkr<dnjj}t|}|| jjj| jd urb| j fdd}|S tt|| jtjjjjrtjg R i diS tjg R i d	d
S )Nz"benchmark module key: %s, path: %sFr   warmupcpuc                    s`   j } tj| fdtjjd}jtjkr|  g | R i dd d S )N)r    r   r3   Tstreambenchmark_run)	r   r   empty_strideduint8r3   	zero_moder#   UNINITIALIZEDzero_)workspace_sizeworkspace_tensorr   r   r   
run_methodr  
warmup_argr  r.   r/   run_with_workspace  s.   z>TritonBenchmarkRequest.make_run_fn.<locals>.run_with_workspacer  Tr  )r   load_by_key_pathr	  r  rB   rC   r   r   runr   r   __self__with_bandwidth_infoinspect	signature
parametersr3   rM   r   get_raw_streamr   r   r  rH   r   	_inductorruntimetriton_heuristicsDebugAutotuner	functoolspartial)rR   r   r   modr)  r   r   r$  r.   r!  r/   r     sn   



	
z"TritonBenchmarkRequest.make_run_fnc                 C  s$   t | j| j}t|| j  d S r   )r   r%  r	  r  r   r   
precompile)rR   r3  r.   r.   r/   r4    s   z!TritonBenchmarkRequest.precompilec                 C     d| j d| jd| jS )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r   r  r	  rQ   r.   r.   r/   __str__     zTritonBenchmarkRequest.__str__)r   r   r   N)r   r8   r   r   r   r   r   r   r  r8   r	  r8   r
  r   r  r   r  r   r  r   r  r   r  r  r@   rA   r   r@   r8   )r+   r,   r-   r   r   r4  r7  __classcell__r.   r.   r  r/   r  w  s    
Tr  c                   @  r(   )TritonGPUBenchmarkRequestNr*   r.   r.   r.   r/   r;    r0   r;  c                   @  r(   )TritonCPUBenchmarkRequestNr*   r.   r.   r.   r/   r<    r0   r<  c                      sV   e Zd Zd fddZdd ZdddZdddZdd ZdddZd ddZ	  Z
S )!CUDABenchmarkRequestr   r8   r   r   r   r   r   source_coder@   rA   c                   sV   t  |||| || _d| _d | _d | _d| _d| _d| _t	
| jd\| _| _d S )Nr   F so)r  r   r>  r  	workspaceDLL_workspace_size_updatedhash_keysource_filer   writerR   r   r   r   r   r>  r  r.   r/   r     s   zCUDABenchmarkRequest.__init__c                 C  s*   t d|  t| jd t d|  d S )NPrecompiling %sr@  Done precompiling %s)rB   rC   r   compiler>  rQ   r.   r.   r/   r4    s   zCUDABenchmarkRequest.precompiler   r   r   r   c             	   G  s   |    |   dd t||g D }td| j| j| j| j|| j	 t
tj j}t| j| j}t
d}| jdkrStj| jd d tj|jd| _t
| j }tj|g|| j	d ||R  S )Nc                 S  s   g | ]}t | qS r.   )r   data_ptrr   r.   r.   r/   r     s    
z4CUDABenchmarkRequest.make_run_fn.<locals>.<listcomp>zqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr         r  )ensure_dll_loadedupdate_workspace_sizer   rB   rC   r   rE  rD  rB  r   r   r   r   current_streamcuda_streamr   r  zerosfloat64r3   rA  rK  r1  r2  )rR   r   r   rW   
stream_ptrr"  workspace_ptrr.   r.   r/   r     sJ   	
z CUDABenchmarkRequest.make_run_fnc              
   C  s   | j rd S |   tdd | jD }dd t|d D }ttj j	}t
| j| j}t }|g || jt|d |R   tj  |j| _td| j| j| j| j| j|| j d| _ d S )Nc                 S  s   h | ]}|j qS r.   )r   )r   metar.   r.   r/   	<setcomp>@  s    z=CUDABenchmarkRequest.update_workspace_size.<locals>.<setcomp>c                 S  s   g | ]}t d qS r   )r   )r   _r.   r.   r/   r   B  r   z>CUDABenchmarkRequest.update_workspace_size.<locals>.<listcomp>r    zupdate_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sT)rC  rN  r~   r   r   r   r   r   rP  rQ  r   rB  r   r   r   r   r   r   r  rB   rC   rE  rD  )rR   unique_input_countrW   rT  r"  c_workspace_sizer.   r.   r/   rO  ;  sH   
	

z*CUDABenchmarkRequest.update_workspace_sizec                 C  s,   | j d u rt| jd\| _ | _| _d S d S )Nr@  )rB  r   loadr>  rD  rE  rQ   r.   r.   r/   rN  _  s
   
z&CUDABenchmarkRequest.ensure_dll_loadedc                 C  s   | j d ur
| j   d | _d S r   )rB  closerA  rQ   r.   r.   r/   r   e  s   


z#CUDABenchmarkRequest.cleanup_run_fnc                 C  r5  )Nr6  z, self.source_file=z, self.hash_key=)r   rE  rD  rQ   r.   r.   r/   r7  j  r8  zCUDABenchmarkRequest.__str__r   r8   r   r   r   r   r   r   r>  r8   r@   rA   r   rt   r9  )r+   r,   r-   r   r4  r   rO  rN  r   r7  r:  r.   r.   r  r/   r=    s    

'$
r=  c                      sD   e Zd Zd fddZdd ZdddZdddZdddZ  ZS )CppBenchmarkRequestr   r8   r   r   r   r   r   r>  r@   rA   c                   s,   t  |||| || _t|| _d | _d S r   )r  r   r>  r   rD  rB  rG  r  r.   r/   r   r  s   

zCppBenchmarkRequest.__init__c                 C  s,   t d|  tj| jdd t d|  d S )NrH  r  r   rI  )rB   rC   r   r[  r>  rQ   r.   r.   r/   r4    s   zCppBenchmarkRequest.precompiler   r   r   r   c                G  s   t j| jdd| _dd t||g D }td| j| j|| j t	| j| j}t
dd | jD s4J tjgt|tt| j  |_tj|g|| jR  S )Nr  r_  c                 S  s   g | ]}|  qS r.   )rK  r   r.   r.   r/   r     r   z3CppBenchmarkRequest.make_run_fn.<locals>.<listcomp>zJmake_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%sc                 s  s    | ]	}t |tjV  qd S r   )rH   ctypesc_ulonglong)r   argr.   r.   r/   r     s    z2CppBenchmarkRequest.make_run_fn.<locals>.<genexpr>)r   r[  r>  rB  r   rB   rC   r   r   r   r   r`  ra  r~   argtypesr1  r2  )rR   r   r   rW   r"  r.   r.   r/   r     s*   zCppBenchmarkRequest.make_run_fnc                 C  s.   | j d ur	 t| j dr| j   d S d S d S )Nr\  )rB  hasattrr\  rQ   r.   r.   r/   r     s   
z"CppBenchmarkRequest.cleanup_run_fnc                 C  s   d| j S )Nr6  )r   rQ   r.   r.   r/   r7    s   zCppBenchmarkRequest.__str__r]  r   rt   r9  )	r+   r,   r-   r   r4  r   r   r7  r:  r.   r.   r  r/   r^  n  s    

r^  r   r   r@   r   c                 C  s
   t | S )zO
    Do benchmarking in a subprocess and return the perf number (latency).
    )tuning_poolrK   )r   r.   r.   r/   benchmark_in_sub_process  r  rf  )r3   r4   r   )_
__future__r   
contextlibr`  dataclassesr1  r   r5   rd   r   r   collections.abcr   r   concurrent.futuresr   r   r   r   r	   typingr
   r   r   r   r   r   torch._inductor.async_compiler   torch._dynamo.device_interfacer   torch._dynamo.testingr   torch._inductorr   torch._inductor.codecacher   r   r   r   r   torch._inductor.utilsr   r   torch._loggingr   torch.utils._ordered_setr   multiprocessing.processr   multiprocessing.queuesr   typesr    torch._inductor.select_algorithmr   codegen.commonr!   r?  r"   r#   runtime.benchmarkingr$   virtualizedr%   r&   r   r+   rB   	getLoggerr}   r)   r1   rE   r2   contextmanagerr:   	dataclassr;   rx   re  r   r   LayoutOrBufferr   rJ   r   r   r  r  r;  r<  r=  r^  rf  r.   r.   r.   r/   <module>   s   

 = 6`#
xw?