o
    Ih^                      @   sP  d dl Z d dlZd dlmZ d dlZd dlZddlmZ ddlm	Z	m
Z
 ddlmZ G dd deZG d	d
 d
eZe jdefddZdejdefddZdejdefddZdejdefddZG dd deZG dd deZG dd deZdgdggZdgdggdgdggdgd gggZg d!g d"g d"gZdejdefd#d$ZdS )%    N)IntEnum   )ir)get_dtype_sizesympy_product)Vc                   @      e Zd ZdZdZdZdS )	NCCL_COLLr   r      N)__name__
__module____qualname__
ALL_REDUCE
ALL_GATHERREDUCE_SCATTER r   r   Q/var/www/vscode/kcb/lib/python3.10/site-packages/torch/_inductor/comm_analysis.pyr	          r	   c                   @   r   )NVIDIA_GPU_TYPEr   r   r
   N)r   r   r   VOLTAAMPEREHOPPERr   r   r   r   r      r   r   returnc                  C   sH   t jjt jjjpd} d| v rtjS d| v rtjS d| v r!tjS tjS )N V100A100H100)	torchutilscollect_envget_gpu_inforunr   r   r   r   )gpu_infor   r   r   get_gpu_type   s   r#   nodec                 C   sd   t | tjstd|  | j}|d usJ d|v rtjS d|v r$tjS d|v r+tjS td| )Nz!node is not a collective kernel: 
all_reduce
all_gatherreduce_scatterzUnsupported collective kernel: )	
isinstancer   _CollectiveKernel
ValueErrorpython_kernel_namer	   r   r   r   )r$   kernel_namer   r   r   get_collective_type(   s   r-   c                 C   s\   d}| j D ]&}t|jj}t|tjrt|}n	tj	j
j|dd}||t|jj 7 }q|S )Nr   )fallback)inputsr   layoutsizer(   sympyIntegerintr   graphsizevars	size_hintr   dtype)r$   sz_bytesinpnumelr   r   r   get_collective_input_size_bytes8   s   

r<   c                 C   s6   t | tjkrddlm} || jd S td|  )Nr   )_get_group_size_by_namezUnsupported collective type: )typer   r)   "torch.distributed.distributed_c10dr=   constant_args	TypeError)r$   r=   r   r   r   get_collective_group_sizeE   s   rC   c                   @   r   )NCCL_HWr   r   r
   N)r   r   r   NVLINKPCINETr   r   r   r   rD   S   r   rD   c                   @   s   e Zd ZdZdZdS )	NCCL_ALGOr   r   N)r   r   r   TREERINGr   r   r   r   rH   Y   s    rH   c                   @   s   e Zd ZdZdS )
NCCL_PROTOr   N)r   r   r   LLr   r   r   r   rK   ^   s    rK   g333333@gffffff@g333333?      ?g      @g@)     C@rN   gffffff4@)gU@g     6@g      3@c                 C   s  t | }|d d d }d}t| }t|| }|}|dkr!dS tj}tj}t| }	t	j
jj}
t	j
jj}t }|dkr@|d nd}|dkrH|nd}t| | }|dkrV|
n|}d}|| }t|||dksj|	tjkrldnd }|	tjkr|d|d  }n|	tjtjfv r|d }d| | }|| }|d	 }tj}|	tjkr|dkrd| }nd}n|	tjtjfv r|d }t| | }t| | | }ttj | | }d
}|dkrd}t||}||| | ||  7 }|d }|| }|| S )a9  
    Returns estimated NCCL collective runtime in nanoseconds (ns).

    The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
    We aim to estimate the runtime as accurately as possible.

    Assumptions:
    - only ring algorithm (NCCL_ALGO_RING) is used
    - only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
    - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
    - collective is one of: allreduce, reducescatter, allgather
    i      r   r   r
   g      ?gUUUUUU?rM   g    eAg        g     @@)r<   rC   mathceilrH   rJ   rK   rL   r-   r   	_inductorconfigintra_node_bwinter_node_bwr#   llMaxBwsminr	   r   r   r   rD   rE   baseLathwLatrG   max)r$   tensor_storage_size_bytestensor_storage_size_GBnum_gpus_per_node
group_sizenNodesnRanks	nccl_algo
nccl_protocollbwIntrabwIntercompCapIndexindex2index1llMaxBwbw	nChannelsbusBwnstepsratio	bandwidthbandwidth_GB_per_nsintraHwnInterStepslatencyintraLatinterLatnetOverhead
latency_nstransport_nsr   r   r    estimate_nccl_collective_runtime   sf   





ry   )	functoolsrP   enumr   r2   r   r   r   r   r   r   virtualizedr   r	   r   	lru_cacher#   IRNoder-   r4   r<   rC   rD   rH   rK   rX   rY   rV   floatry   r   r   r   r   <module>   sL    