o
    0hM                     @   s   U d dl Z d dlmZ d dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZ G dd de	ZG d	d
 d
e	ZG dd de	ZG dd de	Zdaeeeej   ed< dejfddZdS )    N)chain)Optional_get_device_index)Function)commc                   @   $   e Zd Zedd Zedd ZdS )	Broadcastc                    s   t dd |D sJ ddd |D }|| _t|dkrdS t|| _|d  | _t|| j}g }t| j	dd  D ]\ }|sO|
 fd	d|D  q=| j|  tt|S )
Nc                 s       | ]	}|j jd kV  qdS cpuNdevicetype.0i r   P/var/www/vscode/kcb/lib/python3.10/site-packages/torch/nn/parallel/_functions.py	<genexpr>       
z$Broadcast.forward.<locals>.<genexpr>z2Broadcast function not implemented for CPU tensorsc                 S      g | ]}t |d qS Tr   r   xr   r   r   
<listcomp>       z%Broadcast.forward.<locals>.<listcomp>r   r      c                 3   s    | ]}|  V  qd S Nr   )r   outputidxr   r   r          )alltarget_gpuslen
num_inputs
get_deviceinput_devicer   broadcast_coalesced	enumerateneeds_input_gradextendmark_non_differentiabletupler   from_iterable)ctxr$   inputsoutputsnon_differentiablesinput_requires_gradr   r    r   forward   s&   


zBroadcast.forwardc                 G   s   dt j| j| jg|R   S )Nr   )ReduceAddCoalescedapplyr(   r&   r0   grad_outputsr   r   r   backward   s
   
zBroadcast.backwardN__name__
__module____qualname__staticmethodr5   r:   r   r   r   r   r	      s
    
r	   c                   @   r   )r6   c                    sL    fddt dt D | _ fddt dt D }t||S )Nc                    s   g | ]} |   qS r   r'   r   )gradsr   r   r   )   s    z.ReduceAddCoalesced.forward.<locals>.<listcomp>r   c                    s   g | ]
} ||  qS r   r   r   rA   r&   r   r   r   -   s    )ranger%   r$   r   reduce_add_coalesced)r0   destinationr&   rA   grads_r   rB   r   r5   '   s
   
 zReduceAddCoalesced.forwardc                 G   s   dt j| jg|R   S )NNN)r	   r7   r$   r8   r   r   r   r:   0   s   zReduceAddCoalesced.backwardNr;   r   r   r   r   r6   &   s
    
r6   c                   @   r   )Gatherc                    s   t dd |D sJ d|dkrd _nt|d}| _| _tdd |D  _t dd |D rI|dkrItd	d |D }td
 d _nd _t fdd|D  _	t
| j jS )Nc                 s   r
   r   r   r   r   r   r   r   ;   r   z!Gather.forward.<locals>.<genexpr>z/Gather function not implemented for CPU tensorsr   Tc                 s   s    | ]}|  V  qd S r   r@   r   r   r   r   r   D   r"   c                 s   s    | ]	}|  d kV  qdS r   N)dimr   tr   r   r   r   E       r   c                 s   s    | ]}| d V  qdS )r   N)viewrK   r   r   r   r   F   s    zvWas asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.Fc                 3   s    | ]	}|  jV  qd S r   )sizerJ   r   r0   r   r   r   O   rM   )r#   target_devicer   rJ   r.   
input_gpuswarningswarnunsqueezed_scalarinput_sizesr   gather)r0   rQ   rJ   r1   r   rP   r   r5   9   s(   

zGather.forwardc                 C   s6   t | j| j| j|}| jrtdd |D }d| S )Nc                 s   s    | ]}|d  V  qdS rI   r   )r   gr   r   r   r   X   r"   z"Gather.backward.<locals>.<genexpr>rG   )Scatterr7   rR   rV   rJ   rU   r.   )r0   grad_outputscattered_gradsr   r   r   r:   R   s   zGather.backwardNr;   r   r   r   r   rH   8   s
    
rH   c                   @   r   )rY   c           
   	   C   s   dd |D }|| _ |jjdkr| nd| _d }tj r*| jdkr*dd |D }t	|||| j |}|d urjt
|D ]-\}}tj||  tj }	|	||  ||	 W d    n1 sdw   Y  q<|S )Nc                 S   r   r   r   r   r   r   r   r   _   r   z#Scatter.forward.<locals>.<listcomp>r   c                 S   s   g | ]
}t td |qS )cuda)_get_streamtorchr   )r   r   r   r   r   r   e   s    )rJ   r   r   r'   r(   r_   r]   is_availabler   scatterr*   current_streamwait_streamrecord_stream)
r0   r$   chunk_sizesrJ   inputstreamsr2   r   r   main_streamr   r   r   r5   ]   s$   
zScatter.forwardc                 G   s    d d d t j| j| jg|R  fS r   )rH   r7   r(   rJ   )r0   rZ   r   r   r   r:   r   s    zScatter.backwardNr;   r   r   r   r   rY   \   s
    
rY   _streamsr   c                 C   sh   | j dkrdS tt| j d}|du rdS tdu rdg|  at| j du r/|| jt| j< t| j S )zBGet a background stream for copying between CPU and target device.r   N)r   getattrr_   ri   device_countindexStream)r   
device_modr   r   r   r^   {   s   

r^   )rS   	itertoolsr   typingr   r_   torch._utilsr   torch.autogradr   torch.nn.parallelr   r	   r6   rH   rY   ri   listrm   __annotations__r   r^   r   r   r   r   <module>   s   
 $