o
    HhE                     @   s  d dl Z d dlmZmZ d dlZd dlmZ d dlm	  m
Z d dlm	  mZ d dlmZ d dlmZ d dlmZmZmZ d dlmZmZ d dlmZ d dlmZmZmZ d d	lm Z  ej!j"Z"d
gZ#e j$dd
 Z%de&e df de'de'fddZ(de&e df dedefddZ)dej*j+de&e,df de-e.e,f defddZ/dd Z0dej*j+de&e,df de-e.e,f de,fddZ1dej*j+de&e,df de-e.e,f de,fddZ2d ed!ed"ee d#ee d$e'd%e'd&ej3d'e'ded(e'de&eef fd)d*Z4dej*j+de&e,df de-e.e,f de,fd+d,Z5d-ed ed!ed"ee d$e'd%e'd.ed&ej3d'e'ded(e'defd/d0Z6dej*j+de&e,df de-e.e,f de,fd1d2Z7e"j0j8e1e"j9j8e2e"j:j8e5e"j;j8e5e"j<j8e7e"j=j8e7iZ>d3d4 Z?d5d6 Z@dS )7    N)castOptional)Tensor)
DeviceMesh)DTensor	ReplicateShard)DTensorSpec
TensorMeta)_MaskPartial)	_skip_dim	Reductionreplicate_reduction_dims)	Placementloss_parallelc                   c   s    t   dV  t  dS )a  
    A context manager that enables loss parallelism, where efficient parallelized loss computation
    can be performed when the input is sharded on the class dimension. Currently only the cross-entropy
    loss is supported.

    Within this context manager, one can use :func:`~torch.nn.functional.cross_entropy` or
    :class:`~torch.nn.CrossEntropyLoss` as usual, with the following assumptions on the input parameters.
    The corresponding ``backward()`` call, if any, also needs to happen under this context manager.

    Args:
        input (:class:`DTensor`):
            Input logits. Assumed to be sharded on the class dimension.
        target (Union[:class:`torch.Tensor`, :class:`DTensor`]):
            Must be ground truth class indices (class probabilities currently not supported).
            Assumed to be replicated across the ``DeviceMesh``.
        weight (Union[:class:`torch.Tensor`, :class:`DTensor`], optional):
            If given, assumed to be replicated across the ``DeviceMesh``.
        label_smoothing:
            Currently not supported.

    Returns:
        A replicated :class:`DTensor`.

    Example:
        A sharded DTensor is manually created here to showcase the usage.
        In practice, it is usually the output of a TP module.

        >>> # xdoctest: +SKIP("distributed")
        >>> from torch.distributed.tensor.parallel import loss_parallel
        >>> from torch.distributed.device_mesh import init_device_mesh
        >>> ...
        >>> device_mesh = init_device_mesh("cuda", (8,))
        >>> input = torch.randn(4, 16, device="cuda", requires_grad=True)
        >>> dist_input = distribute_tensor(input, device_mesh, placements=[Shard(1)])
        >>> target = torch.randint(16, (4,), device="cuda")
        >>> with loss_parallel():
        >>>     loss = F.cross_entropy(dist_input, target, reduction="mean")
        >>>     loss.backward()
        >>> ...
    N)_enable_custom_loss_ops_disable_custom_loss_ops r   r   Z/var/www/vscode/kcb/lib/python3.10/site-packages/torch/distributed/tensor/parallel/loss.pyr      s   *

placements.dimreturnc                 C   s6   t | dks
td| d |std| ddS )N   zLCurrently loss_parallel() only supports input on one-dimensional DeviceMesh.r   zUloss_parallel() should be enabled only when the input tensor is sharded on dimension .)len
ValueErroris_shard)r   r   r   r   r   _find_all_reduce_mesh_dimP   s   
r   meshc                 C   s`   t | tr| j|kr| S td| d| j dt | tjr'tj| ||ddS tdt|  )Nz	Expected z	 but got r   F)device_meshr   	run_checkzUnsupported type )	
isinstancer   r   RuntimeErrortorchr   
from_local	TypeErrortype)tensorr   r   r   r   r   _cast_to_dtensor\   s   

r(   op_callargskwargsc                 C   sT   t j| ||}t jj|j}t|tr|S t|tr |d S t	dt
| d)Nr   zUnexpected tensor meta type: r   )r   _op_dispatcherunwrap_to_op_infosharding_propagator_propagate_tensor_metaschemar!   r
   tupler"   r&   )r)   r*   r+   op_infotensor_metar   r   r   r/   l   s   

r/   c                 C   s   |r
| j tjks
J tj| tjjd\}}| j|tjd} | 	 dkr&| }ntj
| |dd}tj|tjjj||fd}| | }tjt||dd}	tj|	tjjj||fd}	t|	}
||
 }|sg||}|S )N)type_promotion_kind)dtypememory_formatr   T)keepdim)reduceOpgroup)r5   r#   halfutilselementwise_dtypesELEMENTWISE_TYPE_PROMOTION_KINDDEFAULTtocontiguous_formatnumelamaxfuncol
all_reducec10dReduceOpMAXnamesumexpSUMlog)xr   half_to_floatr   mesh_dimcomputation_dtyperesult_dtypeshiftedx_maxshifted_sumexpshifted_logsumexpresultr   r   r   _log_softmax   s,   


rW   c                 C   s~   t t|d }t t|d }t t|d }|j}t|j|}t| ||}t|j	|||j
|}	t|j
|j|d}
t|	|
|	jdS )Nr   r      r3   requires_grad)r   r   intbool_specr   r   r/   rW   _local_tensorr   r	   r[   )r)   r*   r+   rM   r   rN   specrO   output_tensor_metaresres_specr   r   r   _log_softmax_handler   s"   rd   c                 C   s(   t t|d }t tj|d }||S )Nr      )r   r   r#   r5   r?   )r)   r*   r+   grad_outputinput_dtyper   r   r   _log_softmax_backward_handler   s   
rh   rM   targetweightlocal_weight	reductionignore_indexinput_shapechannel_dimrO   c
                    s  |   d dk rd dtdtf fdd}
|d ur.|
|}|d us&J |
|}| | } t||k|d}| }t| d}||||	}t|  |}||||	}|	  }t||k|d}|t
jjkrzdkrz| d	d
}||fS |d urt| j}d| < ||}t| |	 }t||k|d}| }n	||k | }|t
jjkr| }||fS |t
jjkr| | }||fS )Nr   rX   r   rj   r   c                    s6   dkrdg }| j d | < | |}|S | }|S )Nr   r   )shapeview)rj   rp   wro   n_dimsr   r   _weight_view   s   
z'_nll_loss_forward.<locals>._weight_viewoffset_shape
offset_dimr   g        )r   r   r#   where	unsqueezer   _partition_valuegather_reduce_valuesqueezer   NONEvaluenew_fulllistrp   expandrI   r?   rK   MEAN)rM   ri   rj   rk   rl   rm   rn   ro   r   rO   ru   rr   local_wsafe_targetsafe_target_partial_placementsafe_target_partial_result_partialresult_reducedrV   total_weight	new_shapewsumr   rs   r   _nll_loss_forward   sJ   



r   c                    s  t t|d }|d }|d }t t|d }t t|d }| dkr%dnd}|j}	t|	j| tt|	j|g|}
t	 f|	j
j }t||
|	j
}d }|d urxt|||	j
} fddt|	j
jD }||	j
|j}|jd |jj| ksxJ |tjjkr|
}n|}t|}|||d< |d< t| t||}t|j|j|d ur|jnd ||||j||	j
 
\}}t|	j
||d}t|||jd	|fS )
Nr   r   rX   re      c                    s"   g | ]}| krt d nt qS )r   )r   r   ).0irO   r   r   
<listcomp>)  s    z-_nll_loss_forward_handler.<locals>.<listcomp>rY   rZ   )r   r   r\   r   r^   r   r   r   r   r   r   ndimr(   rangeredistributer_   rp   r   r   r   r   r/   r1   r   r	   r[   )r)   r*   r+   rM   ri   rj   rl   rm   ro   r`   target_placementsall_replicate_placementsrk   sharded_placementsoutput_placementsra   rV   r   out_specr   r   r   _nll_loss_forward_handler  s^   

r   rf   r   c                 C   s  |  dk rdnd}|tjjkr| | } ||}t||k|d}t|}t||d}|	|
 }|||	|
}|jjd usCJ |jj|jd }tj|jd |jd}|  dkrc|||< n.|  dkrp||||f< n!||d}|j}|d|j| }||||f< |||d}|  |     krdkrn n| |} |d urdd	 t|  D }|jd ||< ||}t|j}d||< ||}t|||}| | } t||k| d} |t| |  S )
NrX   r   r   rv   g      ?)devicery   c                 S   s   g | ]}d qS )r   r   )r   _r   r   r   r     s    z6_nll_loss_and_log_softmax_backward.<locals>.<listcomp>)r   r   r   r   r{   r#   rz   
zeros_liker   r   flattenr|   mask_bufferdatar?   r5   arangerp   r   	transposereshaperq   r   r   r   r}   rJ   )rf   rM   ri   rj   rl   rm   r   rn   ro   r   rO   r   
grad_inputr   masked_safe_targetgrad_update	arange_1dgrad_input_tintermidate_shapegrad_input_2dr   rr   w_targetr   r   r   "_nll_loss_and_log_softmax_backwardX  sH   


 



r   c                 C   sV  t t|d }t t|d }|d }|d }t t|d }t t|d }t t|d }	| dkr3dnd}
|j}t|j|
}tt	|j|
g|
}t
 f|jj }t|||j}|d urbt|||j}t|}|||d< |d< t|	||j|d< t| t||}t|j|j|j|d ur|jnd |||	|j|
|j|}t|j|j|d}t|||jd	S )
Nr   r   rX   re   r         rY   rZ   )r   r   r\   r   r   r^   r   r   r   r   r   r   r   r(   r   r/   r1   r   r_   rp   r	   r[   )r)   r*   r+   rf   rM   ri   rj   rl   rm   r   ro   r`   rO   r   r   ra   rV   r   r   r   r   _nll_loss_backward_handler  sX   r   c                   C   s   t jjt d S N)r   r,   _custom_op_handlersupdatecustomized_loss_opsr   r   r   r   r     s   r   c                  C   s   t D ]	} tjj|  qd S r   )r   r   r,   r   pop)	custom_opr   r   r   r     s   r   )A
contextlibtypingr   r   r#   torch._prims_common_prims_commonr;   )torch.distributed._functional_collectivesdistributed_functional_collectivesrC   "torch.distributed.distributed_c10ddistributed_c10drE   r   torch.distributed.device_meshr   torch.distributed.tensorr   r   r   &torch.distributed.tensor._dtensor_specr	   r
   ,torch.distributed.tensor._ops._embedding_opsr   'torch.distributed.tensor._ops._math_opsr   r   r   (torch.distributed.tensor.placement_typesr   opsaten__all__contextmanagerr   r1   r\   r   r(   _ops
OpOverloadobjectdictstrr/   rW   rd   rh   Sizer   r   r   r   default_log_softmax_backward_datanll_loss_forwardnll_loss2d_forwardnll_loss_backwardnll_loss2d_backwardr   r   r   r   r   r   r   <module>   s   
2










	


I


K	

E


<
