o
    Hh2                     @   s6  d dl mZ d dlmZ d dlZd dlm  m  mZ	 d dl
mZ d dlmZ d dlmZ d dlmZmZmZmZmZ ded	ed
ee deeedf eedf f fddZdejd	ed
ee deee ee f fddZdejjdee defddZded	ed
ee deedf fddZ dej!fddZ"dS )    )Sequence)castN)	ShapeType)
DeviceMesh)DTensorSpec)_StridedShardPartial	Placement	ReplicateShardglobal_shapemesh
placementsreturn.c              	      s>     du r
dS t| }dgt|  } fddtt| D }dgt|  }t|D ]b\}} |}	t|tr|j}
dgt|  }|
t|k sWJ d|
 dt| |j	||
 |	| d	d
\}}|||
< |||
< ||
 ||
 kr}||
 ||
< n
||
  ||
 7  < ||
  |	9  < q-t
dd |D }|rdgt|  }dgt|  }t|D ]S\}} |}	t|tr|j}
||
 rtd| d| d| d||
 rd	||
< t|trd	||
< ||
 |j|	  ||
 |< q||
  |	  < ||
 ||
 |< qfddt|D }dd t||D }t|t|fS )a  
    Compute the local tensor shape and the global offsets into the original tensor
    of a DTensor on its current global rank. This is useful for checkpointing purpose.

    Example (2 host with 4GPUs each):
    # Below is a DeviceMesh with mesh_shape of (2, 4)
    mesh = DeviceMesh(device_type="cuda",
                        mesh=[
                        [0, 1, 2, 3],
                        [4, 5, 6, 7]
                        ],
    )

    Let's say we distribute a global_tensor of shape (8,4) over the above DeviceMesh
    with a placements of [Shard(0), Shard(0)].
    The local shape and global offset will be as follows:
    rank0 -- local_shape:[1, 4], global_offset:[0, 0]
    rank1 -- local_shape:[1, 4], global_offset:[1, 0]
    rank2 -- local_shape:[1, 4], global_offset:[2, 0]
    rank5 -- local_shape:[1, 4], global_offset:[5, 0]
    rank3 -- local_shape:[1, 4], global_offset:[3, 0]
    rank4 -- local_shape:[1, 4], global_offset:[4, 0]
    rank6 -- local_shape:[1, 4], global_offset:[6, 0]
    rank7 -- local_shape:[1, 4], global_offset:[7, 0]

    Let's say we distribute a global_tensor of shape (2) over the above DeviceMesh with
    a placements of [Shard(0)]. We will not have non-empty local tensor for all the ranks.
    The local shape and global offset will be as follows:
    rank0 -- local_shape:[1,], global_offset:[0,]
    rank1 -- local_shape:[1,], global_offset:[1,]
    rank2 -- local_shape:[0,], global_offset:[2,]
    rank5 -- local_shape:[0,], global_offset:[2,]
    rank3 -- local_shape:[0,], global_offset:[2,]
    rank4 -- local_shape:[0,], global_offset:[2,]
    rank6 -- local_shape:[0,], global_offset:[2,]
    rank7 -- local_shape:[0,], global_offset:[2,]
    N)r    r   c                    s   g | ]}d g j  qS r   )ndim).0_)r   r   S/var/www/vscode/kcb/lib/python3.10/site-packages/torch/distributed/tensor/_utils.py
<listcomp>B   s    z9compute_local_shape_and_global_offset.<locals>.<listcomp>   Sharding dim  greater than tensor ndim T)return_offsetc                 s   s    | ]}t |tV  qd S N)
isinstancer   )r   pr   r   r   	<genexpr>{   s    z8compute_local_shape_and_global_offset.<locals>.<genexpr>FzTStrided sharding does not allow Shard() to appear after the strided part has ended. z at idx z in z violates this assumption.c                    s(   g | ]\}}t d d t| D qS )c                 S      g | ]\}}|| qS r   r   r   xyr   r   r   r          zDcompute_local_shape_and_global_offset.<locals>.<listcomp>.<listcomp>)sumzip)r   	shard_dimshard_idx_stride)my_coordinater   r   r      s    c                 S   r   r   r   r    r   r   r   r      r#   )get_coordinatelistlenrange	enumeratesizer   r   dim_local_shard_size_on_dimanyNotImplementedErrorr   split_factorr%   tuple)r   r   r   local_shapeglobal_offsetshard_idx_stride_by_mesh_dimnum_shards_by_tensor_dimidx	placementmesh_dim_sizer&   local_offset
shard_sizeshard_offsetstrided_shardingstrided_part_seenstrided_part_end	shard_idxr   )r   r(   r   %compute_local_shape_and_global_offset   s   (








rC   tensorc              	   C   s  t |  }t |  }t|D ]n\}}||}| rmtt|}|jdk r.td| |j}	|	| j	k sEJ d|	 d| j	 d| d||	 }
|
| ||	< t
t|D ]}||	krk|| ||	 krk|| | ||< qUqt|ttfs~tdt| dq||fS )	aV  
    Compute the global size and stride of a DTensor from the given local tensor.
    The local size is multiplited by `world_size` per Sharding dim.
    The local stride is multiplited by `world_size` per Sharding dim, as long as the
    dimension is outside sharding dim.

    For example, if we have a local tensor with size (4, 8, 2) and stride (16, 1, 8).
    If the DTensor placements are [Shard(2)] and world_size is 2;
    then the global size is (4, 8, 4) and stride is (16 * 2, 1, 8).

    Args:
        tensor (:class:`torch.Tensor`):
            Local tensor which DTensor will be constructed from.
        mesh (:class:`DeviceMesh`):
            Object which describes the mesh topology
            of devices for the DTensor.
        placements (Sequence[:class:`Placement`]]):
            The attribute of the DTensor that describes its layout
            on the mesh topology.

    Return:
        tensor_shape: A List of int which specifies the size of DTensor which build
            on top of the local tensor.
        tensor_stride: A List of int which specifies the stride of DTensor.
    r   zOShard placements should have negative dims normalized in the user-facing APIs: r   r   z for placement number .zplacement type z not supported!)r*   r.   strider-   is_shardr   r   r/   AssertionErrorr   r,   r+   r   r
   r   RuntimeErrortype)rD   r   r   tensor_shapetensor_strider9   r:   r;   shard_placementr&   local_dim_sizeir   r   r   compute_global_tensor_info   s6   


rP   op_callargsc                 C   sp   |D ]-}t |tjtfr|j  S t |ttfr/t|dkr/t |d tjtfr/|d j  S qtd|  d)z
    Find the device mesh object from args.
    It returns None if no mesh is found.
    NOTE: we can optimize this search if needed
    r   z+Cannot find device mesh from args for op : rE   )	r   dtensorDTensorr   device_meshr*   r4   r+   
ValueError)rQ   rR   argr   r   r   try_find_mesh_from_args   s   
rX   global_stridec                    s   dgt   t|D ]*\}}| r5tt|j}tt  D ]} |  | kr4|  ||9  < qqt fddtt  D S )z
    Compute the stride of a local tensor shard, given the global stride of the DTensor.
    NOTE: Currently this function is assuming the DTensor is evenly shardable.
    r   c                 3   s     | ]} | |  V  qd S r   r   )r   rO   rY   stride_divisorsr   r   r     s    
z'compute_local_stride.<locals>.<genexpr>)	r+   r-   rG   r   r   r/   r,   r.   r4   )rY   r   r   mesh_idxr   rO   jr   rZ   r   compute_local_stride   s   
r^   c                 C   s\   t | tjr| S t | tr| g}nt| dkr%t | d tr%t| d }nt| }t|S )z
    Unify variable types of size argument to torch.Size
    Acceptable types include:
        int, Sequence[int], Tuple[int], Tuple[Sequence[int]],
        or torch.Size
    r   r   )r   torchSizeintr+   r   r*   )r.   
torch_sizer   r   r   normalize_to_torch_size  s   

rc   )#collections.abcr   typingr   r_   torch.distributed.tensor._apidistributedrD   _apirS   torch._prims_commonr   torch.distributed.device_meshr   &torch.distributed.tensor._dtensor_specr   (torch.distributed.tensor.placement_typesr   r   r	   r
   r   r4   ra   rC   Tensorr*   rP   _ops
OpOverloadobjectrX   r^   r`   rc   r   r   r   r   <module>   sX    	
 
;


